Print this page
4896 Performance improvements for KCF AES modes
        
*** 20,144 ****
   */
  /*
   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
  
  #include <sys/types.h>
  #include <sys/sysmacros.h>
  #include <modes/modes.h>
  #include "aes_impl.h"
  #ifndef _KERNEL
  #include <stdlib.h>
  #endif  /* !_KERNEL */
  
  
  /* Copy a 16-byte AES block from "in" to "out" */
  void
! aes_copy_block(uint8_t *in, uint8_t *out)
  {
          if (IS_P2ALIGNED2(in, out, sizeof (uint32_t))) {
!                 /* LINTED: pointer alignment */
!                 *(uint32_t *)&out[0] = *(uint32_t *)&in[0];
!                 /* LINTED: pointer alignment */
!                 *(uint32_t *)&out[4] = *(uint32_t *)&in[4];
!                 /* LINTED: pointer alignment */
!                 *(uint32_t *)&out[8] = *(uint32_t *)&in[8];
!                 /* LINTED: pointer alignment */
!                 *(uint32_t *)&out[12] = *(uint32_t *)&in[12];
          } else {
!                 AES_COPY_BLOCK(in, out);
          }
  }
  
- 
  /* XOR a 16-byte AES block of data into dst */
  void
! aes_xor_block(uint8_t *data, uint8_t *dst)
  {
          if (IS_P2ALIGNED2(dst, data, sizeof (uint32_t))) {
!                 /* LINTED: pointer alignment */
!                 *(uint32_t *)&dst[0] ^= *(uint32_t *)&data[0];
!                 /* LINTED: pointer alignment */
!                 *(uint32_t *)&dst[4] ^= *(uint32_t *)&data[4];
!                 /* LINTED: pointer alignment */
!                 *(uint32_t *)&dst[8] ^= *(uint32_t *)&data[8];
!                 /* LINTED: pointer alignment */
!                 *(uint32_t *)&dst[12] ^= *(uint32_t *)&data[12];
          } else {
!                 AES_XOR_BLOCK(data, dst);
          }
  }
  
- 
  /*
   * Encrypt multiple blocks of data according to mode.
   */
  int
  aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length,
      crypto_data_t *out)
  {
          aes_ctx_t *aes_ctx = ctx;
!         int rv;
  
          if (aes_ctx->ac_flags & CTR_MODE) {
!                 rv = ctr_mode_contiguous_blocks(ctx, data, length, out,
!                     AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
  #ifdef _KERNEL
          } else if (aes_ctx->ac_flags & CCM_MODE) {
!                 rv = ccm_mode_encrypt_contiguous_blocks(ctx, data, length,
!                     out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
!                     aes_xor_block);
          } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
!                 rv = gcm_mode_encrypt_contiguous_blocks(ctx, data, length,
!                     out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
!                     aes_xor_block);
  #endif
          } else if (aes_ctx->ac_flags & CBC_MODE) {
!                 rv = cbc_encrypt_contiguous_blocks(ctx,
!                     data, length, out, AES_BLOCK_LEN, aes_encrypt_block,
!                     aes_copy_block, aes_xor_block);
          } else {
!                 rv = ecb_cipher_contiguous_blocks(ctx, data, length, out,
!                     AES_BLOCK_LEN, aes_encrypt_block);
          }
          return (rv);
  }
  
- 
  /*
   * Decrypt multiple blocks of data according to mode.
   */
  int
  aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length,
      crypto_data_t *out)
  {
          aes_ctx_t *aes_ctx = ctx;
!         int rv;
  
          if (aes_ctx->ac_flags & CTR_MODE) {
!                 rv = ctr_mode_contiguous_blocks(ctx, data, length, out,
!                     AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
                  if (rv == CRYPTO_DATA_LEN_RANGE)
                          rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
  #ifdef _KERNEL
          } else if (aes_ctx->ac_flags & CCM_MODE) {
!                 rv = ccm_mode_decrypt_contiguous_blocks(ctx, data, length,
!                     out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
!                     aes_xor_block);
          } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
!                 rv = gcm_mode_decrypt_contiguous_blocks(ctx, data, length,
!                     out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
!                     aes_xor_block);
  #endif
          } else if (aes_ctx->ac_flags & CBC_MODE) {
!                 rv = cbc_decrypt_contiguous_blocks(ctx, data, length, out,
!                     AES_BLOCK_LEN, aes_decrypt_block, aes_copy_block,
!                     aes_xor_block);
          } else {
!                 rv = ecb_cipher_contiguous_blocks(ctx, data, length, out,
!                     AES_BLOCK_LEN, aes_decrypt_block);
                  if (rv == CRYPTO_DATA_LEN_RANGE)
                          rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
          }
          return (rv);
  }
--- 20,210 ----
   */
  /*
   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
+ /*
+  * Copyright 2015 by Saso Kiselkov. All rights reserved.
+  */
  
  #include <sys/types.h>
  #include <sys/sysmacros.h>
  #include <modes/modes.h>
  #include "aes_impl.h"
  #ifndef _KERNEL
  #include <stdlib.h>
  #endif  /* !_KERNEL */
  
+ #if defined(__amd64)
  
+ /*
+  * XORs a range of contiguous AES blocks in `data' with blocks in 'dst'
+  * and places the result in `dst'. On x86-64 this exploits the 128-bit
+  * floating point registers (xmm) to maximize performance.
+  */
+ static void
+ aes_xor_range(const uint8_t *data, uint8_t *dst, uint64_t length)
+ {
+         uint64_t i = 0;
+ 
+         /* First use the unrolled version. */
+         for (; i + 8 * AES_BLOCK_LEN <= length; i += 8 * AES_BLOCK_LEN)
+                 aes_xor_intel8(&data[i], &dst[i]);
+         /* Finish the rest in single blocks. */
+         for (; i < length; i += AES_BLOCK_LEN)
+                 aes_xor_intel(&data[i], &dst[i]);
+ }
+ 
+ #else   /* !__amd64 */
+ 
+ /*
+  * XORs a range of contiguous AES blocks in `data' with blocks in 'dst'
+  * and places the result in `dst'.
+  */
+ static void
+ aes_xor_range(const uint8_t *data, uint8_t *dst, uint64_t length)
+ {
+         uint64_t i = 0;
+ 
+         if (IS_P2ALIGNED2(dst, data, sizeof (uint64_t))) {
+                 /* Unroll the loop to enable efficiency. */
+                 for (; i + 8 * AES_BLOCK_LEN < length; i += 8 * AES_BLOCK_LEN) {
+                         AES_XOR_BLOCK_ALIGNED(&data[i + 0x00], &dst[i + 0x00]);
+                         AES_XOR_BLOCK_ALIGNED(&data[i + 0x10], &dst[i + 0x10]);
+                         AES_XOR_BLOCK_ALIGNED(&data[i + 0x20], &dst[i + 0x20]);
+                         AES_XOR_BLOCK_ALIGNED(&data[i + 0x30], &dst[i + 0x30]);
+                         AES_XOR_BLOCK_ALIGNED(&data[i + 0x40], &dst[i + 0x40]);
+                         AES_XOR_BLOCK_ALIGNED(&data[i + 0x50], &dst[i + 0x50]);
+                         AES_XOR_BLOCK_ALIGNED(&data[i + 0x60], &dst[i + 0x60]);
+                         AES_XOR_BLOCK_ALIGNED(&data[i + 0x70], &dst[i + 0x70]);
+                 }
+         }
+         /* Finish the rest in single blocks. */
+         for (; i < length; i += AES_BLOCK_LEN)
+                 AES_XOR_BLOCK(&data[i], &dst[i]);
+ }
+ 
+ #endif  /* !__amd64 */
+ 
  /* Copy a 16-byte AES block from "in" to "out" */
  void
! aes_copy_block(const uint8_t *in, uint8_t *out)
  {
          if (IS_P2ALIGNED2(in, out, sizeof (uint32_t))) {
!                 AES_COPY_BLOCK_ALIGNED(in, out);
          } else {
!                 AES_COPY_BLOCK_UNALIGNED(in, out);
          }
  }
  
  /* XOR a 16-byte AES block of data into dst */
  void
! aes_xor_block(const uint8_t *data, uint8_t *dst)
  {
          if (IS_P2ALIGNED2(dst, data, sizeof (uint32_t))) {
!                 AES_XOR_BLOCK_ALIGNED(data, dst);
          } else {
!                 AES_XOR_BLOCK_UNALIGNED(data, dst);
          }
  }
  
  /*
   * Encrypt multiple blocks of data according to mode.
   */
  int
  aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length,
      crypto_data_t *out)
  {
          aes_ctx_t *aes_ctx = ctx;
!         int rv = CRYPTO_SUCCESS;
  
+         for (size_t i = 0; i < length; i += AES_OPSZ) {
+                 size_t opsz = MIN(length - i, AES_OPSZ);
+                 AES_ACCEL_SAVESTATE(savestate);
+                 aes_accel_enter(savestate);
+ 
                  if (aes_ctx->ac_flags & CTR_MODE) {
!                         rv = ctr_mode_contiguous_blocks(ctx, &data[i], opsz,
!                             out, AES_BLOCK_LEN, aes_encrypt_block,
!                             AES_XOR_BLOCK, aes_ctr_mode);
  #ifdef _KERNEL
                  } else if (aes_ctx->ac_flags & CCM_MODE) {
!                         rv = ccm_mode_encrypt_contiguous_blocks(ctx, &data[i],
!                             opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
!                             AES_COPY_BLOCK, AES_XOR_BLOCK);
                  } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
!                         rv = gcm_mode_encrypt_contiguous_blocks(ctx, &data[i],
!                             opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
!                             AES_COPY_BLOCK, AES_XOR_BLOCK, aes_ctr_mode);
  #endif
                  } else if (aes_ctx->ac_flags & CBC_MODE) {
!                         rv = cbc_encrypt_contiguous_blocks(ctx, &data[i], opsz,
!                             out, AES_BLOCK_LEN, aes_encrypt_block,
!                             AES_COPY_BLOCK, AES_XOR_BLOCK, aes_encrypt_cbc);
                  } else {
!                         rv = ecb_cipher_contiguous_blocks(ctx, &data[i], opsz,
!                             out, AES_BLOCK_LEN, aes_encrypt_block,
!                             aes_encrypt_ecb);
                  }
+ 
+                 aes_accel_exit(savestate);
+ 
+                 if (rv != CRYPTO_SUCCESS)
+                                 break;
+         }
+ 
          return (rv);
  }
  
  /*
   * Decrypt multiple blocks of data according to mode.
   */
  int
  aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length,
      crypto_data_t *out)
  {
          aes_ctx_t *aes_ctx = ctx;
!         int rv = CRYPTO_SUCCESS;
  
+ 
+         for (size_t i = 0; i < length; i += AES_OPSZ) {
+                 size_t opsz = MIN(length - i, AES_OPSZ);
+                 AES_ACCEL_SAVESTATE(savestate);
+                 aes_accel_enter(savestate);
+ 
                  if (aes_ctx->ac_flags & CTR_MODE) {
!                         rv = ctr_mode_contiguous_blocks(ctx, &data[i], opsz,
!                             out, AES_BLOCK_LEN, aes_encrypt_block,
!                             AES_XOR_BLOCK, aes_ctr_mode);
                          if (rv == CRYPTO_DATA_LEN_RANGE)
                                  rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
  #ifdef _KERNEL
                  } else if (aes_ctx->ac_flags & CCM_MODE) {
!                         rv = ccm_mode_decrypt_contiguous_blocks(ctx, &data[i],
!                             opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
!                             AES_COPY_BLOCK, AES_XOR_BLOCK);
                  } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
!                         rv = gcm_mode_decrypt_contiguous_blocks(ctx, &data[i],
!                             opsz, out, AES_BLOCK_LEN, aes_encrypt_block,
!                             AES_COPY_BLOCK, AES_XOR_BLOCK, aes_ctr_mode);
  #endif
                  } else if (aes_ctx->ac_flags & CBC_MODE) {
!                         rv = cbc_decrypt_contiguous_blocks(ctx, &data[i],
!                             opsz, out, AES_BLOCK_LEN, aes_decrypt_block,
!                             AES_COPY_BLOCK, AES_XOR_BLOCK, aes_decrypt_ecb,
!                             aes_xor_range);
                  } else {
!                         rv = ecb_cipher_contiguous_blocks(ctx, &data[i],
!                             opsz, out, AES_BLOCK_LEN, aes_decrypt_block,
!                             aes_decrypt_ecb);
                          if (rv == CRYPTO_DATA_LEN_RANGE)
                                  rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
                  }
+ 
+                 aes_accel_exit(savestate);
+ 
+                 if (rv != CRYPTO_SUCCESS)
+                                 break;
+         }
+ 
          return (rv);
  }