illumos-gate Cdiff usr/src/common/crypto/aes/aes

Print this page

4896 Performance improvements for KCF AES modes


*** 18,27 ****
--- 18,28 ----
   *
   * CDDL HEADER END
   */
  /*
   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+  * Copyright 2015 by Saso Kiselkov. All rights reserved.
   */
  
  #include <sys/types.h>
  #include <sys/systm.h>
  #include <sys/sysmacros.h>
*** 28,37 ****
--- 29,39 ----
  #include <netinet/in.h>
  #include "aes_impl.h"
  #ifndef _KERNEL
  #include <strings.h>
  #include <stdlib.h>
+ #include <sys/note.h>
  #endif  /* !_KERNEL */
  
  #ifdef __amd64
  
  #ifdef _KERNEL
*** 92,103 ****
  extern void aes_encrypt_impl(const uint32_t rk[], int Nr, const uint32_t pt[4],
          uint32_t ct[4]);
  extern void aes_decrypt_impl(const uint32_t rk[], int Nr, const uint32_t ct[4],
          uint32_t pt[4]);
  
! #define AES_ENCRYPT_IMPL(a, b, c, d, e) aes_encrypt_impl(a, b, c, d)
! #define AES_DECRYPT_IMPL(a, b, c, d, e) aes_decrypt_impl(a, b, c, d)
  
  #elif defined(__amd64)
  
  /* These functions are used to execute amd64 instructions for AMD or Intel: */
  extern int rijndael_key_setup_enc_amd64(uint32_t rk[],
--- 94,105 ----
  extern void aes_encrypt_impl(const uint32_t rk[], int Nr, const uint32_t pt[4],
          uint32_t ct[4]);
  extern void aes_decrypt_impl(const uint32_t rk[], int Nr, const uint32_t ct[4],
          uint32_t pt[4]);
  
! #define AES_ENCRYPT_IMPL(a, b, c, d)    aes_encrypt_impl(a, b, c, d)
! #define AES_DECRYPT_IMPL(a, b, c, d)    aes_decrypt_impl(a, b, c, d)
  
  #elif defined(__amd64)
  
  /* These functions are used to execute amd64 instructions for AMD or Intel: */
  extern int rijndael_key_setup_enc_amd64(uint32_t rk[],
*** 116,136 ****
          const uint32_t cipherKey[], uint64_t keyBits);
  extern void aes_encrypt_intel(const uint32_t rk[], int Nr,
          const uint32_t pt[4], uint32_t ct[4]);
  extern void aes_decrypt_intel(const uint32_t rk[], int Nr,
          const uint32_t ct[4], uint32_t pt[4]);
  
! static int intel_aes_instructions_present(void);
  
! #define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d, e)
! #define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d, e)
  
  #else /* Generic C implementation */
! 
! #define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d)
! #define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d)
  #define rijndael_key_setup_enc_raw      rijndael_key_setup_enc
  #endif  /* sun4u || __amd64 */
  
  #if defined(_LITTLE_ENDIAN) && !defined(__amd64)
  #define AES_BYTE_SWAP
  #endif
--- 118,158 ----
          const uint32_t cipherKey[], uint64_t keyBits);
  extern void aes_encrypt_intel(const uint32_t rk[], int Nr,
          const uint32_t pt[4], uint32_t ct[4]);
  extern void aes_decrypt_intel(const uint32_t rk[], int Nr,
          const uint32_t ct[4], uint32_t pt[4]);
+ extern void aes_encrypt_intel8(const uint32_t rk[], int Nr,
+         const void *pt, void *ct);
+ extern void aes_decrypt_intel8(const uint32_t rk[], int Nr,
+         const void *ct, void *pt);
+ extern void aes_encrypt_cbc_intel8(const uint32_t rk[], int Nr,
+         const void *pt, void *ct, const void *iv);
+ extern void aes_ctr_intel8(const uint32_t rk[], int Nr,
+         const void *input, void *output, uint64_t counter_upper_BE,
+         uint64_t counter_lower_LE);
+ extern void aes_xor_intel(const uint8_t *, uint8_t *);
  
! static inline int intel_aes_instructions_present(void);
  
! #ifdef  _KERNEL
! /*
!  * Some form of floating-point acceleration is available, so declare these.
!  * The implementations will be in a platform-specific assembly file (e.g.
!  * amd64/aes_intel.s for SSE2/AES-NI).
!  */
! extern void aes_accel_save(void *savestate);
! extern void aes_accel_restore(void *savestate);
! #endif  /* _KERNEL */
  
  #else /* Generic C implementation */
! static void rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
!     uint32_t ct[4]);
! static void rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
!     uint32_t ct[4]);
  #define rijndael_key_setup_enc_raw      rijndael_key_setup_enc
+ #define AES_ENCRYPT_IMPL(a, b, c, d)    rijndael_encrypt(a, b, c, d)
+ #define AES_DECRYPT_IMPL(a, b, c, d)    rijndael_decrypt(a, b, c, d)
  #endif  /* sun4u || __amd64 */
  
  #if defined(_LITTLE_ENDIAN) && !defined(__amd64)
  #define AES_BYTE_SWAP
  #endif
*** 1138,1221 ****
   * keyBits      AES key size (128, 192, or 256 bits)
   */
  static void
  aes_setupkeys(aes_key_t *key, const uint32_t *keyarr32, int keybits)
  {
          if (intel_aes_instructions_present()) {
-                 key->flags = INTEL_AES_NI_CAPABLE;
-                 KPREEMPT_DISABLE;
                  key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]),
                      keyarr32, keybits);
                  key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]),
                      keyarr32, keybits);
-                 KPREEMPT_ENABLE;
          } else {
-                 key->flags = 0;
                  key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]),
                      keyarr32, keybits);
                  key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]),
                      keyarr32, keybits);
          }
  
          key->type = AES_32BIT_KS;
  }
  
- /*
-  * Encrypt one block of data. The block is assumed to be an array
-  * of four uint32_t values, so copy for alignment (and byte-order
-  * reversal for little endian systems might be necessary on the
-  * input and output byte streams.
-  * The size of the key schedule depends on the number of rounds
-  * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
-  *
-  * Parameters:
-  * rk           Key schedule, of aes_ks_t (60 32-bit integers)
-  * Nr           Number of rounds
-  * pt           Input block (plain text)
-  * ct           Output block (crypto text).  Can overlap with pt
-  * flags        Indicates whether we're on Intel AES-NI-capable hardware
-  */
- static void
- rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
-     uint32_t ct[4], int flags) {
-         if (flags & INTEL_AES_NI_CAPABLE) {
-                 KPREEMPT_DISABLE;
-                 aes_encrypt_intel(rk, Nr, pt, ct);
-                 KPREEMPT_ENABLE;
-         } else {
-                 aes_encrypt_amd64(rk, Nr, pt, ct);
-         }
- }
- 
- /*
-  * Decrypt one block of data. The block is assumed to be an array
-  * of four uint32_t values, so copy for alignment (and byte-order
-  * reversal for little endian systems might be necessary on the
-  * input and output byte streams.
-  * The size of the key schedule depends on the number of rounds
-  * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
-  *
-  * Parameters:
-  * rk           Key schedule, of aes_ks_t (60 32-bit integers)
-  * Nr           Number of rounds
-  * ct           Input block (crypto text)
-  * pt           Output block (plain text). Can overlap with pt
-  * flags        Indicates whether we're on Intel AES-NI-capable hardware
-  */
- static void
- rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
-     uint32_t pt[4], int flags) {
-         if (flags & INTEL_AES_NI_CAPABLE) {
-                 KPREEMPT_DISABLE;
-                 aes_decrypt_intel(rk, Nr, ct, pt);
-                 KPREEMPT_ENABLE;
-         } else {
-                 aes_decrypt_amd64(rk, Nr, ct, pt);
-         }
- }
- 
- 
  #else /* generic C implementation */
  
  /*
   *  Expand the cipher key into the decryption key schedule.
   *  Return the number of rounds for the given cipher key size.
--- 1160,1188 ----
   * keyBits      AES key size (128, 192, or 256 bits)
   */
  static void
  aes_setupkeys(aes_key_t *key, const uint32_t *keyarr32, int keybits)
  {
+         AES_ACCEL_SAVESTATE(savestate);
+         aes_accel_enter(savestate);
+ 
          if (intel_aes_instructions_present()) {
                  key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]),
                      keyarr32, keybits);
                  key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]),
                      keyarr32, keybits);
          } else {
                  key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]),
                      keyarr32, keybits);
                  key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]),
                      keyarr32, keybits);
          }
  
+         aes_accel_exit(savestate);
          key->type = AES_32BIT_KS;
  }
  
  #else /* generic C implementation */
  
  /*
   *  Expand the cipher key into the decryption key schedule.
   *  Return the number of rounds for the given cipher key size.
*** 1620,1630 ****
--- 1587,1612 ----
  #endif
  
          aes_setupkeys(newbie, keyarr.ka32, keyBits);
  }
  
+ #if     defined(__amd64) && defined(_KERNEL)
+ void
+ aes_accel_enter(void *savestate)
+ {
+         KPREEMPT_DISABLE;
+         aes_accel_save(savestate);
+ }
  
+ void
+ aes_accel_exit(void *savestate)
+ {
+         aes_accel_restore(savestate);
+         KPREEMPT_ENABLE;
+ }
+ #endif  /* defined(__amd64) && defined(_KERNEL) */
+ 
  /*
   * Encrypt one block using AES.
   * Align if needed and (for x86 32-bit only) byte-swap.
   *
   * Parameters:
*** 1635,1650 ****
  int
  aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct)
  {
          aes_key_t       *ksch = (aes_key_t *)ks;
  
  #ifndef AES_BYTE_SWAP
          if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t))) {
-                 /* LINTED:  pointer alignment */
                  AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr,
                      /* LINTED:  pointer alignment */
!                     (uint32_t *)pt, (uint32_t *)ct, ksch->flags);
          } else {
  #endif
                  uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
  
                  /* Copy input block into buffer */
--- 1617,1641 ----
  int
  aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct)
  {
          aes_key_t       *ksch = (aes_key_t *)ks;
  
+ #ifdef  __amd64
+         if (intel_aes_instructions_present())
+                 aes_encrypt_intel(&ksch->encr_ks.ks32[0], ksch->nr,
+                     /* LINTED:  pointer alignment */
+                     (uint32_t *)pt, (uint32_t *)ct);
+         else
+                 aes_encrypt_amd64(&ksch->encr_ks.ks32[0], ksch->nr,
+                     /* LINTED:  pointer alignment */
+                     (uint32_t *)pt, (uint32_t *)ct);
+ #else   /* !__amd64 */
  #ifndef AES_BYTE_SWAP
          if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t))) {
                  AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr,
                      /* LINTED:  pointer alignment */
!                     (uint32_t *)pt, (uint32_t *)ct);
          } else {
  #endif
                  uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
  
                  /* Copy input block into buffer */
*** 1654,1667 ****
  #else   /* byte swap */
                  buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]);
                  buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]);
                  buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]);
                  buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]);
! #endif
  
                  AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr,
!                     buffer, buffer, ksch->flags);
  
                  /* Copy result from buffer to output block */
  #ifndef AES_BYTE_SWAP
                  bcopy(&buffer, ct, AES_BLOCK_LEN);
          }
--- 1645,1658 ----
  #else   /* byte swap */
                  buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]);
                  buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]);
                  buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]);
                  buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]);
! #endif  /* byte swap */
  
                  AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr,
!                     buffer, buffer);
  
                  /* Copy result from buffer to output block */
  #ifndef AES_BYTE_SWAP
                  bcopy(&buffer, ct, AES_BLOCK_LEN);
          }
*** 1669,1679 ****
  #else   /* byte swap */
                  *(uint32_t *)(void *)&ct[0] = htonl(buffer[0]);
                  *(uint32_t *)(void *)&ct[4] = htonl(buffer[1]);
                  *(uint32_t *)(void *)&ct[8] = htonl(buffer[2]);
                  *(uint32_t *)(void *)&ct[12] = htonl(buffer[3]);
! #endif
          return (CRYPTO_SUCCESS);
  }
  
  
  /*
--- 1660,1672 ----
  #else   /* byte swap */
                  *(uint32_t *)(void *)&ct[0] = htonl(buffer[0]);
                  *(uint32_t *)(void *)&ct[4] = htonl(buffer[1]);
                  *(uint32_t *)(void *)&ct[8] = htonl(buffer[2]);
                  *(uint32_t *)(void *)&ct[12] = htonl(buffer[3]);
! #endif  /* byte swap */
! #endif  /* !__amd64 */
! 
          return (CRYPTO_SUCCESS);
  }
  
  
  /*
*** 1688,1703 ****
  int
  aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt)
  {
          aes_key_t       *ksch = (aes_key_t *)ks;
  
  #ifndef AES_BYTE_SWAP
          if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t))) {
-                 /* LINTED:  pointer alignment */
                  AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr,
                      /* LINTED:  pointer alignment */
!                     (uint32_t *)ct, (uint32_t *)pt, ksch->flags);
          } else {
  #endif
                  uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
  
                  /* Copy input block into buffer */
--- 1681,1705 ----
  int
  aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt)
  {
          aes_key_t       *ksch = (aes_key_t *)ks;
  
+ #ifdef  __amd64
+         if (intel_aes_instructions_present())
+                 aes_decrypt_intel(&ksch->decr_ks.ks32[0], ksch->nr,
+                     /* LINTED:  pointer alignment */
+                     (uint32_t *)ct, (uint32_t *)pt);
+         else
+                 aes_decrypt_amd64(&ksch->decr_ks.ks32[0], ksch->nr,
+                     /* LINTED:  pointer alignment */
+                     (uint32_t *)ct, (uint32_t *)pt);
+ #else   /* !__amd64 */
  #ifndef AES_BYTE_SWAP
          if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t))) {
                  AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr,
                      /* LINTED:  pointer alignment */
!                     (uint32_t *)ct, (uint32_t *)pt);
          } else {
  #endif
                  uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
  
                  /* Copy input block into buffer */
*** 1707,1720 ****
  #else   /* byte swap */
                  buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]);
                  buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]);
                  buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]);
                  buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]);
! #endif
  
                  AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr,
!                     buffer, buffer, ksch->flags);
  
                  /* Copy result from buffer to output block */
  #ifndef AES_BYTE_SWAP
                  bcopy(&buffer, pt, AES_BLOCK_LEN);
          }
--- 1709,1722 ----
  #else   /* byte swap */
                  buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]);
                  buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]);
                  buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]);
                  buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]);
! #endif  /* byte swap */
  
                  AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr,
!                     buffer, buffer);
  
                  /* Copy result from buffer to output block */
  #ifndef AES_BYTE_SWAP
                  bcopy(&buffer, pt, AES_BLOCK_LEN);
          }
*** 1722,1738 ****
  #else   /* byte swap */
          *(uint32_t *)(void *)&pt[0] = htonl(buffer[0]);
          *(uint32_t *)(void *)&pt[4] = htonl(buffer[1]);
          *(uint32_t *)(void *)&pt[8] = htonl(buffer[2]);
          *(uint32_t *)(void *)&pt[12] = htonl(buffer[3]);
! #endif
  
          return (CRYPTO_SUCCESS);
  }
  
  
  /*
   * Allocate key schedule for AES.
   *
   * Return the pointer and set size to the number of bytes allocated.
   * Memory allocated must be freed by the caller when done.
   *
--- 1724,1954 ----
  #else   /* byte swap */
          *(uint32_t *)(void *)&pt[0] = htonl(buffer[0]);
          *(uint32_t *)(void *)&pt[4] = htonl(buffer[1]);
          *(uint32_t *)(void *)&pt[8] = htonl(buffer[2]);
          *(uint32_t *)(void *)&pt[12] = htonl(buffer[3]);
! #endif  /* byte swap */
! #endif  /* !__amd64 */
  
          return (CRYPTO_SUCCESS);
  }
  
+ #define ECB_LOOP(ciph_func)                                             \
+         do {                                                            \
+                 for (; i < length; i += AES_BLOCK_LEN)                  \
+                         ciph_func;                                      \
+                 _NOTE(CONSTCOND)                                        \
+         } while (0)
+ #define ECB_LOOP_4P(ciph_func, enc_or_dec, in, out)                     \
+         ECB_LOOP(ciph_func(&ksch->enc_or_dec ## r_ks.ks32[0],           \
+             ksch->nr, (void *)&in[i], (void *)&out[i]))
+ #define ECB_LOOP_3P(ciph_func, in, out)                                 \
+         ECB_LOOP(ciph_func(ksch, (void *)&in[i], (void *)&out[i]))
  
+ #ifdef  __amd64
+ #define ECB_INTEL_IMPL(enc_or_dec, in, out)                             \
+         do {                                                            \
+                 if (intel_aes_instructions_present()) {                 \
+                         /* first use the accelerated function */        \
+                         for (; i + 8 * AES_BLOCK_LEN <= length;         \
+                             i += 8 * AES_BLOCK_LEN)                     \
+                                 aes_ ## enc_or_dec ## rypt_intel8(      \
+                                     &ksch->enc_or_dec ## r_ks.ks32[0],  \
+                                     ksch->nr, &in[i], &out[i]);         \
+                         /* finish off the remainder per-block */        \
+                         ECB_LOOP_4P(aes_ ## enc_or_dec ## rypt_intel,   \
+                             enc_or_dec, in, out);                       \
+                 } else {                                                \
+                         ECB_LOOP_4P(aes_ ## enc_or_dec ## rypt_amd64,   \
+                             enc_or_dec, in, out);                       \
+                 }                                                       \
+                 _NOTE(CONSTCOND)                                        \
+         } while (0)
+ #endif  /* __amd64 */
+ 
  /*
+  * Perform AES ECB encryption on a sequence of blocks. On x86-64 CPUs with
+  * the AES-NI extension, this performs the encryption in increments of 8
+  * blocks at a time, exploiting instruction parallelism more efficiently.
+  * On other platforms, this simply encrypts the blocks in sequence.
+  */
+ int
+ aes_encrypt_ecb(const void *ks, const uint8_t *pt, uint8_t *ct, uint64_t length)
+ {
+         aes_key_t *ksch = (aes_key_t *)ks;
+         uint64_t i = 0;
+ 
+ #ifdef  __amd64
+         ECB_INTEL_IMPL(enc, pt, ct);
+ #elif   defined(sun4u)
+         ECB_LOOP_4P(aes_encrypt_impl, enc, pt, ct);
+ #else   /* Generic C implementation */
+         ECB_LOOP_3P((void) aes_encrypt_block, pt, ct);
+ #endif  /* Generic C implementation */
+ 
+         return (CRYPTO_SUCCESS);
+ }
+ 
+ /*
+  * Same as aes_encrypt_ecb, but performs decryption.
+  */
+ int
+ aes_decrypt_ecb(const void *ks, const uint8_t *ct, uint8_t *pt, uint64_t length)
+ {
+         aes_key_t *ksch = (aes_key_t *)ks;
+         uint64_t i = 0;
+ 
+ #ifdef  __amd64
+         ECB_INTEL_IMPL(dec, ct, pt);
+ #elif   defined(sun4u)
+         ECB_LOOP_4P(aes_decrypt_impl, dec, ct, pt);
+ #else   /* Generic C implementation */
+         ECB_LOOP_3P((void) aes_decrypt_block, ct, pt);
+ #endif  /* Generic C implementation */
+ 
+         return (CRYPTO_SUCCESS);
+ }
+ #ifdef  __amd64
+ #undef  ECB_INTEL_IMPL
+ #endif  /* __amd64 */
+ 
+ #undef  ECB_LOOP
+ #undef  ECB_LOOP_4P
+ #undef  ECB_LOOP_3P
+ 
+ #define CBC_LOOP(enc_func, xor_func)                                    \
+         do {                                                            \
+                 for (; i < length; i += AES_BLOCK_LEN) {                \
+                         /* copy IV to ciphertext */                     \
+                         bcopy(iv, &ct[i], AES_BLOCK_LEN);               \
+                         /* XOR IV with plaintext with input */          \
+                         xor_func(&pt[i], &ct[i]);                       \
+                         /* encrypt counter in output region */          \
+                         enc_func;                                       \
+                         iv = &ct[i];                                    \
+                 }                                                       \
+                 _NOTE(CONSTCOND)                                        \
+         } while (0)
+ #define CBC_LOOP_4P(enc_func, xor_func)                                 \
+         CBC_LOOP(enc_func(&ksch->encr_ks.ks32[0],                       \
+             ksch->nr, (void *)&ct[i], (void *)&ct[i]), xor_func)
+ #define CBC_LOOP_3P(enc_func, xor_func)                                 \
+         CBC_LOOP(enc_func(ksch, (void *)&ct[i], (void *)&ct[i]), xor_func)
+ 
+ /*
+  * Encrypts a sequence of consecutive AES blocks in CBC mode. On x86-64
+  * with the AES-NI extension, the encryption is performed on 8 blocks at
+  * a time using an optimized assembly implementation, giving a speed boost
+  * of around 75%. On other platforms, this simply performs CBC encryption
+  * in sequence on the blocks.
+  *
+  * Decryption acceleration is implemented in the kernel kcf block cipher
+  * modes code (cbc.c), because that doesn't require a complete hand-tuned
+  * CBC implementation in assembly.
+  */
+ int
+ aes_encrypt_cbc(const void *ks, const uint8_t *pt, uint8_t *ct,
+     const uint8_t *iv, uint64_t length)
+ {
+         aes_key_t *ksch = (aes_key_t *)ks;
+         size_t i = 0;
+ 
+ #ifdef  __amd64
+         if (intel_aes_instructions_present()) {
+                 for (; i + 8 * AES_BLOCK_LEN <= length;
+                     i += 8 * AES_BLOCK_LEN) {
+                         aes_encrypt_cbc_intel8(&ksch->encr_ks.ks32[0],
+                             ksch->nr, &ct[i], &ct[i], iv);
+                         iv = &ct[7 * AES_BLOCK_LEN];
+                 }
+                 CBC_LOOP_4P(aes_encrypt_intel, aes_xor_intel);
+         } else {
+                 CBC_LOOP_4P(aes_encrypt_amd64, aes_xor_intel);
+         }
+ #elif   defined(sun4u)
+         CBC_LOOP_4P(aes_encrypt_impl, aes_xor_block);
+ #else   /* Generic C implementation */
+         CBC_LOOP_3P((void) aes_encrypt_block, aes_xor_block);
+ #endif  /* Generic C implementation */
+ 
+         return (CRYPTO_SUCCESS);
+ }
+ #undef  CBC_LOOP
+ #undef  CBC_LOOP_4P
+ #undef  CBC_LOOP_3P
+ 
+ #define CTR_LOOP(enc_func, xor_func)                                    \
+         do {                                                            \
+                 for (; i < length; i += AES_BLOCK_LEN) {                \
+                         /* set up counter in output region */           \
+                         *(uint64_t *)(void *)&output[i] = counter[0];   \
+                         *(uint64_t *)(void *)&output[i + 8] =           \
+                             htonll(counter[1]++);                       \
+                         /* encrypt counter in output region */          \
+                         enc_func;                                       \
+                         /* XOR encrypted counter with input */          \
+                         xor_func(&input[i], &output[i]);                \
+                 }                                                       \
+                 _NOTE(CONSTCOND)                                        \
+         } while (0)
+ #define CTR_LOOP_4P(enc_func, xor_func)                                 \
+         CTR_LOOP(enc_func(&ksch->encr_ks.ks32[0], ksch->nr,             \
+             (void *)&output[i], (void *)&output[i]), xor_func)
+ #define CTR_LOOP_3P(enc_func, xor_func)                                 \
+         CTR_LOOP(enc_func(ksch, (void *)&output[i], (void *)&output[i]),\
+             xor_func)
+ /*
+  * Performs high-performance counter mode encryption and decryption on
+  * a sequence of blocks. In CTR mode, encryption and decryption are the
+  * same operation, just with the plaintext and ciphertext reversed:
+  * plaintext = CTR(CTR(plaintext, K), K)
+  * Blocks also do not interdepend on each other, so it is an excellent
+  * mode when high performance is required and data authentication/integrity
+  * checking is provided via some other means, or isn't necessary.
+  *
+  * On x86-64 with the AES-NI extension, this code performs CTR mode
+  * encryption in parallel on 8 blocks at a time and can provide in
+  * excess of 3GB/s/core of encryption/decryption performance (<1 CPB).
+  */
+ int
+ aes_ctr_mode(const void *ks, const uint8_t *input, uint8_t *output,
+     uint64_t length, uint64_t counter[2])
+ {
+         aes_key_t *ksch = (aes_key_t *)ks;
+         uint64_t i = 0;
+ 
+         // swap lower part to host order for computations
+         counter[1] = ntohll(counter[1]);
+ 
+ #ifdef  __amd64
+         if (intel_aes_instructions_present()) {
+                 /* first use the wide-register accelerated function */
+                 for (; i + 8 * AES_BLOCK_LEN <= length;
+                     i += 8 * AES_BLOCK_LEN) {
+                         aes_ctr_intel8(&ksch->encr_ks.ks32[0], ksch->nr,
+                             &input[i], &output[i], counter[0], counter[1]);
+                         counter[1] += 8;
+                 }
+                 /* finish off the remainder using the slow per-block method */
+                 CTR_LOOP_4P(aes_encrypt_intel, aes_xor_intel);
+         } else {
+                 CTR_LOOP_4P(aes_encrypt_amd64, aes_xor_intel);
+         }
+ #elif   defined(sun4u)
+         CTR_LOOP_4P(aes_encrypt_impl, aes_xor_block);
+ #else   /* Generic C implementation */
+         CTR_LOOP_3P((void) aes_encrypt_block, aes_xor_block);
+ #endif  /* Generic C implementation */
+ 
+         // swap lower part back to big endian
+         counter[1] = htonll(counter[1]);
+ 
+         return (CRYPTO_SUCCESS);
+ }
+ #undef  CTR_LOOP
+ 
+ /*
   * Allocate key schedule for AES.
   *
   * Return the pointer and set size to the number of bytes allocated.
   * Memory allocated must be freed by the caller when done.
   *
*** 1760,1777 ****
  }
  
  
  #ifdef __amd64
  /*
!  * Return 1 if executing on Intel with AES-NI instructions,
!  * otherwise 0 (i.e., Intel without AES-NI or AMD64).
   * Cache the result, as the CPU can't change.
   *
   * Note: the userland version uses getisax().  The kernel version uses
   * global variable x86_featureset.
   */
! static int
  intel_aes_instructions_present(void)
  {
          static int      cached_result = -1;
  
          if (cached_result == -1) { /* first time */
--- 1976,1992 ----
  }
  
  
  #ifdef __amd64
  /*
!  * Return 1 if executing on x86-64 with AES-NI instructions, otherwise 0.
   * Cache the result, as the CPU can't change.
   *
   * Note: the userland version uses getisax().  The kernel version uses
   * global variable x86_featureset.
   */
! static inline int
  intel_aes_instructions_present(void)
  {
          static int      cached_result = -1;
  
          if (cached_result == -1) { /* first time */