Print this page
4896 Performance improvements for KCF AES modes

@@ -18,10 +18,11 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 by Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>

@@ -28,10 +29,11 @@
 #include <netinet/in.h>
 #include "aes_impl.h"
 #ifndef _KERNEL
 #include <strings.h>
 #include <stdlib.h>
+#include <sys/note.h>
 #endif  /* !_KERNEL */
 
 #ifdef __amd64
 
 #ifdef _KERNEL

@@ -92,12 +94,12 @@
 extern void aes_encrypt_impl(const uint32_t rk[], int Nr, const uint32_t pt[4],
         uint32_t ct[4]);
 extern void aes_decrypt_impl(const uint32_t rk[], int Nr, const uint32_t ct[4],
         uint32_t pt[4]);
 
-#define AES_ENCRYPT_IMPL(a, b, c, d, e) aes_encrypt_impl(a, b, c, d)
-#define AES_DECRYPT_IMPL(a, b, c, d, e) aes_decrypt_impl(a, b, c, d)
+#define AES_ENCRYPT_IMPL(a, b, c, d)    aes_encrypt_impl(a, b, c, d)
+#define AES_DECRYPT_IMPL(a, b, c, d)    aes_decrypt_impl(a, b, c, d)
 
 #elif defined(__amd64)
 
 /* These functions are used to execute amd64 instructions for AMD or Intel: */
 extern int rijndael_key_setup_enc_amd64(uint32_t rk[],

@@ -116,21 +118,41 @@
         const uint32_t cipherKey[], uint64_t keyBits);
 extern void aes_encrypt_intel(const uint32_t rk[], int Nr,
         const uint32_t pt[4], uint32_t ct[4]);
 extern void aes_decrypt_intel(const uint32_t rk[], int Nr,
         const uint32_t ct[4], uint32_t pt[4]);
+extern void aes_encrypt_intel8(const uint32_t rk[], int Nr,
+        const void *pt, void *ct);
+extern void aes_decrypt_intel8(const uint32_t rk[], int Nr,
+        const void *ct, void *pt);
+extern void aes_encrypt_cbc_intel8(const uint32_t rk[], int Nr,
+        const void *pt, void *ct, const void *iv);
+extern void aes_ctr_intel8(const uint32_t rk[], int Nr,
+        const void *input, void *output, uint64_t counter_upper_BE,
+        uint64_t counter_lower_LE);
+extern void aes_xor_intel(const uint8_t *, uint8_t *);
 
-static int intel_aes_instructions_present(void);
+static inline int intel_aes_instructions_present(void);
 
-#define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d, e)
-#define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d, e)
+#ifdef  _KERNEL
+/*
+ * Some form of floating-point acceleration is available, so declare these.
+ * The implementations will be in a platform-specific assembly file (e.g.
+ * amd64/aes_intel.s for SSE2/AES-NI).
+ */
+extern void aes_accel_save(void *savestate);
+extern void aes_accel_restore(void *savestate);
+#endif  /* _KERNEL */
 
 #else /* Generic C implementation */
-
-#define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d)
-#define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d)
+static void rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
+    uint32_t ct[4]);
+static void rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
+    uint32_t ct[4]);
 #define rijndael_key_setup_enc_raw      rijndael_key_setup_enc
+#define AES_ENCRYPT_IMPL(a, b, c, d)    rijndael_encrypt(a, b, c, d)
+#define AES_DECRYPT_IMPL(a, b, c, d)    rijndael_decrypt(a, b, c, d)
 #endif  /* sun4u || __amd64 */
 
 #if defined(_LITTLE_ENDIAN) && !defined(__amd64)
 #define AES_BYTE_SWAP
 #endif

@@ -1138,84 +1160,29 @@
  * keyBits      AES key size (128, 192, or 256 bits)
  */
 static void
 aes_setupkeys(aes_key_t *key, const uint32_t *keyarr32, int keybits)
 {
+        AES_ACCEL_SAVESTATE(savestate);
+        aes_accel_enter(savestate);
+
         if (intel_aes_instructions_present()) {
-                key->flags = INTEL_AES_NI_CAPABLE;
-                KPREEMPT_DISABLE;
                 key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]),
                     keyarr32, keybits);
                 key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]),
                     keyarr32, keybits);
-                KPREEMPT_ENABLE;
         } else {
-                key->flags = 0;
                 key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]),
                     keyarr32, keybits);
                 key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]),
                     keyarr32, keybits);
         }
 
+        aes_accel_exit(savestate);
         key->type = AES_32BIT_KS;
 }
 
-/*
- * Encrypt one block of data. The block is assumed to be an array
- * of four uint32_t values, so copy for alignment (and byte-order
- * reversal for little endian systems might be necessary on the
- * input and output byte streams.
- * The size of the key schedule depends on the number of rounds
- * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
- *
- * Parameters:
- * rk           Key schedule, of aes_ks_t (60 32-bit integers)
- * Nr           Number of rounds
- * pt           Input block (plain text)
- * ct           Output block (crypto text).  Can overlap with pt
- * flags        Indicates whether we're on Intel AES-NI-capable hardware
- */
-static void
-rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
-    uint32_t ct[4], int flags) {
-        if (flags & INTEL_AES_NI_CAPABLE) {
-                KPREEMPT_DISABLE;
-                aes_encrypt_intel(rk, Nr, pt, ct);
-                KPREEMPT_ENABLE;
-        } else {
-                aes_encrypt_amd64(rk, Nr, pt, ct);
-        }
-}
-
-/*
- * Decrypt one block of data. The block is assumed to be an array
- * of four uint32_t values, so copy for alignment (and byte-order
- * reversal for little endian systems might be necessary on the
- * input and output byte streams.
- * The size of the key schedule depends on the number of rounds
- * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
- *
- * Parameters:
- * rk           Key schedule, of aes_ks_t (60 32-bit integers)
- * Nr           Number of rounds
- * ct           Input block (crypto text)
- * pt           Output block (plain text). Can overlap with pt
- * flags        Indicates whether we're on Intel AES-NI-capable hardware
- */
-static void
-rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
-    uint32_t pt[4], int flags) {
-        if (flags & INTEL_AES_NI_CAPABLE) {
-                KPREEMPT_DISABLE;
-                aes_decrypt_intel(rk, Nr, ct, pt);
-                KPREEMPT_ENABLE;
-        } else {
-                aes_decrypt_amd64(rk, Nr, ct, pt);
-        }
-}
-
-
 #else /* generic C implementation */
 
 /*
  *  Expand the cipher key into the decryption key schedule.
  *  Return the number of rounds for the given cipher key size.

@@ -1620,11 +1587,26 @@
 #endif
 
         aes_setupkeys(newbie, keyarr.ka32, keyBits);
 }
 
+#if     defined(__amd64) && defined(_KERNEL)
+void
+aes_accel_enter(void *savestate)
+{
+        KPREEMPT_DISABLE;
+        aes_accel_save(savestate);
+}
 
+void
+aes_accel_exit(void *savestate)
+{
+        aes_accel_restore(savestate);
+        KPREEMPT_ENABLE;
+}
+#endif  /* defined(__amd64) && defined(_KERNEL) */
+
 /*
  * Encrypt one block using AES.
  * Align if needed and (for x86 32-bit only) byte-swap.
  *
  * Parameters:

@@ -1635,16 +1617,25 @@
 int
 aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct)
 {
         aes_key_t       *ksch = (aes_key_t *)ks;
 
+#ifdef  __amd64
+        if (intel_aes_instructions_present())
+                aes_encrypt_intel(&ksch->encr_ks.ks32[0], ksch->nr,
+                    /* LINTED:  pointer alignment */
+                    (uint32_t *)pt, (uint32_t *)ct);
+        else
+                aes_encrypt_amd64(&ksch->encr_ks.ks32[0], ksch->nr,
+                    /* LINTED:  pointer alignment */
+                    (uint32_t *)pt, (uint32_t *)ct);
+#else   /* !__amd64 */
 #ifndef AES_BYTE_SWAP
         if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t))) {
-                /* LINTED:  pointer alignment */
                 AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr,
                     /* LINTED:  pointer alignment */
-                    (uint32_t *)pt, (uint32_t *)ct, ksch->flags);
+                    (uint32_t *)pt, (uint32_t *)ct);
         } else {
 #endif
                 uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
 
                 /* Copy input block into buffer */

@@ -1654,14 +1645,14 @@
 #else   /* byte swap */
                 buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]);
                 buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]);
                 buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]);
                 buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]);
-#endif
+#endif  /* byte swap */
 
                 AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr,
-                    buffer, buffer, ksch->flags);
+                    buffer, buffer);
 
                 /* Copy result from buffer to output block */
 #ifndef AES_BYTE_SWAP
                 bcopy(&buffer, ct, AES_BLOCK_LEN);
         }

@@ -1669,11 +1660,13 @@
 #else   /* byte swap */
                 *(uint32_t *)(void *)&ct[0] = htonl(buffer[0]);
                 *(uint32_t *)(void *)&ct[4] = htonl(buffer[1]);
                 *(uint32_t *)(void *)&ct[8] = htonl(buffer[2]);
                 *(uint32_t *)(void *)&ct[12] = htonl(buffer[3]);
-#endif
+#endif  /* byte swap */
+#endif  /* !__amd64 */
+
         return (CRYPTO_SUCCESS);
 }
 
 
 /*

@@ -1688,16 +1681,25 @@
 int
 aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt)
 {
         aes_key_t       *ksch = (aes_key_t *)ks;
 
+#ifdef  __amd64
+        if (intel_aes_instructions_present())
+                aes_decrypt_intel(&ksch->decr_ks.ks32[0], ksch->nr,
+                    /* LINTED:  pointer alignment */
+                    (uint32_t *)ct, (uint32_t *)pt);
+        else
+                aes_decrypt_amd64(&ksch->decr_ks.ks32[0], ksch->nr,
+                    /* LINTED:  pointer alignment */
+                    (uint32_t *)ct, (uint32_t *)pt);
+#else   /* !__amd64 */
 #ifndef AES_BYTE_SWAP
         if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t))) {
-                /* LINTED:  pointer alignment */
                 AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr,
                     /* LINTED:  pointer alignment */
-                    (uint32_t *)ct, (uint32_t *)pt, ksch->flags);
+                    (uint32_t *)ct, (uint32_t *)pt);
         } else {
 #endif
                 uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
 
                 /* Copy input block into buffer */

@@ -1707,14 +1709,14 @@
 #else   /* byte swap */
                 buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]);
                 buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]);
                 buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]);
                 buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]);
-#endif
+#endif  /* byte swap */
 
                 AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr,
-                    buffer, buffer, ksch->flags);
+                    buffer, buffer);
 
                 /* Copy result from buffer to output block */
 #ifndef AES_BYTE_SWAP
                 bcopy(&buffer, pt, AES_BLOCK_LEN);
         }

@@ -1722,17 +1724,231 @@
 #else   /* byte swap */
         *(uint32_t *)(void *)&pt[0] = htonl(buffer[0]);
         *(uint32_t *)(void *)&pt[4] = htonl(buffer[1]);
         *(uint32_t *)(void *)&pt[8] = htonl(buffer[2]);
         *(uint32_t *)(void *)&pt[12] = htonl(buffer[3]);
-#endif
+#endif  /* byte swap */
+#endif  /* !__amd64 */
 
         return (CRYPTO_SUCCESS);
 }
 
+#define ECB_LOOP(ciph_func)                                             \
+        do {                                                            \
+                for (; i < length; i += AES_BLOCK_LEN)                  \
+                        ciph_func;                                      \
+                _NOTE(CONSTCOND)                                        \
+        } while (0)
+#define ECB_LOOP_4P(ciph_func, enc_or_dec, in, out)                     \
+        ECB_LOOP(ciph_func(&ksch->enc_or_dec ## r_ks.ks32[0],           \
+            ksch->nr, (void *)&in[i], (void *)&out[i]))
+#define ECB_LOOP_3P(ciph_func, in, out)                                 \
+        ECB_LOOP(ciph_func(ksch, (void *)&in[i], (void *)&out[i]))
 
+#ifdef  __amd64
+#define ECB_INTEL_IMPL(enc_or_dec, in, out)                             \
+        do {                                                            \
+                if (intel_aes_instructions_present()) {                 \
+                        /* first use the accelerated function */        \
+                        for (; i + 8 * AES_BLOCK_LEN <= length;         \
+                            i += 8 * AES_BLOCK_LEN)                     \
+                                aes_ ## enc_or_dec ## rypt_intel8(      \
+                                    &ksch->enc_or_dec ## r_ks.ks32[0],  \
+                                    ksch->nr, &in[i], &out[i]);         \
+                        /* finish off the remainder per-block */        \
+                        ECB_LOOP_4P(aes_ ## enc_or_dec ## rypt_intel,   \
+                            enc_or_dec, in, out);                       \
+                } else {                                                \
+                        ECB_LOOP_4P(aes_ ## enc_or_dec ## rypt_amd64,   \
+                            enc_or_dec, in, out);                       \
+                }                                                       \
+                _NOTE(CONSTCOND)                                        \
+        } while (0)
+#endif  /* __amd64 */
+
 /*
+ * Perform AES ECB encryption on a sequence of blocks. On x86-64 CPUs with
+ * the AES-NI extension, this performs the encryption in increments of 8
+ * blocks at a time, exploiting instruction parallelism more efficiently.
+ * On other platforms, this simply encrypts the blocks in sequence.
+ */
+int
+aes_encrypt_ecb(const void *ks, const uint8_t *pt, uint8_t *ct, uint64_t length)
+{
+        aes_key_t *ksch = (aes_key_t *)ks;
+        uint64_t i = 0;
+
+#ifdef  __amd64
+        ECB_INTEL_IMPL(enc, pt, ct);
+#elif   defined(sun4u)
+        ECB_LOOP_4P(aes_encrypt_impl, enc, pt, ct);
+#else   /* Generic C implementation */
+        ECB_LOOP_3P((void) aes_encrypt_block, pt, ct);
+#endif  /* Generic C implementation */
+
+        return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Same as aes_encrypt_ecb, but performs decryption.
+ */
+int
+aes_decrypt_ecb(const void *ks, const uint8_t *ct, uint8_t *pt, uint64_t length)
+{
+        aes_key_t *ksch = (aes_key_t *)ks;
+        uint64_t i = 0;
+
+#ifdef  __amd64
+        ECB_INTEL_IMPL(dec, ct, pt);
+#elif   defined(sun4u)
+        ECB_LOOP_4P(aes_decrypt_impl, dec, ct, pt);
+#else   /* Generic C implementation */
+        ECB_LOOP_3P((void) aes_decrypt_block, ct, pt);
+#endif  /* Generic C implementation */
+
+        return (CRYPTO_SUCCESS);
+}
+#ifdef  __amd64
+#undef  ECB_INTEL_IMPL
+#endif  /* __amd64 */
+
+#undef  ECB_LOOP
+#undef  ECB_LOOP_4P
+#undef  ECB_LOOP_3P
+
+#define CBC_LOOP(enc_func, xor_func)                                    \
+        do {                                                            \
+                for (; i < length; i += AES_BLOCK_LEN) {                \
+                        /* copy IV to ciphertext */                     \
+                        bcopy(iv, &ct[i], AES_BLOCK_LEN);               \
+                        /* XOR IV with plaintext with input */          \
+                        xor_func(&pt[i], &ct[i]);                       \
+                        /* encrypt counter in output region */          \
+                        enc_func;                                       \
+                        iv = &ct[i];                                    \
+                }                                                       \
+                _NOTE(CONSTCOND)                                        \
+        } while (0)
+#define CBC_LOOP_4P(enc_func, xor_func)                                 \
+        CBC_LOOP(enc_func(&ksch->encr_ks.ks32[0],                       \
+            ksch->nr, (void *)&ct[i], (void *)&ct[i]), xor_func)
+#define CBC_LOOP_3P(enc_func, xor_func)                                 \
+        CBC_LOOP(enc_func(ksch, (void *)&ct[i], (void *)&ct[i]), xor_func)
+
+/*
+ * Encrypts a sequence of consecutive AES blocks in CBC mode. On x86-64
+ * with the AES-NI extension, the encryption is performed on 8 blocks at
+ * a time using an optimized assembly implementation, giving a speed boost
+ * of around 75%. On other platforms, this simply performs CBC encryption
+ * in sequence on the blocks.
+ *
+ * Decryption acceleration is implemented in the kernel kcf block cipher
+ * modes code (cbc.c), because that doesn't require a complete hand-tuned
+ * CBC implementation in assembly.
+ */
+int
+aes_encrypt_cbc(const void *ks, const uint8_t *pt, uint8_t *ct,
+    const uint8_t *iv, uint64_t length)
+{
+        aes_key_t *ksch = (aes_key_t *)ks;
+        size_t i = 0;
+
+#ifdef  __amd64
+        if (intel_aes_instructions_present()) {
+                for (; i + 8 * AES_BLOCK_LEN <= length;
+                    i += 8 * AES_BLOCK_LEN) {
+                        aes_encrypt_cbc_intel8(&ksch->encr_ks.ks32[0],
+                            ksch->nr, &ct[i], &ct[i], iv);
+                        iv = &ct[7 * AES_BLOCK_LEN];
+                }
+                CBC_LOOP_4P(aes_encrypt_intel, aes_xor_intel);
+        } else {
+                CBC_LOOP_4P(aes_encrypt_amd64, aes_xor_intel);
+        }
+#elif   defined(sun4u)
+        CBC_LOOP_4P(aes_encrypt_impl, aes_xor_block);
+#else   /* Generic C implementation */
+        CBC_LOOP_3P((void) aes_encrypt_block, aes_xor_block);
+#endif  /* Generic C implementation */
+
+        return (CRYPTO_SUCCESS);
+}
+#undef  CBC_LOOP
+#undef  CBC_LOOP_4P
+#undef  CBC_LOOP_3P
+
+#define CTR_LOOP(enc_func, xor_func)                                    \
+        do {                                                            \
+                for (; i < length; i += AES_BLOCK_LEN) {                \
+                        /* set up counter in output region */           \
+                        *(uint64_t *)(void *)&output[i] = counter[0];   \
+                        *(uint64_t *)(void *)&output[i + 8] =           \
+                            htonll(counter[1]++);                       \
+                        /* encrypt counter in output region */          \
+                        enc_func;                                       \
+                        /* XOR encrypted counter with input */          \
+                        xor_func(&input[i], &output[i]);                \
+                }                                                       \
+                _NOTE(CONSTCOND)                                        \
+        } while (0)
+#define CTR_LOOP_4P(enc_func, xor_func)                                 \
+        CTR_LOOP(enc_func(&ksch->encr_ks.ks32[0], ksch->nr,             \
+            (void *)&output[i], (void *)&output[i]), xor_func)
+#define CTR_LOOP_3P(enc_func, xor_func)                                 \
+        CTR_LOOP(enc_func(ksch, (void *)&output[i], (void *)&output[i]),\
+            xor_func)
+/*
+ * Performs high-performance counter mode encryption and decryption on
+ * a sequence of blocks. In CTR mode, encryption and decryption are the
+ * same operation, just with the plaintext and ciphertext reversed:
+ * plaintext = CTR(CTR(plaintext, K), K)
+ * Blocks also do not interdepend on each other, so it is an excellent
+ * mode when high performance is required and data authentication/integrity
+ * checking is provided via some other means, or isn't necessary.
+ *
+ * On x86-64 with the AES-NI extension, this code performs CTR mode
+ * encryption in parallel on 8 blocks at a time and can provide in
+ * excess of 3GB/s/core of encryption/decryption performance (<1 CPB).
+ */
+int
+aes_ctr_mode(const void *ks, const uint8_t *input, uint8_t *output,
+    uint64_t length, uint64_t counter[2])
+{
+        aes_key_t *ksch = (aes_key_t *)ks;
+        uint64_t i = 0;
+
+        // swap lower part to host order for computations
+        counter[1] = ntohll(counter[1]);
+
+#ifdef  __amd64
+        if (intel_aes_instructions_present()) {
+                /* first use the wide-register accelerated function */
+                for (; i + 8 * AES_BLOCK_LEN <= length;
+                    i += 8 * AES_BLOCK_LEN) {
+                        aes_ctr_intel8(&ksch->encr_ks.ks32[0], ksch->nr,
+                            &input[i], &output[i], counter[0], counter[1]);
+                        counter[1] += 8;
+                }
+                /* finish off the remainder using the slow per-block method */
+                CTR_LOOP_4P(aes_encrypt_intel, aes_xor_intel);
+        } else {
+                CTR_LOOP_4P(aes_encrypt_amd64, aes_xor_intel);
+        }
+#elif   defined(sun4u)
+        CTR_LOOP_4P(aes_encrypt_impl, aes_xor_block);
+#else   /* Generic C implementation */
+        CTR_LOOP_3P((void) aes_encrypt_block, aes_xor_block);
+#endif  /* Generic C implementation */
+
+        // swap lower part back to big endian
+        counter[1] = htonll(counter[1]);
+
+        return (CRYPTO_SUCCESS);
+}
+#undef  CTR_LOOP
+
+/*
  * Allocate key schedule for AES.
  *
  * Return the pointer and set size to the number of bytes allocated.
  * Memory allocated must be freed by the caller when done.
  *

@@ -1760,18 +1976,17 @@
 }
 
 
 #ifdef __amd64
 /*
- * Return 1 if executing on Intel with AES-NI instructions,
- * otherwise 0 (i.e., Intel without AES-NI or AMD64).
+ * Return 1 if executing on x86-64 with AES-NI instructions, otherwise 0.
  * Cache the result, as the CPU can't change.
  *
  * Note: the userland version uses getisax().  The kernel version uses
  * global variable x86_featureset.
  */
-static int
+static inline int
 intel_aes_instructions_present(void)
 {
         static int      cached_result = -1;
 
         if (cached_result == -1) { /* first time */