Print this page
4896 Performance improvements for KCF AES modes

@@ -20,10 +20,13 @@
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2015 by Saso Kiselkov. All rights reserved.
+ */
 
 #ifndef _AES_IMPL_H
 #define _AES_IMPL_H
 
 /*

@@ -43,11 +46,11 @@
 
 #define AES_BLOCK_LEN   16      /* bytes */
 /* Round constant length, in number of 32-bit elements: */
 #define RC_LENGTH       (5 * ((AES_BLOCK_LEN) / 4 - 2))
 
-#define AES_COPY_BLOCK(src, dst) \
+#define AES_COPY_BLOCK_UNALIGNED(src, dst) \
         (dst)[0] = (src)[0]; \
         (dst)[1] = (src)[1]; \
         (dst)[2] = (src)[2]; \
         (dst)[3] = (src)[3]; \
         (dst)[4] = (src)[4]; \

@@ -61,11 +64,11 @@
         (dst)[12] = (src)[12]; \
         (dst)[13] = (src)[13]; \
         (dst)[14] = (src)[14]; \
         (dst)[15] = (src)[15]
 
-#define AES_XOR_BLOCK(src, dst) \
+#define AES_XOR_BLOCK_UNALIGNED(src, dst) \
         (dst)[0] ^= (src)[0]; \
         (dst)[1] ^= (src)[1]; \
         (dst)[2] ^= (src)[2]; \
         (dst)[3] ^= (src)[3]; \
         (dst)[4] ^= (src)[4]; \

@@ -79,10 +82,18 @@
         (dst)[12] ^= (src)[12]; \
         (dst)[13] ^= (src)[13]; \
         (dst)[14] ^= (src)[14]; \
         (dst)[15] ^= (src)[15]
 
+#define AES_COPY_BLOCK_ALIGNED(src, dst) \
+        ((uint64_t *)(void *)(dst))[0] = ((uint64_t *)(void *)(src))[0]; \
+        ((uint64_t *)(void *)(dst))[1] = ((uint64_t *)(void *)(src))[1]
+
+#define AES_XOR_BLOCK_ALIGNED(src, dst) \
+        ((uint64_t *)(void *)(dst))[0] ^= ((uint64_t *)(void *)(src))[0]; \
+        ((uint64_t *)(void *)(dst))[1] ^= ((uint64_t *)(void *)(src))[1]
+
 /* AES key size definitions */
 #define AES_MINBITS             128
 #define AES_MINBYTES            ((AES_MINBITS) >> 3)
 #define AES_MAXBITS             256
 #define AES_MAXBYTES            ((AES_MAXBITS) >> 3)

@@ -97,27 +108,60 @@
 #define AES_64BIT_KS            64
 
 #define MAX_AES_NR              14 /* Maximum number of rounds */
 #define MAX_AES_NB              4  /* Number of columns comprising a state */
 
+/*
+ * Architecture-specific acceleration support autodetection.
+ * Some architectures provide hardware-assisted acceleration using floating
+ * point registers, which need special handling inside of the kernel, so the
+ * macros below define the auxiliary functions needed to utilize them.
+ */
+#if     defined(__amd64) && defined(_KERNEL)
+/*
+ * Using floating point registers requires temporarily disabling kernel
+ * thread preemption, so we need to operate on small-enough chunks to
+ * prevent scheduling latency bubbles.
+ * A typical 64-bit CPU can sustain around 300-400MB/s/core even in the
+ * slowest encryption modes (CBC), which with 32k per run works out to ~100us
+ * per run. CPUs with AES-NI in fast modes (ECB, CTR, CBC decryption) can
+ * easily sustain 3GB/s/core, so the latency potential essentially vanishes.
+ */
+#define AES_OPSZ        32768
+
+#if     defined(lint) || defined(__lint)
+#define AES_ACCEL_SAVESTATE(name)       uint8_t name[16 * 16 + 8]
+#else   /* lint || __lint */
+#define AES_ACCEL_SAVESTATE(name) \
+        /* stack space for xmm0--xmm15 and cr0 (16 x 128 bits + 64 bits) */ \
+        uint8_t name[16 * 16 + 8] __attribute__((aligned(16)))
+#endif  /* lint || __lint */
+
+#else   /* !defined(__amd64) || !defined(_KERNEL) */
+/*
+ * All other accel support
+ */
+#define AES_OPSZ        ((size_t)-1)
+/* On other architectures or outside of the kernel these get stubbed out */
+#define AES_ACCEL_SAVESTATE(name)
+#define aes_accel_enter(savestate)
+#define aes_accel_exit(savestate)
+#endif  /* !defined(__amd64) || !defined(_KERNEL) */
+
 typedef union {
 #ifdef  sun4u
         uint64_t        ks64[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
 #endif
         uint32_t        ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
 } aes_ks_t;
 
-/* aes_key.flags value: */
-#define INTEL_AES_NI_CAPABLE    0x1     /* AES-NI instructions present */
-
 typedef struct aes_key aes_key_t;
 struct aes_key {
         aes_ks_t        encr_ks;  /* encryption key schedule */
         aes_ks_t        decr_ks;  /* decryption key schedule */
 #ifdef __amd64
         long double     align128; /* Align fields above for Intel AES-NI */
-        int             flags;    /* implementation-dependent flags */
 #endif  /* __amd64 */
         int             nr;       /* number of rounds (10, 12, or 14) */
         int             type;     /* key schedule size (32 or 64 bits) */
 };
 

@@ -130,25 +174,54 @@
 extern void *aes_alloc_keysched(size_t *size, int kmflag);
 extern void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits,
         void *keysched);
 extern int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct);
 extern int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt);
+extern int aes_encrypt_ecb(const void *ks, const uint8_t *pt, uint8_t *ct,
+    uint64_t length);
+extern int aes_decrypt_ecb(const void *ks, const uint8_t *pt, uint8_t *ct,
+    uint64_t length);
+extern int aes_encrypt_cbc(const void *ks, const uint8_t *pt, uint8_t *ct,
+    const uint8_t *iv, uint64_t length);
+extern int aes_ctr_mode(const void *ks, const uint8_t *pt, uint8_t *ct,
+    uint64_t length, uint64_t counter[2]);
 
 /*
  * AES mode functions.
  * The first 2 functions operate on 16-byte AES blocks.
  */
-extern void aes_copy_block(uint8_t *in, uint8_t *out);
-extern void aes_xor_block(uint8_t *data, uint8_t *dst);
+#ifdef  __amd64
+#define AES_COPY_BLOCK  aes_copy_intel
+#define AES_XOR_BLOCK   aes_xor_intel
+extern void aes_copy_intel(const uint8_t *src, uint8_t *dst);
+extern void aes_xor_intel(const uint8_t *src, uint8_t *dst);
+extern void aes_xor_intel8(const uint8_t *src, uint8_t *dst);
+#else   /* !__amd64 */
+#define AES_COPY_BLOCK  aes_copy_block
+#define AES_XOR_BLOCK   aes_xor_block
+#endif  /* !__amd64 */
 
+extern void aes_copy_block(const uint8_t *src, uint8_t *dst);
+extern void aes_xor_block(const uint8_t *src, uint8_t *dst);
+
 /* Note: ctx is a pointer to aes_ctx_t defined in modes.h */
 extern int aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length,
     crypto_data_t *out);
 extern int aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length,
     crypto_data_t *out);
 
+#if     defined(__amd64) && defined(_KERNEL)
 /*
+ * When AES floating-point acceleration is available, these will be called
+ * by the worker functions to clear and restore floating point state and
+ * control kernel thread preemption.
+ */
+extern void aes_accel_enter(void *savestate);
+extern void aes_accel_exit(void *savestate);
+#endif  /* __amd64 && _KERNEL */
+
+/*
  * The following definitions and declarations are only used by AES FIPS POST
  */
 #ifdef _AES_IMPL
 
 #ifdef _KERNEL