Print this page
4896 Performance improvements for KCF AES modes
@@ -20,10 +20,13 @@
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright 2015 by Saso Kiselkov. All rights reserved.
+ */
#ifndef _AES_IMPL_H
#define _AES_IMPL_H
/*
@@ -43,11 +46,11 @@
#define AES_BLOCK_LEN 16 /* bytes */
/* Round constant length, in number of 32-bit elements: */
#define RC_LENGTH (5 * ((AES_BLOCK_LEN) / 4 - 2))
-#define AES_COPY_BLOCK(src, dst) \
+#define AES_COPY_BLOCK_UNALIGNED(src, dst) \
(dst)[0] = (src)[0]; \
(dst)[1] = (src)[1]; \
(dst)[2] = (src)[2]; \
(dst)[3] = (src)[3]; \
(dst)[4] = (src)[4]; \
@@ -61,11 +64,11 @@
(dst)[12] = (src)[12]; \
(dst)[13] = (src)[13]; \
(dst)[14] = (src)[14]; \
(dst)[15] = (src)[15]
-#define AES_XOR_BLOCK(src, dst) \
+#define AES_XOR_BLOCK_UNALIGNED(src, dst) \
(dst)[0] ^= (src)[0]; \
(dst)[1] ^= (src)[1]; \
(dst)[2] ^= (src)[2]; \
(dst)[3] ^= (src)[3]; \
(dst)[4] ^= (src)[4]; \
@@ -79,10 +82,18 @@
(dst)[12] ^= (src)[12]; \
(dst)[13] ^= (src)[13]; \
(dst)[14] ^= (src)[14]; \
(dst)[15] ^= (src)[15]
+#define AES_COPY_BLOCK_ALIGNED(src, dst) \
+ ((uint64_t *)(void *)(dst))[0] = ((uint64_t *)(void *)(src))[0]; \
+ ((uint64_t *)(void *)(dst))[1] = ((uint64_t *)(void *)(src))[1]
+
+#define AES_XOR_BLOCK_ALIGNED(src, dst) \
+ ((uint64_t *)(void *)(dst))[0] ^= ((uint64_t *)(void *)(src))[0]; \
+ ((uint64_t *)(void *)(dst))[1] ^= ((uint64_t *)(void *)(src))[1]
+
/* AES key size definitions */
#define AES_MINBITS 128
#define AES_MINBYTES ((AES_MINBITS) >> 3)
#define AES_MAXBITS 256
#define AES_MAXBYTES ((AES_MAXBITS) >> 3)
@@ -97,27 +108,60 @@
#define AES_64BIT_KS 64
#define MAX_AES_NR 14 /* Maximum number of rounds */
#define MAX_AES_NB 4 /* Number of columns comprising a state */
+/*
+ * Architecture-specific acceleration support autodetection.
+ * Some architectures provide hardware-assisted acceleration using floating
+ * point registers, which need special handling inside of the kernel, so the
+ * macros below define the auxiliary functions needed to utilize them.
+ */
+#if defined(__amd64) && defined(_KERNEL)
+/*
+ * Using floating point registers requires temporarily disabling kernel
+ * thread preemption, so we need to operate on small-enough chunks to
+ * prevent scheduling latency bubbles.
+ * A typical 64-bit CPU can sustain around 300-400MB/s/core even in the
+ * slowest encryption modes (CBC), which with 32k per run works out to ~100us
+ * per run. CPUs with AES-NI in fast modes (ECB, CTR, CBC decryption) can
+ * easily sustain 3GB/s/core, so the latency potential essentially vanishes.
+ */
+#define AES_OPSZ 32768
+
+#if defined(lint) || defined(__lint)
+#define AES_ACCEL_SAVESTATE(name) uint8_t name[16 * 16 + 8]
+#else /* lint || __lint */
+#define AES_ACCEL_SAVESTATE(name) \
+ /* stack space for xmm0--xmm15 and cr0 (16 x 128 bits + 64 bits) */ \
+ uint8_t name[16 * 16 + 8] __attribute__((aligned(16)))
+#endif /* lint || __lint */
+
+#else /* !defined(__amd64) || !defined(_KERNEL) */
+/*
+ * All other accel support
+ */
+#define AES_OPSZ ((size_t)-1)
+/* On other architectures or outside of the kernel these get stubbed out */
+#define AES_ACCEL_SAVESTATE(name)
+#define aes_accel_enter(savestate)
+#define aes_accel_exit(savestate)
+#endif /* !defined(__amd64) || !defined(_KERNEL) */
+
typedef union {
#ifdef sun4u
uint64_t ks64[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
#endif
uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
} aes_ks_t;
-/* aes_key.flags value: */
-#define INTEL_AES_NI_CAPABLE 0x1 /* AES-NI instructions present */
-
typedef struct aes_key aes_key_t;
struct aes_key {
aes_ks_t encr_ks; /* encryption key schedule */
aes_ks_t decr_ks; /* decryption key schedule */
#ifdef __amd64
long double align128; /* Align fields above for Intel AES-NI */
- int flags; /* implementation-dependent flags */
#endif /* __amd64 */
int nr; /* number of rounds (10, 12, or 14) */
int type; /* key schedule size (32 or 64 bits) */
};
@@ -130,25 +174,54 @@
extern void *aes_alloc_keysched(size_t *size, int kmflag);
extern void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits,
void *keysched);
extern int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct);
extern int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt);
+extern int aes_encrypt_ecb(const void *ks, const uint8_t *pt, uint8_t *ct,
+ uint64_t length);
+extern int aes_decrypt_ecb(const void *ks, const uint8_t *pt, uint8_t *ct,
+ uint64_t length);
+extern int aes_encrypt_cbc(const void *ks, const uint8_t *pt, uint8_t *ct,
+ const uint8_t *iv, uint64_t length);
+extern int aes_ctr_mode(const void *ks, const uint8_t *pt, uint8_t *ct,
+ uint64_t length, uint64_t counter[2]);
/*
* AES mode functions.
* The first 2 functions operate on 16-byte AES blocks.
*/
-extern void aes_copy_block(uint8_t *in, uint8_t *out);
-extern void aes_xor_block(uint8_t *data, uint8_t *dst);
+#ifdef __amd64
+#define AES_COPY_BLOCK aes_copy_intel
+#define AES_XOR_BLOCK aes_xor_intel
+extern void aes_copy_intel(const uint8_t *src, uint8_t *dst);
+extern void aes_xor_intel(const uint8_t *src, uint8_t *dst);
+extern void aes_xor_intel8(const uint8_t *src, uint8_t *dst);
+#else /* !__amd64 */
+#define AES_COPY_BLOCK aes_copy_block
+#define AES_XOR_BLOCK aes_xor_block
+#endif /* !__amd64 */
+extern void aes_copy_block(const uint8_t *src, uint8_t *dst);
+extern void aes_xor_block(const uint8_t *src, uint8_t *dst);
+
/* Note: ctx is a pointer to aes_ctx_t defined in modes.h */
extern int aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length,
crypto_data_t *out);
extern int aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length,
crypto_data_t *out);
+#if defined(__amd64) && defined(_KERNEL)
/*
+ * When AES floating-point acceleration is available, these will be called
+ * by the worker functions to clear and restore floating point state and
+ * control kernel thread preemption.
+ */
+extern void aes_accel_enter(void *savestate);
+extern void aes_accel_exit(void *savestate);
+#endif /* __amd64 && _KERNEL */
+
+/*
* The following definitions and declarations are only used by AES FIPS POST
*/
#ifdef _AES_IMPL
#ifdef _KERNEL