Print this page
4896 Performance improvements for KCF AES modes

*** 20,29 **** --- 20,32 ---- */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ + /* + * Copyright 2015 by Saso Kiselkov. All rights reserved. + */ #ifndef _AES_IMPL_H #define _AES_IMPL_H /*
*** 43,53 **** #define AES_BLOCK_LEN 16 /* bytes */ /* Round constant length, in number of 32-bit elements: */ #define RC_LENGTH (5 * ((AES_BLOCK_LEN) / 4 - 2)) ! #define AES_COPY_BLOCK(src, dst) \ (dst)[0] = (src)[0]; \ (dst)[1] = (src)[1]; \ (dst)[2] = (src)[2]; \ (dst)[3] = (src)[3]; \ (dst)[4] = (src)[4]; \ --- 46,56 ---- #define AES_BLOCK_LEN 16 /* bytes */ /* Round constant length, in number of 32-bit elements: */ #define RC_LENGTH (5 * ((AES_BLOCK_LEN) / 4 - 2)) ! #define AES_COPY_BLOCK_UNALIGNED(src, dst) \ (dst)[0] = (src)[0]; \ (dst)[1] = (src)[1]; \ (dst)[2] = (src)[2]; \ (dst)[3] = (src)[3]; \ (dst)[4] = (src)[4]; \
*** 61,71 **** (dst)[12] = (src)[12]; \ (dst)[13] = (src)[13]; \ (dst)[14] = (src)[14]; \ (dst)[15] = (src)[15] ! #define AES_XOR_BLOCK(src, dst) \ (dst)[0] ^= (src)[0]; \ (dst)[1] ^= (src)[1]; \ (dst)[2] ^= (src)[2]; \ (dst)[3] ^= (src)[3]; \ (dst)[4] ^= (src)[4]; \ --- 64,74 ---- (dst)[12] = (src)[12]; \ (dst)[13] = (src)[13]; \ (dst)[14] = (src)[14]; \ (dst)[15] = (src)[15] ! #define AES_XOR_BLOCK_UNALIGNED(src, dst) \ (dst)[0] ^= (src)[0]; \ (dst)[1] ^= (src)[1]; \ (dst)[2] ^= (src)[2]; \ (dst)[3] ^= (src)[3]; \ (dst)[4] ^= (src)[4]; \
*** 79,88 **** --- 82,99 ---- (dst)[12] ^= (src)[12]; \ (dst)[13] ^= (src)[13]; \ (dst)[14] ^= (src)[14]; \ (dst)[15] ^= (src)[15] + #define AES_COPY_BLOCK_ALIGNED(src, dst) \ + ((uint64_t *)(void *)(dst))[0] = ((uint64_t *)(void *)(src))[0]; \ + ((uint64_t *)(void *)(dst))[1] = ((uint64_t *)(void *)(src))[1] + + #define AES_XOR_BLOCK_ALIGNED(src, dst) \ + ((uint64_t *)(void *)(dst))[0] ^= ((uint64_t *)(void *)(src))[0]; \ + ((uint64_t *)(void *)(dst))[1] ^= ((uint64_t *)(void *)(src))[1] + /* AES key size definitions */ #define AES_MINBITS 128 #define AES_MINBYTES ((AES_MINBITS) >> 3) #define AES_MAXBITS 256 #define AES_MAXBYTES ((AES_MAXBITS) >> 3)
*** 97,123 **** #define AES_64BIT_KS 64 #define MAX_AES_NR 14 /* Maximum number of rounds */ #define MAX_AES_NB 4 /* Number of columns comprising a state */ typedef union { #ifdef sun4u uint64_t ks64[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; #endif uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; } aes_ks_t; - /* aes_key.flags value: */ - #define INTEL_AES_NI_CAPABLE 0x1 /* AES-NI instructions present */ - typedef struct aes_key aes_key_t; struct aes_key { aes_ks_t encr_ks; /* encryption key schedule */ aes_ks_t decr_ks; /* decryption key schedule */ #ifdef __amd64 long double align128; /* Align fields above for Intel AES-NI */ - int flags; /* implementation-dependent flags */ #endif /* __amd64 */ int nr; /* number of rounds (10, 12, or 14) */ int type; /* key schedule size (32 or 64 bits) */ }; --- 108,167 ---- #define AES_64BIT_KS 64 #define MAX_AES_NR 14 /* Maximum number of rounds */ #define MAX_AES_NB 4 /* Number of columns comprising a state */ + /* + * Architecture-specific acceleration support autodetection. + * Some architectures provide hardware-assisted acceleration using floating + * point registers, which need special handling inside of the kernel, so the + * macros below define the auxiliary functions needed to utilize them. + */ + #if defined(__amd64) && defined(_KERNEL) + /* + * Using floating point registers requires temporarily disabling kernel + * thread preemption, so we need to operate on small-enough chunks to + * prevent scheduling latency bubbles. + * A typical 64-bit CPU can sustain around 300-400MB/s/core even in the + * slowest encryption modes (CBC), which with 32k per run works out to ~100us + * per run. CPUs with AES-NI in fast modes (ECB, CTR, CBC decryption) can + * easily sustain 3GB/s/core, so the latency potential essentially vanishes. + */ + #define AES_OPSZ 32768 + + #if defined(lint) || defined(__lint) + #define AES_ACCEL_SAVESTATE(name) uint8_t name[16 * 16 + 8] + #else /* lint || __lint */ + #define AES_ACCEL_SAVESTATE(name) \ + /* stack space for xmm0--xmm15 and cr0 (16 x 128 bits + 64 bits) */ \ + uint8_t name[16 * 16 + 8] __attribute__((aligned(16))) + #endif /* lint || __lint */ + + #else /* !defined(__amd64) || !defined(_KERNEL) */ + /* + * All other accel support + */ + #define AES_OPSZ ((size_t)-1) + /* On other architectures or outside of the kernel these get stubbed out */ + #define AES_ACCEL_SAVESTATE(name) + #define aes_accel_enter(savestate) + #define aes_accel_exit(savestate) + #endif /* !defined(__amd64) || !defined(_KERNEL) */ + typedef union { #ifdef sun4u uint64_t ks64[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; #endif uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; } aes_ks_t; typedef struct aes_key aes_key_t; struct aes_key { aes_ks_t encr_ks; /* encryption key schedule */ aes_ks_t decr_ks; /* decryption key schedule */ #ifdef __amd64 long double align128; /* Align fields above for Intel AES-NI */ #endif /* __amd64 */ int nr; /* number of rounds (10, 12, or 14) */ int type; /* key schedule size (32 or 64 bits) */ };
*** 130,154 **** extern void *aes_alloc_keysched(size_t *size, int kmflag); extern void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched); extern int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct); extern int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt); /* * AES mode functions. * The first 2 functions operate on 16-byte AES blocks. */ ! extern void aes_copy_block(uint8_t *in, uint8_t *out); ! extern void aes_xor_block(uint8_t *data, uint8_t *dst); /* Note: ctx is a pointer to aes_ctx_t defined in modes.h */ extern int aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length, crypto_data_t *out); extern int aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length, crypto_data_t *out); /* * The following definitions and declarations are only used by AES FIPS POST */ #ifdef _AES_IMPL #ifdef _KERNEL --- 174,227 ---- extern void *aes_alloc_keysched(size_t *size, int kmflag); extern void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched); extern int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct); extern int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt); + extern int aes_encrypt_ecb(const void *ks, const uint8_t *pt, uint8_t *ct, + uint64_t length); + extern int aes_decrypt_ecb(const void *ks, const uint8_t *pt, uint8_t *ct, + uint64_t length); + extern int aes_encrypt_cbc(const void *ks, const uint8_t *pt, uint8_t *ct, + const uint8_t *iv, uint64_t length); + extern int aes_ctr_mode(const void *ks, const uint8_t *pt, uint8_t *ct, + uint64_t length, uint64_t counter[2]); /* * AES mode functions. * The first 2 functions operate on 16-byte AES blocks. */ ! #ifdef __amd64 ! #define AES_COPY_BLOCK aes_copy_intel ! #define AES_XOR_BLOCK aes_xor_intel ! extern void aes_copy_intel(const uint8_t *src, uint8_t *dst); ! extern void aes_xor_intel(const uint8_t *src, uint8_t *dst); ! extern void aes_xor_intel8(const uint8_t *src, uint8_t *dst); ! #else /* !__amd64 */ ! #define AES_COPY_BLOCK aes_copy_block ! #define AES_XOR_BLOCK aes_xor_block ! #endif /* !__amd64 */ + extern void aes_copy_block(const uint8_t *src, uint8_t *dst); + extern void aes_xor_block(const uint8_t *src, uint8_t *dst); + /* Note: ctx is a pointer to aes_ctx_t defined in modes.h */ extern int aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length, crypto_data_t *out); extern int aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length, crypto_data_t *out); + #if defined(__amd64) && defined(_KERNEL) /* + * When AES floating-point acceleration is available, these will be called + * by the worker functions to clear and restore floating point state and + * control kernel thread preemption. + */ + extern void aes_accel_enter(void *savestate); + extern void aes_accel_exit(void *savestate); + #endif /* __amd64 && _KERNEL */ + + /* * The following definitions and declarations are only used by AES FIPS POST */ #ifdef _AES_IMPL #ifdef _KERNEL