Print this page
4896 Performance improvements for KCF AES modes
@@ -18,10 +18,11 @@
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 by Saso Kiselkov. All rights reserved.
*/
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
@@ -28,10 +29,11 @@
#include <netinet/in.h>
#include "aes_impl.h"
#ifndef _KERNEL
#include <strings.h>
#include <stdlib.h>
+#include <sys/note.h>
#endif /* !_KERNEL */
#ifdef __amd64
#ifdef _KERNEL
@@ -92,12 +94,12 @@
extern void aes_encrypt_impl(const uint32_t rk[], int Nr, const uint32_t pt[4],
uint32_t ct[4]);
extern void aes_decrypt_impl(const uint32_t rk[], int Nr, const uint32_t ct[4],
uint32_t pt[4]);
-#define AES_ENCRYPT_IMPL(a, b, c, d, e) aes_encrypt_impl(a, b, c, d)
-#define AES_DECRYPT_IMPL(a, b, c, d, e) aes_decrypt_impl(a, b, c, d)
+#define AES_ENCRYPT_IMPL(a, b, c, d) aes_encrypt_impl(a, b, c, d)
+#define AES_DECRYPT_IMPL(a, b, c, d) aes_decrypt_impl(a, b, c, d)
#elif defined(__amd64)
/* These functions are used to execute amd64 instructions for AMD or Intel: */
extern int rijndael_key_setup_enc_amd64(uint32_t rk[],
@@ -116,21 +118,41 @@
const uint32_t cipherKey[], uint64_t keyBits);
extern void aes_encrypt_intel(const uint32_t rk[], int Nr,
const uint32_t pt[4], uint32_t ct[4]);
extern void aes_decrypt_intel(const uint32_t rk[], int Nr,
const uint32_t ct[4], uint32_t pt[4]);
+extern void aes_encrypt_intel8(const uint32_t rk[], int Nr,
+ const void *pt, void *ct);
+extern void aes_decrypt_intel8(const uint32_t rk[], int Nr,
+ const void *ct, void *pt);
+extern void aes_encrypt_cbc_intel8(const uint32_t rk[], int Nr,
+ const void *pt, void *ct, const void *iv);
+extern void aes_ctr_intel8(const uint32_t rk[], int Nr,
+ const void *input, void *output, uint64_t counter_upper_BE,
+ uint64_t counter_lower_LE);
+extern void aes_xor_intel(const uint8_t *, uint8_t *);
-static int intel_aes_instructions_present(void);
+static inline int intel_aes_instructions_present(void);
-#define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d, e)
-#define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d, e)
+#ifdef _KERNEL
+/*
+ * Some form of floating-point acceleration is available, so declare these.
+ * The implementations will be in a platform-specific assembly file (e.g.
+ * amd64/aes_intel.s for SSE2/AES-NI).
+ */
+extern void aes_accel_save(void *savestate);
+extern void aes_accel_restore(void *savestate);
+#endif /* _KERNEL */
#else /* Generic C implementation */
-
-#define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d)
-#define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d)
+static void rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
+ uint32_t ct[4]);
+static void rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
+ uint32_t ct[4]);
#define rijndael_key_setup_enc_raw rijndael_key_setup_enc
+#define AES_ENCRYPT_IMPL(a, b, c, d) rijndael_encrypt(a, b, c, d)
+#define AES_DECRYPT_IMPL(a, b, c, d) rijndael_decrypt(a, b, c, d)
#endif /* sun4u || __amd64 */
#if defined(_LITTLE_ENDIAN) && !defined(__amd64)
#define AES_BYTE_SWAP
#endif
@@ -1138,84 +1160,29 @@
* keyBits AES key size (128, 192, or 256 bits)
*/
static void
aes_setupkeys(aes_key_t *key, const uint32_t *keyarr32, int keybits)
{
+ AES_ACCEL_SAVESTATE(savestate);
+ aes_accel_enter(savestate);
+
if (intel_aes_instructions_present()) {
- key->flags = INTEL_AES_NI_CAPABLE;
- KPREEMPT_DISABLE;
key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]),
keyarr32, keybits);
key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]),
keyarr32, keybits);
- KPREEMPT_ENABLE;
} else {
- key->flags = 0;
key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]),
keyarr32, keybits);
key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]),
keyarr32, keybits);
}
+ aes_accel_exit(savestate);
key->type = AES_32BIT_KS;
}
-/*
- * Encrypt one block of data. The block is assumed to be an array
- * of four uint32_t values, so copy for alignment (and byte-order
- * reversal for little endian systems might be necessary on the
- * input and output byte streams.
- * The size of the key schedule depends on the number of rounds
- * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
- *
- * Parameters:
- * rk Key schedule, of aes_ks_t (60 32-bit integers)
- * Nr Number of rounds
- * pt Input block (plain text)
- * ct Output block (crypto text). Can overlap with pt
- * flags Indicates whether we're on Intel AES-NI-capable hardware
- */
-static void
-rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
- uint32_t ct[4], int flags) {
- if (flags & INTEL_AES_NI_CAPABLE) {
- KPREEMPT_DISABLE;
- aes_encrypt_intel(rk, Nr, pt, ct);
- KPREEMPT_ENABLE;
- } else {
- aes_encrypt_amd64(rk, Nr, pt, ct);
- }
-}
-
-/*
- * Decrypt one block of data. The block is assumed to be an array
- * of four uint32_t values, so copy for alignment (and byte-order
- * reversal for little endian systems might be necessary on the
- * input and output byte streams.
- * The size of the key schedule depends on the number of rounds
- * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
- *
- * Parameters:
- * rk Key schedule, of aes_ks_t (60 32-bit integers)
- * Nr Number of rounds
- * ct Input block (crypto text)
- * pt Output block (plain text). Can overlap with pt
- * flags Indicates whether we're on Intel AES-NI-capable hardware
- */
-static void
-rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
- uint32_t pt[4], int flags) {
- if (flags & INTEL_AES_NI_CAPABLE) {
- KPREEMPT_DISABLE;
- aes_decrypt_intel(rk, Nr, ct, pt);
- KPREEMPT_ENABLE;
- } else {
- aes_decrypt_amd64(rk, Nr, ct, pt);
- }
-}
-
-
#else /* generic C implementation */
/*
* Expand the cipher key into the decryption key schedule.
* Return the number of rounds for the given cipher key size.
@@ -1620,11 +1587,26 @@
#endif
aes_setupkeys(newbie, keyarr.ka32, keyBits);
}
+#if defined(__amd64) && defined(_KERNEL)
+void
+aes_accel_enter(void *savestate)
+{
+ KPREEMPT_DISABLE;
+ aes_accel_save(savestate);
+}
+void
+aes_accel_exit(void *savestate)
+{
+ aes_accel_restore(savestate);
+ KPREEMPT_ENABLE;
+}
+#endif /* defined(__amd64) && defined(_KERNEL) */
+
/*
* Encrypt one block using AES.
* Align if needed and (for x86 32-bit only) byte-swap.
*
* Parameters:
@@ -1635,16 +1617,25 @@
int
aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct)
{
aes_key_t *ksch = (aes_key_t *)ks;
+#ifdef __amd64
+ if (intel_aes_instructions_present())
+ aes_encrypt_intel(&ksch->encr_ks.ks32[0], ksch->nr,
+ /* LINTED: pointer alignment */
+ (uint32_t *)pt, (uint32_t *)ct);
+ else
+ aes_encrypt_amd64(&ksch->encr_ks.ks32[0], ksch->nr,
+ /* LINTED: pointer alignment */
+ (uint32_t *)pt, (uint32_t *)ct);
+#else /* !__amd64 */
#ifndef AES_BYTE_SWAP
if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t))) {
- /* LINTED: pointer alignment */
AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr,
/* LINTED: pointer alignment */
- (uint32_t *)pt, (uint32_t *)ct, ksch->flags);
+ (uint32_t *)pt, (uint32_t *)ct);
} else {
#endif
uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
/* Copy input block into buffer */
@@ -1654,14 +1645,14 @@
#else /* byte swap */
buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]);
buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]);
buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]);
buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]);
-#endif
+#endif /* byte swap */
AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr,
- buffer, buffer, ksch->flags);
+ buffer, buffer);
/* Copy result from buffer to output block */
#ifndef AES_BYTE_SWAP
bcopy(&buffer, ct, AES_BLOCK_LEN);
}
@@ -1669,11 +1660,13 @@
#else /* byte swap */
*(uint32_t *)(void *)&ct[0] = htonl(buffer[0]);
*(uint32_t *)(void *)&ct[4] = htonl(buffer[1]);
*(uint32_t *)(void *)&ct[8] = htonl(buffer[2]);
*(uint32_t *)(void *)&ct[12] = htonl(buffer[3]);
-#endif
+#endif /* byte swap */
+#endif /* !__amd64 */
+
return (CRYPTO_SUCCESS);
}
/*
@@ -1688,16 +1681,25 @@
int
aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt)
{
aes_key_t *ksch = (aes_key_t *)ks;
+#ifdef __amd64
+ if (intel_aes_instructions_present())
+ aes_decrypt_intel(&ksch->decr_ks.ks32[0], ksch->nr,
+ /* LINTED: pointer alignment */
+ (uint32_t *)ct, (uint32_t *)pt);
+ else
+ aes_decrypt_amd64(&ksch->decr_ks.ks32[0], ksch->nr,
+ /* LINTED: pointer alignment */
+ (uint32_t *)ct, (uint32_t *)pt);
+#else /* !__amd64 */
#ifndef AES_BYTE_SWAP
if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t))) {
- /* LINTED: pointer alignment */
AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr,
/* LINTED: pointer alignment */
- (uint32_t *)ct, (uint32_t *)pt, ksch->flags);
+ (uint32_t *)ct, (uint32_t *)pt);
} else {
#endif
uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
/* Copy input block into buffer */
@@ -1707,14 +1709,14 @@
#else /* byte swap */
buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]);
buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]);
buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]);
buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]);
-#endif
+#endif /* byte swap */
AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr,
- buffer, buffer, ksch->flags);
+ buffer, buffer);
/* Copy result from buffer to output block */
#ifndef AES_BYTE_SWAP
bcopy(&buffer, pt, AES_BLOCK_LEN);
}
@@ -1722,17 +1724,231 @@
#else /* byte swap */
*(uint32_t *)(void *)&pt[0] = htonl(buffer[0]);
*(uint32_t *)(void *)&pt[4] = htonl(buffer[1]);
*(uint32_t *)(void *)&pt[8] = htonl(buffer[2]);
*(uint32_t *)(void *)&pt[12] = htonl(buffer[3]);
-#endif
+#endif /* byte swap */
+#endif /* !__amd64 */
return (CRYPTO_SUCCESS);
}
+#define ECB_LOOP(ciph_func) \
+ do { \
+ for (; i < length; i += AES_BLOCK_LEN) \
+ ciph_func; \
+ _NOTE(CONSTCOND) \
+ } while (0)
+#define ECB_LOOP_4P(ciph_func, enc_or_dec, in, out) \
+ ECB_LOOP(ciph_func(&ksch->enc_or_dec ## r_ks.ks32[0], \
+ ksch->nr, (void *)&in[i], (void *)&out[i]))
+#define ECB_LOOP_3P(ciph_func, in, out) \
+ ECB_LOOP(ciph_func(ksch, (void *)&in[i], (void *)&out[i]))
+#ifdef __amd64
+#define ECB_INTEL_IMPL(enc_or_dec, in, out) \
+ do { \
+ if (intel_aes_instructions_present()) { \
+ /* first use the accelerated function */ \
+ for (; i + 8 * AES_BLOCK_LEN <= length; \
+ i += 8 * AES_BLOCK_LEN) \
+ aes_ ## enc_or_dec ## rypt_intel8( \
+ &ksch->enc_or_dec ## r_ks.ks32[0], \
+ ksch->nr, &in[i], &out[i]); \
+ /* finish off the remainder per-block */ \
+ ECB_LOOP_4P(aes_ ## enc_or_dec ## rypt_intel, \
+ enc_or_dec, in, out); \
+ } else { \
+ ECB_LOOP_4P(aes_ ## enc_or_dec ## rypt_amd64, \
+ enc_or_dec, in, out); \
+ } \
+ _NOTE(CONSTCOND) \
+ } while (0)
+#endif /* __amd64 */
+
/*
+ * Perform AES ECB encryption on a sequence of blocks. On x86-64 CPUs with
+ * the AES-NI extension, this performs the encryption in increments of 8
+ * blocks at a time, exploiting instruction parallelism more efficiently.
+ * On other platforms, this simply encrypts the blocks in sequence.
+ */
+int
+aes_encrypt_ecb(const void *ks, const uint8_t *pt, uint8_t *ct, uint64_t length)
+{
+ aes_key_t *ksch = (aes_key_t *)ks;
+ uint64_t i = 0;
+
+#ifdef __amd64
+ ECB_INTEL_IMPL(enc, pt, ct);
+#elif defined(sun4u)
+ ECB_LOOP_4P(aes_encrypt_impl, enc, pt, ct);
+#else /* Generic C implementation */
+ ECB_LOOP_3P((void) aes_encrypt_block, pt, ct);
+#endif /* Generic C implementation */
+
+ return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Same as aes_encrypt_ecb, but performs decryption.
+ */
+int
+aes_decrypt_ecb(const void *ks, const uint8_t *ct, uint8_t *pt, uint64_t length)
+{
+ aes_key_t *ksch = (aes_key_t *)ks;
+ uint64_t i = 0;
+
+#ifdef __amd64
+ ECB_INTEL_IMPL(dec, ct, pt);
+#elif defined(sun4u)
+ ECB_LOOP_4P(aes_decrypt_impl, dec, ct, pt);
+#else /* Generic C implementation */
+ ECB_LOOP_3P((void) aes_decrypt_block, ct, pt);
+#endif /* Generic C implementation */
+
+ return (CRYPTO_SUCCESS);
+}
+#ifdef __amd64
+#undef ECB_INTEL_IMPL
+#endif /* __amd64 */
+
+#undef ECB_LOOP
+#undef ECB_LOOP_4P
+#undef ECB_LOOP_3P
+
+#define CBC_LOOP(enc_func, xor_func) \
+ do { \
+ for (; i < length; i += AES_BLOCK_LEN) { \
+ /* copy IV to ciphertext */ \
+ bcopy(iv, &ct[i], AES_BLOCK_LEN); \
+ /* XOR IV with plaintext with input */ \
+ xor_func(&pt[i], &ct[i]); \
+ /* encrypt counter in output region */ \
+ enc_func; \
+ iv = &ct[i]; \
+ } \
+ _NOTE(CONSTCOND) \
+ } while (0)
+#define CBC_LOOP_4P(enc_func, xor_func) \
+ CBC_LOOP(enc_func(&ksch->encr_ks.ks32[0], \
+ ksch->nr, (void *)&ct[i], (void *)&ct[i]), xor_func)
+#define CBC_LOOP_3P(enc_func, xor_func) \
+ CBC_LOOP(enc_func(ksch, (void *)&ct[i], (void *)&ct[i]), xor_func)
+
+/*
+ * Encrypts a sequence of consecutive AES blocks in CBC mode. On x86-64
+ * with the AES-NI extension, the encryption is performed on 8 blocks at
+ * a time using an optimized assembly implementation, giving a speed boost
+ * of around 75%. On other platforms, this simply performs CBC encryption
+ * in sequence on the blocks.
+ *
+ * Decryption acceleration is implemented in the kernel kcf block cipher
+ * modes code (cbc.c), because that doesn't require a complete hand-tuned
+ * CBC implementation in assembly.
+ */
+int
+aes_encrypt_cbc(const void *ks, const uint8_t *pt, uint8_t *ct,
+ const uint8_t *iv, uint64_t length)
+{
+ aes_key_t *ksch = (aes_key_t *)ks;
+ size_t i = 0;
+
+#ifdef __amd64
+ if (intel_aes_instructions_present()) {
+ for (; i + 8 * AES_BLOCK_LEN <= length;
+ i += 8 * AES_BLOCK_LEN) {
+ aes_encrypt_cbc_intel8(&ksch->encr_ks.ks32[0],
+ ksch->nr, &ct[i], &ct[i], iv);
+ iv = &ct[7 * AES_BLOCK_LEN];
+ }
+ CBC_LOOP_4P(aes_encrypt_intel, aes_xor_intel);
+ } else {
+ CBC_LOOP_4P(aes_encrypt_amd64, aes_xor_intel);
+ }
+#elif defined(sun4u)
+ CBC_LOOP_4P(aes_encrypt_impl, aes_xor_block);
+#else /* Generic C implementation */
+ CBC_LOOP_3P((void) aes_encrypt_block, aes_xor_block);
+#endif /* Generic C implementation */
+
+ return (CRYPTO_SUCCESS);
+}
+#undef CBC_LOOP
+#undef CBC_LOOP_4P
+#undef CBC_LOOP_3P
+
+#define CTR_LOOP(enc_func, xor_func) \
+ do { \
+ for (; i < length; i += AES_BLOCK_LEN) { \
+ /* set up counter in output region */ \
+ *(uint64_t *)(void *)&output[i] = counter[0]; \
+ *(uint64_t *)(void *)&output[i + 8] = \
+ htonll(counter[1]++); \
+ /* encrypt counter in output region */ \
+ enc_func; \
+ /* XOR encrypted counter with input */ \
+ xor_func(&input[i], &output[i]); \
+ } \
+ _NOTE(CONSTCOND) \
+ } while (0)
+#define CTR_LOOP_4P(enc_func, xor_func) \
+ CTR_LOOP(enc_func(&ksch->encr_ks.ks32[0], ksch->nr, \
+ (void *)&output[i], (void *)&output[i]), xor_func)
+#define CTR_LOOP_3P(enc_func, xor_func) \
+ CTR_LOOP(enc_func(ksch, (void *)&output[i], (void *)&output[i]),\
+ xor_func)
+/*
+ * Performs high-performance counter mode encryption and decryption on
+ * a sequence of blocks. In CTR mode, encryption and decryption are the
+ * same operation, just with the plaintext and ciphertext reversed:
+ * plaintext = CTR(CTR(plaintext, K), K)
+ * Blocks also do not interdepend on each other, so it is an excellent
+ * mode when high performance is required and data authentication/integrity
+ * checking is provided via some other means, or isn't necessary.
+ *
+ * On x86-64 with the AES-NI extension, this code performs CTR mode
+ * encryption in parallel on 8 blocks at a time and can provide in
+ * excess of 3GB/s/core of encryption/decryption performance (<1 CPB).
+ */
+int
+aes_ctr_mode(const void *ks, const uint8_t *input, uint8_t *output,
+ uint64_t length, uint64_t counter[2])
+{
+ aes_key_t *ksch = (aes_key_t *)ks;
+ uint64_t i = 0;
+
+ // swap lower part to host order for computations
+ counter[1] = ntohll(counter[1]);
+
+#ifdef __amd64
+ if (intel_aes_instructions_present()) {
+ /* first use the wide-register accelerated function */
+ for (; i + 8 * AES_BLOCK_LEN <= length;
+ i += 8 * AES_BLOCK_LEN) {
+ aes_ctr_intel8(&ksch->encr_ks.ks32[0], ksch->nr,
+ &input[i], &output[i], counter[0], counter[1]);
+ counter[1] += 8;
+ }
+ /* finish off the remainder using the slow per-block method */
+ CTR_LOOP_4P(aes_encrypt_intel, aes_xor_intel);
+ } else {
+ CTR_LOOP_4P(aes_encrypt_amd64, aes_xor_intel);
+ }
+#elif defined(sun4u)
+ CTR_LOOP_4P(aes_encrypt_impl, aes_xor_block);
+#else /* Generic C implementation */
+ CTR_LOOP_3P((void) aes_encrypt_block, aes_xor_block);
+#endif /* Generic C implementation */
+
+ // swap lower part back to big endian
+ counter[1] = htonll(counter[1]);
+
+ return (CRYPTO_SUCCESS);
+}
+#undef CTR_LOOP
+
+/*
* Allocate key schedule for AES.
*
* Return the pointer and set size to the number of bytes allocated.
* Memory allocated must be freed by the caller when done.
*
@@ -1760,18 +1976,17 @@
}
#ifdef __amd64
/*
- * Return 1 if executing on Intel with AES-NI instructions,
- * otherwise 0 (i.e., Intel without AES-NI or AMD64).
+ * Return 1 if executing on x86-64 with AES-NI instructions, otherwise 0.
* Cache the result, as the CPU can't change.
*
* Note: the userland version uses getisax(). The kernel version uses
* global variable x86_featureset.
*/
-static int
+static inline int
intel_aes_instructions_present(void)
{
static int cached_result = -1;
if (cached_result == -1) { /* first time */