Print this page
4896 Performance improvements for KCF AES modes

*** 18,27 **** --- 18,28 ---- * * CDDL HEADER END */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 by Saso Kiselkov. All rights reserved. */ #include <sys/types.h> #include <sys/systm.h> #include <sys/sysmacros.h>
*** 28,37 **** --- 29,39 ---- #include <netinet/in.h> #include "aes_impl.h" #ifndef _KERNEL #include <strings.h> #include <stdlib.h> + #include <sys/note.h> #endif /* !_KERNEL */ #ifdef __amd64 #ifdef _KERNEL
*** 92,103 **** extern void aes_encrypt_impl(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]); extern void aes_decrypt_impl(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]); ! #define AES_ENCRYPT_IMPL(a, b, c, d, e) aes_encrypt_impl(a, b, c, d) ! #define AES_DECRYPT_IMPL(a, b, c, d, e) aes_decrypt_impl(a, b, c, d) #elif defined(__amd64) /* These functions are used to execute amd64 instructions for AMD or Intel: */ extern int rijndael_key_setup_enc_amd64(uint32_t rk[], --- 94,105 ---- extern void aes_encrypt_impl(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]); extern void aes_decrypt_impl(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]); ! #define AES_ENCRYPT_IMPL(a, b, c, d) aes_encrypt_impl(a, b, c, d) ! #define AES_DECRYPT_IMPL(a, b, c, d) aes_decrypt_impl(a, b, c, d) #elif defined(__amd64) /* These functions are used to execute amd64 instructions for AMD or Intel: */ extern int rijndael_key_setup_enc_amd64(uint32_t rk[],
*** 116,136 **** const uint32_t cipherKey[], uint64_t keyBits); extern void aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]); extern void aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]); ! static int intel_aes_instructions_present(void); ! #define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d, e) ! #define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d, e) #else /* Generic C implementation */ ! ! #define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d) ! #define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d) #define rijndael_key_setup_enc_raw rijndael_key_setup_enc #endif /* sun4u || __amd64 */ #if defined(_LITTLE_ENDIAN) && !defined(__amd64) #define AES_BYTE_SWAP #endif --- 118,158 ---- const uint32_t cipherKey[], uint64_t keyBits); extern void aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]); extern void aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]); + extern void aes_encrypt_intel8(const uint32_t rk[], int Nr, + const void *pt, void *ct); + extern void aes_decrypt_intel8(const uint32_t rk[], int Nr, + const void *ct, void *pt); + extern void aes_encrypt_cbc_intel8(const uint32_t rk[], int Nr, + const void *pt, void *ct, const void *iv); + extern void aes_ctr_intel8(const uint32_t rk[], int Nr, + const void *input, void *output, uint64_t counter_upper_BE, + uint64_t counter_lower_LE); + extern void aes_xor_intel(const uint8_t *, uint8_t *); ! static inline int intel_aes_instructions_present(void); ! #ifdef _KERNEL ! /* ! * Some form of floating-point acceleration is available, so declare these. ! * The implementations will be in a platform-specific assembly file (e.g. ! * amd64/aes_intel.s for SSE2/AES-NI). ! */ ! extern void aes_accel_save(void *savestate); ! extern void aes_accel_restore(void *savestate); ! #endif /* _KERNEL */ #else /* Generic C implementation */ ! static void rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4], ! uint32_t ct[4]); ! static void rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t pt[4], ! uint32_t ct[4]); #define rijndael_key_setup_enc_raw rijndael_key_setup_enc + #define AES_ENCRYPT_IMPL(a, b, c, d) rijndael_encrypt(a, b, c, d) + #define AES_DECRYPT_IMPL(a, b, c, d) rijndael_decrypt(a, b, c, d) #endif /* sun4u || __amd64 */ #if defined(_LITTLE_ENDIAN) && !defined(__amd64) #define AES_BYTE_SWAP #endif
*** 1138,1221 **** * keyBits AES key size (128, 192, or 256 bits) */ static void aes_setupkeys(aes_key_t *key, const uint32_t *keyarr32, int keybits) { if (intel_aes_instructions_present()) { - key->flags = INTEL_AES_NI_CAPABLE; - KPREEMPT_DISABLE; key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]), keyarr32, keybits); key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]), keyarr32, keybits); - KPREEMPT_ENABLE; } else { - key->flags = 0; key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]), keyarr32, keybits); key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]), keyarr32, keybits); } key->type = AES_32BIT_KS; } - /* - * Encrypt one block of data. The block is assumed to be an array - * of four uint32_t values, so copy for alignment (and byte-order - * reversal for little endian systems might be necessary on the - * input and output byte streams. - * The size of the key schedule depends on the number of rounds - * (which can be computed from the size of the key), i.e. 4*(Nr + 1). - * - * Parameters: - * rk Key schedule, of aes_ks_t (60 32-bit integers) - * Nr Number of rounds - * pt Input block (plain text) - * ct Output block (crypto text). Can overlap with pt - * flags Indicates whether we're on Intel AES-NI-capable hardware - */ - static void - rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4], - uint32_t ct[4], int flags) { - if (flags & INTEL_AES_NI_CAPABLE) { - KPREEMPT_DISABLE; - aes_encrypt_intel(rk, Nr, pt, ct); - KPREEMPT_ENABLE; - } else { - aes_encrypt_amd64(rk, Nr, pt, ct); - } - } - - /* - * Decrypt one block of data. The block is assumed to be an array - * of four uint32_t values, so copy for alignment (and byte-order - * reversal for little endian systems might be necessary on the - * input and output byte streams. - * The size of the key schedule depends on the number of rounds - * (which can be computed from the size of the key), i.e. 4*(Nr + 1). - * - * Parameters: - * rk Key schedule, of aes_ks_t (60 32-bit integers) - * Nr Number of rounds - * ct Input block (crypto text) - * pt Output block (plain text). Can overlap with pt - * flags Indicates whether we're on Intel AES-NI-capable hardware - */ - static void - rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4], - uint32_t pt[4], int flags) { - if (flags & INTEL_AES_NI_CAPABLE) { - KPREEMPT_DISABLE; - aes_decrypt_intel(rk, Nr, ct, pt); - KPREEMPT_ENABLE; - } else { - aes_decrypt_amd64(rk, Nr, ct, pt); - } - } - - #else /* generic C implementation */ /* * Expand the cipher key into the decryption key schedule. * Return the number of rounds for the given cipher key size. --- 1160,1188 ---- * keyBits AES key size (128, 192, or 256 bits) */ static void aes_setupkeys(aes_key_t *key, const uint32_t *keyarr32, int keybits) { + AES_ACCEL_SAVESTATE(savestate); + aes_accel_enter(savestate); + if (intel_aes_instructions_present()) { key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]), keyarr32, keybits); key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]), keyarr32, keybits); } else { key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]), keyarr32, keybits); key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]), keyarr32, keybits); } + aes_accel_exit(savestate); key->type = AES_32BIT_KS; } #else /* generic C implementation */ /* * Expand the cipher key into the decryption key schedule. * Return the number of rounds for the given cipher key size.
*** 1620,1630 **** --- 1587,1612 ---- #endif aes_setupkeys(newbie, keyarr.ka32, keyBits); } + #if defined(__amd64) && defined(_KERNEL) + void + aes_accel_enter(void *savestate) + { + KPREEMPT_DISABLE; + aes_accel_save(savestate); + } + void + aes_accel_exit(void *savestate) + { + aes_accel_restore(savestate); + KPREEMPT_ENABLE; + } + #endif /* defined(__amd64) && defined(_KERNEL) */ + /* * Encrypt one block using AES. * Align if needed and (for x86 32-bit only) byte-swap. * * Parameters:
*** 1635,1650 **** int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct) { aes_key_t *ksch = (aes_key_t *)ks; #ifndef AES_BYTE_SWAP if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t))) { - /* LINTED: pointer alignment */ AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr, /* LINTED: pointer alignment */ ! (uint32_t *)pt, (uint32_t *)ct, ksch->flags); } else { #endif uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)]; /* Copy input block into buffer */ --- 1617,1641 ---- int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct) { aes_key_t *ksch = (aes_key_t *)ks; + #ifdef __amd64 + if (intel_aes_instructions_present()) + aes_encrypt_intel(&ksch->encr_ks.ks32[0], ksch->nr, + /* LINTED: pointer alignment */ + (uint32_t *)pt, (uint32_t *)ct); + else + aes_encrypt_amd64(&ksch->encr_ks.ks32[0], ksch->nr, + /* LINTED: pointer alignment */ + (uint32_t *)pt, (uint32_t *)ct); + #else /* !__amd64 */ #ifndef AES_BYTE_SWAP if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t))) { AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr, /* LINTED: pointer alignment */ ! (uint32_t *)pt, (uint32_t *)ct); } else { #endif uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)]; /* Copy input block into buffer */
*** 1654,1667 **** #else /* byte swap */ buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]); buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]); buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]); buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]); ! #endif AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr, ! buffer, buffer, ksch->flags); /* Copy result from buffer to output block */ #ifndef AES_BYTE_SWAP bcopy(&buffer, ct, AES_BLOCK_LEN); } --- 1645,1658 ---- #else /* byte swap */ buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]); buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]); buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]); buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]); ! #endif /* byte swap */ AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr, ! buffer, buffer); /* Copy result from buffer to output block */ #ifndef AES_BYTE_SWAP bcopy(&buffer, ct, AES_BLOCK_LEN); }
*** 1669,1679 **** #else /* byte swap */ *(uint32_t *)(void *)&ct[0] = htonl(buffer[0]); *(uint32_t *)(void *)&ct[4] = htonl(buffer[1]); *(uint32_t *)(void *)&ct[8] = htonl(buffer[2]); *(uint32_t *)(void *)&ct[12] = htonl(buffer[3]); ! #endif return (CRYPTO_SUCCESS); } /* --- 1660,1672 ---- #else /* byte swap */ *(uint32_t *)(void *)&ct[0] = htonl(buffer[0]); *(uint32_t *)(void *)&ct[4] = htonl(buffer[1]); *(uint32_t *)(void *)&ct[8] = htonl(buffer[2]); *(uint32_t *)(void *)&ct[12] = htonl(buffer[3]); ! #endif /* byte swap */ ! #endif /* !__amd64 */ ! return (CRYPTO_SUCCESS); } /*
*** 1688,1703 **** int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt) { aes_key_t *ksch = (aes_key_t *)ks; #ifndef AES_BYTE_SWAP if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t))) { - /* LINTED: pointer alignment */ AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr, /* LINTED: pointer alignment */ ! (uint32_t *)ct, (uint32_t *)pt, ksch->flags); } else { #endif uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)]; /* Copy input block into buffer */ --- 1681,1705 ---- int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt) { aes_key_t *ksch = (aes_key_t *)ks; + #ifdef __amd64 + if (intel_aes_instructions_present()) + aes_decrypt_intel(&ksch->decr_ks.ks32[0], ksch->nr, + /* LINTED: pointer alignment */ + (uint32_t *)ct, (uint32_t *)pt); + else + aes_decrypt_amd64(&ksch->decr_ks.ks32[0], ksch->nr, + /* LINTED: pointer alignment */ + (uint32_t *)ct, (uint32_t *)pt); + #else /* !__amd64 */ #ifndef AES_BYTE_SWAP if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t))) { AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr, /* LINTED: pointer alignment */ ! (uint32_t *)ct, (uint32_t *)pt); } else { #endif uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)]; /* Copy input block into buffer */
*** 1707,1720 **** #else /* byte swap */ buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]); buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]); buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]); buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]); ! #endif AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr, ! buffer, buffer, ksch->flags); /* Copy result from buffer to output block */ #ifndef AES_BYTE_SWAP bcopy(&buffer, pt, AES_BLOCK_LEN); } --- 1709,1722 ---- #else /* byte swap */ buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]); buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]); buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]); buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]); ! #endif /* byte swap */ AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr, ! buffer, buffer); /* Copy result from buffer to output block */ #ifndef AES_BYTE_SWAP bcopy(&buffer, pt, AES_BLOCK_LEN); }
*** 1722,1738 **** #else /* byte swap */ *(uint32_t *)(void *)&pt[0] = htonl(buffer[0]); *(uint32_t *)(void *)&pt[4] = htonl(buffer[1]); *(uint32_t *)(void *)&pt[8] = htonl(buffer[2]); *(uint32_t *)(void *)&pt[12] = htonl(buffer[3]); ! #endif return (CRYPTO_SUCCESS); } /* * Allocate key schedule for AES. * * Return the pointer and set size to the number of bytes allocated. * Memory allocated must be freed by the caller when done. * --- 1724,1954 ---- #else /* byte swap */ *(uint32_t *)(void *)&pt[0] = htonl(buffer[0]); *(uint32_t *)(void *)&pt[4] = htonl(buffer[1]); *(uint32_t *)(void *)&pt[8] = htonl(buffer[2]); *(uint32_t *)(void *)&pt[12] = htonl(buffer[3]); ! #endif /* byte swap */ ! #endif /* !__amd64 */ return (CRYPTO_SUCCESS); } + #define ECB_LOOP(ciph_func) \ + do { \ + for (; i < length; i += AES_BLOCK_LEN) \ + ciph_func; \ + _NOTE(CONSTCOND) \ + } while (0) + #define ECB_LOOP_4P(ciph_func, enc_or_dec, in, out) \ + ECB_LOOP(ciph_func(&ksch->enc_or_dec ## r_ks.ks32[0], \ + ksch->nr, (void *)&in[i], (void *)&out[i])) + #define ECB_LOOP_3P(ciph_func, in, out) \ + ECB_LOOP(ciph_func(ksch, (void *)&in[i], (void *)&out[i])) + #ifdef __amd64 + #define ECB_INTEL_IMPL(enc_or_dec, in, out) \ + do { \ + if (intel_aes_instructions_present()) { \ + /* first use the accelerated function */ \ + for (; i + 8 * AES_BLOCK_LEN <= length; \ + i += 8 * AES_BLOCK_LEN) \ + aes_ ## enc_or_dec ## rypt_intel8( \ + &ksch->enc_or_dec ## r_ks.ks32[0], \ + ksch->nr, &in[i], &out[i]); \ + /* finish off the remainder per-block */ \ + ECB_LOOP_4P(aes_ ## enc_or_dec ## rypt_intel, \ + enc_or_dec, in, out); \ + } else { \ + ECB_LOOP_4P(aes_ ## enc_or_dec ## rypt_amd64, \ + enc_or_dec, in, out); \ + } \ + _NOTE(CONSTCOND) \ + } while (0) + #endif /* __amd64 */ + /* + * Perform AES ECB encryption on a sequence of blocks. On x86-64 CPUs with + * the AES-NI extension, this performs the encryption in increments of 8 + * blocks at a time, exploiting instruction parallelism more efficiently. + * On other platforms, this simply encrypts the blocks in sequence. + */ + int + aes_encrypt_ecb(const void *ks, const uint8_t *pt, uint8_t *ct, uint64_t length) + { + aes_key_t *ksch = (aes_key_t *)ks; + uint64_t i = 0; + + #ifdef __amd64 + ECB_INTEL_IMPL(enc, pt, ct); + #elif defined(sun4u) + ECB_LOOP_4P(aes_encrypt_impl, enc, pt, ct); + #else /* Generic C implementation */ + ECB_LOOP_3P((void) aes_encrypt_block, pt, ct); + #endif /* Generic C implementation */ + + return (CRYPTO_SUCCESS); + } + + /* + * Same as aes_encrypt_ecb, but performs decryption. + */ + int + aes_decrypt_ecb(const void *ks, const uint8_t *ct, uint8_t *pt, uint64_t length) + { + aes_key_t *ksch = (aes_key_t *)ks; + uint64_t i = 0; + + #ifdef __amd64 + ECB_INTEL_IMPL(dec, ct, pt); + #elif defined(sun4u) + ECB_LOOP_4P(aes_decrypt_impl, dec, ct, pt); + #else /* Generic C implementation */ + ECB_LOOP_3P((void) aes_decrypt_block, ct, pt); + #endif /* Generic C implementation */ + + return (CRYPTO_SUCCESS); + } + #ifdef __amd64 + #undef ECB_INTEL_IMPL + #endif /* __amd64 */ + + #undef ECB_LOOP + #undef ECB_LOOP_4P + #undef ECB_LOOP_3P + + #define CBC_LOOP(enc_func, xor_func) \ + do { \ + for (; i < length; i += AES_BLOCK_LEN) { \ + /* copy IV to ciphertext */ \ + bcopy(iv, &ct[i], AES_BLOCK_LEN); \ + /* XOR IV with plaintext with input */ \ + xor_func(&pt[i], &ct[i]); \ + /* encrypt counter in output region */ \ + enc_func; \ + iv = &ct[i]; \ + } \ + _NOTE(CONSTCOND) \ + } while (0) + #define CBC_LOOP_4P(enc_func, xor_func) \ + CBC_LOOP(enc_func(&ksch->encr_ks.ks32[0], \ + ksch->nr, (void *)&ct[i], (void *)&ct[i]), xor_func) + #define CBC_LOOP_3P(enc_func, xor_func) \ + CBC_LOOP(enc_func(ksch, (void *)&ct[i], (void *)&ct[i]), xor_func) + + /* + * Encrypts a sequence of consecutive AES blocks in CBC mode. On x86-64 + * with the AES-NI extension, the encryption is performed on 8 blocks at + * a time using an optimized assembly implementation, giving a speed boost + * of around 75%. On other platforms, this simply performs CBC encryption + * in sequence on the blocks. + * + * Decryption acceleration is implemented in the kernel kcf block cipher + * modes code (cbc.c), because that doesn't require a complete hand-tuned + * CBC implementation in assembly. + */ + int + aes_encrypt_cbc(const void *ks, const uint8_t *pt, uint8_t *ct, + const uint8_t *iv, uint64_t length) + { + aes_key_t *ksch = (aes_key_t *)ks; + size_t i = 0; + + #ifdef __amd64 + if (intel_aes_instructions_present()) { + for (; i + 8 * AES_BLOCK_LEN <= length; + i += 8 * AES_BLOCK_LEN) { + aes_encrypt_cbc_intel8(&ksch->encr_ks.ks32[0], + ksch->nr, &ct[i], &ct[i], iv); + iv = &ct[7 * AES_BLOCK_LEN]; + } + CBC_LOOP_4P(aes_encrypt_intel, aes_xor_intel); + } else { + CBC_LOOP_4P(aes_encrypt_amd64, aes_xor_intel); + } + #elif defined(sun4u) + CBC_LOOP_4P(aes_encrypt_impl, aes_xor_block); + #else /* Generic C implementation */ + CBC_LOOP_3P((void) aes_encrypt_block, aes_xor_block); + #endif /* Generic C implementation */ + + return (CRYPTO_SUCCESS); + } + #undef CBC_LOOP + #undef CBC_LOOP_4P + #undef CBC_LOOP_3P + + #define CTR_LOOP(enc_func, xor_func) \ + do { \ + for (; i < length; i += AES_BLOCK_LEN) { \ + /* set up counter in output region */ \ + *(uint64_t *)(void *)&output[i] = counter[0]; \ + *(uint64_t *)(void *)&output[i + 8] = \ + htonll(counter[1]++); \ + /* encrypt counter in output region */ \ + enc_func; \ + /* XOR encrypted counter with input */ \ + xor_func(&input[i], &output[i]); \ + } \ + _NOTE(CONSTCOND) \ + } while (0) + #define CTR_LOOP_4P(enc_func, xor_func) \ + CTR_LOOP(enc_func(&ksch->encr_ks.ks32[0], ksch->nr, \ + (void *)&output[i], (void *)&output[i]), xor_func) + #define CTR_LOOP_3P(enc_func, xor_func) \ + CTR_LOOP(enc_func(ksch, (void *)&output[i], (void *)&output[i]),\ + xor_func) + /* + * Performs high-performance counter mode encryption and decryption on + * a sequence of blocks. In CTR mode, encryption and decryption are the + * same operation, just with the plaintext and ciphertext reversed: + * plaintext = CTR(CTR(plaintext, K), K) + * Blocks also do not interdepend on each other, so it is an excellent + * mode when high performance is required and data authentication/integrity + * checking is provided via some other means, or isn't necessary. + * + * On x86-64 with the AES-NI extension, this code performs CTR mode + * encryption in parallel on 8 blocks at a time and can provide in + * excess of 3GB/s/core of encryption/decryption performance (<1 CPB). + */ + int + aes_ctr_mode(const void *ks, const uint8_t *input, uint8_t *output, + uint64_t length, uint64_t counter[2]) + { + aes_key_t *ksch = (aes_key_t *)ks; + uint64_t i = 0; + + // swap lower part to host order for computations + counter[1] = ntohll(counter[1]); + + #ifdef __amd64 + if (intel_aes_instructions_present()) { + /* first use the wide-register accelerated function */ + for (; i + 8 * AES_BLOCK_LEN <= length; + i += 8 * AES_BLOCK_LEN) { + aes_ctr_intel8(&ksch->encr_ks.ks32[0], ksch->nr, + &input[i], &output[i], counter[0], counter[1]); + counter[1] += 8; + } + /* finish off the remainder using the slow per-block method */ + CTR_LOOP_4P(aes_encrypt_intel, aes_xor_intel); + } else { + CTR_LOOP_4P(aes_encrypt_amd64, aes_xor_intel); + } + #elif defined(sun4u) + CTR_LOOP_4P(aes_encrypt_impl, aes_xor_block); + #else /* Generic C implementation */ + CTR_LOOP_3P((void) aes_encrypt_block, aes_xor_block); + #endif /* Generic C implementation */ + + // swap lower part back to big endian + counter[1] = htonll(counter[1]); + + return (CRYPTO_SUCCESS); + } + #undef CTR_LOOP + + /* * Allocate key schedule for AES. * * Return the pointer and set size to the number of bytes allocated. * Memory allocated must be freed by the caller when done. *
*** 1760,1777 **** } #ifdef __amd64 /* ! * Return 1 if executing on Intel with AES-NI instructions, ! * otherwise 0 (i.e., Intel without AES-NI or AMD64). * Cache the result, as the CPU can't change. * * Note: the userland version uses getisax(). The kernel version uses * global variable x86_featureset. */ ! static int intel_aes_instructions_present(void) { static int cached_result = -1; if (cached_result == -1) { /* first time */ --- 1976,1992 ---- } #ifdef __amd64 /* ! * Return 1 if executing on x86-64 with AES-NI instructions, otherwise 0. * Cache the result, as the CPU can't change. * * Note: the userland version uses getisax(). The kernel version uses * global variable x86_featureset. */ ! static inline int intel_aes_instructions_present(void) { static int cached_result = -1; if (cached_result == -1) { /* first time */