Print this page
4896 Performance improvements for KCF AES modes

*** 146,155 **** --- 146,158 ---- * * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary. * * ==================================================================== */ + /* + * Copyright 2015 by Saso Kiselkov. All rights reserved. + */ #if defined(lint) || defined(__lint) #include <sys/types.h>
*** 279,289 **** --- 282,359 ---- STTS(tmpreg); \ 2: \ mov %rbp, %rsp; \ pop %rbp + /* + * void aes_accel_save(void *savestate); + * + * Saves all 16 XMM registers and CR0 to a temporary location pointed to + * in the first argument and clears TS in CR0. This must be invoked before + * executing any floating point operations inside the kernel (and kernel + * thread preemption must be disabled as well). The memory region to which + * all state is saved must be at least 16x 128-bit + 64-bit long and must + * be 128-bit aligned. + */ + ENTRY_NP(aes_accel_save) + movq %cr0, %rax + movq %rax, 0x100(%rdi) + testq $CR0_TS, %rax + jnz 1f + movaps %xmm0, 0x00(%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + movaps %xmm4, 0x40(%rdi) + movaps %xmm5, 0x50(%rdi) + movaps %xmm6, 0x60(%rdi) + movaps %xmm7, 0x70(%rdi) + movaps %xmm8, 0x80(%rdi) + movaps %xmm9, 0x90(%rdi) + movaps %xmm10, 0xa0(%rdi) + movaps %xmm11, 0xb0(%rdi) + movaps %xmm12, 0xc0(%rdi) + movaps %xmm13, 0xd0(%rdi) + movaps %xmm14, 0xe0(%rdi) + movaps %xmm15, 0xf0(%rdi) + ret + 1: + PROTECTED_CLTS + ret + SET_SIZE(aes_accel_save) + /* + * void aes_accel_restore(void *savestate); + * + * Restores the saved XMM and CR0.TS state from aes_accel_save. + */ + ENTRY_NP(aes_accel_restore) + mov 0x100(%rdi), %rax + testq $CR0_TS, %rax + jnz 1f + movaps 0x00(%rdi), %xmm0 + movaps 0x10(%rdi), %xmm1 + movaps 0x20(%rdi), %xmm2 + movaps 0x30(%rdi), %xmm3 + movaps 0x40(%rdi), %xmm4 + movaps 0x50(%rdi), %xmm5 + movaps 0x60(%rdi), %xmm6 + movaps 0x70(%rdi), %xmm7 + movaps 0x80(%rdi), %xmm8 + movaps 0x90(%rdi), %xmm9 + movaps 0xa0(%rdi), %xmm10 + movaps 0xb0(%rdi), %xmm11 + movaps 0xc0(%rdi), %xmm12 + movaps 0xd0(%rdi), %xmm13 + movaps 0xe0(%rdi), %xmm14 + movaps 0xf0(%rdi), %xmm15 + ret + 1: + STTS(%rax) + ret + SET_SIZE(aes_accel_restore) + #else #define PROTECTED_CLTS #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) #define SET_TS_OR_POP_XMM0_XMM1(tmpreg) #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
*** 376,387 **** --- 446,527 ---- movaps %xmm2, (%rcx) add $0x10, %rcx ret SET_SIZE(_key_expansion_256b) + /* + * void aes_copy_intel(const uint8_t *src, uint8_t *dst); + * + * Copies one unaligned 128-bit block from `src' to `dst'. The copy is + * performed using FPU registers, so make sure FPU state is saved when + * running this in the kernel. + */ + ENTRY_NP(aes_copy_intel) + movdqu (%rdi), %xmm0 + movdqu %xmm0, (%rsi) + ret + SET_SIZE(aes_copy_intel) /* + * void aes_xor_intel(const uint8_t *src, uint8_t *dst); + * + * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and + * stores the result at `dst'. The XOR is performed using FPU registers, + * so make sure FPU state is saved when running this in the kernel. + */ + ENTRY_NP(aes_xor_intel) + movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm1 + pxor %xmm1, %xmm0 + movdqu %xmm0, (%rsi) + ret + SET_SIZE(aes_xor_intel) + + /* + * void aes_xor_intel8(const uint8_t *src, uint8_t *dst); + * + * XORs eight pairs of consecutive unaligned 128-bit blocks from `src' and + * 'dst' and stores the results at `dst'. The XOR is performed using FPU + * registers, so make sure FPU state is saved when running this in the kernel. + */ + ENTRY_NP(aes_xor_intel8) + movdqu 0x00(%rdi), %xmm0 + movdqu 0x00(%rsi), %xmm1 + movdqu 0x10(%rdi), %xmm2 + movdqu 0x10(%rsi), %xmm3 + movdqu 0x20(%rdi), %xmm4 + movdqu 0x20(%rsi), %xmm5 + movdqu 0x30(%rdi), %xmm6 + movdqu 0x30(%rsi), %xmm7 + movdqu 0x40(%rdi), %xmm8 + movdqu 0x40(%rsi), %xmm9 + movdqu 0x50(%rdi), %xmm10 + movdqu 0x50(%rsi), %xmm11 + movdqu 0x60(%rdi), %xmm12 + movdqu 0x60(%rsi), %xmm13 + movdqu 0x70(%rdi), %xmm14 + movdqu 0x70(%rsi), %xmm15 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm4 + pxor %xmm7, %xmm6 + pxor %xmm9, %xmm8 + pxor %xmm11, %xmm10 + pxor %xmm13, %xmm12 + pxor %xmm15, %xmm14 + movdqu %xmm0, 0x00(%rsi) + movdqu %xmm2, 0x10(%rsi) + movdqu %xmm4, 0x20(%rsi) + movdqu %xmm6, 0x30(%rsi) + movdqu %xmm8, 0x40(%rsi) + movdqu %xmm10, 0x50(%rsi) + movdqu %xmm12, 0x60(%rsi) + movdqu %xmm14, 0x70(%rsi) + ret + SET_SIZE(aes_xor_intel8) + + /* * rijndael_key_setup_enc_intel() * Expand the cipher key into the encryption key schedule. * * For kernel code, caller is responsible for ensuring kpreempt_disable() * has been called. This is because %xmm registers are not saved/restored.
*** 657,689 **** / OpenSSL: rax = 0 for OK, or non-zero for error ret SET_SIZE(rijndael_key_setup_dec_intel) - /* - * aes_encrypt_intel() - * Encrypt a single block (in and out can overlap). - * - * For kernel code, caller is responsible for ensuring kpreempt_disable() - * has been called. This is because %xmm registers are not saved/restored. - * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set - * on entry. Otherwise, if TS is not set, save and restore %xmm registers - * on the stack. - * - * Temporary register usage: - * %xmm0 State - * %xmm1 Key - * - * Original OpenSolaris Interface: - * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, - * const uint32_t pt[4], uint32_t ct[4]) - * - * Original Intel OpenSSL Interface: - * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, - * const AES_KEY *key) - */ - #ifdef OPENSSL_INTERFACE #define aes_encrypt_intel intel_AES_encrypt #define aes_decrypt_intel intel_AES_decrypt #define INP rdi /* P1, 64 bits */ --- 797,806 ----
*** 697,854 **** #else /* OpenSolaris Interface */ #define KEYP rdi /* P1, 64 bits */ #define NROUNDS esi /* P2, 32 bits */ #define INP rdx /* P3, 64 bits */ #define OUTP rcx /* P4, 64 bits */ #endif /* OPENSSL_INTERFACE */ ! #define STATE xmm0 /* temporary, 128 bits */ ! #define KEY xmm1 /* temporary, 128 bits */ ! ENTRY_NP(aes_encrypt_intel) ! CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) ! movups (%INP), %STATE / input movaps (%KEYP), %KEY / key #ifdef OPENSSL_INTERFACE mov 240(%KEYP), %NROUNDS32 / round count #else /* OpenSolaris Interface */ /* Round count is already present as P2 in %rsi/%esi */ #endif /* OPENSSL_INTERFACE */ ! pxor %KEY, %STATE / round 0 lea 0x30(%KEYP), %KEYP cmp $12, %NROUNDS jb .Lenc128 lea 0x20(%KEYP), %KEYP je .Lenc192 / AES 256 lea 0x20(%KEYP), %KEYP ! movaps -0x60(%KEYP), %KEY ! aesenc %KEY, %STATE ! movaps -0x50(%KEYP), %KEY ! aesenc %KEY, %STATE .align 4 .Lenc192: / AES 192 and 256 ! movaps -0x40(%KEYP), %KEY ! aesenc %KEY, %STATE ! movaps -0x30(%KEYP), %KEY ! aesenc %KEY, %STATE .align 4 .Lenc128: / AES 128, 192, and 256 ! movaps -0x20(%KEYP), %KEY ! aesenc %KEY, %STATE ! movaps -0x10(%KEYP), %KEY ! aesenc %KEY, %STATE ! movaps (%KEYP), %KEY ! aesenc %KEY, %STATE ! movaps 0x10(%KEYP), %KEY ! aesenc %KEY, %STATE ! movaps 0x20(%KEYP), %KEY ! aesenc %KEY, %STATE ! movaps 0x30(%KEYP), %KEY ! aesenc %KEY, %STATE ! movaps 0x40(%KEYP), %KEY ! aesenc %KEY, %STATE ! movaps 0x50(%KEYP), %KEY ! aesenc %KEY, %STATE ! movaps 0x60(%KEYP), %KEY ! aesenc %KEY, %STATE ! movaps 0x70(%KEYP), %KEY ! aesenclast %KEY, %STATE / last round ! movups %STATE, (%OUTP) / output - SET_TS_OR_POP_XMM0_XMM1(%r10) ret SET_SIZE(aes_encrypt_intel) - /* * aes_decrypt_intel() * Decrypt a single block (in and out can overlap). * ! * For kernel code, caller is responsible for ensuring kpreempt_disable() ! * has been called. This is because %xmm registers are not saved/restored. ! * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set ! * on entry. Otherwise, if TS is not set, save and restore %xmm registers ! * on the stack. * * Temporary register usage: * %xmm0 State * %xmm1 Key * * Original OpenSolaris Interface: * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, ! * const uint32_t pt[4], uint32_t ct[4])/ * * Original Intel OpenSSL Interface: * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, * const AES_KEY *key); */ ENTRY_NP(aes_decrypt_intel) ! CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) ! ! movups (%INP), %STATE / input movaps (%KEYP), %KEY / key #ifdef OPENSSL_INTERFACE mov 240(%KEYP), %NROUNDS32 / round count #else /* OpenSolaris Interface */ /* Round count is already present as P2 in %rsi/%esi */ #endif /* OPENSSL_INTERFACE */ ! pxor %KEY, %STATE / round 0 lea 0x30(%KEYP), %KEYP cmp $12, %NROUNDS jb .Ldec128 lea 0x20(%KEYP), %KEYP je .Ldec192 / AES 256 lea 0x20(%KEYP), %KEYP ! movaps -0x60(%KEYP), %KEY ! aesdec %KEY, %STATE ! movaps -0x50(%KEYP), %KEY ! aesdec %KEY, %STATE .align 4 .Ldec192: / AES 192 and 256 ! movaps -0x40(%KEYP), %KEY ! aesdec %KEY, %STATE ! movaps -0x30(%KEYP), %KEY ! aesdec %KEY, %STATE .align 4 .Ldec128: / AES 128, 192, and 256 ! movaps -0x20(%KEYP), %KEY ! aesdec %KEY, %STATE ! movaps -0x10(%KEYP), %KEY ! aesdec %KEY, %STATE ! movaps (%KEYP), %KEY ! aesdec %KEY, %STATE ! movaps 0x10(%KEYP), %KEY ! aesdec %KEY, %STATE ! movaps 0x20(%KEYP), %KEY ! aesdec %KEY, %STATE ! movaps 0x30(%KEYP), %KEY ! aesdec %KEY, %STATE ! movaps 0x40(%KEYP), %KEY ! aesdec %KEY, %STATE ! movaps 0x50(%KEYP), %KEY ! aesdec %KEY, %STATE ! movaps 0x60(%KEYP), %KEY ! aesdec %KEY, %STATE ! movaps 0x70(%KEYP), %KEY ! aesdeclast %KEY, %STATE / last round ! movups %STATE, (%OUTP) / output - SET_TS_OR_POP_XMM0_XMM1(%r10) ret SET_SIZE(aes_decrypt_intel) #endif /* lint || __lint */ --- 814,1407 ---- #else /* OpenSolaris Interface */ #define KEYP rdi /* P1, 64 bits */ #define NROUNDS esi /* P2, 32 bits */ #define INP rdx /* P3, 64 bits */ #define OUTP rcx /* P4, 64 bits */ + #define LENGTH r8 /* P5, 64 bits */ #endif /* OPENSSL_INTERFACE */ ! #define KEY xmm0 /* temporary, 128 bits */ ! #define STATE0 xmm8 /* temporary, 128 bits */ ! #define STATE1 xmm9 /* temporary, 128 bits */ ! #define STATE2 xmm10 /* temporary, 128 bits */ ! #define STATE3 xmm11 /* temporary, 128 bits */ ! #define STATE4 xmm12 /* temporary, 128 bits */ ! #define STATE5 xmm13 /* temporary, 128 bits */ ! #define STATE6 xmm14 /* temporary, 128 bits */ ! #define STATE7 xmm15 /* temporary, 128 bits */ ! /* ! * Runs the first two rounds of AES256 on a state register. `op' should be ! * aesenc or aesdec. ! */ ! #define AES256_ROUNDS(op, statereg) \ ! movaps -0x60(%KEYP), %KEY; \ ! op %KEY, %statereg; \ ! movaps -0x50(%KEYP), %KEY; \ ! op %KEY, %statereg ! /* ! * Runs the first two rounds of AES192, or the 3rd & 4th round of AES256 on ! * a state register. `op' should be aesenc or aesdec. ! */ ! #define AES192_ROUNDS(op, statereg) \ ! movaps -0x40(%KEYP), %KEY; \ ! op %KEY, %statereg; \ ! movaps -0x30(%KEYP), %KEY; \ ! op %KEY, %statereg ! ! /* ! * Runs the full 10 rounds of AES128, or the last 10 rounds of AES192/AES256 ! * on a state register. `op' should be aesenc or aesdec and `lastop' should ! * be aesenclast or aesdeclast. ! */ ! #define AES128_ROUNDS(op, lastop, statereg) \ ! movaps -0x20(%KEYP), %KEY; \ ! op %KEY, %statereg; \ ! movaps -0x10(%KEYP), %KEY; \ ! op %KEY, %statereg; \ ! movaps (%KEYP), %KEY; \ ! op %KEY, %statereg; \ ! movaps 0x10(%KEYP), %KEY; \ ! op %KEY, %statereg; \ ! movaps 0x20(%KEYP), %KEY; \ ! op %KEY, %statereg; \ ! movaps 0x30(%KEYP), %KEY; \ ! op %KEY, %statereg; \ ! movaps 0x40(%KEYP), %KEY; \ ! op %KEY, %statereg; \ ! movaps 0x50(%KEYP), %KEY; \ ! op %KEY, %statereg; \ ! movaps 0x60(%KEYP), %KEY; \ ! op %KEY, %statereg; \ ! movaps 0x70(%KEYP), %KEY; \ ! lastop %KEY, %statereg ! ! /* ! * Macros to run AES encryption rounds. Input must be prefilled in state ! * register - output will be left there as well. ! * To run AES256, invoke all of these macros in sequence. To run AES192, ! * invoke only the -192 and -128 variants. To run AES128, invoke only the ! * -128 variant. ! */ ! #define AES256_ENC_ROUNDS(statereg) \ ! AES256_ROUNDS(aesenc, statereg) ! #define AES192_ENC_ROUNDS(statereg) \ ! AES192_ROUNDS(aesenc, statereg) ! #define AES128_ENC_ROUNDS(statereg) \ ! AES128_ROUNDS(aesenc, aesenclast, statereg) ! ! /* Same as the AES*_ENC_ROUNDS macros, but for decryption. */ ! #define AES256_DEC_ROUNDS(statereg) \ ! AES256_ROUNDS(aesdec, statereg) ! #define AES192_DEC_ROUNDS(statereg) \ ! AES192_ROUNDS(aesdec, statereg) ! #define AES128_DEC_ROUNDS(statereg) \ ! AES128_ROUNDS(aesdec, aesdeclast, statereg) ! ! ! /* ! * aes_encrypt_intel() ! * Encrypt a single block (in and out can overlap). ! * ! * For kernel code, caller is responsible for bracketing this call with ! * disabling kernel thread preemption and calling aes_accel_save/restore(). ! * ! * Temporary register usage: ! * %xmm0 Key ! * %xmm8 State ! * ! * Original OpenSolaris Interface: ! * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, ! * const uint32_t pt[4], uint32_t ct[4]) ! * ! * Original Intel OpenSSL Interface: ! * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, ! * const AES_KEY *key) ! */ ! ENTRY_NP(aes_encrypt_intel) ! movups (%INP), %STATE0 / input movaps (%KEYP), %KEY / key + #ifdef OPENSSL_INTERFACE mov 240(%KEYP), %NROUNDS32 / round count #else /* OpenSolaris Interface */ /* Round count is already present as P2 in %rsi/%esi */ #endif /* OPENSSL_INTERFACE */ ! pxor %KEY, %STATE0 / round 0 lea 0x30(%KEYP), %KEYP cmp $12, %NROUNDS jb .Lenc128 lea 0x20(%KEYP), %KEYP je .Lenc192 / AES 256 lea 0x20(%KEYP), %KEYP ! AES256_ENC_ROUNDS(STATE0) .align 4 .Lenc192: / AES 192 and 256 ! AES192_ENC_ROUNDS(STATE0) .align 4 .Lenc128: / AES 128, 192, and 256 ! AES128_ENC_ROUNDS(STATE0) ! movups %STATE0, (%OUTP) / output ret SET_SIZE(aes_encrypt_intel) /* * aes_decrypt_intel() * Decrypt a single block (in and out can overlap). * ! * For kernel code, caller is responsible for bracketing this call with ! * disabling kernel thread preemption and calling aes_accel_save/restore(). * * Temporary register usage: * %xmm0 State * %xmm1 Key * * Original OpenSolaris Interface: * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, ! * const uint32_t pt[4], uint32_t ct[4]) * * Original Intel OpenSSL Interface: * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, * const AES_KEY *key); */ ENTRY_NP(aes_decrypt_intel) ! movups (%INP), %STATE0 / input movaps (%KEYP), %KEY / key + #ifdef OPENSSL_INTERFACE mov 240(%KEYP), %NROUNDS32 / round count #else /* OpenSolaris Interface */ /* Round count is already present as P2 in %rsi/%esi */ #endif /* OPENSSL_INTERFACE */ ! pxor %KEY, %STATE0 / round 0 lea 0x30(%KEYP), %KEYP cmp $12, %NROUNDS jb .Ldec128 lea 0x20(%KEYP), %KEYP je .Ldec192 / AES 256 lea 0x20(%KEYP), %KEYP ! AES256_DEC_ROUNDS(STATE0) .align 4 .Ldec192: / AES 192 and 256 ! AES192_DEC_ROUNDS(STATE0) .align 4 .Ldec128: / AES 128, 192, and 256 ! AES128_DEC_ROUNDS(STATE0) ! movups %STATE0, (%OUTP) / output ret SET_SIZE(aes_decrypt_intel) + /* Does a pipelined load of eight input blocks into our AES state registers. */ + #define AES_LOAD_INPUT_8BLOCKS \ + movups 0x00(%INP), %STATE0; \ + movups 0x10(%INP), %STATE1; \ + movups 0x20(%INP), %STATE2; \ + movups 0x30(%INP), %STATE3; \ + movups 0x40(%INP), %STATE4; \ + movups 0x50(%INP), %STATE5; \ + movups 0x60(%INP), %STATE6; \ + movups 0x70(%INP), %STATE7; + + /* Does a pipelined store of eight AES state registers to the output. */ + #define AES_STORE_OUTPUT_8BLOCKS \ + movups %STATE0, 0x00(%OUTP); \ + movups %STATE1, 0x10(%OUTP); \ + movups %STATE2, 0x20(%OUTP); \ + movups %STATE3, 0x30(%OUTP); \ + movups %STATE4, 0x40(%OUTP); \ + movups %STATE5, 0x50(%OUTP); \ + movups %STATE6, 0x60(%OUTP); \ + movups %STATE7, 0x70(%OUTP); + + /* Performs a pipelined AES instruction with the key on all state registers. */ + #define AES_KEY_STATE_OP_8BLOCKS(op) \ + op %KEY, %STATE0; \ + op %KEY, %STATE1; \ + op %KEY, %STATE2; \ + op %KEY, %STATE3; \ + op %KEY, %STATE4; \ + op %KEY, %STATE5; \ + op %KEY, %STATE6; \ + op %KEY, %STATE7 + + /* XOR all AES state regs with key to initiate encryption/decryption. */ + #define AES_XOR_STATE_8BLOCKS \ + AES_KEY_STATE_OP_8BLOCKS(pxor) + + /* + * Loads a round key from the key schedule offset `off' into the KEY + * register and performs `op' using the KEY on all 8 STATE registers. + */ + #define AES_RND_8BLOCKS(op, off) \ + movaps off(%KEYP), %KEY; \ + AES_KEY_STATE_OP_8BLOCKS(op) + + /* + * void aes_encrypt_intel8(const uint32_t roundkeys[], int numrounds, + * const void *plaintext, void *ciphertext) + * + * Same as aes_encrypt_intel, but performs the encryption operation on + * 8 independent blocks in sequence, exploiting instruction pipelining. + * This function doesn't support the OpenSSL interface, it's only meant + * for kernel use. + */ + ENTRY_NP(aes_encrypt_intel8) + AES_LOAD_INPUT_8BLOCKS / load input + movaps (%KEYP), %KEY / key + AES_XOR_STATE_8BLOCKS / round 0 + + lea 0x30(%KEYP), %KEYP / point to key schedule + cmp $12, %NROUNDS / determine AES variant + jb .Lenc8_128 + lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule + je .Lenc8_192 + + lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule + AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1 + AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2 + + .align 4 + .Lenc8_192: + AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3 + AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4 + + .align 4 + .Lenc8_128: + AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5 + AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6 + AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7 + AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8 + AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9 + AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10 + AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11 + AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12 + AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13 + AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14 + + AES_STORE_OUTPUT_8BLOCKS / store output + ret + SET_SIZE(aes_encrypt_intel8) + + + /* + * void aes_decrypt_intel8(const uint32_t roundkeys[], int numrounds, + * const void *ciphertext, void *plaintext) + * + * Same as aes_decrypt_intel, but performs the decryption operation on + * 8 independent blocks in sequence, exploiting instruction pipelining. + * This function doesn't support the OpenSSL interface, it's only meant + * for kernel use. + */ + ENTRY_NP(aes_decrypt_intel8) + AES_LOAD_INPUT_8BLOCKS / load input + movaps (%KEYP), %KEY / key + AES_XOR_STATE_8BLOCKS / round 0 + + lea 0x30(%KEYP), %KEYP / point to key schedule + cmp $12, %NROUNDS / determine AES variant + jb .Ldec8_128 + lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule + je .Ldec8_192 + + lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule + AES_RND_8BLOCKS(aesdec, -0x60) / AES256 R.1 + AES_RND_8BLOCKS(aesdec, -0x50) / AES256 R.2 + + .align 4 + .Ldec8_192: + AES_RND_8BLOCKS(aesdec, -0x40) / AES192 R.1; AES256 R.3 + AES_RND_8BLOCKS(aesdec, -0x30) / AES192 R.2; AES256 R.4 + + .align 4 + .Ldec8_128: + AES_RND_8BLOCKS(aesdec, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5 + AES_RND_8BLOCKS(aesdec, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6 + AES_RND_8BLOCKS(aesdec, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7 + AES_RND_8BLOCKS(aesdec, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8 + AES_RND_8BLOCKS(aesdec, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9 + AES_RND_8BLOCKS(aesdec, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10 + AES_RND_8BLOCKS(aesdec, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11 + AES_RND_8BLOCKS(aesdec, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12 + AES_RND_8BLOCKS(aesdec, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13 + AES_RND_8BLOCKS(aesdeclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14 + + AES_STORE_OUTPUT_8BLOCKS / store output + ret + SET_SIZE(aes_decrypt_intel8) + + + /* + * This macro encapsulates the entire AES encryption algo for a single + * block, which is prefilled in statereg and which will be replaced by + * the encrypted output. The KEYP register must already point to the + * AES128 key schedule ("lea 0x30(%KEYP), %KEYP" from encryption + * function call) so that consecutive invocations of this macro are + * supported (KEYP is restored after each invocation). + */ + #define AES_ENC(statereg, label_128, label_192, label_out) \ + cmp $12, %NROUNDS; \ + jb label_128; \ + je label_192; \ + /* AES 256 only */ \ + lea 0x40(%KEYP), %KEYP; \ + AES256_ENC_ROUNDS(statereg); \ + AES192_ENC_ROUNDS(statereg); \ + AES128_ENC_ROUNDS(statereg); \ + lea -0x40(%KEYP), %KEYP; \ + jmp label_out; \ + .align 4; \ + label_192: \ + lea 0x20(%KEYP), %KEYP; \ + /* AES 192 only */ \ + AES192_ENC_ROUNDS(statereg); \ + AES128_ENC_ROUNDS(statereg); \ + lea -0x20(%KEYP), %KEYP; \ + jmp label_out; \ + .align 4; \ + label_128: \ + /* AES 128 only */ \ + AES128_ENC_ROUNDS(statereg); \ + .align 4; \ + label_out: + + + /* + * void aes_encrypt_cbc_intel8(const uint32_t roundkeys[], int numrounds, + * const void *plaintext, void *ciphertext, const void *IV) + * + * Encrypts 8 consecutive AES blocks in the CBC mode. Input and output + * may overlap. This provides a modest performance boost over invoking + * the encryption and XOR in separate functions because we can avoid + * copying the ciphertext block to and from memory between encryption + * and XOR calls. + */ + #define CBC_IV r8 /* input - IV blk pointer */ + #define CBC_IV_XMM xmm1 /* tmp IV location for alignment */ + + ENTRY_NP(aes_encrypt_cbc_intel8) + AES_LOAD_INPUT_8BLOCKS / load input + movaps (%KEYP), %KEY / key + AES_XOR_STATE_8BLOCKS / round 0 + + lea 0x30(%KEYP), %KEYP / point to key schedule + movdqu (%CBC_IV), %CBC_IV_XMM / load IV from unaligned memory + pxor %CBC_IV_XMM, %STATE0 / XOR IV with input block and encrypt + AES_ENC(STATE0, .Lenc_cbc_0_128, .Lenc_cbc_0_192, .Lenc_cbc_0_out) + pxor %STATE0, %STATE1 + AES_ENC(STATE1, .Lenc_cbc_1_128, .Lenc_cbc_1_192, .Lenc_cbc_1_out) + pxor %STATE1, %STATE2 + AES_ENC(STATE2, .Lenc_cbc_2_128, .Lenc_cbc_2_192, .Lenc_cbc_2_out) + pxor %STATE2, %STATE3 + AES_ENC(STATE3, .Lenc_cbc_3_128, .Lenc_cbc_3_192, .Lenc_cbc_3_out) + pxor %STATE3, %STATE4 + AES_ENC(STATE4, .Lenc_cbc_4_128, .Lenc_cbc_4_192, .Lenc_cbc_4_out) + pxor %STATE4, %STATE5 + AES_ENC(STATE5, .Lenc_cbc_5_128, .Lenc_cbc_5_192, .Lenc_cbc_5_out) + pxor %STATE5, %STATE6 + AES_ENC(STATE6, .Lenc_cbc_6_128, .Lenc_cbc_6_192, .Lenc_cbc_6_out) + pxor %STATE6, %STATE7 + AES_ENC(STATE7, .Lenc_cbc_7_128, .Lenc_cbc_7_192, .Lenc_cbc_7_out) + + AES_STORE_OUTPUT_8BLOCKS / store output + ret + SET_SIZE(aes_encrypt_cbc_intel8) + + /* + * Prefills register state with counters suitable for the CTR encryption + * mode. The counter is assumed to consist of two portions: + * - A lower monotonically increasing 64-bit counter. If the caller wants + * a smaller counter, they are responsible for checking that it doesn't + * overflow between encryption calls. + * - An upper static "nonce" portion, in big endian, preloaded into the + * lower portion of an XMM register. + * This macro adds `ctridx' to the lower_LE counter, swaps it to big + * endian and by way of a temporary general-purpose register loads the + * lower and upper counter portions into a target XMM result register, + * which can then be handed off to the encryption process. + */ + #define PREP_CTR_BLOCKS(lower_LE, upper_BE_xmm, ctridx, tmpreg, resreg) \ + lea ctridx(%lower_LE), %tmpreg; \ + bswap %tmpreg; \ + movq %tmpreg, %resreg; \ + movlhps %upper_BE_xmm, %resreg; \ + pshufd $0b01001110, %resreg, %resreg + + #define CTR_UPPER_BE r8 /* input - counter upper 64 bits (BE) */ + #define CTR_UPPER_BE_XMM xmm1 /* tmp for upper counter bits */ + #define CTR_LOWER_LE r9 /* input - counter lower 64 bits (LE) */ + #define CTR_TMP0 rax /* tmp for lower 64 bit add & bswap */ + #define CTR_TMP1 rbx /* tmp for lower 64 bit add & bswap */ + #define CTR_TMP2 r10 /* tmp for lower 64 bit add & bswap */ + #define CTR_TMP3 r11 /* tmp for lower 64 bit add & bswap */ + #define CTR_TMP4 r12 /* tmp for lower 64 bit add & bswap */ + #define CTR_TMP5 r13 /* tmp for lower 64 bit add & bswap */ + #define CTR_TMP6 r14 /* tmp for lower 64 bit add & bswap */ + #define CTR_TMP7 r15 /* tmp for lower 64 bit add & bswap */ + + /* + * These are used in case CTR encryption input is unaligned before XORing. + * Must not overlap with any STATE[0-7] register. + */ + #define TMP_INPUT0 xmm0 + #define TMP_INPUT1 xmm1 + #define TMP_INPUT2 xmm2 + #define TMP_INPUT3 xmm3 + #define TMP_INPUT4 xmm4 + #define TMP_INPUT5 xmm5 + #define TMP_INPUT6 xmm6 + #define TMP_INPUT7 xmm7 + + /* + * void aes_ctr_intel8(const uint32_t roundkeys[], int numrounds, + * const void *input, void *output, uint64_t counter_upper_BE, + * uint64_t counter_lower_LE) + * + * Runs AES on 8 consecutive blocks in counter mode (encryption and + * decryption in counter mode are the same). + */ + ENTRY_NP(aes_ctr_intel8) + /* save caller's regs */ + pushq %rbp + movq %rsp, %rbp + subq $0x38, %rsp + / CTR_TMP0 is rax, no need to save + movq %CTR_TMP1, -0x38(%rbp) + movq %CTR_TMP2, -0x30(%rbp) + movq %CTR_TMP3, -0x28(%rbp) + movq %CTR_TMP4, -0x20(%rbp) + movq %CTR_TMP5, -0x18(%rbp) + movq %CTR_TMP6, -0x10(%rbp) + movq %CTR_TMP7, -0x08(%rbp) + + /* + * CTR step 1: prepare big-endian formatted 128-bit counter values, + * placing the result in the AES-NI input state registers. + */ + movq %CTR_UPPER_BE, %CTR_UPPER_BE_XMM + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 0, CTR_TMP0, STATE0) + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 1, CTR_TMP1, STATE1) + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 2, CTR_TMP2, STATE2) + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 3, CTR_TMP3, STATE3) + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 4, CTR_TMP4, STATE4) + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 5, CTR_TMP5, STATE5) + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 6, CTR_TMP6, STATE6) + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 7, CTR_TMP7, STATE7) + + /* + * CTR step 2: Encrypt the counters. + */ + movaps (%KEYP), %KEY / key + AES_XOR_STATE_8BLOCKS / round 0 + + /* Determine the AES variant we're going to compute */ + lea 0x30(%KEYP), %KEYP / point to key schedule + cmp $12, %NROUNDS / determine AES variant + jb .Lctr8_128 + lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule + je .Lctr8_192 + + /* AES 256 */ + lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule + AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1 + AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2 + + .align 4 + .Lctr8_192: + /* AES 192 and 256 */ + AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3 + AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4 + + .align 4 + .Lctr8_128: + /* AES 128, 192, and 256 */ + AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5 + AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6 + AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7 + AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8 + AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9 + AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10 + AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11 + AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12 + AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13 + AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14 + + /* + * CTR step 3: XOR input data blocks with encrypted counters to + * produce result. + */ + mov %INP, %rax / pxor requires alignment, so check + andq $0xf, %rax + jnz .Lctr_input_unaligned + pxor 0x00(%INP), %STATE0 + pxor 0x10(%INP), %STATE1 + pxor 0x20(%INP), %STATE2 + pxor 0x30(%INP), %STATE3 + pxor 0x40(%INP), %STATE4 + pxor 0x50(%INP), %STATE5 + pxor 0x60(%INP), %STATE6 + pxor 0x70(%INP), %STATE7 + jmp .Lctr_out + + .align 4 + .Lctr_input_unaligned: + movdqu 0x00(%INP), %TMP_INPUT0 + movdqu 0x10(%INP), %TMP_INPUT1 + movdqu 0x20(%INP), %TMP_INPUT2 + movdqu 0x30(%INP), %TMP_INPUT3 + movdqu 0x40(%INP), %TMP_INPUT4 + movdqu 0x50(%INP), %TMP_INPUT5 + movdqu 0x60(%INP), %TMP_INPUT6 + movdqu 0x70(%INP), %TMP_INPUT7 + pxor %TMP_INPUT0, %STATE0 + pxor %TMP_INPUT1, %STATE1 + pxor %TMP_INPUT2, %STATE2 + pxor %TMP_INPUT3, %STATE3 + pxor %TMP_INPUT4, %STATE4 + pxor %TMP_INPUT5, %STATE5 + pxor %TMP_INPUT6, %STATE6 + pxor %TMP_INPUT7, %STATE7 + + .align 4 + .Lctr_out: + /* + * Step 4: Write out processed blocks to memory. + */ + movdqu %STATE0, 0x00(%OUTP) + movdqu %STATE1, 0x10(%OUTP) + movdqu %STATE2, 0x20(%OUTP) + movdqu %STATE3, 0x30(%OUTP) + movdqu %STATE4, 0x40(%OUTP) + movdqu %STATE5, 0x50(%OUTP) + movdqu %STATE6, 0x60(%OUTP) + movdqu %STATE7, 0x70(%OUTP) + + /* restore caller's regs */ + / CTR_TMP0 is rax, no need to restore + movq -0x38(%rbp), %CTR_TMP1 + movq -0x30(%rbp), %CTR_TMP2 + movq -0x28(%rbp), %CTR_TMP3 + movq -0x20(%rbp), %CTR_TMP4 + movq -0x18(%rbp), %CTR_TMP5 + movq -0x10(%rbp), %CTR_TMP6 + movq -0x08(%rbp), %CTR_TMP7 + leave + ret + SET_SIZE(aes_ctr_intel8) + #endif /* lint || __lint */