Print this page
4896 Performance improvements for KCF AES modes
@@ -146,10 +146,13 @@
*
* Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
*
* ====================================================================
*/
+/*
+ * Copyright 2015 by Saso Kiselkov. All rights reserved.
+ */
#if defined(lint) || defined(__lint)
#include <sys/types.h>
@@ -279,11 +282,78 @@
STTS(tmpreg); \
2: \
mov %rbp, %rsp; \
pop %rbp
+/*
+ * void aes_accel_save(void *savestate);
+ *
+ * Saves all 16 XMM registers and CR0 to a temporary location pointed to
+ * in the first argument and clears TS in CR0. This must be invoked before
+ * executing any floating point operations inside the kernel (and kernel
+ * thread preemption must be disabled as well). The memory region to which
+ * all state is saved must be at least 16x 128-bit + 64-bit long and must
+ * be 128-bit aligned.
+ */
+ENTRY_NP(aes_accel_save)
+ movq %cr0, %rax
+ movq %rax, 0x100(%rdi)
+ testq $CR0_TS, %rax
+ jnz 1f
+ movaps %xmm0, 0x00(%rdi)
+ movaps %xmm1, 0x10(%rdi)
+ movaps %xmm2, 0x20(%rdi)
+ movaps %xmm3, 0x30(%rdi)
+ movaps %xmm4, 0x40(%rdi)
+ movaps %xmm5, 0x50(%rdi)
+ movaps %xmm6, 0x60(%rdi)
+ movaps %xmm7, 0x70(%rdi)
+ movaps %xmm8, 0x80(%rdi)
+ movaps %xmm9, 0x90(%rdi)
+ movaps %xmm10, 0xa0(%rdi)
+ movaps %xmm11, 0xb0(%rdi)
+ movaps %xmm12, 0xc0(%rdi)
+ movaps %xmm13, 0xd0(%rdi)
+ movaps %xmm14, 0xe0(%rdi)
+ movaps %xmm15, 0xf0(%rdi)
+ ret
+1:
+ PROTECTED_CLTS
+ ret
+ SET_SIZE(aes_accel_save)
+/*
+ * void aes_accel_restore(void *savestate);
+ *
+ * Restores the saved XMM and CR0.TS state from aes_accel_save.
+ */
+ENTRY_NP(aes_accel_restore)
+ mov 0x100(%rdi), %rax
+ testq $CR0_TS, %rax
+ jnz 1f
+ movaps 0x00(%rdi), %xmm0
+ movaps 0x10(%rdi), %xmm1
+ movaps 0x20(%rdi), %xmm2
+ movaps 0x30(%rdi), %xmm3
+ movaps 0x40(%rdi), %xmm4
+ movaps 0x50(%rdi), %xmm5
+ movaps 0x60(%rdi), %xmm6
+ movaps 0x70(%rdi), %xmm7
+ movaps 0x80(%rdi), %xmm8
+ movaps 0x90(%rdi), %xmm9
+ movaps 0xa0(%rdi), %xmm10
+ movaps 0xb0(%rdi), %xmm11
+ movaps 0xc0(%rdi), %xmm12
+ movaps 0xd0(%rdi), %xmm13
+ movaps 0xe0(%rdi), %xmm14
+ movaps 0xf0(%rdi), %xmm15
+ ret
+1:
+ STTS(%rax)
+ ret
+ SET_SIZE(aes_accel_restore)
+
#else
#define PROTECTED_CLTS
#define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
#define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
#define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
@@ -376,12 +446,82 @@
movaps %xmm2, (%rcx)
add $0x10, %rcx
ret
SET_SIZE(_key_expansion_256b)
+/*
+ * void aes_copy_intel(const uint8_t *src, uint8_t *dst);
+ *
+ * Copies one unaligned 128-bit block from `src' to `dst'. The copy is
+ * performed using FPU registers, so make sure FPU state is saved when
+ * running this in the kernel.
+ */
+ENTRY_NP(aes_copy_intel)
+ movdqu (%rdi), %xmm0
+ movdqu %xmm0, (%rsi)
+ ret
+ SET_SIZE(aes_copy_intel)
/*
+ * void aes_xor_intel(const uint8_t *src, uint8_t *dst);
+ *
+ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
+ * stores the result at `dst'. The XOR is performed using FPU registers,
+ * so make sure FPU state is saved when running this in the kernel.
+ */
+ENTRY_NP(aes_xor_intel)
+ movdqu (%rdi), %xmm0
+ movdqu (%rsi), %xmm1
+ pxor %xmm1, %xmm0
+ movdqu %xmm0, (%rsi)
+ ret
+ SET_SIZE(aes_xor_intel)
+
+/*
+ * void aes_xor_intel8(const uint8_t *src, uint8_t *dst);
+ *
+ * XORs eight pairs of consecutive unaligned 128-bit blocks from `src' and
+ * 'dst' and stores the results at `dst'. The XOR is performed using FPU
+ * registers, so make sure FPU state is saved when running this in the kernel.
+ */
+ENTRY_NP(aes_xor_intel8)
+ movdqu 0x00(%rdi), %xmm0
+ movdqu 0x00(%rsi), %xmm1
+ movdqu 0x10(%rdi), %xmm2
+ movdqu 0x10(%rsi), %xmm3
+ movdqu 0x20(%rdi), %xmm4
+ movdqu 0x20(%rsi), %xmm5
+ movdqu 0x30(%rdi), %xmm6
+ movdqu 0x30(%rsi), %xmm7
+ movdqu 0x40(%rdi), %xmm8
+ movdqu 0x40(%rsi), %xmm9
+ movdqu 0x50(%rdi), %xmm10
+ movdqu 0x50(%rsi), %xmm11
+ movdqu 0x60(%rdi), %xmm12
+ movdqu 0x60(%rsi), %xmm13
+ movdqu 0x70(%rdi), %xmm14
+ movdqu 0x70(%rsi), %xmm15
+ pxor %xmm1, %xmm0
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm4
+ pxor %xmm7, %xmm6
+ pxor %xmm9, %xmm8
+ pxor %xmm11, %xmm10
+ pxor %xmm13, %xmm12
+ pxor %xmm15, %xmm14
+ movdqu %xmm0, 0x00(%rsi)
+ movdqu %xmm2, 0x10(%rsi)
+ movdqu %xmm4, 0x20(%rsi)
+ movdqu %xmm6, 0x30(%rsi)
+ movdqu %xmm8, 0x40(%rsi)
+ movdqu %xmm10, 0x50(%rsi)
+ movdqu %xmm12, 0x60(%rsi)
+ movdqu %xmm14, 0x70(%rsi)
+ ret
+ SET_SIZE(aes_xor_intel8)
+
+/*
* rijndael_key_setup_enc_intel()
* Expand the cipher key into the encryption key schedule.
*
* For kernel code, caller is responsible for ensuring kpreempt_disable()
* has been called. This is because %xmm registers are not saved/restored.
@@ -657,33 +797,10 @@
/ OpenSSL: rax = 0 for OK, or non-zero for error
ret
SET_SIZE(rijndael_key_setup_dec_intel)
-/*
- * aes_encrypt_intel()
- * Encrypt a single block (in and out can overlap).
- *
- * For kernel code, caller is responsible for ensuring kpreempt_disable()
- * has been called. This is because %xmm registers are not saved/restored.
- * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
- * on entry. Otherwise, if TS is not set, save and restore %xmm registers
- * on the stack.
- *
- * Temporary register usage:
- * %xmm0 State
- * %xmm1 Key
- *
- * Original OpenSolaris Interface:
- * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
- * const uint32_t pt[4], uint32_t ct[4])
- *
- * Original Intel OpenSSL Interface:
- * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
- * const AES_KEY *key)
- */
-
#ifdef OPENSSL_INTERFACE
#define aes_encrypt_intel intel_AES_encrypt
#define aes_decrypt_intel intel_AES_decrypt
#define INP rdi /* P1, 64 bits */
@@ -697,158 +814,594 @@
#else /* OpenSolaris Interface */
#define KEYP rdi /* P1, 64 bits */
#define NROUNDS esi /* P2, 32 bits */
#define INP rdx /* P3, 64 bits */
#define OUTP rcx /* P4, 64 bits */
+#define LENGTH r8 /* P5, 64 bits */
#endif /* OPENSSL_INTERFACE */
-#define STATE xmm0 /* temporary, 128 bits */
-#define KEY xmm1 /* temporary, 128 bits */
+#define KEY xmm0 /* temporary, 128 bits */
+#define STATE0 xmm8 /* temporary, 128 bits */
+#define STATE1 xmm9 /* temporary, 128 bits */
+#define STATE2 xmm10 /* temporary, 128 bits */
+#define STATE3 xmm11 /* temporary, 128 bits */
+#define STATE4 xmm12 /* temporary, 128 bits */
+#define STATE5 xmm13 /* temporary, 128 bits */
+#define STATE6 xmm14 /* temporary, 128 bits */
+#define STATE7 xmm15 /* temporary, 128 bits */
-ENTRY_NP(aes_encrypt_intel)
- CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
+/*
+ * Runs the first two rounds of AES256 on a state register. `op' should be
+ * aesenc or aesdec.
+ */
+#define AES256_ROUNDS(op, statereg) \
+ movaps -0x60(%KEYP), %KEY; \
+ op %KEY, %statereg; \
+ movaps -0x50(%KEYP), %KEY; \
+ op %KEY, %statereg
- movups (%INP), %STATE / input
+/*
+ * Runs the first two rounds of AES192, or the 3rd & 4th round of AES256 on
+ * a state register. `op' should be aesenc or aesdec.
+ */
+#define AES192_ROUNDS(op, statereg) \
+ movaps -0x40(%KEYP), %KEY; \
+ op %KEY, %statereg; \
+ movaps -0x30(%KEYP), %KEY; \
+ op %KEY, %statereg
+
+/*
+ * Runs the full 10 rounds of AES128, or the last 10 rounds of AES192/AES256
+ * on a state register. `op' should be aesenc or aesdec and `lastop' should
+ * be aesenclast or aesdeclast.
+ */
+#define AES128_ROUNDS(op, lastop, statereg) \
+ movaps -0x20(%KEYP), %KEY; \
+ op %KEY, %statereg; \
+ movaps -0x10(%KEYP), %KEY; \
+ op %KEY, %statereg; \
+ movaps (%KEYP), %KEY; \
+ op %KEY, %statereg; \
+ movaps 0x10(%KEYP), %KEY; \
+ op %KEY, %statereg; \
+ movaps 0x20(%KEYP), %KEY; \
+ op %KEY, %statereg; \
+ movaps 0x30(%KEYP), %KEY; \
+ op %KEY, %statereg; \
+ movaps 0x40(%KEYP), %KEY; \
+ op %KEY, %statereg; \
+ movaps 0x50(%KEYP), %KEY; \
+ op %KEY, %statereg; \
+ movaps 0x60(%KEYP), %KEY; \
+ op %KEY, %statereg; \
+ movaps 0x70(%KEYP), %KEY; \
+ lastop %KEY, %statereg
+
+/*
+ * Macros to run AES encryption rounds. Input must be prefilled in state
+ * register - output will be left there as well.
+ * To run AES256, invoke all of these macros in sequence. To run AES192,
+ * invoke only the -192 and -128 variants. To run AES128, invoke only the
+ * -128 variant.
+ */
+#define AES256_ENC_ROUNDS(statereg) \
+ AES256_ROUNDS(aesenc, statereg)
+#define AES192_ENC_ROUNDS(statereg) \
+ AES192_ROUNDS(aesenc, statereg)
+#define AES128_ENC_ROUNDS(statereg) \
+ AES128_ROUNDS(aesenc, aesenclast, statereg)
+
+/* Same as the AES*_ENC_ROUNDS macros, but for decryption. */
+#define AES256_DEC_ROUNDS(statereg) \
+ AES256_ROUNDS(aesdec, statereg)
+#define AES192_DEC_ROUNDS(statereg) \
+ AES192_ROUNDS(aesdec, statereg)
+#define AES128_DEC_ROUNDS(statereg) \
+ AES128_ROUNDS(aesdec, aesdeclast, statereg)
+
+
+/*
+ * aes_encrypt_intel()
+ * Encrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for bracketing this call with
+ * disabling kernel thread preemption and calling aes_accel_save/restore().
+ *
+ * Temporary register usage:
+ * %xmm0 Key
+ * %xmm8 State
+ *
+ * Original OpenSolaris Interface:
+ * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key)
+ */
+ENTRY_NP(aes_encrypt_intel)
+ movups (%INP), %STATE0 / input
movaps (%KEYP), %KEY / key
+
#ifdef OPENSSL_INTERFACE
mov 240(%KEYP), %NROUNDS32 / round count
#else /* OpenSolaris Interface */
/* Round count is already present as P2 in %rsi/%esi */
#endif /* OPENSSL_INTERFACE */
- pxor %KEY, %STATE / round 0
+ pxor %KEY, %STATE0 / round 0
lea 0x30(%KEYP), %KEYP
cmp $12, %NROUNDS
jb .Lenc128
lea 0x20(%KEYP), %KEYP
je .Lenc192
/ AES 256
lea 0x20(%KEYP), %KEYP
- movaps -0x60(%KEYP), %KEY
- aesenc %KEY, %STATE
- movaps -0x50(%KEYP), %KEY
- aesenc %KEY, %STATE
+ AES256_ENC_ROUNDS(STATE0)
.align 4
.Lenc192:
/ AES 192 and 256
- movaps -0x40(%KEYP), %KEY
- aesenc %KEY, %STATE
- movaps -0x30(%KEYP), %KEY
- aesenc %KEY, %STATE
+ AES192_ENC_ROUNDS(STATE0)
.align 4
.Lenc128:
/ AES 128, 192, and 256
- movaps -0x20(%KEYP), %KEY
- aesenc %KEY, %STATE
- movaps -0x10(%KEYP), %KEY
- aesenc %KEY, %STATE
- movaps (%KEYP), %KEY
- aesenc %KEY, %STATE
- movaps 0x10(%KEYP), %KEY
- aesenc %KEY, %STATE
- movaps 0x20(%KEYP), %KEY
- aesenc %KEY, %STATE
- movaps 0x30(%KEYP), %KEY
- aesenc %KEY, %STATE
- movaps 0x40(%KEYP), %KEY
- aesenc %KEY, %STATE
- movaps 0x50(%KEYP), %KEY
- aesenc %KEY, %STATE
- movaps 0x60(%KEYP), %KEY
- aesenc %KEY, %STATE
- movaps 0x70(%KEYP), %KEY
- aesenclast %KEY, %STATE / last round
- movups %STATE, (%OUTP) / output
+ AES128_ENC_ROUNDS(STATE0)
+ movups %STATE0, (%OUTP) / output
- SET_TS_OR_POP_XMM0_XMM1(%r10)
ret
SET_SIZE(aes_encrypt_intel)
-
/*
* aes_decrypt_intel()
* Decrypt a single block (in and out can overlap).
*
- * For kernel code, caller is responsible for ensuring kpreempt_disable()
- * has been called. This is because %xmm registers are not saved/restored.
- * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
- * on entry. Otherwise, if TS is not set, save and restore %xmm registers
- * on the stack.
+ * For kernel code, caller is responsible for bracketing this call with
+ * disabling kernel thread preemption and calling aes_accel_save/restore().
*
* Temporary register usage:
* %xmm0 State
* %xmm1 Key
*
* Original OpenSolaris Interface:
* void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
- * const uint32_t pt[4], uint32_t ct[4])/
+ * const uint32_t pt[4], uint32_t ct[4])
*
* Original Intel OpenSSL Interface:
* void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
* const AES_KEY *key);
*/
ENTRY_NP(aes_decrypt_intel)
- CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
-
- movups (%INP), %STATE / input
+ movups (%INP), %STATE0 / input
movaps (%KEYP), %KEY / key
+
#ifdef OPENSSL_INTERFACE
mov 240(%KEYP), %NROUNDS32 / round count
#else /* OpenSolaris Interface */
/* Round count is already present as P2 in %rsi/%esi */
#endif /* OPENSSL_INTERFACE */
- pxor %KEY, %STATE / round 0
+ pxor %KEY, %STATE0 / round 0
lea 0x30(%KEYP), %KEYP
cmp $12, %NROUNDS
jb .Ldec128
lea 0x20(%KEYP), %KEYP
je .Ldec192
/ AES 256
lea 0x20(%KEYP), %KEYP
- movaps -0x60(%KEYP), %KEY
- aesdec %KEY, %STATE
- movaps -0x50(%KEYP), %KEY
- aesdec %KEY, %STATE
+ AES256_DEC_ROUNDS(STATE0)
.align 4
.Ldec192:
/ AES 192 and 256
- movaps -0x40(%KEYP), %KEY
- aesdec %KEY, %STATE
- movaps -0x30(%KEYP), %KEY
- aesdec %KEY, %STATE
+ AES192_DEC_ROUNDS(STATE0)
.align 4
.Ldec128:
/ AES 128, 192, and 256
- movaps -0x20(%KEYP), %KEY
- aesdec %KEY, %STATE
- movaps -0x10(%KEYP), %KEY
- aesdec %KEY, %STATE
- movaps (%KEYP), %KEY
- aesdec %KEY, %STATE
- movaps 0x10(%KEYP), %KEY
- aesdec %KEY, %STATE
- movaps 0x20(%KEYP), %KEY
- aesdec %KEY, %STATE
- movaps 0x30(%KEYP), %KEY
- aesdec %KEY, %STATE
- movaps 0x40(%KEYP), %KEY
- aesdec %KEY, %STATE
- movaps 0x50(%KEYP), %KEY
- aesdec %KEY, %STATE
- movaps 0x60(%KEYP), %KEY
- aesdec %KEY, %STATE
- movaps 0x70(%KEYP), %KEY
- aesdeclast %KEY, %STATE / last round
- movups %STATE, (%OUTP) / output
+ AES128_DEC_ROUNDS(STATE0)
+ movups %STATE0, (%OUTP) / output
- SET_TS_OR_POP_XMM0_XMM1(%r10)
ret
SET_SIZE(aes_decrypt_intel)
+/* Does a pipelined load of eight input blocks into our AES state registers. */
+#define AES_LOAD_INPUT_8BLOCKS \
+ movups 0x00(%INP), %STATE0; \
+ movups 0x10(%INP), %STATE1; \
+ movups 0x20(%INP), %STATE2; \
+ movups 0x30(%INP), %STATE3; \
+ movups 0x40(%INP), %STATE4; \
+ movups 0x50(%INP), %STATE5; \
+ movups 0x60(%INP), %STATE6; \
+ movups 0x70(%INP), %STATE7;
+
+/* Does a pipelined store of eight AES state registers to the output. */
+#define AES_STORE_OUTPUT_8BLOCKS \
+ movups %STATE0, 0x00(%OUTP); \
+ movups %STATE1, 0x10(%OUTP); \
+ movups %STATE2, 0x20(%OUTP); \
+ movups %STATE3, 0x30(%OUTP); \
+ movups %STATE4, 0x40(%OUTP); \
+ movups %STATE5, 0x50(%OUTP); \
+ movups %STATE6, 0x60(%OUTP); \
+ movups %STATE7, 0x70(%OUTP);
+
+/* Performs a pipelined AES instruction with the key on all state registers. */
+#define AES_KEY_STATE_OP_8BLOCKS(op) \
+ op %KEY, %STATE0; \
+ op %KEY, %STATE1; \
+ op %KEY, %STATE2; \
+ op %KEY, %STATE3; \
+ op %KEY, %STATE4; \
+ op %KEY, %STATE5; \
+ op %KEY, %STATE6; \
+ op %KEY, %STATE7
+
+/* XOR all AES state regs with key to initiate encryption/decryption. */
+#define AES_XOR_STATE_8BLOCKS \
+ AES_KEY_STATE_OP_8BLOCKS(pxor)
+
+/*
+ * Loads a round key from the key schedule offset `off' into the KEY
+ * register and performs `op' using the KEY on all 8 STATE registers.
+ */
+#define AES_RND_8BLOCKS(op, off) \
+ movaps off(%KEYP), %KEY; \
+ AES_KEY_STATE_OP_8BLOCKS(op)
+
+/*
+ * void aes_encrypt_intel8(const uint32_t roundkeys[], int numrounds,
+ * const void *plaintext, void *ciphertext)
+ *
+ * Same as aes_encrypt_intel, but performs the encryption operation on
+ * 8 independent blocks in sequence, exploiting instruction pipelining.
+ * This function doesn't support the OpenSSL interface, it's only meant
+ * for kernel use.
+ */
+ENTRY_NP(aes_encrypt_intel8)
+ AES_LOAD_INPUT_8BLOCKS / load input
+ movaps (%KEYP), %KEY / key
+ AES_XOR_STATE_8BLOCKS / round 0
+
+ lea 0x30(%KEYP), %KEYP / point to key schedule
+ cmp $12, %NROUNDS / determine AES variant
+ jb .Lenc8_128
+ lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
+ je .Lenc8_192
+
+ lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
+ AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1
+ AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2
+
+.align 4
+.Lenc8_192:
+ AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3
+ AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4
+
+.align 4
+.Lenc8_128:
+ AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
+ AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
+ AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
+ AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
+ AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
+ AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
+ AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
+ AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
+ AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
+ AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
+
+ AES_STORE_OUTPUT_8BLOCKS / store output
+ ret
+ SET_SIZE(aes_encrypt_intel8)
+
+
+/*
+ * void aes_decrypt_intel8(const uint32_t roundkeys[], int numrounds,
+ * const void *ciphertext, void *plaintext)
+ *
+ * Same as aes_decrypt_intel, but performs the decryption operation on
+ * 8 independent blocks in sequence, exploiting instruction pipelining.
+ * This function doesn't support the OpenSSL interface, it's only meant
+ * for kernel use.
+ */
+ENTRY_NP(aes_decrypt_intel8)
+ AES_LOAD_INPUT_8BLOCKS / load input
+ movaps (%KEYP), %KEY / key
+ AES_XOR_STATE_8BLOCKS / round 0
+
+ lea 0x30(%KEYP), %KEYP / point to key schedule
+ cmp $12, %NROUNDS / determine AES variant
+ jb .Ldec8_128
+ lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
+ je .Ldec8_192
+
+ lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
+ AES_RND_8BLOCKS(aesdec, -0x60) / AES256 R.1
+ AES_RND_8BLOCKS(aesdec, -0x50) / AES256 R.2
+
+.align 4
+.Ldec8_192:
+ AES_RND_8BLOCKS(aesdec, -0x40) / AES192 R.1; AES256 R.3
+ AES_RND_8BLOCKS(aesdec, -0x30) / AES192 R.2; AES256 R.4
+
+.align 4
+.Ldec8_128:
+ AES_RND_8BLOCKS(aesdec, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
+ AES_RND_8BLOCKS(aesdec, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
+ AES_RND_8BLOCKS(aesdec, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
+ AES_RND_8BLOCKS(aesdec, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
+ AES_RND_8BLOCKS(aesdec, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
+ AES_RND_8BLOCKS(aesdec, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
+ AES_RND_8BLOCKS(aesdec, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
+ AES_RND_8BLOCKS(aesdec, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
+ AES_RND_8BLOCKS(aesdec, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
+ AES_RND_8BLOCKS(aesdeclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
+
+ AES_STORE_OUTPUT_8BLOCKS / store output
+ ret
+ SET_SIZE(aes_decrypt_intel8)
+
+
+/*
+ * This macro encapsulates the entire AES encryption algo for a single
+ * block, which is prefilled in statereg and which will be replaced by
+ * the encrypted output. The KEYP register must already point to the
+ * AES128 key schedule ("lea 0x30(%KEYP), %KEYP" from encryption
+ * function call) so that consecutive invocations of this macro are
+ * supported (KEYP is restored after each invocation).
+ */
+#define AES_ENC(statereg, label_128, label_192, label_out) \
+ cmp $12, %NROUNDS; \
+ jb label_128; \
+ je label_192; \
+ /* AES 256 only */ \
+ lea 0x40(%KEYP), %KEYP; \
+ AES256_ENC_ROUNDS(statereg); \
+ AES192_ENC_ROUNDS(statereg); \
+ AES128_ENC_ROUNDS(statereg); \
+ lea -0x40(%KEYP), %KEYP; \
+ jmp label_out; \
+.align 4; \
+label_192: \
+ lea 0x20(%KEYP), %KEYP; \
+ /* AES 192 only */ \
+ AES192_ENC_ROUNDS(statereg); \
+ AES128_ENC_ROUNDS(statereg); \
+ lea -0x20(%KEYP), %KEYP; \
+ jmp label_out; \
+.align 4; \
+label_128: \
+ /* AES 128 only */ \
+ AES128_ENC_ROUNDS(statereg); \
+.align 4; \
+label_out:
+
+
+/*
+ * void aes_encrypt_cbc_intel8(const uint32_t roundkeys[], int numrounds,
+ * const void *plaintext, void *ciphertext, const void *IV)
+ *
+ * Encrypts 8 consecutive AES blocks in the CBC mode. Input and output
+ * may overlap. This provides a modest performance boost over invoking
+ * the encryption and XOR in separate functions because we can avoid
+ * copying the ciphertext block to and from memory between encryption
+ * and XOR calls.
+ */
+#define CBC_IV r8 /* input - IV blk pointer */
+#define CBC_IV_XMM xmm1 /* tmp IV location for alignment */
+
+ENTRY_NP(aes_encrypt_cbc_intel8)
+ AES_LOAD_INPUT_8BLOCKS / load input
+ movaps (%KEYP), %KEY / key
+ AES_XOR_STATE_8BLOCKS / round 0
+
+ lea 0x30(%KEYP), %KEYP / point to key schedule
+ movdqu (%CBC_IV), %CBC_IV_XMM / load IV from unaligned memory
+ pxor %CBC_IV_XMM, %STATE0 / XOR IV with input block and encrypt
+ AES_ENC(STATE0, .Lenc_cbc_0_128, .Lenc_cbc_0_192, .Lenc_cbc_0_out)
+ pxor %STATE0, %STATE1
+ AES_ENC(STATE1, .Lenc_cbc_1_128, .Lenc_cbc_1_192, .Lenc_cbc_1_out)
+ pxor %STATE1, %STATE2
+ AES_ENC(STATE2, .Lenc_cbc_2_128, .Lenc_cbc_2_192, .Lenc_cbc_2_out)
+ pxor %STATE2, %STATE3
+ AES_ENC(STATE3, .Lenc_cbc_3_128, .Lenc_cbc_3_192, .Lenc_cbc_3_out)
+ pxor %STATE3, %STATE4
+ AES_ENC(STATE4, .Lenc_cbc_4_128, .Lenc_cbc_4_192, .Lenc_cbc_4_out)
+ pxor %STATE4, %STATE5
+ AES_ENC(STATE5, .Lenc_cbc_5_128, .Lenc_cbc_5_192, .Lenc_cbc_5_out)
+ pxor %STATE5, %STATE6
+ AES_ENC(STATE6, .Lenc_cbc_6_128, .Lenc_cbc_6_192, .Lenc_cbc_6_out)
+ pxor %STATE6, %STATE7
+ AES_ENC(STATE7, .Lenc_cbc_7_128, .Lenc_cbc_7_192, .Lenc_cbc_7_out)
+
+ AES_STORE_OUTPUT_8BLOCKS / store output
+ ret
+ SET_SIZE(aes_encrypt_cbc_intel8)
+
+/*
+ * Prefills register state with counters suitable for the CTR encryption
+ * mode. The counter is assumed to consist of two portions:
+ * - A lower monotonically increasing 64-bit counter. If the caller wants
+ * a smaller counter, they are responsible for checking that it doesn't
+ * overflow between encryption calls.
+ * - An upper static "nonce" portion, in big endian, preloaded into the
+ * lower portion of an XMM register.
+ * This macro adds `ctridx' to the lower_LE counter, swaps it to big
+ * endian and by way of a temporary general-purpose register loads the
+ * lower and upper counter portions into a target XMM result register,
+ * which can then be handed off to the encryption process.
+ */
+#define PREP_CTR_BLOCKS(lower_LE, upper_BE_xmm, ctridx, tmpreg, resreg) \
+ lea ctridx(%lower_LE), %tmpreg; \
+ bswap %tmpreg; \
+ movq %tmpreg, %resreg; \
+ movlhps %upper_BE_xmm, %resreg; \
+ pshufd $0b01001110, %resreg, %resreg
+
+#define CTR_UPPER_BE r8 /* input - counter upper 64 bits (BE) */
+#define CTR_UPPER_BE_XMM xmm1 /* tmp for upper counter bits */
+#define CTR_LOWER_LE r9 /* input - counter lower 64 bits (LE) */
+#define CTR_TMP0 rax /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP1 rbx /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP2 r10 /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP3 r11 /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP4 r12 /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP5 r13 /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP6 r14 /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP7 r15 /* tmp for lower 64 bit add & bswap */
+
+/*
+ * These are used in case CTR encryption input is unaligned before XORing.
+ * Must not overlap with any STATE[0-7] register.
+ */
+#define TMP_INPUT0 xmm0
+#define TMP_INPUT1 xmm1
+#define TMP_INPUT2 xmm2
+#define TMP_INPUT3 xmm3
+#define TMP_INPUT4 xmm4
+#define TMP_INPUT5 xmm5
+#define TMP_INPUT6 xmm6
+#define TMP_INPUT7 xmm7
+
+/*
+ * void aes_ctr_intel8(const uint32_t roundkeys[], int numrounds,
+ * const void *input, void *output, uint64_t counter_upper_BE,
+ * uint64_t counter_lower_LE)
+ *
+ * Runs AES on 8 consecutive blocks in counter mode (encryption and
+ * decryption in counter mode are the same).
+ */
+ENTRY_NP(aes_ctr_intel8)
+ /* save caller's regs */
+ pushq %rbp
+ movq %rsp, %rbp
+ subq $0x38, %rsp
+ / CTR_TMP0 is rax, no need to save
+ movq %CTR_TMP1, -0x38(%rbp)
+ movq %CTR_TMP2, -0x30(%rbp)
+ movq %CTR_TMP3, -0x28(%rbp)
+ movq %CTR_TMP4, -0x20(%rbp)
+ movq %CTR_TMP5, -0x18(%rbp)
+ movq %CTR_TMP6, -0x10(%rbp)
+ movq %CTR_TMP7, -0x08(%rbp)
+
+ /*
+ * CTR step 1: prepare big-endian formatted 128-bit counter values,
+ * placing the result in the AES-NI input state registers.
+ */
+ movq %CTR_UPPER_BE, %CTR_UPPER_BE_XMM
+ PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 0, CTR_TMP0, STATE0)
+ PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 1, CTR_TMP1, STATE1)
+ PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 2, CTR_TMP2, STATE2)
+ PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 3, CTR_TMP3, STATE3)
+ PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 4, CTR_TMP4, STATE4)
+ PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 5, CTR_TMP5, STATE5)
+ PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 6, CTR_TMP6, STATE6)
+ PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 7, CTR_TMP7, STATE7)
+
+ /*
+ * CTR step 2: Encrypt the counters.
+ */
+ movaps (%KEYP), %KEY / key
+ AES_XOR_STATE_8BLOCKS / round 0
+
+ /* Determine the AES variant we're going to compute */
+ lea 0x30(%KEYP), %KEYP / point to key schedule
+ cmp $12, %NROUNDS / determine AES variant
+ jb .Lctr8_128
+ lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
+ je .Lctr8_192
+
+ /* AES 256 */
+ lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
+ AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1
+ AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2
+
+.align 4
+.Lctr8_192:
+ /* AES 192 and 256 */
+ AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3
+ AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4
+
+.align 4
+.Lctr8_128:
+ /* AES 128, 192, and 256 */
+ AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
+ AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
+ AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
+ AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
+ AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
+ AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
+ AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
+ AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
+ AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
+ AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
+
+ /*
+ * CTR step 3: XOR input data blocks with encrypted counters to
+ * produce result.
+ */
+ mov %INP, %rax / pxor requires alignment, so check
+ andq $0xf, %rax
+ jnz .Lctr_input_unaligned
+ pxor 0x00(%INP), %STATE0
+ pxor 0x10(%INP), %STATE1
+ pxor 0x20(%INP), %STATE2
+ pxor 0x30(%INP), %STATE3
+ pxor 0x40(%INP), %STATE4
+ pxor 0x50(%INP), %STATE5
+ pxor 0x60(%INP), %STATE6
+ pxor 0x70(%INP), %STATE7
+ jmp .Lctr_out
+
+.align 4
+.Lctr_input_unaligned:
+ movdqu 0x00(%INP), %TMP_INPUT0
+ movdqu 0x10(%INP), %TMP_INPUT1
+ movdqu 0x20(%INP), %TMP_INPUT2
+ movdqu 0x30(%INP), %TMP_INPUT3
+ movdqu 0x40(%INP), %TMP_INPUT4
+ movdqu 0x50(%INP), %TMP_INPUT5
+ movdqu 0x60(%INP), %TMP_INPUT6
+ movdqu 0x70(%INP), %TMP_INPUT7
+ pxor %TMP_INPUT0, %STATE0
+ pxor %TMP_INPUT1, %STATE1
+ pxor %TMP_INPUT2, %STATE2
+ pxor %TMP_INPUT3, %STATE3
+ pxor %TMP_INPUT4, %STATE4
+ pxor %TMP_INPUT5, %STATE5
+ pxor %TMP_INPUT6, %STATE6
+ pxor %TMP_INPUT7, %STATE7
+
+.align 4
+.Lctr_out:
+ /*
+ * Step 4: Write out processed blocks to memory.
+ */
+ movdqu %STATE0, 0x00(%OUTP)
+ movdqu %STATE1, 0x10(%OUTP)
+ movdqu %STATE2, 0x20(%OUTP)
+ movdqu %STATE3, 0x30(%OUTP)
+ movdqu %STATE4, 0x40(%OUTP)
+ movdqu %STATE5, 0x50(%OUTP)
+ movdqu %STATE6, 0x60(%OUTP)
+ movdqu %STATE7, 0x70(%OUTP)
+
+ /* restore caller's regs */
+ / CTR_TMP0 is rax, no need to restore
+ movq -0x38(%rbp), %CTR_TMP1
+ movq -0x30(%rbp), %CTR_TMP2
+ movq -0x28(%rbp), %CTR_TMP3
+ movq -0x20(%rbp), %CTR_TMP4
+ movq -0x18(%rbp), %CTR_TMP5
+ movq -0x10(%rbp), %CTR_TMP6
+ movq -0x08(%rbp), %CTR_TMP7
+ leave
+ ret
+ SET_SIZE(aes_ctr_intel8)
+
#endif /* lint || __lint */