Print this page
4896 Performance improvements for KCF AES modes

@@ -146,10 +146,13 @@
  *
  * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
  *
  * ====================================================================
  */
+/*
+ * Copyright 2015 by Saso Kiselkov. All rights reserved.
+ */
 
 #if defined(lint) || defined(__lint)
 
 #include <sys/types.h>
 

@@ -279,11 +282,78 @@
         STTS(tmpreg); \
 2: \
         mov     %rbp, %rsp; \
         pop     %rbp
 
+/*
+ * void aes_accel_save(void *savestate);
+ *
+ * Saves all 16 XMM registers and CR0 to a temporary location pointed to
+ * in the first argument and clears TS in CR0. This must be invoked before
+ * executing any floating point operations inside the kernel (and kernel
+ * thread preemption must be disabled as well). The memory region to which
+ * all state is saved must be at least 16x 128-bit + 64-bit long and must
+ * be 128-bit aligned.
+ */
+ENTRY_NP(aes_accel_save)
+        movq    %cr0, %rax
+        movq    %rax, 0x100(%rdi)
+        testq   $CR0_TS, %rax
+        jnz     1f
+        movaps  %xmm0, 0x00(%rdi)
+        movaps  %xmm1, 0x10(%rdi)
+        movaps  %xmm2, 0x20(%rdi)
+        movaps  %xmm3, 0x30(%rdi)
+        movaps  %xmm4, 0x40(%rdi)
+        movaps  %xmm5, 0x50(%rdi)
+        movaps  %xmm6, 0x60(%rdi)
+        movaps  %xmm7, 0x70(%rdi)
+        movaps  %xmm8, 0x80(%rdi)
+        movaps  %xmm9, 0x90(%rdi)
+        movaps  %xmm10, 0xa0(%rdi)
+        movaps  %xmm11, 0xb0(%rdi)
+        movaps  %xmm12, 0xc0(%rdi)
+        movaps  %xmm13, 0xd0(%rdi)
+        movaps  %xmm14, 0xe0(%rdi)
+        movaps  %xmm15, 0xf0(%rdi)
+        ret
+1:
+        PROTECTED_CLTS
+        ret
+        SET_SIZE(aes_accel_save)
 
+/*
+ * void aes_accel_restore(void *savestate);
+ *
+ * Restores the saved XMM and CR0.TS state from aes_accel_save.
+ */
+ENTRY_NP(aes_accel_restore)
+        mov     0x100(%rdi), %rax
+        testq   $CR0_TS, %rax
+        jnz     1f
+        movaps  0x00(%rdi), %xmm0
+        movaps  0x10(%rdi), %xmm1
+        movaps  0x20(%rdi), %xmm2
+        movaps  0x30(%rdi), %xmm3
+        movaps  0x40(%rdi), %xmm4
+        movaps  0x50(%rdi), %xmm5
+        movaps  0x60(%rdi), %xmm6
+        movaps  0x70(%rdi), %xmm7
+        movaps  0x80(%rdi), %xmm8
+        movaps  0x90(%rdi), %xmm9
+        movaps  0xa0(%rdi), %xmm10
+        movaps  0xb0(%rdi), %xmm11
+        movaps  0xc0(%rdi), %xmm12
+        movaps  0xd0(%rdi), %xmm13
+        movaps  0xe0(%rdi), %xmm14
+        movaps  0xf0(%rdi), %xmm15
+        ret
+1:
+        STTS(%rax)
+        ret
+        SET_SIZE(aes_accel_restore)
+
 #else
 #define PROTECTED_CLTS
 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)

@@ -376,12 +446,82 @@
         movaps  %xmm2, (%rcx)
         add     $0x10, %rcx
         ret
         SET_SIZE(_key_expansion_256b)
 
+/*
+ * void aes_copy_intel(const uint8_t *src, uint8_t *dst);
+ *
+ * Copies one unaligned 128-bit block from `src' to `dst'. The copy is
+ * performed using FPU registers, so make sure FPU state is saved when
+ * running this in the kernel.
+ */
+ENTRY_NP(aes_copy_intel)
+        movdqu  (%rdi), %xmm0
+        movdqu  %xmm0, (%rsi)
+        ret
+        SET_SIZE(aes_copy_intel)
 
 /*
+ * void aes_xor_intel(const uint8_t *src, uint8_t *dst);
+ *
+ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
+ * stores the result at `dst'. The XOR is performed using FPU registers,
+ * so make sure FPU state is saved when running this in the kernel.
+ */
+ENTRY_NP(aes_xor_intel)
+        movdqu  (%rdi), %xmm0
+        movdqu  (%rsi), %xmm1
+        pxor    %xmm1, %xmm0
+        movdqu  %xmm0, (%rsi)
+        ret
+        SET_SIZE(aes_xor_intel)
+
+/*
+ * void aes_xor_intel8(const uint8_t *src, uint8_t *dst);
+ *
+ * XORs eight pairs of consecutive unaligned 128-bit blocks from `src' and
+ * 'dst' and stores the results at `dst'. The XOR is performed using FPU
+ * registers, so make sure FPU state is saved when running this in the kernel.
+ */
+ENTRY_NP(aes_xor_intel8)
+        movdqu  0x00(%rdi), %xmm0
+        movdqu  0x00(%rsi), %xmm1
+        movdqu  0x10(%rdi), %xmm2
+        movdqu  0x10(%rsi), %xmm3
+        movdqu  0x20(%rdi), %xmm4
+        movdqu  0x20(%rsi), %xmm5
+        movdqu  0x30(%rdi), %xmm6
+        movdqu  0x30(%rsi), %xmm7
+        movdqu  0x40(%rdi), %xmm8
+        movdqu  0x40(%rsi), %xmm9
+        movdqu  0x50(%rdi), %xmm10
+        movdqu  0x50(%rsi), %xmm11
+        movdqu  0x60(%rdi), %xmm12
+        movdqu  0x60(%rsi), %xmm13
+        movdqu  0x70(%rdi), %xmm14
+        movdqu  0x70(%rsi), %xmm15
+        pxor    %xmm1, %xmm0
+        pxor    %xmm3, %xmm2
+        pxor    %xmm5, %xmm4
+        pxor    %xmm7, %xmm6
+        pxor    %xmm9, %xmm8
+        pxor    %xmm11, %xmm10
+        pxor    %xmm13, %xmm12
+        pxor    %xmm15, %xmm14
+        movdqu  %xmm0, 0x00(%rsi)
+        movdqu  %xmm2, 0x10(%rsi)
+        movdqu  %xmm4, 0x20(%rsi)
+        movdqu  %xmm6, 0x30(%rsi)
+        movdqu  %xmm8, 0x40(%rsi)
+        movdqu  %xmm10, 0x50(%rsi)
+        movdqu  %xmm12, 0x60(%rsi)
+        movdqu  %xmm14, 0x70(%rsi)
+        ret
+        SET_SIZE(aes_xor_intel8)
+
+/*
  * rijndael_key_setup_enc_intel()
  * Expand the cipher key into the encryption key schedule.
  *
  * For kernel code, caller is responsible for ensuring kpreempt_disable()
  * has been called.  This is because %xmm registers are not saved/restored.

@@ -657,33 +797,10 @@
         / OpenSSL: rax = 0 for OK, or non-zero for error
         ret
         SET_SIZE(rijndael_key_setup_dec_intel)
 
 
-/*
- * aes_encrypt_intel()
- * Encrypt a single block (in and out can overlap).
- *
- * For kernel code, caller is responsible for ensuring kpreempt_disable()
- * has been called.  This is because %xmm registers are not saved/restored.
- * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
- * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
- * on the stack.
- *
- * Temporary register usage:
- * %xmm0        State
- * %xmm1        Key
- *
- * Original OpenSolaris Interface:
- * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
- *      const uint32_t pt[4], uint32_t ct[4])
- *
- * Original Intel OpenSSL Interface:
- * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
- *      const AES_KEY *key)
- */
-
 #ifdef  OPENSSL_INTERFACE
 #define aes_encrypt_intel       intel_AES_encrypt
 #define aes_decrypt_intel       intel_AES_decrypt
 
 #define INP             rdi     /* P1, 64 bits */

@@ -697,158 +814,594 @@
 #else   /* OpenSolaris Interface */
 #define KEYP            rdi     /* P1, 64 bits */
 #define NROUNDS         esi     /* P2, 32 bits */
 #define INP             rdx     /* P3, 64 bits */
 #define OUTP            rcx     /* P4, 64 bits */
+#define LENGTH          r8      /* P5, 64 bits */
 #endif  /* OPENSSL_INTERFACE */
 
-#define STATE           xmm0    /* temporary, 128 bits */
-#define KEY             xmm1    /* temporary, 128 bits */
+#define KEY             xmm0    /* temporary, 128 bits */
+#define STATE0          xmm8    /* temporary, 128 bits */
+#define STATE1          xmm9    /* temporary, 128 bits */
+#define STATE2          xmm10   /* temporary, 128 bits */
+#define STATE3          xmm11   /* temporary, 128 bits */
+#define STATE4          xmm12   /* temporary, 128 bits */
+#define STATE5          xmm13   /* temporary, 128 bits */
+#define STATE6          xmm14   /* temporary, 128 bits */
+#define STATE7          xmm15   /* temporary, 128 bits */
 
-ENTRY_NP(aes_encrypt_intel)
-        CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
+/*
+ * Runs the first two rounds of AES256 on a state register. `op' should be
+ * aesenc or aesdec.
+ */
+#define AES256_ROUNDS(op, statereg)     \
+        movaps  -0x60(%KEYP), %KEY;     \
+        op      %KEY, %statereg;        \
+        movaps  -0x50(%KEYP), %KEY;     \
+        op      %KEY, %statereg
 
-        movups  (%INP), %STATE                  / input
+/*
+ * Runs the first two rounds of AES192, or the 3rd & 4th round of AES256 on
+ * a state register. `op' should be aesenc or aesdec.
+ */
+#define AES192_ROUNDS(op, statereg)     \
+        movaps  -0x40(%KEYP), %KEY;     \
+        op      %KEY, %statereg;        \
+        movaps  -0x30(%KEYP), %KEY;     \
+        op      %KEY, %statereg
+
+/*
+ * Runs the full 10 rounds of AES128, or the last 10 rounds of AES192/AES256
+ * on a state register. `op' should be aesenc or aesdec and `lastop' should
+ * be aesenclast or aesdeclast.
+ */
+#define AES128_ROUNDS(op, lastop, statereg) \
+        movaps  -0x20(%KEYP), %KEY;     \
+        op      %KEY, %statereg;        \
+        movaps  -0x10(%KEYP), %KEY;     \
+        op      %KEY, %statereg;        \
+        movaps  (%KEYP), %KEY;          \
+        op      %KEY, %statereg;        \
+        movaps  0x10(%KEYP), %KEY;      \
+        op      %KEY, %statereg;        \
+        movaps  0x20(%KEYP), %KEY;      \
+        op      %KEY, %statereg;        \
+        movaps  0x30(%KEYP), %KEY;      \
+        op      %KEY, %statereg;        \
+        movaps  0x40(%KEYP), %KEY;      \
+        op      %KEY, %statereg;        \
+        movaps  0x50(%KEYP), %KEY;      \
+        op      %KEY, %statereg;        \
+        movaps  0x60(%KEYP), %KEY;      \
+        op      %KEY, %statereg;        \
+        movaps  0x70(%KEYP), %KEY;      \
+        lastop  %KEY, %statereg
+
+/*
+ * Macros to run AES encryption rounds. Input must be prefilled in state
+ * register - output will be left there as well.
+ * To run AES256, invoke all of these macros in sequence. To run AES192,
+ * invoke only the -192 and -128 variants. To run AES128, invoke only the
+ * -128 variant.
+ */
+#define AES256_ENC_ROUNDS(statereg) \
+        AES256_ROUNDS(aesenc, statereg)
+#define AES192_ENC_ROUNDS(statereg) \
+        AES192_ROUNDS(aesenc, statereg)
+#define AES128_ENC_ROUNDS(statereg) \
+        AES128_ROUNDS(aesenc, aesenclast, statereg)
+
+/* Same as the AES*_ENC_ROUNDS macros, but for decryption. */
+#define AES256_DEC_ROUNDS(statereg) \
+        AES256_ROUNDS(aesdec, statereg)
+#define AES192_DEC_ROUNDS(statereg) \
+        AES192_ROUNDS(aesdec, statereg)
+#define AES128_DEC_ROUNDS(statereg) \
+        AES128_ROUNDS(aesdec, aesdeclast, statereg)
+
+
+/*
+ * aes_encrypt_intel()
+ * Encrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for bracketing this call with
+ * disabling kernel thread preemption and calling aes_accel_save/restore().
+ *
+ * Temporary register usage:
+ * %xmm0        Key
+ * %xmm8        State
+ *
+ * Original OpenSolaris Interface:
+ * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ *      const uint32_t pt[4], uint32_t ct[4])
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ *      const AES_KEY *key)
+ */
+ENTRY_NP(aes_encrypt_intel)
+        movups  (%INP), %STATE0                 / input
         movaps  (%KEYP), %KEY                   / key
+
 #ifdef  OPENSSL_INTERFACE
         mov     240(%KEYP), %NROUNDS32          / round count
 #else   /* OpenSolaris Interface */
         /* Round count is already present as P2 in %rsi/%esi */
 #endif  /* OPENSSL_INTERFACE */
 
-        pxor    %KEY, %STATE                    / round 0
+        pxor    %KEY, %STATE0                   / round 0
         lea     0x30(%KEYP), %KEYP
         cmp     $12, %NROUNDS
         jb      .Lenc128
         lea     0x20(%KEYP), %KEYP
         je      .Lenc192
 
         / AES 256
         lea     0x20(%KEYP), %KEYP
-        movaps  -0x60(%KEYP), %KEY
-        aesenc  %KEY, %STATE
-        movaps  -0x50(%KEYP), %KEY
-        aesenc  %KEY, %STATE
+        AES256_ENC_ROUNDS(STATE0)
 
 .align 4
 .Lenc192:
         / AES 192 and 256
-        movaps  -0x40(%KEYP), %KEY
-        aesenc  %KEY, %STATE
-        movaps  -0x30(%KEYP), %KEY
-        aesenc  %KEY, %STATE
+        AES192_ENC_ROUNDS(STATE0)
 
 .align 4
 .Lenc128:
         / AES 128, 192, and 256
-        movaps  -0x20(%KEYP), %KEY
-        aesenc  %KEY, %STATE
-        movaps  -0x10(%KEYP), %KEY
-        aesenc  %KEY, %STATE
-        movaps  (%KEYP), %KEY
-        aesenc  %KEY, %STATE
-        movaps  0x10(%KEYP), %KEY
-        aesenc  %KEY, %STATE
-        movaps  0x20(%KEYP), %KEY
-        aesenc  %KEY, %STATE
-        movaps  0x30(%KEYP), %KEY
-        aesenc  %KEY, %STATE
-        movaps  0x40(%KEYP), %KEY
-        aesenc  %KEY, %STATE
-        movaps  0x50(%KEYP), %KEY
-        aesenc  %KEY, %STATE
-        movaps  0x60(%KEYP), %KEY
-        aesenc  %KEY, %STATE
-        movaps  0x70(%KEYP), %KEY
-        aesenclast       %KEY, %STATE           / last round
-        movups  %STATE, (%OUTP)                 / output
+        AES128_ENC_ROUNDS(STATE0)
+        movups  %STATE0, (%OUTP)                / output
 
-        SET_TS_OR_POP_XMM0_XMM1(%r10)
         ret
         SET_SIZE(aes_encrypt_intel)
 
-
 /*
  * aes_decrypt_intel()
  * Decrypt a single block (in and out can overlap).
  *
- * For kernel code, caller is responsible for ensuring kpreempt_disable()
- * has been called.  This is because %xmm registers are not saved/restored.
- * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
- * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
- * on the stack.
+ * For kernel code, caller is responsible for bracketing this call with
+ * disabling kernel thread preemption and calling aes_accel_save/restore().
  *
  * Temporary register usage:
  * %xmm0        State
  * %xmm1        Key
  *
  * Original OpenSolaris Interface:
  * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
- *      const uint32_t pt[4], uint32_t ct[4])/
+ *      const uint32_t pt[4], uint32_t ct[4])
  *
  * Original Intel OpenSSL Interface:
  * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
  *      const AES_KEY *key);
  */
 ENTRY_NP(aes_decrypt_intel)
-        CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
-
-        movups  (%INP), %STATE                  / input
+        movups  (%INP), %STATE0                 / input
         movaps  (%KEYP), %KEY                   / key
+
 #ifdef  OPENSSL_INTERFACE
         mov     240(%KEYP), %NROUNDS32          / round count
 #else   /* OpenSolaris Interface */
         /* Round count is already present as P2 in %rsi/%esi */
 #endif  /* OPENSSL_INTERFACE */
 
-        pxor    %KEY, %STATE                    / round 0
+        pxor    %KEY, %STATE0                   / round 0
         lea     0x30(%KEYP), %KEYP
         cmp     $12, %NROUNDS
         jb      .Ldec128
         lea     0x20(%KEYP), %KEYP
         je      .Ldec192
 
         / AES 256
         lea     0x20(%KEYP), %KEYP
-        movaps  -0x60(%KEYP), %KEY
-        aesdec  %KEY, %STATE
-        movaps  -0x50(%KEYP), %KEY
-        aesdec  %KEY, %STATE
+        AES256_DEC_ROUNDS(STATE0)
 
 .align 4
 .Ldec192:
         / AES 192 and 256
-        movaps  -0x40(%KEYP), %KEY
-        aesdec  %KEY, %STATE
-        movaps  -0x30(%KEYP), %KEY
-        aesdec  %KEY, %STATE
+        AES192_DEC_ROUNDS(STATE0)
 
 .align 4
 .Ldec128:
         / AES 128, 192, and 256
-        movaps  -0x20(%KEYP), %KEY
-        aesdec  %KEY, %STATE
-        movaps  -0x10(%KEYP), %KEY
-        aesdec  %KEY, %STATE
-        movaps  (%KEYP), %KEY
-        aesdec  %KEY, %STATE
-        movaps  0x10(%KEYP), %KEY
-        aesdec  %KEY, %STATE
-        movaps  0x20(%KEYP), %KEY
-        aesdec  %KEY, %STATE
-        movaps  0x30(%KEYP), %KEY
-        aesdec  %KEY, %STATE
-        movaps  0x40(%KEYP), %KEY
-        aesdec  %KEY, %STATE
-        movaps  0x50(%KEYP), %KEY
-        aesdec  %KEY, %STATE
-        movaps  0x60(%KEYP), %KEY
-        aesdec  %KEY, %STATE
-        movaps  0x70(%KEYP), %KEY
-        aesdeclast      %KEY, %STATE            / last round
-        movups  %STATE, (%OUTP)                 / output
+        AES128_DEC_ROUNDS(STATE0)
+        movups  %STATE0, (%OUTP)                / output
 
-        SET_TS_OR_POP_XMM0_XMM1(%r10)
         ret
         SET_SIZE(aes_decrypt_intel)
 
+/* Does a pipelined load of eight input blocks into our AES state registers. */
+#define AES_LOAD_INPUT_8BLOCKS          \
+        movups  0x00(%INP), %STATE0;    \
+        movups  0x10(%INP), %STATE1;    \
+        movups  0x20(%INP), %STATE2;    \
+        movups  0x30(%INP), %STATE3;    \
+        movups  0x40(%INP), %STATE4;    \
+        movups  0x50(%INP), %STATE5;    \
+        movups  0x60(%INP), %STATE6;    \
+        movups  0x70(%INP), %STATE7;
+
+/* Does a pipelined store of eight AES state registers to the output. */
+#define AES_STORE_OUTPUT_8BLOCKS        \
+        movups  %STATE0, 0x00(%OUTP);   \
+        movups  %STATE1, 0x10(%OUTP);   \
+        movups  %STATE2, 0x20(%OUTP);   \
+        movups  %STATE3, 0x30(%OUTP);   \
+        movups  %STATE4, 0x40(%OUTP);   \
+        movups  %STATE5, 0x50(%OUTP);   \
+        movups  %STATE6, 0x60(%OUTP);   \
+        movups  %STATE7, 0x70(%OUTP);
+
+/* Performs a pipelined AES instruction with the key on all state registers. */
+#define AES_KEY_STATE_OP_8BLOCKS(op)    \
+        op      %KEY, %STATE0;          \
+        op      %KEY, %STATE1;          \
+        op      %KEY, %STATE2;          \
+        op      %KEY, %STATE3;          \
+        op      %KEY, %STATE4;          \
+        op      %KEY, %STATE5;          \
+        op      %KEY, %STATE6;          \
+        op      %KEY, %STATE7
+
+/* XOR all AES state regs with key to initiate encryption/decryption. */
+#define AES_XOR_STATE_8BLOCKS           \
+        AES_KEY_STATE_OP_8BLOCKS(pxor)
+
+/*
+ * Loads a round key from the key schedule offset `off' into the KEY
+ * register and performs `op' using the KEY on all 8 STATE registers.
+ */
+#define AES_RND_8BLOCKS(op, off)        \
+        movaps  off(%KEYP), %KEY;       \
+        AES_KEY_STATE_OP_8BLOCKS(op)
+
+/*
+ * void aes_encrypt_intel8(const uint32_t roundkeys[], int numrounds,
+ *      const void *plaintext, void *ciphertext)
+ *
+ * Same as aes_encrypt_intel, but performs the encryption operation on
+ * 8 independent blocks in sequence, exploiting instruction pipelining.
+ * This function doesn't support the OpenSSL interface, it's only meant
+ * for kernel use.
+ */
+ENTRY_NP(aes_encrypt_intel8)
+        AES_LOAD_INPUT_8BLOCKS          / load input
+        movaps  (%KEYP), %KEY           / key
+        AES_XOR_STATE_8BLOCKS           / round 0
+
+        lea     0x30(%KEYP), %KEYP      / point to key schedule
+        cmp     $12, %NROUNDS           / determine AES variant
+        jb      .Lenc8_128
+        lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
+        je      .Lenc8_192
+
+        lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
+        AES_RND_8BLOCKS(aesenc, -0x60)  / AES256 R.1
+        AES_RND_8BLOCKS(aesenc, -0x50)  / AES256 R.2
+
+.align 4
+.Lenc8_192:
+        AES_RND_8BLOCKS(aesenc, -0x40)  / AES192 R.1; AES256 R.3
+        AES_RND_8BLOCKS(aesenc, -0x30)  / AES192 R.2; AES256 R.4
+
+.align 4
+.Lenc8_128:
+        AES_RND_8BLOCKS(aesenc, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
+        AES_RND_8BLOCKS(aesenc, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
+        AES_RND_8BLOCKS(aesenc, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
+        AES_RND_8BLOCKS(aesenc, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
+        AES_RND_8BLOCKS(aesenc, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
+        AES_RND_8BLOCKS(aesenc, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
+        AES_RND_8BLOCKS(aesenc, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
+        AES_RND_8BLOCKS(aesenc, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
+        AES_RND_8BLOCKS(aesenc, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
+        AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
+
+        AES_STORE_OUTPUT_8BLOCKS        / store output
+        ret
+        SET_SIZE(aes_encrypt_intel8)
+
+
+/*
+ * void aes_decrypt_intel8(const uint32_t roundkeys[], int numrounds,
+ *      const void *ciphertext, void *plaintext)
+ *
+ * Same as aes_decrypt_intel, but performs the decryption operation on
+ * 8 independent blocks in sequence, exploiting instruction pipelining.
+ * This function doesn't support the OpenSSL interface, it's only meant
+ * for kernel use.
+ */
+ENTRY_NP(aes_decrypt_intel8)
+        AES_LOAD_INPUT_8BLOCKS          / load input
+        movaps  (%KEYP), %KEY           / key
+        AES_XOR_STATE_8BLOCKS           / round 0
+
+        lea     0x30(%KEYP), %KEYP      / point to key schedule
+        cmp     $12, %NROUNDS           / determine AES variant
+        jb      .Ldec8_128
+        lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
+        je      .Ldec8_192
+
+        lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
+        AES_RND_8BLOCKS(aesdec, -0x60)  / AES256 R.1
+        AES_RND_8BLOCKS(aesdec, -0x50)  / AES256 R.2
+
+.align 4
+.Ldec8_192:
+        AES_RND_8BLOCKS(aesdec, -0x40)  / AES192 R.1; AES256 R.3
+        AES_RND_8BLOCKS(aesdec, -0x30)  / AES192 R.2; AES256 R.4
+
+.align 4
+.Ldec8_128:
+        AES_RND_8BLOCKS(aesdec, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
+        AES_RND_8BLOCKS(aesdec, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
+        AES_RND_8BLOCKS(aesdec, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
+        AES_RND_8BLOCKS(aesdec, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
+        AES_RND_8BLOCKS(aesdec, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
+        AES_RND_8BLOCKS(aesdec, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
+        AES_RND_8BLOCKS(aesdec, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
+        AES_RND_8BLOCKS(aesdec, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
+        AES_RND_8BLOCKS(aesdec, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
+        AES_RND_8BLOCKS(aesdeclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
+
+        AES_STORE_OUTPUT_8BLOCKS        / store output
+        ret
+        SET_SIZE(aes_decrypt_intel8)
+
+
+/*
+ * This macro encapsulates the entire AES encryption algo for a single
+ * block, which is prefilled in statereg and which will be replaced by
+ * the encrypted output. The KEYP register must already point to the
+ * AES128 key schedule ("lea 0x30(%KEYP), %KEYP" from encryption
+ * function call) so that consecutive invocations of this macro are
+ * supported (KEYP is restored after each invocation).
+ */
+#define AES_ENC(statereg, label_128, label_192, label_out)      \
+        cmp     $12, %NROUNDS;                                  \
+        jb      label_128;                                      \
+        je      label_192;                                      \
+        /* AES 256 only */                                      \
+        lea     0x40(%KEYP), %KEYP;                             \
+        AES256_ENC_ROUNDS(statereg);                            \
+        AES192_ENC_ROUNDS(statereg);                            \
+        AES128_ENC_ROUNDS(statereg);                            \
+        lea     -0x40(%KEYP), %KEYP;                            \
+        jmp     label_out;                                      \
+.align 4;                                                       \
+label_192:                                                      \
+        lea     0x20(%KEYP), %KEYP;                             \
+        /* AES 192 only */                                      \
+        AES192_ENC_ROUNDS(statereg);                            \
+        AES128_ENC_ROUNDS(statereg);                            \
+        lea     -0x20(%KEYP), %KEYP;                            \
+        jmp     label_out;                                      \
+.align 4;                                                       \
+label_128:                                                      \
+        /* AES 128 only */                                      \
+        AES128_ENC_ROUNDS(statereg);                            \
+.align 4;                                                       \
+label_out:
+
+
+/*
+ * void aes_encrypt_cbc_intel8(const uint32_t roundkeys[], int numrounds,
+ *      const void *plaintext, void *ciphertext, const void *IV)
+ *
+ * Encrypts 8 consecutive AES blocks in the CBC mode. Input and output
+ * may overlap. This provides a modest performance boost over invoking
+ * the encryption and XOR in separate functions because we can avoid
+ * copying the ciphertext block to and from memory between encryption
+ * and XOR calls.
+ */
+#define CBC_IV                  r8      /* input - IV blk pointer */
+#define CBC_IV_XMM              xmm1    /* tmp IV location for alignment */
+
+ENTRY_NP(aes_encrypt_cbc_intel8)
+        AES_LOAD_INPUT_8BLOCKS          / load input
+        movaps  (%KEYP), %KEY           / key
+        AES_XOR_STATE_8BLOCKS           / round 0
+
+        lea     0x30(%KEYP), %KEYP      / point to key schedule
+        movdqu  (%CBC_IV), %CBC_IV_XMM  / load IV from unaligned memory
+        pxor    %CBC_IV_XMM, %STATE0    / XOR IV with input block and encrypt
+        AES_ENC(STATE0, .Lenc_cbc_0_128, .Lenc_cbc_0_192, .Lenc_cbc_0_out)
+        pxor    %STATE0, %STATE1
+        AES_ENC(STATE1, .Lenc_cbc_1_128, .Lenc_cbc_1_192, .Lenc_cbc_1_out)
+        pxor    %STATE1, %STATE2
+        AES_ENC(STATE2, .Lenc_cbc_2_128, .Lenc_cbc_2_192, .Lenc_cbc_2_out)
+        pxor    %STATE2, %STATE3
+        AES_ENC(STATE3, .Lenc_cbc_3_128, .Lenc_cbc_3_192, .Lenc_cbc_3_out)
+        pxor    %STATE3, %STATE4
+        AES_ENC(STATE4, .Lenc_cbc_4_128, .Lenc_cbc_4_192, .Lenc_cbc_4_out)
+        pxor    %STATE4, %STATE5
+        AES_ENC(STATE5, .Lenc_cbc_5_128, .Lenc_cbc_5_192, .Lenc_cbc_5_out)
+        pxor    %STATE5, %STATE6
+        AES_ENC(STATE6, .Lenc_cbc_6_128, .Lenc_cbc_6_192, .Lenc_cbc_6_out)
+        pxor    %STATE6, %STATE7
+        AES_ENC(STATE7, .Lenc_cbc_7_128, .Lenc_cbc_7_192, .Lenc_cbc_7_out)
+
+        AES_STORE_OUTPUT_8BLOCKS        / store output
+        ret
+        SET_SIZE(aes_encrypt_cbc_intel8)
+
+/*
+ * Prefills register state with counters suitable for the CTR encryption
+ * mode. The counter is assumed to consist of two portions:
+ * - A lower monotonically increasing 64-bit counter. If the caller wants
+ *   a smaller counter, they are responsible for checking that it doesn't
+ *   overflow between encryption calls.
+ * - An upper static "nonce" portion, in big endian, preloaded into the
+ *   lower portion of an XMM register.
+ * This macro adds `ctridx' to the lower_LE counter, swaps it to big
+ * endian and by way of a temporary general-purpose register loads the
+ * lower and upper counter portions into a target XMM result register,
+ * which can then be handed off to the encryption process.
+ */
+#define PREP_CTR_BLOCKS(lower_LE, upper_BE_xmm, ctridx, tmpreg, resreg) \
+        lea     ctridx(%lower_LE), %tmpreg;                             \
+        bswap   %tmpreg;                                                \
+        movq    %tmpreg, %resreg;                                       \
+        movlhps %upper_BE_xmm, %resreg;                                 \
+        pshufd  $0b01001110, %resreg, %resreg
+
+#define CTR_UPPER_BE            r8      /* input - counter upper 64 bits (BE) */
+#define CTR_UPPER_BE_XMM        xmm1    /* tmp for upper counter bits */
+#define CTR_LOWER_LE            r9      /* input - counter lower 64 bits (LE) */
+#define CTR_TMP0                rax     /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP1                rbx     /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP2                r10     /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP3                r11     /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP4                r12     /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP5                r13     /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP6                r14     /* tmp for lower 64 bit add & bswap */
+#define CTR_TMP7                r15     /* tmp for lower 64 bit add & bswap */
+
+/*
+ * These are used in case CTR encryption input is unaligned before XORing.
+ * Must not overlap with any STATE[0-7] register.
+ */
+#define TMP_INPUT0      xmm0
+#define TMP_INPUT1      xmm1
+#define TMP_INPUT2      xmm2
+#define TMP_INPUT3      xmm3
+#define TMP_INPUT4      xmm4
+#define TMP_INPUT5      xmm5
+#define TMP_INPUT6      xmm6
+#define TMP_INPUT7      xmm7
+
+/*
+ * void aes_ctr_intel8(const uint32_t roundkeys[], int numrounds,
+ *      const void *input, void *output, uint64_t counter_upper_BE,
+ *      uint64_t counter_lower_LE)
+ *
+ * Runs AES on 8 consecutive blocks in counter mode (encryption and
+ * decryption in counter mode are the same).
+ */
+ENTRY_NP(aes_ctr_intel8)
+        /* save caller's regs */
+        pushq   %rbp
+        movq    %rsp, %rbp
+        subq    $0x38, %rsp
+        / CTR_TMP0 is rax, no need to save
+        movq    %CTR_TMP1, -0x38(%rbp)
+        movq    %CTR_TMP2, -0x30(%rbp)
+        movq    %CTR_TMP3, -0x28(%rbp)
+        movq    %CTR_TMP4, -0x20(%rbp)
+        movq    %CTR_TMP5, -0x18(%rbp)
+        movq    %CTR_TMP6, -0x10(%rbp)
+        movq    %CTR_TMP7, -0x08(%rbp)
+
+        /*
+         * CTR step 1: prepare big-endian formatted 128-bit counter values,
+         * placing the result in the AES-NI input state registers.
+         */
+        movq    %CTR_UPPER_BE, %CTR_UPPER_BE_XMM
+        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 0, CTR_TMP0, STATE0)
+        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 1, CTR_TMP1, STATE1)
+        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 2, CTR_TMP2, STATE2)
+        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 3, CTR_TMP3, STATE3)
+        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 4, CTR_TMP4, STATE4)
+        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 5, CTR_TMP5, STATE5)
+        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 6, CTR_TMP6, STATE6)
+        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 7, CTR_TMP7, STATE7)
+
+        /*
+         * CTR step 2: Encrypt the counters.
+         */
+        movaps  (%KEYP), %KEY           / key
+        AES_XOR_STATE_8BLOCKS           / round 0
+
+        /* Determine the AES variant we're going to compute */
+        lea     0x30(%KEYP), %KEYP      / point to key schedule
+        cmp     $12, %NROUNDS           / determine AES variant
+        jb      .Lctr8_128
+        lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
+        je      .Lctr8_192
+
+        /* AES 256 */
+        lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
+        AES_RND_8BLOCKS(aesenc, -0x60)  / AES256 R.1
+        AES_RND_8BLOCKS(aesenc, -0x50)  / AES256 R.2
+
+.align 4
+.Lctr8_192:
+        /* AES 192 and 256 */
+        AES_RND_8BLOCKS(aesenc, -0x40)  / AES192 R.1; AES256 R.3
+        AES_RND_8BLOCKS(aesenc, -0x30)  / AES192 R.2; AES256 R.4
+
+.align 4
+.Lctr8_128:
+        /* AES 128, 192, and 256 */
+        AES_RND_8BLOCKS(aesenc, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
+        AES_RND_8BLOCKS(aesenc, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
+        AES_RND_8BLOCKS(aesenc, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
+        AES_RND_8BLOCKS(aesenc, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
+        AES_RND_8BLOCKS(aesenc, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
+        AES_RND_8BLOCKS(aesenc, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
+        AES_RND_8BLOCKS(aesenc, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
+        AES_RND_8BLOCKS(aesenc, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
+        AES_RND_8BLOCKS(aesenc, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
+        AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
+
+        /*
+         * CTR step 3: XOR input data blocks with encrypted counters to
+         * produce result.
+         */
+        mov     %INP, %rax              / pxor requires alignment, so check
+        andq    $0xf, %rax
+        jnz     .Lctr_input_unaligned
+        pxor    0x00(%INP), %STATE0
+        pxor    0x10(%INP), %STATE1
+        pxor    0x20(%INP), %STATE2
+        pxor    0x30(%INP), %STATE3
+        pxor    0x40(%INP), %STATE4
+        pxor    0x50(%INP), %STATE5
+        pxor    0x60(%INP), %STATE6
+        pxor    0x70(%INP), %STATE7
+        jmp     .Lctr_out
+
+.align 4
+.Lctr_input_unaligned:
+        movdqu  0x00(%INP), %TMP_INPUT0
+        movdqu  0x10(%INP), %TMP_INPUT1
+        movdqu  0x20(%INP), %TMP_INPUT2
+        movdqu  0x30(%INP), %TMP_INPUT3
+        movdqu  0x40(%INP), %TMP_INPUT4
+        movdqu  0x50(%INP), %TMP_INPUT5
+        movdqu  0x60(%INP), %TMP_INPUT6
+        movdqu  0x70(%INP), %TMP_INPUT7
+        pxor    %TMP_INPUT0, %STATE0
+        pxor    %TMP_INPUT1, %STATE1
+        pxor    %TMP_INPUT2, %STATE2
+        pxor    %TMP_INPUT3, %STATE3
+        pxor    %TMP_INPUT4, %STATE4
+        pxor    %TMP_INPUT5, %STATE5
+        pxor    %TMP_INPUT6, %STATE6
+        pxor    %TMP_INPUT7, %STATE7
+
+.align 4
+.Lctr_out:
+        /*
+         * Step 4: Write out processed blocks to memory.
+         */
+        movdqu  %STATE0, 0x00(%OUTP)
+        movdqu  %STATE1, 0x10(%OUTP)
+        movdqu  %STATE2, 0x20(%OUTP)
+        movdqu  %STATE3, 0x30(%OUTP)
+        movdqu  %STATE4, 0x40(%OUTP)
+        movdqu  %STATE5, 0x50(%OUTP)
+        movdqu  %STATE6, 0x60(%OUTP)
+        movdqu  %STATE7, 0x70(%OUTP)
+
+        /* restore caller's regs */
+        / CTR_TMP0 is rax, no need to restore
+        movq    -0x38(%rbp), %CTR_TMP1
+        movq    -0x30(%rbp), %CTR_TMP2
+        movq    -0x28(%rbp), %CTR_TMP3
+        movq    -0x20(%rbp), %CTR_TMP4
+        movq    -0x18(%rbp), %CTR_TMP5
+        movq    -0x10(%rbp), %CTR_TMP6
+        movq    -0x08(%rbp), %CTR_TMP7
+        leave
+        ret
+        SET_SIZE(aes_ctr_intel8)
+
 #endif  /* lint || __lint */