Print this page
4896 Performance improvements for KCF AES modes
@@ -25,10 +25,13 @@
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright 2015 by Saso Kiselkov. All rights reserved.
+ */
/*
* Accelerated GHASH implementation with Intel PCLMULQDQ-NI
* instructions. This file contains an accelerated
* Galois Field Multiplication implementation.
@@ -88,10 +91,24 @@
/* ARGSUSED */
void
gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
}
+#ifdef _KERNEL
+/*ARGSUSED*/
+void
+gcm_intel_save(void *savestate)
+{
+}
+
+/*ARGSUSED*/
+void
+gcm_accel_restore(void *savestate)
+{
+}
+#endif /* _KERNEL */
+
#else /* lint */
#include <sys/asm_linkage.h>
#include <sys/controlregs.h>
#ifdef _KERNEL
@@ -114,101 +131,107 @@
pop %rsi
#else
#define PROTECTED_CLTS \
CLTS
#endif /* __xpv */
-
- /*
- * If CR0_TS is not set, align stack (with push %rbp) and push
- * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
- */
-#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
- push %rbp; \
- mov %rsp, %rbp; \
- movq %cr0, tmpreg; \
- testq $CR0_TS, tmpreg; \
- jnz 1f; \
- and $-XMM_ALIGN, %rsp; \
- sub $[XMM_SIZE * 11], %rsp; \
- movaps %xmm0, 160(%rsp); \
- movaps %xmm1, 144(%rsp); \
- movaps %xmm2, 128(%rsp); \
- movaps %xmm3, 112(%rsp); \
- movaps %xmm4, 96(%rsp); \
- movaps %xmm5, 80(%rsp); \
- movaps %xmm6, 64(%rsp); \
- movaps %xmm7, 48(%rsp); \
- movaps %xmm8, 32(%rsp); \
- movaps %xmm9, 16(%rsp); \
- movaps %xmm10, (%rsp); \
- jmp 2f; \
-1: \
- PROTECTED_CLTS; \
-2:
-
-
- /*
- * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
- * otherwise set CR0_TS.
- */
-#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
- testq $CR0_TS, tmpreg; \
- jnz 1f; \
- movaps (%rsp), %xmm10; \
- movaps 16(%rsp), %xmm9; \
- movaps 32(%rsp), %xmm8; \
- movaps 48(%rsp), %xmm7; \
- movaps 64(%rsp), %xmm6; \
- movaps 80(%rsp), %xmm5; \
- movaps 96(%rsp), %xmm4; \
- movaps 112(%rsp), %xmm3; \
- movaps 128(%rsp), %xmm2; \
- movaps 144(%rsp), %xmm1; \
- movaps 160(%rsp), %xmm0; \
- jmp 2f; \
-1: \
- STTS(tmpreg); \
-2: \
- mov %rbp, %rsp; \
- pop %rbp
-
-
-#else
-#define PROTECTED_CLTS
-#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
-#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
#endif /* _KERNEL */
-/*
- * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
- */
-
-// static uint8_t byte_swap16_mask[] = {
-// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
.text
.align XMM_ALIGN
+/*
+ * Use this mask to byte-swap a 16-byte integer with the pshufb instruction:
+ * static uint8_t byte_swap16_mask[] = {
+ * 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
+ */
.Lbyte_swap16_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#ifdef _KERNEL
+/*
+ * void gcm_intel_save(void *savestate)
+ *
+ * Saves the XMM0--XMM14 registers and CR0 to a temporary location pointed
+ * to in the first argument and clears TS in CR0. This must be invoked before
+ * executing accelerated GCM computations inside the kernel (and kernel
+ * thread preemption must be disabled as well). The memory region to which
+ * all state is saved must be at least 16x 128-bit + 64-bit long and must
+ * be 128-bit aligned.
+ */
+ENTRY_NP(gcm_accel_save)
+ movq %cr0, %rax
+ movq %rax, 0x100(%rdi)
+ testq $CR0_TS, %rax
+ jnz 1f
+ /* FPU is in use, save registers */
+ movaps %xmm0, 0x00(%rdi)
+ movaps %xmm1, 0x10(%rdi)
+ movaps %xmm2, 0x20(%rdi)
+ movaps %xmm3, 0x30(%rdi)
+ movaps %xmm4, 0x40(%rdi)
+ movaps %xmm5, 0x50(%rdi)
+ movaps %xmm6, 0x60(%rdi)
+ movaps %xmm7, 0x70(%rdi)
+ movaps %xmm8, 0x80(%rdi)
+ movaps %xmm9, 0x90(%rdi)
+ movaps %xmm10, 0xa0(%rdi)
+ movaps %xmm11, 0xb0(%rdi)
+ movaps %xmm12, 0xc0(%rdi)
+ movaps %xmm13, 0xd0(%rdi)
+ movaps %xmm14, 0xe0(%rdi)
+ movaps %xmm15, 0xf0(%rdi)
+ ret
+1:
+ PROTECTED_CLTS
+ ret
+ SET_SIZE(gcm_accel_save)
+/*
+ * void gcm_accel_restore(void *savestate)
+ *
+ * Restores the saved XMM and CR0.TS state from aes_accel_save.
+ */
+ENTRY_NP(gcm_accel_restore)
+ movq 0x100(%rdi), %rax
+ testq $CR0_TS, %rax
+ jnz 1f
+ movaps 0x00(%rdi), %xmm0
+ movaps 0x10(%rdi), %xmm1
+ movaps 0x20(%rdi), %xmm2
+ movaps 0x30(%rdi), %xmm3
+ movaps 0x40(%rdi), %xmm4
+ movaps 0x50(%rdi), %xmm5
+ movaps 0x60(%rdi), %xmm6
+ movaps 0x70(%rdi), %xmm7
+ movaps 0x80(%rdi), %xmm8
+ movaps 0x90(%rdi), %xmm9
+ movaps 0xa0(%rdi), %xmm10
+ movaps 0xb0(%rdi), %xmm11
+ movaps 0xc0(%rdi), %xmm12
+ movaps 0xd0(%rdi), %xmm13
+ movaps 0xe0(%rdi), %xmm14
+ movaps 0xf0(%rdi), %xmm15
+ ret
+1:
+ STTS(%rax)
+ ret
+ SET_SIZE(gcm_accel_restore)
+#endif /* _KERNEL */
+
/*
* void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
*
* Perform a carry-less multiplication (that is, use XOR instead of the
* multiply operator) on P1 and P2 and place the result in P3.
*
* Byte swap the input and the output.
*
- * Note: x_in, y, and res all point to a block of 20-byte numbers
+ * Note: x_in, y, and res all point to a block of 16-byte numbers
* (an array of two 64-bit integers).
*
- * Note2: For kernel code, caller is responsible for ensuring
- * kpreempt_disable() has been called. This is because %xmm registers are
- * not saved/restored. Clear and set the CR0.TS bit on entry and exit,
- * respectively, if TS is set on entry. Otherwise, if TS is not set,
- * save and restore %xmm registers on the stack.
+ * Note2: For kernel code, caller is responsible for bracketing this call with
+ * disabling kernel thread preemption and calling gcm_accel_save/restore().
*
* Note3: Original Intel definition:
* void galois_hash_asm(unsigned char *hk, unsigned char *s,
* unsigned char *d, int length)
*
@@ -222,12 +245,10 @@
* Parameter 2: %rsi (copied to %xmm1) y
* Parameter 3: %rdx (result) res
*/
ENTRY_NP(gcm_mul_pclmulqdq)
- CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
-
//
// Copy Parameters
//
movdqu (%rdi), %xmm0 // P1
movdqu (%rsi), %xmm1 // P2
@@ -328,10 +349,9 @@
//
// Cleanup and Return
//
- SET_TS_OR_POP_XMM_REGISTERS(%r10)
ret
SET_SIZE(gcm_mul_pclmulqdq)
#endif /* lint || __lint */