Print this page
4896 Performance improvements for KCF AES modes
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/common/crypto/modes/amd64/gcm_intel.s
+++ new/usr/src/common/crypto/modes/amd64/gcm_intel.s
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
↓ open down ↓ |
19 lines elided |
↑ open up ↑ |
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2009 Intel Corporation
24 24 * All Rights Reserved.
25 25 */
26 26 /*
27 27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
28 28 * Use is subject to license terms.
29 29 */
30 +/*
31 + * Copyright 2015 by Saso Kiselkov. All rights reserved.
32 + */
30 33
31 34 /*
32 35 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
33 36 * instructions. This file contains an accelerated
34 37 * Galois Field Multiplication implementation.
35 38 *
36 39 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
37 40 * carry-less multiplication. More information about PCLMULQDQ can be
38 41 * found at:
39 42 * http://software.intel.com/en-us/articles/
40 43 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
41 44 *
42 45 */
43 46
44 47 /*
45 48 * ====================================================================
46 49 * OpenSolaris OS modifications
47 50 *
48 51 * This source originates as file galois_hash_asm.c from
49 52 * Intel Corporation dated September 21, 2009.
50 53 *
51 54 * This OpenSolaris version has these major changes from the original source:
52 55 *
53 56 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
54 57 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
55 58 * definition for lint.
56 59 *
57 60 * 2. Formatted code, added comments, and added #includes and #defines.
58 61 *
59 62 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
60 63 * calling kpreempt_disable() and kpreempt_enable().
61 64 * If the TS bit is not set, Save and restore %xmm registers at the beginning
62 65 * and end of function calls (%xmm* registers are not saved and restored by
63 66 * during kernel thread preemption).
64 67 *
65 68 * 4. Removed code to perform hashing. This is already done with C macro
66 69 * GHASH in gcm.c. For better performance, this removed code should be
67 70 * reintegrated in the future to replace the C GHASH macro.
68 71 *
69 72 * 5. Added code to byte swap 16-byte input and output.
70 73 *
71 74 * 6. Folded in comments from the original C source with embedded assembly
72 75 * (SB_w_shift_xor.c)
73 76 *
74 77 * 7. Renamed function and reordered parameters to match OpenSolaris:
75 78 * Intel interface:
76 79 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
77 80 * unsigned char *d, int length)
78 81 * OpenSolaris OS interface:
79 82 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
80 83 * ====================================================================
81 84 */
82 85
↓ open down ↓ |
43 lines elided |
↑ open up ↑ |
83 86
84 87 #if defined(lint) || defined(__lint)
85 88
86 89 #include <sys/types.h>
87 90
88 91 /* ARGSUSED */
89 92 void
90 93 gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
91 94 }
92 95
96 +#ifdef _KERNEL
97 +/*ARGSUSED*/
98 +void
99 +gcm_intel_save(void *savestate)
100 +{
101 +}
102 +
103 +/*ARGSUSED*/
104 +void
105 +gcm_accel_restore(void *savestate)
106 +{
107 +}
108 +#endif /* _KERNEL */
109 +
93 110 #else /* lint */
94 111
95 112 #include <sys/asm_linkage.h>
96 113 #include <sys/controlregs.h>
97 114 #ifdef _KERNEL
98 115 #include <sys/machprivregs.h>
99 116 #endif
100 117
101 118 #ifdef _KERNEL
102 119 /*
103 120 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
104 121 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
105 122 * uses it to pass P2 to syscall.
106 123 * This also occurs with the STTS macro, but we don't care if
107 124 * P2 (%rsi) is modified just before function exit.
108 125 * The CLTS and STTS macros push and pop P1 (%rdi) already.
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
109 126 */
110 127 #ifdef __xpv
111 128 #define PROTECTED_CLTS \
112 129 push %rsi; \
113 130 CLTS; \
114 131 pop %rsi
115 132 #else
116 133 #define PROTECTED_CLTS \
117 134 CLTS
118 135 #endif /* __xpv */
119 -
120 - /*
121 - * If CR0_TS is not set, align stack (with push %rbp) and push
122 - * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
123 - */
124 -#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
125 - push %rbp; \
126 - mov %rsp, %rbp; \
127 - movq %cr0, tmpreg; \
128 - testq $CR0_TS, tmpreg; \
129 - jnz 1f; \
130 - and $-XMM_ALIGN, %rsp; \
131 - sub $[XMM_SIZE * 11], %rsp; \
132 - movaps %xmm0, 160(%rsp); \
133 - movaps %xmm1, 144(%rsp); \
134 - movaps %xmm2, 128(%rsp); \
135 - movaps %xmm3, 112(%rsp); \
136 - movaps %xmm4, 96(%rsp); \
137 - movaps %xmm5, 80(%rsp); \
138 - movaps %xmm6, 64(%rsp); \
139 - movaps %xmm7, 48(%rsp); \
140 - movaps %xmm8, 32(%rsp); \
141 - movaps %xmm9, 16(%rsp); \
142 - movaps %xmm10, (%rsp); \
143 - jmp 2f; \
144 -1: \
145 - PROTECTED_CLTS; \
146 -2:
147 -
148 -
149 - /*
150 - * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
151 - * otherwise set CR0_TS.
152 - */
153 -#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
154 - testq $CR0_TS, tmpreg; \
155 - jnz 1f; \
156 - movaps (%rsp), %xmm10; \
157 - movaps 16(%rsp), %xmm9; \
158 - movaps 32(%rsp), %xmm8; \
159 - movaps 48(%rsp), %xmm7; \
160 - movaps 64(%rsp), %xmm6; \
161 - movaps 80(%rsp), %xmm5; \
162 - movaps 96(%rsp), %xmm4; \
163 - movaps 112(%rsp), %xmm3; \
164 - movaps 128(%rsp), %xmm2; \
165 - movaps 144(%rsp), %xmm1; \
166 - movaps 160(%rsp), %xmm0; \
167 - jmp 2f; \
168 -1: \
169 - STTS(tmpreg); \
170 -2: \
171 - mov %rbp, %rsp; \
172 - pop %rbp
173 -
174 -
175 -#else
176 -#define PROTECTED_CLTS
177 -#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
178 -#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
179 136 #endif /* _KERNEL */
180 137
181 -/*
182 - * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
183 - */
184 -
185 -// static uint8_t byte_swap16_mask[] = {
186 -// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
187 138 .text
188 139 .align XMM_ALIGN
140 +/*
141 + * Use this mask to byte-swap a 16-byte integer with the pshufb instruction:
142 + * static uint8_t byte_swap16_mask[] = {
143 + * 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
144 + */
189 145 .Lbyte_swap16_mask:
190 146 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
191 147
148 +#ifdef _KERNEL
149 +/*
150 + * void gcm_intel_save(void *savestate)
151 + *
152 + * Saves the XMM0--XMM14 registers and CR0 to a temporary location pointed
153 + * to in the first argument and clears TS in CR0. This must be invoked before
154 + * executing accelerated GCM computations inside the kernel (and kernel
155 + * thread preemption must be disabled as well). The memory region to which
156 + * all state is saved must be at least 16x 128-bit + 64-bit long and must
157 + * be 128-bit aligned.
158 + */
159 +ENTRY_NP(gcm_accel_save)
160 + movq %cr0, %rax
161 + movq %rax, 0x100(%rdi)
162 + testq $CR0_TS, %rax
163 + jnz 1f
164 + /* FPU is in use, save registers */
165 + movaps %xmm0, 0x00(%rdi)
166 + movaps %xmm1, 0x10(%rdi)
167 + movaps %xmm2, 0x20(%rdi)
168 + movaps %xmm3, 0x30(%rdi)
169 + movaps %xmm4, 0x40(%rdi)
170 + movaps %xmm5, 0x50(%rdi)
171 + movaps %xmm6, 0x60(%rdi)
172 + movaps %xmm7, 0x70(%rdi)
173 + movaps %xmm8, 0x80(%rdi)
174 + movaps %xmm9, 0x90(%rdi)
175 + movaps %xmm10, 0xa0(%rdi)
176 + movaps %xmm11, 0xb0(%rdi)
177 + movaps %xmm12, 0xc0(%rdi)
178 + movaps %xmm13, 0xd0(%rdi)
179 + movaps %xmm14, 0xe0(%rdi)
180 + movaps %xmm15, 0xf0(%rdi)
181 + ret
182 +1:
183 + PROTECTED_CLTS
184 + ret
185 + SET_SIZE(gcm_accel_save)
192 186
187 +/*
188 + * void gcm_accel_restore(void *savestate)
189 + *
190 + * Restores the saved XMM and CR0.TS state from aes_accel_save.
191 + */
192 +ENTRY_NP(gcm_accel_restore)
193 + movq 0x100(%rdi), %rax
194 + testq $CR0_TS, %rax
195 + jnz 1f
196 + movaps 0x00(%rdi), %xmm0
197 + movaps 0x10(%rdi), %xmm1
198 + movaps 0x20(%rdi), %xmm2
199 + movaps 0x30(%rdi), %xmm3
200 + movaps 0x40(%rdi), %xmm4
201 + movaps 0x50(%rdi), %xmm5
202 + movaps 0x60(%rdi), %xmm6
203 + movaps 0x70(%rdi), %xmm7
204 + movaps 0x80(%rdi), %xmm8
205 + movaps 0x90(%rdi), %xmm9
206 + movaps 0xa0(%rdi), %xmm10
207 + movaps 0xb0(%rdi), %xmm11
208 + movaps 0xc0(%rdi), %xmm12
209 + movaps 0xd0(%rdi), %xmm13
210 + movaps 0xe0(%rdi), %xmm14
211 + movaps 0xf0(%rdi), %xmm15
212 + ret
213 +1:
214 + STTS(%rax)
215 + ret
216 + SET_SIZE(gcm_accel_restore)
193 217
218 +#endif /* _KERNEL */
219 +
194 220 /*
195 221 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
196 222 *
197 223 * Perform a carry-less multiplication (that is, use XOR instead of the
198 224 * multiply operator) on P1 and P2 and place the result in P3.
199 225 *
200 226 * Byte swap the input and the output.
201 227 *
202 - * Note: x_in, y, and res all point to a block of 20-byte numbers
228 + * Note: x_in, y, and res all point to a block of 16-byte numbers
203 229 * (an array of two 64-bit integers).
204 230 *
205 - * Note2: For kernel code, caller is responsible for ensuring
206 - * kpreempt_disable() has been called. This is because %xmm registers are
207 - * not saved/restored. Clear and set the CR0.TS bit on entry and exit,
208 - * respectively, if TS is set on entry. Otherwise, if TS is not set,
209 - * save and restore %xmm registers on the stack.
231 + * Note2: For kernel code, caller is responsible for bracketing this call with
232 + * disabling kernel thread preemption and calling gcm_accel_save/restore().
210 233 *
211 234 * Note3: Original Intel definition:
212 235 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
213 236 * unsigned char *d, int length)
214 237 *
215 238 * Note4: Register/parameter mapping:
216 239 * Intel:
217 240 * Parameter 1: %rcx (copied to %xmm0) hk or x_in
218 241 * Parameter 2: %rdx (copied to %xmm1) s or y
219 242 * Parameter 3: %rdi (result) d or res
220 243 * OpenSolaris:
221 244 * Parameter 1: %rdi (copied to %xmm0) x_in
222 245 * Parameter 2: %rsi (copied to %xmm1) y
223 246 * Parameter 3: %rdx (result) res
224 247 */
225 248
226 249 ENTRY_NP(gcm_mul_pclmulqdq)
227 - CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
228 -
229 250 //
230 251 // Copy Parameters
231 252 //
232 253 movdqu (%rdi), %xmm0 // P1
233 254 movdqu (%rsi), %xmm1 // P2
234 255
235 256 //
236 257 // Byte swap 16-byte input
237 258 //
238 259 lea .Lbyte_swap16_mask(%rip), %rax
239 260 movaps (%rax), %xmm10
240 261 pshufb %xmm10, %xmm0
241 262 pshufb %xmm10, %xmm1
242 263
243 264
244 265 //
245 266 // Multiply with the hash key
246 267 //
247 268 movdqu %xmm0, %xmm3
248 269 pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0
249 270
250 271 movdqu %xmm0, %xmm4
251 272 pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1
252 273
253 274 movdqu %xmm0, %xmm5
254 275 pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0
255 276 movdqu %xmm0, %xmm6
256 277 pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1
257 278
258 279 pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0
259 280
260 281 movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5
261 282 psrldq $8, %xmm4 // shift by xmm4 64 bits to the right
262 283 pslldq $8, %xmm5 // shift by xmm5 64 bits to the left
263 284 pxor %xmm5, %xmm3
264 285 pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result
265 286 // of the carry-less multiplication of
266 287 // xmm0 by xmm1.
267 288
268 289 // We shift the result of the multiplication by one bit position
269 290 // to the left to cope for the fact that the bits are reversed.
270 291 movdqu %xmm3, %xmm7
271 292 movdqu %xmm6, %xmm8
272 293 pslld $1, %xmm3
273 294 pslld $1, %xmm6
274 295 psrld $31, %xmm7
275 296 psrld $31, %xmm8
276 297 movdqu %xmm7, %xmm9
277 298 pslldq $4, %xmm8
278 299 pslldq $4, %xmm7
279 300 psrldq $12, %xmm9
280 301 por %xmm7, %xmm3
281 302 por %xmm8, %xmm6
282 303 por %xmm9, %xmm6
283 304
284 305 //
285 306 // First phase of the reduction
286 307 //
287 308 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
288 309 // independently.
289 310 movdqu %xmm3, %xmm7
290 311 movdqu %xmm3, %xmm8
291 312 movdqu %xmm3, %xmm9
292 313 pslld $31, %xmm7 // packed right shift shifting << 31
293 314 pslld $30, %xmm8 // packed right shift shifting << 30
294 315 pslld $25, %xmm9 // packed right shift shifting << 25
295 316 pxor %xmm8, %xmm7 // xor the shifted versions
296 317 pxor %xmm9, %xmm7
297 318 movdqu %xmm7, %xmm8
298 319 pslldq $12, %xmm7
299 320 psrldq $4, %xmm8
300 321 pxor %xmm7, %xmm3 // first phase of the reduction complete
301 322
302 323 //
303 324 // Second phase of the reduction
304 325 //
305 326 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
306 327 // shift operations.
307 328 movdqu %xmm3, %xmm2
308 329 movdqu %xmm3, %xmm4 // packed left shifting >> 1
309 330 movdqu %xmm3, %xmm5
310 331 psrld $1, %xmm2
311 332 psrld $2, %xmm4 // packed left shifting >> 2
312 333 psrld $7, %xmm5 // packed left shifting >> 7
313 334 pxor %xmm4, %xmm2 // xor the shifted versions
314 335 pxor %xmm5, %xmm2
315 336 pxor %xmm8, %xmm2
316 337 pxor %xmm2, %xmm3
317 338 pxor %xmm3, %xmm6 // the result is in xmm6
318 339
319 340 //
320 341 // Byte swap 16-byte result
321 342 //
322 343 pshufb %xmm10, %xmm6 // %xmm10 has the swap mask
↓ open down ↓ |
84 lines elided |
↑ open up ↑ |
323 344
324 345 //
325 346 // Store the result
326 347 //
327 348 movdqu %xmm6, (%rdx) // P3
328 349
329 350
330 351 //
331 352 // Cleanup and Return
332 353 //
333 - SET_TS_OR_POP_XMM_REGISTERS(%r10)
334 354 ret
335 355 SET_SIZE(gcm_mul_pclmulqdq)
336 356
337 357 #endif /* lint || __lint */
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX