10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2009 Intel Corporation
24 * All Rights Reserved.
25 */
26 /*
27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
28 * Use is subject to license terms.
29 */
30
31 /*
32 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
33 * instructions. This file contains an accelerated
34 * Galois Field Multiplication implementation.
35 *
36 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
37 * carry-less multiplication. More information about PCLMULQDQ can be
38 * found at:
39 * http://software.intel.com/en-us/articles/
40 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
41 *
42 */
43
44 /*
45 * ====================================================================
46 * OpenSolaris OS modifications
47 *
48 * This source originates as file galois_hash_asm.c from
49 * Intel Corporation dated September 21, 2009.
73 *
74 * 7. Renamed function and reordered parameters to match OpenSolaris:
75 * Intel interface:
76 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
77 * unsigned char *d, int length)
78 * OpenSolaris OS interface:
79 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
80 * ====================================================================
81 */
82
83
84 #if defined(lint) || defined(__lint)
85
86 #include <sys/types.h>
87
88 /* ARGSUSED */
89 void
90 gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
91 }
92
93 #else /* lint */
94
95 #include <sys/asm_linkage.h>
96 #include <sys/controlregs.h>
97 #ifdef _KERNEL
98 #include <sys/machprivregs.h>
99 #endif
100
101 #ifdef _KERNEL
102 /*
103 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
104 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
105 * uses it to pass P2 to syscall.
106 * This also occurs with the STTS macro, but we don't care if
107 * P2 (%rsi) is modified just before function exit.
108 * The CLTS and STTS macros push and pop P1 (%rdi) already.
109 */
110 #ifdef __xpv
111 #define PROTECTED_CLTS \
112 push %rsi; \
113 CLTS; \
114 pop %rsi
115 #else
116 #define PROTECTED_CLTS \
117 CLTS
118 #endif /* __xpv */
119
120 /*
121 * If CR0_TS is not set, align stack (with push %rbp) and push
122 * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
123 */
124 #define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
125 push %rbp; \
126 mov %rsp, %rbp; \
127 movq %cr0, tmpreg; \
128 testq $CR0_TS, tmpreg; \
129 jnz 1f; \
130 and $-XMM_ALIGN, %rsp; \
131 sub $[XMM_SIZE * 11], %rsp; \
132 movaps %xmm0, 160(%rsp); \
133 movaps %xmm1, 144(%rsp); \
134 movaps %xmm2, 128(%rsp); \
135 movaps %xmm3, 112(%rsp); \
136 movaps %xmm4, 96(%rsp); \
137 movaps %xmm5, 80(%rsp); \
138 movaps %xmm6, 64(%rsp); \
139 movaps %xmm7, 48(%rsp); \
140 movaps %xmm8, 32(%rsp); \
141 movaps %xmm9, 16(%rsp); \
142 movaps %xmm10, (%rsp); \
143 jmp 2f; \
144 1: \
145 PROTECTED_CLTS; \
146 2:
147
148
149 /*
150 * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
151 * otherwise set CR0_TS.
152 */
153 #define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
154 testq $CR0_TS, tmpreg; \
155 jnz 1f; \
156 movaps (%rsp), %xmm10; \
157 movaps 16(%rsp), %xmm9; \
158 movaps 32(%rsp), %xmm8; \
159 movaps 48(%rsp), %xmm7; \
160 movaps 64(%rsp), %xmm6; \
161 movaps 80(%rsp), %xmm5; \
162 movaps 96(%rsp), %xmm4; \
163 movaps 112(%rsp), %xmm3; \
164 movaps 128(%rsp), %xmm2; \
165 movaps 144(%rsp), %xmm1; \
166 movaps 160(%rsp), %xmm0; \
167 jmp 2f; \
168 1: \
169 STTS(tmpreg); \
170 2: \
171 mov %rbp, %rsp; \
172 pop %rbp
173
174
175 #else
176 #define PROTECTED_CLTS
177 #define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
178 #define SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
179 #endif /* _KERNEL */
180
181 /*
182 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
183 */
184
185 // static uint8_t byte_swap16_mask[] = {
186 // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
187 .text
188 .align XMM_ALIGN
189 .Lbyte_swap16_mask:
190 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
191
192
193
194 /*
195 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
196 *
197 * Perform a carry-less multiplication (that is, use XOR instead of the
198 * multiply operator) on P1 and P2 and place the result in P3.
199 *
200 * Byte swap the input and the output.
201 *
202 * Note: x_in, y, and res all point to a block of 20-byte numbers
203 * (an array of two 64-bit integers).
204 *
205 * Note2: For kernel code, caller is responsible for ensuring
206 * kpreempt_disable() has been called. This is because %xmm registers are
207 * not saved/restored. Clear and set the CR0.TS bit on entry and exit,
208 * respectively, if TS is set on entry. Otherwise, if TS is not set,
209 * save and restore %xmm registers on the stack.
210 *
211 * Note3: Original Intel definition:
212 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
213 * unsigned char *d, int length)
214 *
215 * Note4: Register/parameter mapping:
216 * Intel:
217 * Parameter 1: %rcx (copied to %xmm0) hk or x_in
218 * Parameter 2: %rdx (copied to %xmm1) s or y
219 * Parameter 3: %rdi (result) d or res
220 * OpenSolaris:
221 * Parameter 1: %rdi (copied to %xmm0) x_in
222 * Parameter 2: %rsi (copied to %xmm1) y
223 * Parameter 3: %rdx (result) res
224 */
225
226 ENTRY_NP(gcm_mul_pclmulqdq)
227 CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
228
229 //
230 // Copy Parameters
231 //
232 movdqu (%rdi), %xmm0 // P1
233 movdqu (%rsi), %xmm1 // P2
234
235 //
236 // Byte swap 16-byte input
237 //
238 lea .Lbyte_swap16_mask(%rip), %rax
239 movaps (%rax), %xmm10
240 pshufb %xmm10, %xmm0
241 pshufb %xmm10, %xmm1
242
243
244 //
245 // Multiply with the hash key
246 //
247 movdqu %xmm0, %xmm3
248 pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0
313 pxor %xmm4, %xmm2 // xor the shifted versions
314 pxor %xmm5, %xmm2
315 pxor %xmm8, %xmm2
316 pxor %xmm2, %xmm3
317 pxor %xmm3, %xmm6 // the result is in xmm6
318
319 //
320 // Byte swap 16-byte result
321 //
322 pshufb %xmm10, %xmm6 // %xmm10 has the swap mask
323
324 //
325 // Store the result
326 //
327 movdqu %xmm6, (%rdx) // P3
328
329
330 //
331 // Cleanup and Return
332 //
333 SET_TS_OR_POP_XMM_REGISTERS(%r10)
334 ret
335 SET_SIZE(gcm_mul_pclmulqdq)
336
337 #endif /* lint || __lint */
|
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2009 Intel Corporation
24 * All Rights Reserved.
25 */
26 /*
27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
28 * Use is subject to license terms.
29 */
30 /*
31 * Copyright 2015 by Saso Kiselkov. All rights reserved.
32 */
33
34 /*
35 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
36 * instructions. This file contains an accelerated
37 * Galois Field Multiplication implementation.
38 *
39 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
40 * carry-less multiplication. More information about PCLMULQDQ can be
41 * found at:
42 * http://software.intel.com/en-us/articles/
43 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
44 *
45 */
46
47 /*
48 * ====================================================================
49 * OpenSolaris OS modifications
50 *
51 * This source originates as file galois_hash_asm.c from
52 * Intel Corporation dated September 21, 2009.
76 *
77 * 7. Renamed function and reordered parameters to match OpenSolaris:
78 * Intel interface:
79 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
80 * unsigned char *d, int length)
81 * OpenSolaris OS interface:
82 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
83 * ====================================================================
84 */
85
86
87 #if defined(lint) || defined(__lint)
88
89 #include <sys/types.h>
90
91 /* ARGSUSED */
92 void
93 gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
94 }
95
96 #ifdef _KERNEL
97 /*ARGSUSED*/
98 void
99 gcm_intel_save(void *savestate)
100 {
101 }
102
103 /*ARGSUSED*/
104 void
105 gcm_accel_restore(void *savestate)
106 {
107 }
108 #endif /* _KERNEL */
109
110 #else /* lint */
111
112 #include <sys/asm_linkage.h>
113 #include <sys/controlregs.h>
114 #ifdef _KERNEL
115 #include <sys/machprivregs.h>
116 #endif
117
118 #ifdef _KERNEL
119 /*
120 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
121 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
122 * uses it to pass P2 to syscall.
123 * This also occurs with the STTS macro, but we don't care if
124 * P2 (%rsi) is modified just before function exit.
125 * The CLTS and STTS macros push and pop P1 (%rdi) already.
126 */
127 #ifdef __xpv
128 #define PROTECTED_CLTS \
129 push %rsi; \
130 CLTS; \
131 pop %rsi
132 #else
133 #define PROTECTED_CLTS \
134 CLTS
135 #endif /* __xpv */
136 #endif /* _KERNEL */
137
138 .text
139 .align XMM_ALIGN
140 /*
141 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction:
142 * static uint8_t byte_swap16_mask[] = {
143 * 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
144 */
145 .Lbyte_swap16_mask:
146 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
147
148 #ifdef _KERNEL
149 /*
150 * void gcm_intel_save(void *savestate)
151 *
152 * Saves the XMM0--XMM14 registers and CR0 to a temporary location pointed
153 * to in the first argument and clears TS in CR0. This must be invoked before
154 * executing accelerated GCM computations inside the kernel (and kernel
155 * thread preemption must be disabled as well). The memory region to which
156 * all state is saved must be at least 16x 128-bit + 64-bit long and must
157 * be 128-bit aligned.
158 */
159 ENTRY_NP(gcm_accel_save)
160 movq %cr0, %rax
161 movq %rax, 0x100(%rdi)
162 testq $CR0_TS, %rax
163 jnz 1f
164 /* FPU is in use, save registers */
165 movaps %xmm0, 0x00(%rdi)
166 movaps %xmm1, 0x10(%rdi)
167 movaps %xmm2, 0x20(%rdi)
168 movaps %xmm3, 0x30(%rdi)
169 movaps %xmm4, 0x40(%rdi)
170 movaps %xmm5, 0x50(%rdi)
171 movaps %xmm6, 0x60(%rdi)
172 movaps %xmm7, 0x70(%rdi)
173 movaps %xmm8, 0x80(%rdi)
174 movaps %xmm9, 0x90(%rdi)
175 movaps %xmm10, 0xa0(%rdi)
176 movaps %xmm11, 0xb0(%rdi)
177 movaps %xmm12, 0xc0(%rdi)
178 movaps %xmm13, 0xd0(%rdi)
179 movaps %xmm14, 0xe0(%rdi)
180 movaps %xmm15, 0xf0(%rdi)
181 ret
182 1:
183 PROTECTED_CLTS
184 ret
185 SET_SIZE(gcm_accel_save)
186
187 /*
188 * void gcm_accel_restore(void *savestate)
189 *
190 * Restores the saved XMM and CR0.TS state from aes_accel_save.
191 */
192 ENTRY_NP(gcm_accel_restore)
193 movq 0x100(%rdi), %rax
194 testq $CR0_TS, %rax
195 jnz 1f
196 movaps 0x00(%rdi), %xmm0
197 movaps 0x10(%rdi), %xmm1
198 movaps 0x20(%rdi), %xmm2
199 movaps 0x30(%rdi), %xmm3
200 movaps 0x40(%rdi), %xmm4
201 movaps 0x50(%rdi), %xmm5
202 movaps 0x60(%rdi), %xmm6
203 movaps 0x70(%rdi), %xmm7
204 movaps 0x80(%rdi), %xmm8
205 movaps 0x90(%rdi), %xmm9
206 movaps 0xa0(%rdi), %xmm10
207 movaps 0xb0(%rdi), %xmm11
208 movaps 0xc0(%rdi), %xmm12
209 movaps 0xd0(%rdi), %xmm13
210 movaps 0xe0(%rdi), %xmm14
211 movaps 0xf0(%rdi), %xmm15
212 ret
213 1:
214 STTS(%rax)
215 ret
216 SET_SIZE(gcm_accel_restore)
217
218 #endif /* _KERNEL */
219
220 /*
221 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
222 *
223 * Perform a carry-less multiplication (that is, use XOR instead of the
224 * multiply operator) on P1 and P2 and place the result in P3.
225 *
226 * Byte swap the input and the output.
227 *
228 * Note: x_in, y, and res all point to a block of 16-byte numbers
229 * (an array of two 64-bit integers).
230 *
231 * Note2: For kernel code, caller is responsible for bracketing this call with
232 * disabling kernel thread preemption and calling gcm_accel_save/restore().
233 *
234 * Note3: Original Intel definition:
235 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
236 * unsigned char *d, int length)
237 *
238 * Note4: Register/parameter mapping:
239 * Intel:
240 * Parameter 1: %rcx (copied to %xmm0) hk or x_in
241 * Parameter 2: %rdx (copied to %xmm1) s or y
242 * Parameter 3: %rdi (result) d or res
243 * OpenSolaris:
244 * Parameter 1: %rdi (copied to %xmm0) x_in
245 * Parameter 2: %rsi (copied to %xmm1) y
246 * Parameter 3: %rdx (result) res
247 */
248
249 ENTRY_NP(gcm_mul_pclmulqdq)
250 //
251 // Copy Parameters
252 //
253 movdqu (%rdi), %xmm0 // P1
254 movdqu (%rsi), %xmm1 // P2
255
256 //
257 // Byte swap 16-byte input
258 //
259 lea .Lbyte_swap16_mask(%rip), %rax
260 movaps (%rax), %xmm10
261 pshufb %xmm10, %xmm0
262 pshufb %xmm10, %xmm1
263
264
265 //
266 // Multiply with the hash key
267 //
268 movdqu %xmm0, %xmm3
269 pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0
334 pxor %xmm4, %xmm2 // xor the shifted versions
335 pxor %xmm5, %xmm2
336 pxor %xmm8, %xmm2
337 pxor %xmm2, %xmm3
338 pxor %xmm3, %xmm6 // the result is in xmm6
339
340 //
341 // Byte swap 16-byte result
342 //
343 pshufb %xmm10, %xmm6 // %xmm10 has the swap mask
344
345 //
346 // Store the result
347 //
348 movdqu %xmm6, (%rdx) // P3
349
350
351 //
352 // Cleanup and Return
353 //
354 ret
355 SET_SIZE(gcm_mul_pclmulqdq)
356
357 #endif /* lint || __lint */
|