1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2009 Intel Corporation
24 * All Rights Reserved.
25 */
26 /*
27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
28 * Use is subject to license terms.
29 */
30 /*
31 * Copyright 2015 by Saso Kiselkov. All rights reserved.
32 */
33
34 /*
35 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
36 * instructions. This file contains an accelerated
37 * Galois Field Multiplication implementation.
38 *
39 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
40 * carry-less multiplication. More information about PCLMULQDQ can be
41 * found at:
42 * http://software.intel.com/en-us/articles/
43 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
44 *
45 */
46
47 /*
48 * ====================================================================
49 * OpenSolaris OS modifications
50 *
51 * This source originates as file galois_hash_asm.c from
52 * Intel Corporation dated September 21, 2009.
53 *
54 * This OpenSolaris version has these major changes from the original source:
55 *
56 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
57 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
58 * definition for lint.
59 *
60 * 2. Formatted code, added comments, and added #includes and #defines.
61 *
62 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
63 * calling kpreempt_disable() and kpreempt_enable().
64 * If the TS bit is not set, Save and restore %xmm registers at the beginning
65 * and end of function calls (%xmm* registers are not saved and restored by
66 * during kernel thread preemption).
67 *
68 * 4. Removed code to perform hashing. This is already done with C macro
69 * GHASH in gcm.c. For better performance, this removed code should be
70 * reintegrated in the future to replace the C GHASH macro.
71 *
72 * 5. Added code to byte swap 16-byte input and output.
73 *
74 * 6. Folded in comments from the original C source with embedded assembly
75 * (SB_w_shift_xor.c)
76 *
77 * 7. Renamed function and reordered parameters to match OpenSolaris:
78 * Intel interface:
79 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
80 * unsigned char *d, int length)
81 * OpenSolaris OS interface:
82 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
83 * ====================================================================
84 */
85
86
87 #if defined(lint) || defined(__lint)
88
89 #include <sys/types.h>
90
91 /* ARGSUSED */
92 void
93 gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
94 }
95
96 #ifdef _KERNEL
97 /*ARGSUSED*/
98 void
99 gcm_intel_save(void *savestate)
100 {
101 }
102
103 /*ARGSUSED*/
104 void
105 gcm_accel_restore(void *savestate)
106 {
107 }
108 #endif /* _KERNEL */
109
110 #else /* lint */
111
112 #include <sys/asm_linkage.h>
113 #include <sys/controlregs.h>
114 #ifdef _KERNEL
115 #include <sys/machprivregs.h>
116 #endif
117
118 #ifdef _KERNEL
119 /*
120 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
121 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
122 * uses it to pass P2 to syscall.
123 * This also occurs with the STTS macro, but we don't care if
124 * P2 (%rsi) is modified just before function exit.
125 * The CLTS and STTS macros push and pop P1 (%rdi) already.
126 */
127 #ifdef __xpv
128 #define PROTECTED_CLTS \
129 push %rsi; \
130 CLTS; \
131 pop %rsi
132 #else
133 #define PROTECTED_CLTS \
134 CLTS
135 #endif /* __xpv */
136 #endif /* _KERNEL */
137
138 .text
139 .align XMM_ALIGN
140 /*
141 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction:
142 * static uint8_t byte_swap16_mask[] = {
143 * 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
144 */
145 .Lbyte_swap16_mask:
146 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
147
148 #ifdef _KERNEL
149 /*
150 * void gcm_intel_save(void *savestate)
151 *
152 * Saves the XMM0--XMM14 registers and CR0 to a temporary location pointed
153 * to in the first argument and clears TS in CR0. This must be invoked before
154 * executing accelerated GCM computations inside the kernel (and kernel
155 * thread preemption must be disabled as well). The memory region to which
156 * all state is saved must be at least 16x 128-bit + 64-bit long and must
157 * be 128-bit aligned.
158 */
159 ENTRY_NP(gcm_accel_save)
160 movq %cr0, %rax
161 movq %rax, 0x100(%rdi)
162 testq $CR0_TS, %rax
163 jnz 1f
164 /* FPU is in use, save registers */
165 movaps %xmm0, 0x00(%rdi)
166 movaps %xmm1, 0x10(%rdi)
167 movaps %xmm2, 0x20(%rdi)
168 movaps %xmm3, 0x30(%rdi)
169 movaps %xmm4, 0x40(%rdi)
170 movaps %xmm5, 0x50(%rdi)
171 movaps %xmm6, 0x60(%rdi)
172 movaps %xmm7, 0x70(%rdi)
173 movaps %xmm8, 0x80(%rdi)
174 movaps %xmm9, 0x90(%rdi)
175 movaps %xmm10, 0xa0(%rdi)
176 movaps %xmm11, 0xb0(%rdi)
177 movaps %xmm12, 0xc0(%rdi)
178 movaps %xmm13, 0xd0(%rdi)
179 movaps %xmm14, 0xe0(%rdi)
180 movaps %xmm15, 0xf0(%rdi)
181 ret
182 1:
183 PROTECTED_CLTS
184 ret
185 SET_SIZE(gcm_accel_save)
186
187 /*
188 * void gcm_accel_restore(void *savestate)
189 *
190 * Restores the saved XMM and CR0.TS state from aes_accel_save.
191 */
192 ENTRY_NP(gcm_accel_restore)
193 movq 0x100(%rdi), %rax
194 testq $CR0_TS, %rax
195 jnz 1f
196 movaps 0x00(%rdi), %xmm0
197 movaps 0x10(%rdi), %xmm1
198 movaps 0x20(%rdi), %xmm2
199 movaps 0x30(%rdi), %xmm3
200 movaps 0x40(%rdi), %xmm4
201 movaps 0x50(%rdi), %xmm5
202 movaps 0x60(%rdi), %xmm6
203 movaps 0x70(%rdi), %xmm7
204 movaps 0x80(%rdi), %xmm8
205 movaps 0x90(%rdi), %xmm9
206 movaps 0xa0(%rdi), %xmm10
207 movaps 0xb0(%rdi), %xmm11
208 movaps 0xc0(%rdi), %xmm12
209 movaps 0xd0(%rdi), %xmm13
210 movaps 0xe0(%rdi), %xmm14
211 movaps 0xf0(%rdi), %xmm15
212 ret
213 1:
214 STTS(%rax)
215 ret
216 SET_SIZE(gcm_accel_restore)
217
218 #endif /* _KERNEL */
219
220 /*
221 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
222 *
223 * Perform a carry-less multiplication (that is, use XOR instead of the
224 * multiply operator) on P1 and P2 and place the result in P3.
225 *
226 * Byte swap the input and the output.
227 *
228 * Note: x_in, y, and res all point to a block of 16-byte numbers
229 * (an array of two 64-bit integers).
230 *
231 * Note2: For kernel code, caller is responsible for bracketing this call with
232 * disabling kernel thread preemption and calling gcm_accel_save/restore().
233 *
234 * Note3: Original Intel definition:
235 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
236 * unsigned char *d, int length)
237 *
238 * Note4: Register/parameter mapping:
239 * Intel:
240 * Parameter 1: %rcx (copied to %xmm0) hk or x_in
241 * Parameter 2: %rdx (copied to %xmm1) s or y
242 * Parameter 3: %rdi (result) d or res
243 * OpenSolaris:
244 * Parameter 1: %rdi (copied to %xmm0) x_in
245 * Parameter 2: %rsi (copied to %xmm1) y
246 * Parameter 3: %rdx (result) res
247 */
248
249 ENTRY_NP(gcm_mul_pclmulqdq)
250 //
251 // Copy Parameters
252 //
253 movdqu (%rdi), %xmm0 // P1
254 movdqu (%rsi), %xmm1 // P2
255
256 //
257 // Byte swap 16-byte input
258 //
259 lea .Lbyte_swap16_mask(%rip), %rax
260 movaps (%rax), %xmm10
261 pshufb %xmm10, %xmm0
262 pshufb %xmm10, %xmm1
263
264
265 //
266 // Multiply with the hash key
267 //
268 movdqu %xmm0, %xmm3
269 pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0
270
271 movdqu %xmm0, %xmm4
272 pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1
273
274 movdqu %xmm0, %xmm5
275 pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0
276 movdqu %xmm0, %xmm6
277 pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1
278
279 pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0
280
281 movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5
282 psrldq $8, %xmm4 // shift by xmm4 64 bits to the right
283 pslldq $8, %xmm5 // shift by xmm5 64 bits to the left
284 pxor %xmm5, %xmm3
285 pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result
286 // of the carry-less multiplication of
287 // xmm0 by xmm1.
288
289 // We shift the result of the multiplication by one bit position
290 // to the left to cope for the fact that the bits are reversed.
291 movdqu %xmm3, %xmm7
292 movdqu %xmm6, %xmm8
293 pslld $1, %xmm3
294 pslld $1, %xmm6
295 psrld $31, %xmm7
296 psrld $31, %xmm8
297 movdqu %xmm7, %xmm9
298 pslldq $4, %xmm8
299 pslldq $4, %xmm7
300 psrldq $12, %xmm9
301 por %xmm7, %xmm3
302 por %xmm8, %xmm6
303 por %xmm9, %xmm6
304
305 //
306 // First phase of the reduction
307 //
308 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
309 // independently.
310 movdqu %xmm3, %xmm7
311 movdqu %xmm3, %xmm8
312 movdqu %xmm3, %xmm9
313 pslld $31, %xmm7 // packed right shift shifting << 31
314 pslld $30, %xmm8 // packed right shift shifting << 30
315 pslld $25, %xmm9 // packed right shift shifting << 25
316 pxor %xmm8, %xmm7 // xor the shifted versions
317 pxor %xmm9, %xmm7
318 movdqu %xmm7, %xmm8
319 pslldq $12, %xmm7
320 psrldq $4, %xmm8
321 pxor %xmm7, %xmm3 // first phase of the reduction complete
322
323 //
324 // Second phase of the reduction
325 //
326 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
327 // shift operations.
328 movdqu %xmm3, %xmm2
329 movdqu %xmm3, %xmm4 // packed left shifting >> 1
330 movdqu %xmm3, %xmm5
331 psrld $1, %xmm2
332 psrld $2, %xmm4 // packed left shifting >> 2
333 psrld $7, %xmm5 // packed left shifting >> 7
334 pxor %xmm4, %xmm2 // xor the shifted versions
335 pxor %xmm5, %xmm2
336 pxor %xmm8, %xmm2
337 pxor %xmm2, %xmm3
338 pxor %xmm3, %xmm6 // the result is in xmm6
339
340 //
341 // Byte swap 16-byte result
342 //
343 pshufb %xmm10, %xmm6 // %xmm10 has the swap mask
344
345 //
346 // Store the result
347 //
348 movdqu %xmm6, (%rdx) // P3
349
350
351 //
352 // Cleanup and Return
353 //
354 ret
355 SET_SIZE(gcm_mul_pclmulqdq)
356
357 #endif /* lint || __lint */