1 /*
2 * Copyright (c) 2013, CRYPTOGAMS by <appro@openssl.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * * Redistributions of source code must retain copyright notices,
10 * this list of conditions and the following disclaimer.
11 *
12 * * Redistributions in binary form must reproduce the above
13 * copyright notice, this list of conditions and the following
14 * disclaimer in the documentation and/or other materials
15 * provided with the distribution.
16 *
17 * * Neither the name of the CRYPTOGAMS nor the names of its
18 * copyright holder and contributors may be used to endorse or
19 * promote products derived from this software without specific
20 * prior written permission.
21 *
22 * ALTERNATIVELY, provided that this notice is retained in full, this
23 * product may be distributed under the terms of the GNU General Public
24 * License (GPL), in which case the provisions of the GPL apply INSTEAD OF
25 * those given above.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39 /*
40 * Copyright 2015 by Saso Kiselkov on Illumos port sections.
41 */
42
43 #if defined(lint) || defined(__lint)
44
45 #include <sys/types.h>
46
47 /*ARGSUSED*/
48 void
49 gcm_ghash_clmul(uint64_t ghash[2], const uint8_t Htable[256],
50 const uint8_t *inp, size_t len)
51 {
52 }
53
54 /*ARGSUSED*/
55 void
56 gcm_init_clmul(const uint64_t hash_init[2], uint8_t Htable[256])
57 {
58 }
59
60 #else /* lint */
61
62 #include <sys/asm_linkage.h>
63 #include <sys/controlregs.h>
64 #ifdef _KERNEL
65 #include <sys/machprivregs.h>
66 #endif
67
68 .text
69 .align XMM_ALIGN
70 .Lbyte_swap16_mask:
71 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
72 .L0x1c2_polynomial:
73 .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc2
74 .L7_mask:
75 .long 7, 0, 7, 0
76
77 #define Xi xmm0 /* hash value */
78 #define Xhi xmm1 /* hash value high order 64 bits */
79 #define Hkey xmm2 /* hash key */
80 #define T1 xmm3 /* temp1 */
81 #define T2 xmm4 /* temp2 */
82 #define T3 xmm5 /* temp3 */
83 #define Xb0 xmm6 /* cipher block #0 */
84 #define Xb1 xmm7 /* cipher block #1 */
85 #define Xb2 xmm8 /* cipher block #2 */
86 #define Xb3 xmm9 /* cipher block #3 */
87 #define Xb4 xmm10 /* cipher block #4 */
88 #define Xb5 xmm11 /* cipher block #5 */
89 #define Xb6 xmm12 /* cipher block #6 */
90 #define Xb7 xmm13 /* cipher block #7 */
91
92 #define clmul64x64_T2(tmpreg) \
93 movdqa %Xi, %Xhi; \
94 pshufd $0b01001110, %Xi, %T1; \
95 pxor %Xi, %T1; \
96 \
97 pclmulqdq $0x00, %Hkey, %Xi; \
98 pclmulqdq $0x11, %Hkey, %Xhi; \
99 pclmulqdq $0x00, %tmpreg, %T1; \
100 pxor %Xi, %T1; \
101 pxor %Xhi, %T1; \
102 \
103 movdqa %T1, %T2; \
104 psrldq $8, %T1; \
105 pslldq $8, %T2; \
106 pxor %T1, %Xhi; \
107 pxor %T2, %Xi
108
109 #define reduction_alg9 \
110 /* 1st phase */ \
111 movdqa %Xi, %T2; \
112 movdqa %Xi, %T1; \
113 psllq $5, %Xi; \
114 pxor %Xi, %T1; \
115 psllq $1, %Xi; \
116 pxor %T1, %Xi; \
117 psllq $57, %Xi; \
118 movdqa %Xi, %T1; \
119 pslldq $8, %Xi; \
120 psrldq $8, %T1; \
121 pxor %T2, %Xi; \
122 pxor %T1, %Xhi; \
123 /* 2nd phase */ \
124 movdqa %Xi, %T2; \
125 psrlq $1, %Xi; \
126 pxor %T2, %Xhi; \
127 pxor %Xi, %T2; \
128 psrlq $5, %Xi; \
129 pxor %T2, %Xi; \
130 psrlq $1, %Xi; \
131 pxor %Xhi, %Xi
132
133 #define Xip rdi
134 #define Htbl rsi
135 #define inp rdx
136 #define len rcx
137
138 #define Xln xmm6
139 #define Xmn xmm7
140 #define Xhn xmm8
141 #define Hkey2 xmm9
142 #define HK xmm10
143 #define Xl xmm11
144 #define Xm xmm12
145 #define Xh xmm13
146 #define Hkey3 xmm14
147 #define Hkey4 xmm15
148
149 /*
150 * void gcm_ghash_clmul(uint64_t Xi[2], const uint64_t Htable[32],
151 * const uint8_t *inp, size_t len)
152 */
153 ENTRY_NP(gcm_ghash_clmul)
154 movdqa .Lbyte_swap16_mask(%rip), %T3
155 mov $0xA040608020C0E000, %rax / ((7..0) ? 0xE0) & 0xff
156
157 movdqu (%Xip), %Xi
158 movdqu (%Htbl), %Hkey
159 movdqu 0x20(%Htbl), %HK
160 pshufb %T3, %Xi
161
162 sub $0x10, %len
163 jz .Lodd_tail
164
165 movdqu 0x10(%Htbl), %Hkey2
166
167 cmp $0x30, %len
168 jb .Lskip4x
169
170 sub $0x30, %len
171 movdqu 0x30(%Htbl),%Hkey3
172 movdqu 0x40(%Htbl),%Hkey4
173
174 /* Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P */
175 movdqu 0x30(%inp), %Xln
176 movdqu 0x20(%inp), %Xl
177 pshufb %T3, %Xln
178 pshufb %T3, %Xl
179 movdqa %Xln, %Xhn
180 pshufd $0b01001110, %Xln, %Xmn
181 pxor %Xln, %Xmn
182 pclmulqdq $0x00, %Hkey, %Xln
183 pclmulqdq $0x11, %Hkey, %Xhn
184 pclmulqdq $0x00, %HK, %Xmn
185
186 movdqa %Xl, %Xh
187 pshufd $0b01001110, %Xl, %Xm
188 pxor %Xl, %Xm
189 pclmulqdq $0x00, %Hkey2, %Xl
190 pclmulqdq $0x11, %Hkey2, %Xh
191 xorps %Xl, %Xln
192 pclmulqdq $0x10, %HK, %Xm
193 xorps %Xh, %Xhn
194 movups 0x50(%Htbl), %HK
195 xorps %Xm, %Xmn
196
197 movdqu 0x10(%inp), %Xl
198 movdqu 0x00(%inp), %T1
199 pshufb %T3, %Xl
200 pshufb %T3, %T1
201 movdqa %Xl, %Xh
202 pshufd $0b01001110, %Xl, %Xm
203 pxor %T1, %Xi
204 pxor %Xl, %Xm
205 pclmulqdq $0x00, %Hkey3, %Xl
206 movdqa %Xi, %Xhi
207 pshufd $0b01001110, %Xi, %T1
208 pxor %Xi, %T1
209 pclmulqdq $0x11, %Hkey3, %Xh
210 xorps %Xl, %Xln
211 pclmulqdq $0x00, %HK, %Xm
212 xorps %Xh, %Xhn
213
214 lea 0x40(%inp), %inp
215 sub $0x40, %len
216 jc .Ltail4x
217
218 jmp .Lmod4_loop
219
220 .align 32
221 .Lmod4_loop:
222 pclmulqdq $0x00, %Hkey4, %Xi
223 xorps %Xm, %Xmn
224 movdqu 0x30(%inp), %Xl
225 pshufb %T3, %Xl
226 pclmulqdq $0x11, %Hkey4, %Xhi
227 xorps %Xln, %Xi
228 movdqu 0x20(%inp), %Xln
229 movdqa %Xl, %Xh
230 pshufd $0b01001110, %Xl, %Xm
231 pclmulqdq $0x10, %HK, %T1
232 xorps %Xhn, %Xhi
233 pxor %Xl, %Xm
234 pshufb %T3, %Xln
235 movups 0x20(%Htbl), %HK
236 pclmulqdq $0x00, %Hkey, %Xl
237 xorps %Xmn, %T1
238 movdqa %Xln, %Xhn
239 pshufd $0b01001110, %Xln, %Xmn
240
241 pxor %Xi, %T1 / aggregated Karatsuba post-processing
242 pxor %Xln, %Xmn
243 pxor %Xhi, %T1
244 movdqa %T1, %T2
245 pslldq $8, %T1
246 pclmulqdq $0x11, %Hkey, %Xh
247 psrldq $8, %T2
248 pxor %T1, %Xi
249 movdqa .L7_mask(%rip), %T1
250 pxor %T2, %Xhi
251 movq %rax, %T2
252
253 pand %Xi, %T1 / 1st phase
254 pshufb %T1, %T2
255 pclmulqdq $0x00, %HK, %Xm
256 pxor %Xi, %T2
257 psllq $57, %T2
258 movdqa %T2, %T1
259 pslldq $8, %T2
260 pclmulqdq $0x00, %Hkey2, %Xln
261 psrldq $8, %T1
262 pxor %T2, %Xi
263 pxor %T1, %Xhi
264 movdqu 0(%inp), %T1
265
266 movdqa %Xi, %T2 / 2nd phase
267 psrlq $1, %Xi
268 pclmulqdq $0x11, %Hkey2, %Xhn
269 xorps %Xl, %Xln
270 movdqu 0x10(%inp), %Xl
271 pshufb %T3, %Xl
272 pclmulqdq $0x10, %HK, %Xmn
273 xorps %Xh, %Xhn
274 movups 0x50(%Htbl), %HK
275 pshufb %T3, %T1
276 pxor %T2, %Xhi
277 pxor %Xi, %T2
278 psrlq $5, %Xi
279
280 movdqa %Xl, %Xh
281 pxor %Xm, %Xmn
282 pshufd $0b01001110, %Xl, %Xm
283 pxor %Xl, %Xm
284 pclmulqdq $0x00, %Hkey3, %Xl
285 pxor %T2, %Xi
286 pxor %T1, %Xhi
287 psrlq $1, %Xi
288 pclmulqdq $0x11, %Hkey3, %Xh
289 xorps %Xl, %Xln
290 pxor %Xhi, %Xi
291
292 pclmulqdq $0x00, %HK, %Xm
293 xorps %Xh, %Xhn
294
295 movdqa %Xi, %Xhi
296 pshufd $0b01001110, %Xi, %T1
297 pxor %Xi, %T1
298
299 lea 0x40(%inp), %inp
300 sub $0x40, %len
301 jnc .Lmod4_loop
302
303 .Ltail4x:
304 pclmulqdq $0x00, %Hkey4, %Xi
305 xorps %Xm, %Xmn
306 pclmulqdq $0x11, %Hkey4, %Xhi
307 xorps %Xln, %Xi
308 pclmulqdq $0x10, %HK, %T1
309 xorps %Xhn, %Xhi
310 pxor %Xi, %Xhi / aggregated Karatsuba post-processing
311 pxor %Xmn, %T1
312
313 pxor %Xhi, %T1
314 pxor %Xi, %Xhi
315
316 movdqa %T1, %T2
317 psrldq $8, %T1
318 pslldq $8, %T2
319 pxor %T1, %Xhi
320 pxor %T2, %Xi
321
322 reduction_alg9
323
324 add $0x40, %len
325 jz .Ldone
326 movdqu 0x20(%Htbl), %HK
327 sub $0x10, %len
328 jz .Lodd_tail
329 .Lskip4x:
330
331 /*
332 * Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
333 * [(H*Ii+1) + (H*Xi+1)] mod P =
334 * [(H*Ii+1) + H^2*(Ii+Xi)] mod P
335 */
336 movdqu (%inp), %T1 / Ii
337 movdqu 16(%inp), %Xln / Ii+1
338 pshufb %T3, %T1
339 pshufb %T3, %Xln
340 pxor %T1, %Xi / Ii+Xi
341
342 movdqa %Xln, %Xhn
343 pshufd $0b01001110, %Xln, %T1
344 pxor %Xln, %T1
345 pclmulqdq $0x00, %Hkey, %Xln
346 pclmulqdq $0x11, %Hkey, %Xhn
347 pclmulqdq $0x00, %HK, %T1
348
349 lea 32(%inp), %inp / i+=2
350 sub $0x20, %len
351 jbe .Leven_tail
352 jmp .Lmod_loop
353
354 .align 32
355 .Lmod_loop:
356 movdqa %Xi, %Xhi
357 pshufd $0b01001110, %Xi, %T2
358 pxor %Xi, %T2
359
360 pclmulqdq $0x00, %Hkey2, %Xi
361 pclmulqdq $0x11, %Hkey2, %Xhi
362 pclmulqdq $0x10, %HK, %T2
363
364 pxor %Xln, %Xi / (H*Ii+1) + H^2*(Ii+Xi)
365 pxor %Xhn, %Xhi
366 movdqu (%inp), %Xhn / Ii
367 pshufb %T3, %Xhn
368 movdqu 16(%inp), %Xln / Ii+1
369
370 pxor %Xi, %T1 / aggregated Karatsuba post-proc
371 pxor %Xhi, %T1
372 pxor %Xhn, %Xhi / "Ii+Xi", consume early
373 pxor %T1, %T2
374 pshufb %T3, %Xln
375 movdqa %T2, %T1
376 psrldq $8, %T1
377 pslldq $8, %T2
378 pxor %T1, %Xhi
379 pxor %T2, %Xi
380
381 movdqa %Xln, %Xhn
382
383 movdqa %Xi, %T2 / 1st phase
384 movdqa %Xi, %T1
385 psllq $5, %Xi
386 pclmulqdq $0x00, %Hkey, %Xln
387 pxor %Xi, %T1
388 psllq $1, %Xi
389 pxor %T1, %Xi
390 psllq $57, %Xi
391 movdqa %Xi, %T1
392 pslldq $8, %Xi
393 psrldq $8, %T1
394 pxor %T2, %Xi
395 pxor %T1, %Xhi
396 pshufd $0b01001110, %Xhn, %T1
397 pxor %Xhn, %T1
398
399 pclmulqdq $0x11, %Hkey, %Xhn
400 movdqa %Xi, %T2 / 2nd phase
401 psrlq $1, %Xi
402 pxor %T2, %Xhi
403 pxor %Xi, %T2
404 psrlq $5, %Xi
405 pxor %T2, %Xi
406 psrlq $1, %Xi
407 pclmulqdq $0x00, %HK, %T1
408 pxor %Xhi, %Xi
409
410 lea 32(%inp), %inp
411 sub $0x20, %len
412 ja .Lmod_loop
413
414 .Leven_tail:
415 movdqa %Xi, %Xhi
416 pshufd $0b01001110, %Xi, %T2
417 pxor %Xi, %T2
418
419 pclmulqdq $0x00, %Hkey2, %Xi
420 pclmulqdq $0x11, %Hkey2, %Xhi
421 pclmulqdq $0x10, %HK, %T2
422
423 pxor %Xln, %Xi /* (H*Ii+1) + H^2*(Ii+Xi) */
424 pxor %Xhn, %Xhi
425 pxor %Xi, %T1
426 pxor %Xhi, %T1
427 pxor %T1, %T2
428 movdqa %T2, %T1
429 psrldq $8, %T1
430 pslldq $8, %T2
431 pxor %T1, %Xhi
432 pxor %T2, %Xi
433
434 reduction_alg9
435
436 test %len, %len
437 jnz .Ldone
438
439 .Lodd_tail:
440 movdqu (%inp), %T1 / Ii
441 pshufb %T3, %T1
442 pxor %T1, %Xi / Ii+Xi
443
444 clmul64x64_T2(HK) / H*(Ii+Xi)
445 reduction_alg9
446
447 .Ldone:
448 pshufb %T3, %Xi
449 movdqu %Xi, (%Xip)
450
451 ret
452 SET_SIZE(gcm_ghash_clmul)
453
454 /*
455 * void gcm_init_clmul(const void *Xi, void *Htable)
456 */
457 ENTRY_NP(gcm_init_clmul)
458 movdqu (%Xip), %Hkey
459 pshufd $0b01001110, %Hkey, %Hkey / dword swap
460
461 / <<1 twist
462 pshufd $0b11111111, %Hkey, %T2 / broadcast uppermost dword
463 movdqa %Hkey, %T1
464 psllq $1, %Hkey
465 pxor %T3, %T3
466 psrlq $63, %T1
467 pcmpgtd %T2, %T3 / broadcast carry bit
468 pslldq $8, %T1
469 por %T1, %Hkey / H<<=1
470
471 / magic reduction
472 pand .L0x1c2_polynomial(%rip), %T3
473 pxor %T3, %Hkey / if(carry) H^=0x1c2_polynomial
474
475 / calculate H^2
476 pshufd $0b01001110, %Hkey, %HK
477 movdqa %Hkey, %Xi
478 pxor %Hkey, %HK
479
480 clmul64x64_T2(HK)
481 reduction_alg9
482
483 pshufd $0b01001110, %Hkey, %T1
484 pshufd $0b01001110, %Xi, %T2
485 pxor %Hkey, %T1 / Karatsuba pre-processing
486 movdqu %Hkey, 0x00(%Htbl) / save H
487 pxor %Xi, %T2 / Karatsuba pre-processing
488 movdqu %Xi, 0x10(%Htbl) / save H^2
489 palignr $8, %T1, %T2 / low part is H.lo^H.hi...
490 movdqu %T2, 0x20(%Htbl) / save Karatsuba "salt"
491
492 clmul64x64_T2(HK) / H^3
493 reduction_alg9
494
495 movdqa %Xi, %T3
496
497 clmul64x64_T2(HK) / H^4
498 reduction_alg9
499
500 pshufd $0b01001110, %T3, %T1
501 pshufd $0b01001110, %Xi, %T2
502 pxor %T3, %T1 / Karatsuba pre-processing
503 movdqu %T3, 0x30(%Htbl) / save H^3
504 pxor %Xi, %T2 / Karatsuba pre-processing
505 movdqu %Xi, 0x40(%Htbl) / save H^4
506 palignr $8, %T1, %T2 / low part is H^3.lo^H^3.hi...
507 movdqu %T2, 0x50(%Htbl) / save Karatsuba "salt"
508
509 ret
510 SET_SIZE(gcm_init_clmul)
511
512 #endif /* lint || __lint */