1 /*
2 * ====================================================================
3 * Written by Intel Corporation for the OpenSSL project to add support
4 * for Intel AES-NI instructions. Rights for redistribution and usage
5 * in source and binary forms are granted according to the OpenSSL
6 * license.
7 *
8 * Author: Huang Ying <ying.huang at intel dot com>
9 * Vinodh Gopal <vinodh.gopal at intel dot com>
10 * Kahraman Akdemir
11 *
12 * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
13 * instructions that are going to be introduced in the next generation
14 * of Intel processor, as of 2009. These instructions enable fast and
15 * secure data encryption and decryption, using the Advanced Encryption
16 * Standard (AES), defined by FIPS Publication number 197. The
17 * architecture introduces six instructions that offer full hardware
18 * support for AES. Four of them support high performance data
19 * encryption and decryption, and the other two instructions support
20 * the AES key expansion procedure.
21 * ====================================================================
22 */
23
24 /*
25 * ====================================================================
26 * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * 1. Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 *
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in
37 * the documentation and/or other materials provided with the
38 * distribution.
39 *
40 * 3. All advertising materials mentioning features or use of this
41 * software must display the following acknowledgment:
42 * "This product includes software developed by the OpenSSL Project
43 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
44 *
45 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
46 * endorse or promote products derived from this software without
47 * prior written permission. For written permission, please contact
48 * openssl-core@openssl.org.
49 *
50 * 5. Products derived from this software may not be called "OpenSSL"
51 * nor may "OpenSSL" appear in their names without prior written
52 * permission of the OpenSSL Project.
53 *
54 * 6. Redistributions of any form whatsoever must retain the following
55 * acknowledgment:
56 * "This product includes software developed by the OpenSSL Project
57 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
60 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
62 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
63 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
64 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
65 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
66 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
68 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
70 * OF THE POSSIBILITY OF SUCH DAMAGE.
71 * ====================================================================
72 */
73
74 /*
75 * ====================================================================
76 * OpenSolaris OS modifications
77 *
78 * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
79 * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
80 * Huang Ying of Intel to the openssl-dev mailing list under the subject
81 * of "Add support to Intel AES-NI instruction set for x86_64 platform".
82 *
83 * This OpenSolaris version has these major changes from the original source:
84 *
85 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
86 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
87 * definitions for lint.
88 *
89 * 2. Formatted code, added comments, and added #includes and #defines.
90 *
91 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
92 * calling kpreempt_disable() and kpreempt_enable().
93 * If the TS bit is not set, Save and restore %xmm registers at the beginning
94 * and end of function calls (%xmm* registers are not saved and restored by
95 * during kernel thread preemption).
96 *
97 * 4. Renamed functions, reordered parameters, and changed return value
98 * to match OpenSolaris:
99 *
100 * OpenSSL interface:
101 * int intel_AES_set_encrypt_key(const unsigned char *userKey,
102 * const int bits, AES_KEY *key);
103 * int intel_AES_set_decrypt_key(const unsigned char *userKey,
104 * const int bits, AES_KEY *key);
105 * Return values for above are non-zero on error, 0 on success.
106 *
107 * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
108 * const AES_KEY *key);
109 * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
110 * const AES_KEY *key);
111 * typedef struct aes_key_st {
112 * unsigned int rd_key[4 *(AES_MAXNR + 1)];
113 * int rounds;
114 * unsigned int pad[3];
115 * } AES_KEY;
116 * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
117 * (ks32) instead of 64-bit (ks64).
118 * Number of rounds (aka round count) is at offset 240 of AES_KEY.
119 *
120 * OpenSolaris OS interface (#ifdefs removed for readability):
121 * int rijndael_key_setup_dec_intel(uint32_t rk[],
122 * const uint32_t cipherKey[], uint64_t keyBits);
123 * int rijndael_key_setup_enc_intel(uint32_t rk[],
124 * const uint32_t cipherKey[], uint64_t keyBits);
125 * Return values for above are 0 on error, number of rounds on success.
126 *
127 * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
128 * const uint32_t pt[4], uint32_t ct[4]);
129 * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
130 * const uint32_t pt[4], uint32_t ct[4]);
131 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
132 * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
133 *
134 * typedef union {
135 * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
136 * } aes_ks_t;
137 * typedef struct aes_key {
138 * aes_ks_t encr_ks, decr_ks;
139 * long double align128;
140 * int flags, nr, type;
141 * } aes_key_t;
142 *
143 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
144 * ct is crypto text, and MAX_AES_NR is 14.
145 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
146 *
147 * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
148 *
149 * ====================================================================
150 */
151 /*
152 * Copyright 2015 by Saso Kiselkov. All rights reserved.
153 */
154
155 #if defined(lint) || defined(__lint)
156
157 #include <sys/types.h>
158
159 /* ARGSUSED */
160 void
161 aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
162 uint32_t ct[4]) {
163 }
164 /* ARGSUSED */
165 void
166 aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
167 uint32_t pt[4]) {
168 }
169 /* ARGSUSED */
170 int
171 rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
172 uint64_t keyBits) {
173 return (0);
174 }
175 /* ARGSUSED */
176 int
177 rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
178 uint64_t keyBits) {
179 return (0);
180 }
181
182
183 #else /* lint */
184
185 #include <sys/asm_linkage.h>
186 #include <sys/controlregs.h>
187 #ifdef _KERNEL
188 #include <sys/machprivregs.h>
189 #endif
190
191 #ifdef _KERNEL
192 /*
193 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
194 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
195 * uses it to pass P2 to syscall.
196 * This also occurs with the STTS macro, but we don't care if
197 * P2 (%rsi) is modified just before function exit.
198 * The CLTS and STTS macros push and pop P1 (%rdi) already.
199 */
200 #ifdef __xpv
201 #define PROTECTED_CLTS \
202 push %rsi; \
203 CLTS; \
204 pop %rsi
205 #else
206 #define PROTECTED_CLTS \
207 CLTS
208 #endif /* __xpv */
209
210 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \
211 push %rbp; \
212 mov %rsp, %rbp; \
213 movq %cr0, tmpreg; \
214 testq $CR0_TS, tmpreg; \
215 jnz 1f; \
216 and $-XMM_ALIGN, %rsp; \
217 sub $[XMM_SIZE * 2], %rsp; \
218 movaps %xmm0, 16(%rsp); \
219 movaps %xmm1, (%rsp); \
220 jmp 2f; \
221 1: \
222 PROTECTED_CLTS; \
223 2:
224
225 /*
226 * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack,
227 * otherwise set CR0_TS.
228 */
229 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg) \
230 testq $CR0_TS, tmpreg; \
231 jnz 1f; \
232 movaps (%rsp), %xmm1; \
233 movaps 16(%rsp), %xmm0; \
234 jmp 2f; \
235 1: \
236 STTS(tmpreg); \
237 2: \
238 mov %rbp, %rsp; \
239 pop %rbp
240
241 /*
242 * If CR0_TS is not set, align stack (with push %rbp) and push
243 * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS
244 */
245 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \
246 push %rbp; \
247 mov %rsp, %rbp; \
248 movq %cr0, tmpreg; \
249 testq $CR0_TS, tmpreg; \
250 jnz 1f; \
251 and $-XMM_ALIGN, %rsp; \
252 sub $[XMM_SIZE * 7], %rsp; \
253 movaps %xmm0, 96(%rsp); \
254 movaps %xmm1, 80(%rsp); \
255 movaps %xmm2, 64(%rsp); \
256 movaps %xmm3, 48(%rsp); \
257 movaps %xmm4, 32(%rsp); \
258 movaps %xmm5, 16(%rsp); \
259 movaps %xmm6, (%rsp); \
260 jmp 2f; \
261 1: \
262 PROTECTED_CLTS; \
263 2:
264
265
266 /*
267 * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
268 * otherwise set CR0_TS.
269 */
270 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
271 testq $CR0_TS, tmpreg; \
272 jnz 1f; \
273 movaps (%rsp), %xmm6; \
274 movaps 16(%rsp), %xmm5; \
275 movaps 32(%rsp), %xmm4; \
276 movaps 48(%rsp), %xmm3; \
277 movaps 64(%rsp), %xmm2; \
278 movaps 80(%rsp), %xmm1; \
279 movaps 96(%rsp), %xmm0; \
280 jmp 2f; \
281 1: \
282 STTS(tmpreg); \
283 2: \
284 mov %rbp, %rsp; \
285 pop %rbp
286
287 /*
288 * void aes_accel_save(void *savestate);
289 *
290 * Saves all 16 XMM registers and CR0 to a temporary location pointed to
291 * in the first argument and clears TS in CR0. This must be invoked before
292 * executing any floating point operations inside the kernel (and kernel
293 * thread preemption must be disabled as well). The memory region to which
294 * all state is saved must be at least 16x 128-bit + 64-bit long and must
295 * be 128-bit aligned.
296 */
297 ENTRY_NP(aes_accel_save)
298 movq %cr0, %rax
299 movq %rax, 0x100(%rdi)
300 testq $CR0_TS, %rax
301 jnz 1f
302 movaps %xmm0, 0x00(%rdi)
303 movaps %xmm1, 0x10(%rdi)
304 movaps %xmm2, 0x20(%rdi)
305 movaps %xmm3, 0x30(%rdi)
306 movaps %xmm4, 0x40(%rdi)
307 movaps %xmm5, 0x50(%rdi)
308 movaps %xmm6, 0x60(%rdi)
309 movaps %xmm7, 0x70(%rdi)
310 movaps %xmm8, 0x80(%rdi)
311 movaps %xmm9, 0x90(%rdi)
312 movaps %xmm10, 0xa0(%rdi)
313 movaps %xmm11, 0xb0(%rdi)
314 movaps %xmm12, 0xc0(%rdi)
315 movaps %xmm13, 0xd0(%rdi)
316 movaps %xmm14, 0xe0(%rdi)
317 movaps %xmm15, 0xf0(%rdi)
318 ret
319 1:
320 PROTECTED_CLTS
321 ret
322 SET_SIZE(aes_accel_save)
323
324 /*
325 * void aes_accel_restore(void *savestate);
326 *
327 * Restores the saved XMM and CR0.TS state from aes_accel_save.
328 */
329 ENTRY_NP(aes_accel_restore)
330 mov 0x100(%rdi), %rax
331 testq $CR0_TS, %rax
332 jnz 1f
333 movaps 0x00(%rdi), %xmm0
334 movaps 0x10(%rdi), %xmm1
335 movaps 0x20(%rdi), %xmm2
336 movaps 0x30(%rdi), %xmm3
337 movaps 0x40(%rdi), %xmm4
338 movaps 0x50(%rdi), %xmm5
339 movaps 0x60(%rdi), %xmm6
340 movaps 0x70(%rdi), %xmm7
341 movaps 0x80(%rdi), %xmm8
342 movaps 0x90(%rdi), %xmm9
343 movaps 0xa0(%rdi), %xmm10
344 movaps 0xb0(%rdi), %xmm11
345 movaps 0xc0(%rdi), %xmm12
346 movaps 0xd0(%rdi), %xmm13
347 movaps 0xe0(%rdi), %xmm14
348 movaps 0xf0(%rdi), %xmm15
349 ret
350 1:
351 STTS(%rax)
352 ret
353 SET_SIZE(aes_accel_restore)
354
355 #else
356 #define PROTECTED_CLTS
357 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
358 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
359 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
360 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
361 #endif /* _KERNEL */
362
363
364 /*
365 * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
366 * _key_expansion_256a(), _key_expansion_256b()
367 *
368 * Helper functions called by rijndael_key_setup_inc_intel().
369 * Also used indirectly by rijndael_key_setup_dec_intel().
370 *
371 * Input:
372 * %xmm0 User-provided cipher key
373 * %xmm1 Round constant
374 * Output:
375 * (%rcx) AES key
376 */
377
378 .align 16
379 _key_expansion_128:
380 _key_expansion_256a:
381 pshufd $0b11111111, %xmm1, %xmm1
382 shufps $0b00010000, %xmm0, %xmm4
383 pxor %xmm4, %xmm0
384 shufps $0b10001100, %xmm0, %xmm4
385 pxor %xmm4, %xmm0
386 pxor %xmm1, %xmm0
387 movaps %xmm0, (%rcx)
388 add $0x10, %rcx
389 ret
390 SET_SIZE(_key_expansion_128)
391 SET_SIZE(_key_expansion_256a)
392
393 .align 16
394 _key_expansion_192a:
395 pshufd $0b01010101, %xmm1, %xmm1
396 shufps $0b00010000, %xmm0, %xmm4
397 pxor %xmm4, %xmm0
398 shufps $0b10001100, %xmm0, %xmm4
399 pxor %xmm4, %xmm0
400 pxor %xmm1, %xmm0
401
402 movaps %xmm2, %xmm5
403 movaps %xmm2, %xmm6
404 pslldq $4, %xmm5
405 pshufd $0b11111111, %xmm0, %xmm3
406 pxor %xmm3, %xmm2
407 pxor %xmm5, %xmm2
408
409 movaps %xmm0, %xmm1
410 shufps $0b01000100, %xmm0, %xmm6
411 movaps %xmm6, (%rcx)
412 shufps $0b01001110, %xmm2, %xmm1
413 movaps %xmm1, 0x10(%rcx)
414 add $0x20, %rcx
415 ret
416 SET_SIZE(_key_expansion_192a)
417
418 .align 16
419 _key_expansion_192b:
420 pshufd $0b01010101, %xmm1, %xmm1
421 shufps $0b00010000, %xmm0, %xmm4
422 pxor %xmm4, %xmm0
423 shufps $0b10001100, %xmm0, %xmm4
424 pxor %xmm4, %xmm0
425 pxor %xmm1, %xmm0
426
427 movaps %xmm2, %xmm5
428 pslldq $4, %xmm5
429 pshufd $0b11111111, %xmm0, %xmm3
430 pxor %xmm3, %xmm2
431 pxor %xmm5, %xmm2
432
433 movaps %xmm0, (%rcx)
434 add $0x10, %rcx
435 ret
436 SET_SIZE(_key_expansion_192b)
437
438 .align 16
439 _key_expansion_256b:
440 pshufd $0b10101010, %xmm1, %xmm1
441 shufps $0b00010000, %xmm2, %xmm4
442 pxor %xmm4, %xmm2
443 shufps $0b10001100, %xmm2, %xmm4
444 pxor %xmm4, %xmm2
445 pxor %xmm1, %xmm2
446 movaps %xmm2, (%rcx)
447 add $0x10, %rcx
448 ret
449 SET_SIZE(_key_expansion_256b)
450
451 /*
452 * void aes_copy_intel(const uint8_t *src, uint8_t *dst);
453 *
454 * Copies one unaligned 128-bit block from `src' to `dst'. The copy is
455 * performed using FPU registers, so make sure FPU state is saved when
456 * running this in the kernel.
457 */
458 ENTRY_NP(aes_copy_intel)
459 movdqu (%rdi), %xmm0
460 movdqu %xmm0, (%rsi)
461 ret
462 SET_SIZE(aes_copy_intel)
463
464 /*
465 * void aes_xor_intel(const uint8_t *src, uint8_t *dst);
466 *
467 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
468 * stores the result at `dst'. The XOR is performed using FPU registers,
469 * so make sure FPU state is saved when running this in the kernel.
470 */
471 ENTRY_NP(aes_xor_intel)
472 movdqu (%rdi), %xmm0
473 movdqu (%rsi), %xmm1
474 pxor %xmm1, %xmm0
475 movdqu %xmm0, (%rsi)
476 ret
477 SET_SIZE(aes_xor_intel)
478
479 /*
480 * void aes_xor_intel8(const uint8_t *src, uint8_t *dst);
481 *
482 * XORs eight pairs of consecutive unaligned 128-bit blocks from `src' and
483 * 'dst' and stores the results at `dst'. The XOR is performed using FPU
484 * registers, so make sure FPU state is saved when running this in the kernel.
485 */
486 ENTRY_NP(aes_xor_intel8)
487 movdqu 0x00(%rdi), %xmm0
488 movdqu 0x00(%rsi), %xmm1
489 movdqu 0x10(%rdi), %xmm2
490 movdqu 0x10(%rsi), %xmm3
491 movdqu 0x20(%rdi), %xmm4
492 movdqu 0x20(%rsi), %xmm5
493 movdqu 0x30(%rdi), %xmm6
494 movdqu 0x30(%rsi), %xmm7
495 movdqu 0x40(%rdi), %xmm8
496 movdqu 0x40(%rsi), %xmm9
497 movdqu 0x50(%rdi), %xmm10
498 movdqu 0x50(%rsi), %xmm11
499 movdqu 0x60(%rdi), %xmm12
500 movdqu 0x60(%rsi), %xmm13
501 movdqu 0x70(%rdi), %xmm14
502 movdqu 0x70(%rsi), %xmm15
503 pxor %xmm1, %xmm0
504 pxor %xmm3, %xmm2
505 pxor %xmm5, %xmm4
506 pxor %xmm7, %xmm6
507 pxor %xmm9, %xmm8
508 pxor %xmm11, %xmm10
509 pxor %xmm13, %xmm12
510 pxor %xmm15, %xmm14
511 movdqu %xmm0, 0x00(%rsi)
512 movdqu %xmm2, 0x10(%rsi)
513 movdqu %xmm4, 0x20(%rsi)
514 movdqu %xmm6, 0x30(%rsi)
515 movdqu %xmm8, 0x40(%rsi)
516 movdqu %xmm10, 0x50(%rsi)
517 movdqu %xmm12, 0x60(%rsi)
518 movdqu %xmm14, 0x70(%rsi)
519 ret
520 SET_SIZE(aes_xor_intel8)
521
522 /*
523 * rijndael_key_setup_enc_intel()
524 * Expand the cipher key into the encryption key schedule.
525 *
526 * For kernel code, caller is responsible for ensuring kpreempt_disable()
527 * has been called. This is because %xmm registers are not saved/restored.
528 * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
529 * on entry. Otherwise, if TS is not set, save and restore %xmm registers
530 * on the stack.
531 *
532 * OpenSolaris interface:
533 * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
534 * uint64_t keyBits);
535 * Return value is 0 on error, number of rounds on success.
536 *
537 * Original Intel OpenSSL interface:
538 * int intel_AES_set_encrypt_key(const unsigned char *userKey,
539 * const int bits, AES_KEY *key);
540 * Return value is non-zero on error, 0 on success.
541 */
542
543 #ifdef OPENSSL_INTERFACE
544 #define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key
545 #define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key
546
547 #define USERCIPHERKEY rdi /* P1, 64 bits */
548 #define KEYSIZE32 esi /* P2, 32 bits */
549 #define KEYSIZE64 rsi /* P2, 64 bits */
550 #define AESKEY rdx /* P3, 64 bits */
551
552 #else /* OpenSolaris Interface */
553 #define AESKEY rdi /* P1, 64 bits */
554 #define USERCIPHERKEY rsi /* P2, 64 bits */
555 #define KEYSIZE32 edx /* P3, 32 bits */
556 #define KEYSIZE64 rdx /* P3, 64 bits */
557 #endif /* OPENSSL_INTERFACE */
558
559 #define ROUNDS32 KEYSIZE32 /* temp */
560 #define ROUNDS64 KEYSIZE64 /* temp */
561 #define ENDAESKEY USERCIPHERKEY /* temp */
562
563
564 ENTRY_NP(rijndael_key_setup_enc_intel)
565 CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10)
566
567 / NULL pointer sanity check
568 test %USERCIPHERKEY, %USERCIPHERKEY
569 jz .Lenc_key_invalid_param
570 test %AESKEY, %AESKEY
571 jz .Lenc_key_invalid_param
572
573 movups (%USERCIPHERKEY), %xmm0 / user key (first 16 bytes)
574 movaps %xmm0, (%AESKEY)
575 lea 0x10(%AESKEY), %rcx / key addr
576 pxor %xmm4, %xmm4 / xmm4 is assumed 0 in _key_expansion_x
577
578 cmp $256, %KEYSIZE32
579 jnz .Lenc_key192
580
581 / AES 256: 14 rounds in encryption key schedule
582 #ifdef OPENSSL_INTERFACE
583 mov $14, %ROUNDS32
584 movl %ROUNDS32, 240(%AESKEY) / key.rounds = 14
585 #endif /* OPENSSL_INTERFACE */
586
587 movups 0x10(%USERCIPHERKEY), %xmm2 / other user key (2nd 16 bytes)
588 movaps %xmm2, (%rcx)
589 add $0x10, %rcx
590
591 aeskeygenassist $0x1, %xmm2, %xmm1 / expand the key
592 call _key_expansion_256a
593 aeskeygenassist $0x1, %xmm0, %xmm1
594 call _key_expansion_256b
595 aeskeygenassist $0x2, %xmm2, %xmm1 / expand the key
596 call _key_expansion_256a
597 aeskeygenassist $0x2, %xmm0, %xmm1
598 call _key_expansion_256b
599 aeskeygenassist $0x4, %xmm2, %xmm1 / expand the key
600 call _key_expansion_256a
601 aeskeygenassist $0x4, %xmm0, %xmm1
602 call _key_expansion_256b
603 aeskeygenassist $0x8, %xmm2, %xmm1 / expand the key
604 call _key_expansion_256a
605 aeskeygenassist $0x8, %xmm0, %xmm1
606 call _key_expansion_256b
607 aeskeygenassist $0x10, %xmm2, %xmm1 / expand the key
608 call _key_expansion_256a
609 aeskeygenassist $0x10, %xmm0, %xmm1
610 call _key_expansion_256b
611 aeskeygenassist $0x20, %xmm2, %xmm1 / expand the key
612 call _key_expansion_256a
613 aeskeygenassist $0x20, %xmm0, %xmm1
614 call _key_expansion_256b
615 aeskeygenassist $0x40, %xmm2, %xmm1 / expand the key
616 call _key_expansion_256a
617
618 SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
619 #ifdef OPENSSL_INTERFACE
620 xor %rax, %rax / return 0 (OK)
621 #else /* Open Solaris Interface */
622 mov $14, %rax / return # rounds = 14
623 #endif
624 ret
625
626 .align 4
627 .Lenc_key192:
628 cmp $192, %KEYSIZE32
629 jnz .Lenc_key128
630
631 / AES 192: 12 rounds in encryption key schedule
632 #ifdef OPENSSL_INTERFACE
633 mov $12, %ROUNDS32
634 movl %ROUNDS32, 240(%AESKEY) / key.rounds = 12
635 #endif /* OPENSSL_INTERFACE */
636
637 movq 0x10(%USERCIPHERKEY), %xmm2 / other user key
638 aeskeygenassist $0x1, %xmm2, %xmm1 / expand the key
639 call _key_expansion_192a
640 aeskeygenassist $0x2, %xmm2, %xmm1 / expand the key
641 call _key_expansion_192b
642 aeskeygenassist $0x4, %xmm2, %xmm1 / expand the key
643 call _key_expansion_192a
644 aeskeygenassist $0x8, %xmm2, %xmm1 / expand the key
645 call _key_expansion_192b
646 aeskeygenassist $0x10, %xmm2, %xmm1 / expand the key
647 call _key_expansion_192a
648 aeskeygenassist $0x20, %xmm2, %xmm1 / expand the key
649 call _key_expansion_192b
650 aeskeygenassist $0x40, %xmm2, %xmm1 / expand the key
651 call _key_expansion_192a
652 aeskeygenassist $0x80, %xmm2, %xmm1 / expand the key
653 call _key_expansion_192b
654
655 SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
656 #ifdef OPENSSL_INTERFACE
657 xor %rax, %rax / return 0 (OK)
658 #else /* OpenSolaris Interface */
659 mov $12, %rax / return # rounds = 12
660 #endif
661 ret
662
663 .align 4
664 .Lenc_key128:
665 cmp $128, %KEYSIZE32
666 jnz .Lenc_key_invalid_key_bits
667
668 / AES 128: 10 rounds in encryption key schedule
669 #ifdef OPENSSL_INTERFACE
670 mov $10, %ROUNDS32
671 movl %ROUNDS32, 240(%AESKEY) / key.rounds = 10
672 #endif /* OPENSSL_INTERFACE */
673
674 aeskeygenassist $0x1, %xmm0, %xmm1 / expand the key
675 call _key_expansion_128
676 aeskeygenassist $0x2, %xmm0, %xmm1 / expand the key
677 call _key_expansion_128
678 aeskeygenassist $0x4, %xmm0, %xmm1 / expand the key
679 call _key_expansion_128
680 aeskeygenassist $0x8, %xmm0, %xmm1 / expand the key
681 call _key_expansion_128
682 aeskeygenassist $0x10, %xmm0, %xmm1 / expand the key
683 call _key_expansion_128
684 aeskeygenassist $0x20, %xmm0, %xmm1 / expand the key
685 call _key_expansion_128
686 aeskeygenassist $0x40, %xmm0, %xmm1 / expand the key
687 call _key_expansion_128
688 aeskeygenassist $0x80, %xmm0, %xmm1 / expand the key
689 call _key_expansion_128
690 aeskeygenassist $0x1b, %xmm0, %xmm1 / expand the key
691 call _key_expansion_128
692 aeskeygenassist $0x36, %xmm0, %xmm1 / expand the key
693 call _key_expansion_128
694
695 SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
696 #ifdef OPENSSL_INTERFACE
697 xor %rax, %rax / return 0 (OK)
698 #else /* OpenSolaris Interface */
699 mov $10, %rax / return # rounds = 10
700 #endif
701 ret
702
703 .Lenc_key_invalid_param:
704 #ifdef OPENSSL_INTERFACE
705 SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
706 mov $-1, %rax / user key or AES key pointer is NULL
707 ret
708 #else
709 /* FALLTHROUGH */
710 #endif /* OPENSSL_INTERFACE */
711
712 .Lenc_key_invalid_key_bits:
713 SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
714 #ifdef OPENSSL_INTERFACE
715 mov $-2, %rax / keysize is invalid
716 #else /* Open Solaris Interface */
717 xor %rax, %rax / a key pointer is NULL or invalid keysize
718 #endif /* OPENSSL_INTERFACE */
719
720 ret
721 SET_SIZE(rijndael_key_setup_enc_intel)
722
723
724 /*
725 * rijndael_key_setup_dec_intel()
726 * Expand the cipher key into the decryption key schedule.
727 *
728 * For kernel code, caller is responsible for ensuring kpreempt_disable()
729 * has been called. This is because %xmm registers are not saved/restored.
730 * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
731 * on entry. Otherwise, if TS is not set, save and restore %xmm registers
732 * on the stack.
733 *
734 * OpenSolaris interface:
735 * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
736 * uint64_t keyBits);
737 * Return value is 0 on error, number of rounds on success.
738 * P1->P2, P2->P3, P3->P1
739 *
740 * Original Intel OpenSSL interface:
741 * int intel_AES_set_decrypt_key(const unsigned char *userKey,
742 * const int bits, AES_KEY *key);
743 * Return value is non-zero on error, 0 on success.
744 */
745 ENTRY_NP(rijndael_key_setup_dec_intel)
746 / Generate round keys used for encryption
747 call rijndael_key_setup_enc_intel
748 test %rax, %rax
749 #ifdef OPENSSL_INTERFACE
750 jnz .Ldec_key_exit / Failed if returned non-0
751 #else /* OpenSolaris Interface */
752 jz .Ldec_key_exit / Failed if returned 0
753 #endif /* OPENSSL_INTERFACE */
754
755 CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
756
757 /*
758 * Convert round keys used for encryption
759 * to a form usable for decryption
760 */
761 #ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */
762 mov %rax, %ROUNDS64 / set # rounds (10, 12, or 14)
763 / (already set for OpenSSL)
764 #endif
765
766 lea 0x10(%AESKEY), %rcx / key addr
767 shl $4, %ROUNDS32
768 add %AESKEY, %ROUNDS64
769 mov %ROUNDS64, %ENDAESKEY
770
771 .align 4
772 .Ldec_key_reorder_loop:
773 movaps (%AESKEY), %xmm0
774 movaps (%ROUNDS64), %xmm1
775 movaps %xmm0, (%ROUNDS64)
776 movaps %xmm1, (%AESKEY)
777 lea 0x10(%AESKEY), %AESKEY
778 lea -0x10(%ROUNDS64), %ROUNDS64
779 cmp %AESKEY, %ROUNDS64
780 ja .Ldec_key_reorder_loop
781
782 .align 4
783 .Ldec_key_inv_loop:
784 movaps (%rcx), %xmm0
785 / Convert an encryption round key to a form usable for decryption
786 / with the "AES Inverse Mix Columns" instruction
787 aesimc %xmm0, %xmm1
788 movaps %xmm1, (%rcx)
789 lea 0x10(%rcx), %rcx
790 cmp %ENDAESKEY, %rcx
791 jnz .Ldec_key_inv_loop
792
793 SET_TS_OR_POP_XMM0_XMM1(%r10)
794
795 .Ldec_key_exit:
796 / OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
797 / OpenSSL: rax = 0 for OK, or non-zero for error
798 ret
799 SET_SIZE(rijndael_key_setup_dec_intel)
800
801
802 #ifdef OPENSSL_INTERFACE
803 #define aes_encrypt_intel intel_AES_encrypt
804 #define aes_decrypt_intel intel_AES_decrypt
805
806 #define INP rdi /* P1, 64 bits */
807 #define OUTP rsi /* P2, 64 bits */
808 #define KEYP rdx /* P3, 64 bits */
809
810 /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */
811 #define NROUNDS32 ecx /* temporary, 32 bits */
812 #define NROUNDS cl /* temporary, 8 bits */
813
814 #else /* OpenSolaris Interface */
815 #define KEYP rdi /* P1, 64 bits */
816 #define NROUNDS esi /* P2, 32 bits */
817 #define INP rdx /* P3, 64 bits */
818 #define OUTP rcx /* P4, 64 bits */
819 #define LENGTH r8 /* P5, 64 bits */
820 #endif /* OPENSSL_INTERFACE */
821
822 #define KEY xmm0 /* temporary, 128 bits */
823 #define STATE0 xmm8 /* temporary, 128 bits */
824 #define STATE1 xmm9 /* temporary, 128 bits */
825 #define STATE2 xmm10 /* temporary, 128 bits */
826 #define STATE3 xmm11 /* temporary, 128 bits */
827 #define STATE4 xmm12 /* temporary, 128 bits */
828 #define STATE5 xmm13 /* temporary, 128 bits */
829 #define STATE6 xmm14 /* temporary, 128 bits */
830 #define STATE7 xmm15 /* temporary, 128 bits */
831
832 /*
833 * Runs the first two rounds of AES256 on a state register. `op' should be
834 * aesenc or aesdec.
835 */
836 #define AES256_ROUNDS(op, statereg) \
837 movaps -0x60(%KEYP), %KEY; \
838 op %KEY, %statereg; \
839 movaps -0x50(%KEYP), %KEY; \
840 op %KEY, %statereg
841
842 /*
843 * Runs the first two rounds of AES192, or the 3rd & 4th round of AES256 on
844 * a state register. `op' should be aesenc or aesdec.
845 */
846 #define AES192_ROUNDS(op, statereg) \
847 movaps -0x40(%KEYP), %KEY; \
848 op %KEY, %statereg; \
849 movaps -0x30(%KEYP), %KEY; \
850 op %KEY, %statereg
851
852 /*
853 * Runs the full 10 rounds of AES128, or the last 10 rounds of AES192/AES256
854 * on a state register. `op' should be aesenc or aesdec and `lastop' should
855 * be aesenclast or aesdeclast.
856 */
857 #define AES128_ROUNDS(op, lastop, statereg) \
858 movaps -0x20(%KEYP), %KEY; \
859 op %KEY, %statereg; \
860 movaps -0x10(%KEYP), %KEY; \
861 op %KEY, %statereg; \
862 movaps (%KEYP), %KEY; \
863 op %KEY, %statereg; \
864 movaps 0x10(%KEYP), %KEY; \
865 op %KEY, %statereg; \
866 movaps 0x20(%KEYP), %KEY; \
867 op %KEY, %statereg; \
868 movaps 0x30(%KEYP), %KEY; \
869 op %KEY, %statereg; \
870 movaps 0x40(%KEYP), %KEY; \
871 op %KEY, %statereg; \
872 movaps 0x50(%KEYP), %KEY; \
873 op %KEY, %statereg; \
874 movaps 0x60(%KEYP), %KEY; \
875 op %KEY, %statereg; \
876 movaps 0x70(%KEYP), %KEY; \
877 lastop %KEY, %statereg
878
879 /*
880 * Macros to run AES encryption rounds. Input must be prefilled in state
881 * register - output will be left there as well.
882 * To run AES256, invoke all of these macros in sequence. To run AES192,
883 * invoke only the -192 and -128 variants. To run AES128, invoke only the
884 * -128 variant.
885 */
886 #define AES256_ENC_ROUNDS(statereg) \
887 AES256_ROUNDS(aesenc, statereg)
888 #define AES192_ENC_ROUNDS(statereg) \
889 AES192_ROUNDS(aesenc, statereg)
890 #define AES128_ENC_ROUNDS(statereg) \
891 AES128_ROUNDS(aesenc, aesenclast, statereg)
892
893 /* Same as the AES*_ENC_ROUNDS macros, but for decryption. */
894 #define AES256_DEC_ROUNDS(statereg) \
895 AES256_ROUNDS(aesdec, statereg)
896 #define AES192_DEC_ROUNDS(statereg) \
897 AES192_ROUNDS(aesdec, statereg)
898 #define AES128_DEC_ROUNDS(statereg) \
899 AES128_ROUNDS(aesdec, aesdeclast, statereg)
900
901
902 /*
903 * aes_encrypt_intel()
904 * Encrypt a single block (in and out can overlap).
905 *
906 * For kernel code, caller is responsible for bracketing this call with
907 * disabling kernel thread preemption and calling aes_accel_save/restore().
908 *
909 * Temporary register usage:
910 * %xmm0 Key
911 * %xmm8 State
912 *
913 * Original OpenSolaris Interface:
914 * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
915 * const uint32_t pt[4], uint32_t ct[4])
916 *
917 * Original Intel OpenSSL Interface:
918 * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
919 * const AES_KEY *key)
920 */
921 ENTRY_NP(aes_encrypt_intel)
922 movups (%INP), %STATE0 / input
923 movaps (%KEYP), %KEY / key
924
925 #ifdef OPENSSL_INTERFACE
926 mov 240(%KEYP), %NROUNDS32 / round count
927 #else /* OpenSolaris Interface */
928 /* Round count is already present as P2 in %rsi/%esi */
929 #endif /* OPENSSL_INTERFACE */
930
931 pxor %KEY, %STATE0 / round 0
932 lea 0x30(%KEYP), %KEYP
933 cmp $12, %NROUNDS
934 jb .Lenc128
935 lea 0x20(%KEYP), %KEYP
936 je .Lenc192
937
938 / AES 256
939 lea 0x20(%KEYP), %KEYP
940 AES256_ENC_ROUNDS(STATE0)
941
942 .align 4
943 .Lenc192:
944 / AES 192 and 256
945 AES192_ENC_ROUNDS(STATE0)
946
947 .align 4
948 .Lenc128:
949 / AES 128, 192, and 256
950 AES128_ENC_ROUNDS(STATE0)
951 movups %STATE0, (%OUTP) / output
952
953 ret
954 SET_SIZE(aes_encrypt_intel)
955
956 /*
957 * aes_decrypt_intel()
958 * Decrypt a single block (in and out can overlap).
959 *
960 * For kernel code, caller is responsible for bracketing this call with
961 * disabling kernel thread preemption and calling aes_accel_save/restore().
962 *
963 * Temporary register usage:
964 * %xmm0 State
965 * %xmm1 Key
966 *
967 * Original OpenSolaris Interface:
968 * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
969 * const uint32_t pt[4], uint32_t ct[4])
970 *
971 * Original Intel OpenSSL Interface:
972 * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
973 * const AES_KEY *key);
974 */
975 ENTRY_NP(aes_decrypt_intel)
976 movups (%INP), %STATE0 / input
977 movaps (%KEYP), %KEY / key
978
979 #ifdef OPENSSL_INTERFACE
980 mov 240(%KEYP), %NROUNDS32 / round count
981 #else /* OpenSolaris Interface */
982 /* Round count is already present as P2 in %rsi/%esi */
983 #endif /* OPENSSL_INTERFACE */
984
985 pxor %KEY, %STATE0 / round 0
986 lea 0x30(%KEYP), %KEYP
987 cmp $12, %NROUNDS
988 jb .Ldec128
989 lea 0x20(%KEYP), %KEYP
990 je .Ldec192
991
992 / AES 256
993 lea 0x20(%KEYP), %KEYP
994 AES256_DEC_ROUNDS(STATE0)
995
996 .align 4
997 .Ldec192:
998 / AES 192 and 256
999 AES192_DEC_ROUNDS(STATE0)
1000
1001 .align 4
1002 .Ldec128:
1003 / AES 128, 192, and 256
1004 AES128_DEC_ROUNDS(STATE0)
1005 movups %STATE0, (%OUTP) / output
1006
1007 ret
1008 SET_SIZE(aes_decrypt_intel)
1009
1010 /* Does a pipelined load of eight input blocks into our AES state registers. */
1011 #define AES_LOAD_INPUT_8BLOCKS \
1012 movups 0x00(%INP), %STATE0; \
1013 movups 0x10(%INP), %STATE1; \
1014 movups 0x20(%INP), %STATE2; \
1015 movups 0x30(%INP), %STATE3; \
1016 movups 0x40(%INP), %STATE4; \
1017 movups 0x50(%INP), %STATE5; \
1018 movups 0x60(%INP), %STATE6; \
1019 movups 0x70(%INP), %STATE7;
1020
1021 /* Does a pipelined store of eight AES state registers to the output. */
1022 #define AES_STORE_OUTPUT_8BLOCKS \
1023 movups %STATE0, 0x00(%OUTP); \
1024 movups %STATE1, 0x10(%OUTP); \
1025 movups %STATE2, 0x20(%OUTP); \
1026 movups %STATE3, 0x30(%OUTP); \
1027 movups %STATE4, 0x40(%OUTP); \
1028 movups %STATE5, 0x50(%OUTP); \
1029 movups %STATE6, 0x60(%OUTP); \
1030 movups %STATE7, 0x70(%OUTP);
1031
1032 /* Performs a pipelined AES instruction with the key on all state registers. */
1033 #define AES_KEY_STATE_OP_8BLOCKS(op) \
1034 op %KEY, %STATE0; \
1035 op %KEY, %STATE1; \
1036 op %KEY, %STATE2; \
1037 op %KEY, %STATE3; \
1038 op %KEY, %STATE4; \
1039 op %KEY, %STATE5; \
1040 op %KEY, %STATE6; \
1041 op %KEY, %STATE7
1042
1043 /* XOR all AES state regs with key to initiate encryption/decryption. */
1044 #define AES_XOR_STATE_8BLOCKS \
1045 AES_KEY_STATE_OP_8BLOCKS(pxor)
1046
1047 /*
1048 * Loads a round key from the key schedule offset `off' into the KEY
1049 * register and performs `op' using the KEY on all 8 STATE registers.
1050 */
1051 #define AES_RND_8BLOCKS(op, off) \
1052 movaps off(%KEYP), %KEY; \
1053 AES_KEY_STATE_OP_8BLOCKS(op)
1054
1055 /*
1056 * void aes_encrypt_intel8(const uint32_t roundkeys[], int numrounds,
1057 * const void *plaintext, void *ciphertext)
1058 *
1059 * Same as aes_encrypt_intel, but performs the encryption operation on
1060 * 8 independent blocks in sequence, exploiting instruction pipelining.
1061 * This function doesn't support the OpenSSL interface, it's only meant
1062 * for kernel use.
1063 */
1064 ENTRY_NP(aes_encrypt_intel8)
1065 AES_LOAD_INPUT_8BLOCKS / load input
1066 movaps (%KEYP), %KEY / key
1067 AES_XOR_STATE_8BLOCKS / round 0
1068
1069 lea 0x30(%KEYP), %KEYP / point to key schedule
1070 cmp $12, %NROUNDS / determine AES variant
1071 jb .Lenc8_128
1072 lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
1073 je .Lenc8_192
1074
1075 lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
1076 AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1
1077 AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2
1078
1079 .align 4
1080 .Lenc8_192:
1081 AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3
1082 AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4
1083
1084 .align 4
1085 .Lenc8_128:
1086 AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
1087 AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
1088 AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
1089 AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
1090 AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
1091 AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
1092 AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
1093 AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
1094 AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
1095 AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1096
1097 AES_STORE_OUTPUT_8BLOCKS / store output
1098 ret
1099 SET_SIZE(aes_encrypt_intel8)
1100
1101
1102 /*
1103 * void aes_decrypt_intel8(const uint32_t roundkeys[], int numrounds,
1104 * const void *ciphertext, void *plaintext)
1105 *
1106 * Same as aes_decrypt_intel, but performs the decryption operation on
1107 * 8 independent blocks in sequence, exploiting instruction pipelining.
1108 * This function doesn't support the OpenSSL interface, it's only meant
1109 * for kernel use.
1110 */
1111 ENTRY_NP(aes_decrypt_intel8)
1112 AES_LOAD_INPUT_8BLOCKS / load input
1113 movaps (%KEYP), %KEY / key
1114 AES_XOR_STATE_8BLOCKS / round 0
1115
1116 lea 0x30(%KEYP), %KEYP / point to key schedule
1117 cmp $12, %NROUNDS / determine AES variant
1118 jb .Ldec8_128
1119 lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
1120 je .Ldec8_192
1121
1122 lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
1123 AES_RND_8BLOCKS(aesdec, -0x60) / AES256 R.1
1124 AES_RND_8BLOCKS(aesdec, -0x50) / AES256 R.2
1125
1126 .align 4
1127 .Ldec8_192:
1128 AES_RND_8BLOCKS(aesdec, -0x40) / AES192 R.1; AES256 R.3
1129 AES_RND_8BLOCKS(aesdec, -0x30) / AES192 R.2; AES256 R.4
1130
1131 .align 4
1132 .Ldec8_128:
1133 AES_RND_8BLOCKS(aesdec, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
1134 AES_RND_8BLOCKS(aesdec, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
1135 AES_RND_8BLOCKS(aesdec, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
1136 AES_RND_8BLOCKS(aesdec, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
1137 AES_RND_8BLOCKS(aesdec, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
1138 AES_RND_8BLOCKS(aesdec, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
1139 AES_RND_8BLOCKS(aesdec, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
1140 AES_RND_8BLOCKS(aesdec, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
1141 AES_RND_8BLOCKS(aesdec, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
1142 AES_RND_8BLOCKS(aesdeclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1143
1144 AES_STORE_OUTPUT_8BLOCKS / store output
1145 ret
1146 SET_SIZE(aes_decrypt_intel8)
1147
1148
1149 /*
1150 * This macro encapsulates the entire AES encryption algo for a single
1151 * block, which is prefilled in statereg and which will be replaced by
1152 * the encrypted output. The KEYP register must already point to the
1153 * AES128 key schedule ("lea 0x30(%KEYP), %KEYP" from encryption
1154 * function call) so that consecutive invocations of this macro are
1155 * supported (KEYP is restored after each invocation).
1156 */
1157 #define AES_ENC(statereg, label_128, label_192, label_out) \
1158 cmp $12, %NROUNDS; \
1159 jb label_128; \
1160 je label_192; \
1161 /* AES 256 only */ \
1162 lea 0x40(%KEYP), %KEYP; \
1163 AES256_ENC_ROUNDS(statereg); \
1164 AES192_ENC_ROUNDS(statereg); \
1165 AES128_ENC_ROUNDS(statereg); \
1166 lea -0x40(%KEYP), %KEYP; \
1167 jmp label_out; \
1168 .align 4; \
1169 label_192: \
1170 lea 0x20(%KEYP), %KEYP; \
1171 /* AES 192 only */ \
1172 AES192_ENC_ROUNDS(statereg); \
1173 AES128_ENC_ROUNDS(statereg); \
1174 lea -0x20(%KEYP), %KEYP; \
1175 jmp label_out; \
1176 .align 4; \
1177 label_128: \
1178 /* AES 128 only */ \
1179 AES128_ENC_ROUNDS(statereg); \
1180 .align 4; \
1181 label_out:
1182
1183
1184 /*
1185 * void aes_encrypt_cbc_intel8(const uint32_t roundkeys[], int numrounds,
1186 * const void *plaintext, void *ciphertext, const void *IV)
1187 *
1188 * Encrypts 8 consecutive AES blocks in the CBC mode. Input and output
1189 * may overlap. This provides a modest performance boost over invoking
1190 * the encryption and XOR in separate functions because we can avoid
1191 * copying the ciphertext block to and from memory between encryption
1192 * and XOR calls.
1193 */
1194 #define CBC_IV r8 /* input - IV blk pointer */
1195 #define CBC_IV_XMM xmm1 /* tmp IV location for alignment */
1196
1197 ENTRY_NP(aes_encrypt_cbc_intel8)
1198 AES_LOAD_INPUT_8BLOCKS / load input
1199 movaps (%KEYP), %KEY / key
1200 AES_XOR_STATE_8BLOCKS / round 0
1201
1202 lea 0x30(%KEYP), %KEYP / point to key schedule
1203 movdqu (%CBC_IV), %CBC_IV_XMM / load IV from unaligned memory
1204 pxor %CBC_IV_XMM, %STATE0 / XOR IV with input block and encrypt
1205 AES_ENC(STATE0, .Lenc_cbc_0_128, .Lenc_cbc_0_192, .Lenc_cbc_0_out)
1206 pxor %STATE0, %STATE1
1207 AES_ENC(STATE1, .Lenc_cbc_1_128, .Lenc_cbc_1_192, .Lenc_cbc_1_out)
1208 pxor %STATE1, %STATE2
1209 AES_ENC(STATE2, .Lenc_cbc_2_128, .Lenc_cbc_2_192, .Lenc_cbc_2_out)
1210 pxor %STATE2, %STATE3
1211 AES_ENC(STATE3, .Lenc_cbc_3_128, .Lenc_cbc_3_192, .Lenc_cbc_3_out)
1212 pxor %STATE3, %STATE4
1213 AES_ENC(STATE4, .Lenc_cbc_4_128, .Lenc_cbc_4_192, .Lenc_cbc_4_out)
1214 pxor %STATE4, %STATE5
1215 AES_ENC(STATE5, .Lenc_cbc_5_128, .Lenc_cbc_5_192, .Lenc_cbc_5_out)
1216 pxor %STATE5, %STATE6
1217 AES_ENC(STATE6, .Lenc_cbc_6_128, .Lenc_cbc_6_192, .Lenc_cbc_6_out)
1218 pxor %STATE6, %STATE7
1219 AES_ENC(STATE7, .Lenc_cbc_7_128, .Lenc_cbc_7_192, .Lenc_cbc_7_out)
1220
1221 AES_STORE_OUTPUT_8BLOCKS / store output
1222 ret
1223 SET_SIZE(aes_encrypt_cbc_intel8)
1224
1225 /*
1226 * Prefills register state with counters suitable for the CTR encryption
1227 * mode. The counter is assumed to consist of two portions:
1228 * - A lower monotonically increasing 64-bit counter. If the caller wants
1229 * a smaller counter, they are responsible for checking that it doesn't
1230 * overflow between encryption calls.
1231 * - An upper static "nonce" portion, in big endian, preloaded into the
1232 * lower portion of an XMM register.
1233 * This macro adds `ctridx' to the lower_LE counter, swaps it to big
1234 * endian and by way of a temporary general-purpose register loads the
1235 * lower and upper counter portions into a target XMM result register,
1236 * which can then be handed off to the encryption process.
1237 */
1238 #define PREP_CTR_BLOCKS(lower_LE, upper_BE_xmm, ctridx, tmpreg, resreg) \
1239 lea ctridx(%lower_LE), %tmpreg; \
1240 bswap %tmpreg; \
1241 movq %tmpreg, %resreg; \
1242 movlhps %upper_BE_xmm, %resreg; \
1243 pshufd $0b01001110, %resreg, %resreg
1244
1245 #define CTR_UPPER_BE r8 /* input - counter upper 64 bits (BE) */
1246 #define CTR_UPPER_BE_XMM xmm1 /* tmp for upper counter bits */
1247 #define CTR_LOWER_LE r9 /* input - counter lower 64 bits (LE) */
1248 #define CTR_TMP0 rax /* tmp for lower 64 bit add & bswap */
1249 #define CTR_TMP1 rbx /* tmp for lower 64 bit add & bswap */
1250 #define CTR_TMP2 r10 /* tmp for lower 64 bit add & bswap */
1251 #define CTR_TMP3 r11 /* tmp for lower 64 bit add & bswap */
1252 #define CTR_TMP4 r12 /* tmp for lower 64 bit add & bswap */
1253 #define CTR_TMP5 r13 /* tmp for lower 64 bit add & bswap */
1254 #define CTR_TMP6 r14 /* tmp for lower 64 bit add & bswap */
1255 #define CTR_TMP7 r15 /* tmp for lower 64 bit add & bswap */
1256
1257 /*
1258 * These are used in case CTR encryption input is unaligned before XORing.
1259 * Must not overlap with any STATE[0-7] register.
1260 */
1261 #define TMP_INPUT0 xmm0
1262 #define TMP_INPUT1 xmm1
1263 #define TMP_INPUT2 xmm2
1264 #define TMP_INPUT3 xmm3
1265 #define TMP_INPUT4 xmm4
1266 #define TMP_INPUT5 xmm5
1267 #define TMP_INPUT6 xmm6
1268 #define TMP_INPUT7 xmm7
1269
1270 /*
1271 * void aes_ctr_intel8(const uint32_t roundkeys[], int numrounds,
1272 * const void *input, void *output, uint64_t counter_upper_BE,
1273 * uint64_t counter_lower_LE)
1274 *
1275 * Runs AES on 8 consecutive blocks in counter mode (encryption and
1276 * decryption in counter mode are the same).
1277 */
1278 ENTRY_NP(aes_ctr_intel8)
1279 /* save caller's regs */
1280 pushq %rbp
1281 movq %rsp, %rbp
1282 subq $0x38, %rsp
1283 / CTR_TMP0 is rax, no need to save
1284 movq %CTR_TMP1, -0x38(%rbp)
1285 movq %CTR_TMP2, -0x30(%rbp)
1286 movq %CTR_TMP3, -0x28(%rbp)
1287 movq %CTR_TMP4, -0x20(%rbp)
1288 movq %CTR_TMP5, -0x18(%rbp)
1289 movq %CTR_TMP6, -0x10(%rbp)
1290 movq %CTR_TMP7, -0x08(%rbp)
1291
1292 /*
1293 * CTR step 1: prepare big-endian formatted 128-bit counter values,
1294 * placing the result in the AES-NI input state registers.
1295 */
1296 movq %CTR_UPPER_BE, %CTR_UPPER_BE_XMM
1297 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 0, CTR_TMP0, STATE0)
1298 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 1, CTR_TMP1, STATE1)
1299 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 2, CTR_TMP2, STATE2)
1300 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 3, CTR_TMP3, STATE3)
1301 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 4, CTR_TMP4, STATE4)
1302 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 5, CTR_TMP5, STATE5)
1303 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 6, CTR_TMP6, STATE6)
1304 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 7, CTR_TMP7, STATE7)
1305
1306 /*
1307 * CTR step 2: Encrypt the counters.
1308 */
1309 movaps (%KEYP), %KEY / key
1310 AES_XOR_STATE_8BLOCKS / round 0
1311
1312 /* Determine the AES variant we're going to compute */
1313 lea 0x30(%KEYP), %KEYP / point to key schedule
1314 cmp $12, %NROUNDS / determine AES variant
1315 jb .Lctr8_128
1316 lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
1317 je .Lctr8_192
1318
1319 /* AES 256 */
1320 lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
1321 AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1
1322 AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2
1323
1324 .align 4
1325 .Lctr8_192:
1326 /* AES 192 and 256 */
1327 AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3
1328 AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4
1329
1330 .align 4
1331 .Lctr8_128:
1332 /* AES 128, 192, and 256 */
1333 AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
1334 AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
1335 AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
1336 AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
1337 AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
1338 AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
1339 AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
1340 AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
1341 AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
1342 AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1343
1344 /*
1345 * CTR step 3: XOR input data blocks with encrypted counters to
1346 * produce result.
1347 */
1348 mov %INP, %rax / pxor requires alignment, so check
1349 andq $0xf, %rax
1350 jnz .Lctr_input_unaligned
1351 pxor 0x00(%INP), %STATE0
1352 pxor 0x10(%INP), %STATE1
1353 pxor 0x20(%INP), %STATE2
1354 pxor 0x30(%INP), %STATE3
1355 pxor 0x40(%INP), %STATE4
1356 pxor 0x50(%INP), %STATE5
1357 pxor 0x60(%INP), %STATE6
1358 pxor 0x70(%INP), %STATE7
1359 jmp .Lctr_out
1360
1361 .align 4
1362 .Lctr_input_unaligned:
1363 movdqu 0x00(%INP), %TMP_INPUT0
1364 movdqu 0x10(%INP), %TMP_INPUT1
1365 movdqu 0x20(%INP), %TMP_INPUT2
1366 movdqu 0x30(%INP), %TMP_INPUT3
1367 movdqu 0x40(%INP), %TMP_INPUT4
1368 movdqu 0x50(%INP), %TMP_INPUT5
1369 movdqu 0x60(%INP), %TMP_INPUT6
1370 movdqu 0x70(%INP), %TMP_INPUT7
1371 pxor %TMP_INPUT0, %STATE0
1372 pxor %TMP_INPUT1, %STATE1
1373 pxor %TMP_INPUT2, %STATE2
1374 pxor %TMP_INPUT3, %STATE3
1375 pxor %TMP_INPUT4, %STATE4
1376 pxor %TMP_INPUT5, %STATE5
1377 pxor %TMP_INPUT6, %STATE6
1378 pxor %TMP_INPUT7, %STATE7
1379
1380 .align 4
1381 .Lctr_out:
1382 /*
1383 * Step 4: Write out processed blocks to memory.
1384 */
1385 movdqu %STATE0, 0x00(%OUTP)
1386 movdqu %STATE1, 0x10(%OUTP)
1387 movdqu %STATE2, 0x20(%OUTP)
1388 movdqu %STATE3, 0x30(%OUTP)
1389 movdqu %STATE4, 0x40(%OUTP)
1390 movdqu %STATE5, 0x50(%OUTP)
1391 movdqu %STATE6, 0x60(%OUTP)
1392 movdqu %STATE7, 0x70(%OUTP)
1393
1394 /* restore caller's regs */
1395 / CTR_TMP0 is rax, no need to restore
1396 movq -0x38(%rbp), %CTR_TMP1
1397 movq -0x30(%rbp), %CTR_TMP2
1398 movq -0x28(%rbp), %CTR_TMP3
1399 movq -0x20(%rbp), %CTR_TMP4
1400 movq -0x18(%rbp), %CTR_TMP5
1401 movq -0x10(%rbp), %CTR_TMP6
1402 movq -0x08(%rbp), %CTR_TMP7
1403 leave
1404 ret
1405 SET_SIZE(aes_ctr_intel8)
1406
1407 #endif /* lint || __lint */