Print this page
4896 Performance improvements for KCF AES modes
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/common/crypto/aes/amd64/aes_intel.s
+++ new/usr/src/common/crypto/aes/amd64/aes_intel.s
1 1 /*
2 2 * ====================================================================
3 3 * Written by Intel Corporation for the OpenSSL project to add support
4 4 * for Intel AES-NI instructions. Rights for redistribution and usage
5 5 * in source and binary forms are granted according to the OpenSSL
6 6 * license.
7 7 *
8 8 * Author: Huang Ying <ying.huang at intel dot com>
9 9 * Vinodh Gopal <vinodh.gopal at intel dot com>
10 10 * Kahraman Akdemir
11 11 *
12 12 * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
13 13 * instructions that are going to be introduced in the next generation
14 14 * of Intel processor, as of 2009. These instructions enable fast and
15 15 * secure data encryption and decryption, using the Advanced Encryption
16 16 * Standard (AES), defined by FIPS Publication number 197. The
17 17 * architecture introduces six instructions that offer full hardware
18 18 * support for AES. Four of them support high performance data
19 19 * encryption and decryption, and the other two instructions support
20 20 * the AES key expansion procedure.
21 21 * ====================================================================
22 22 */
23 23
24 24 /*
25 25 * ====================================================================
26 26 * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved.
27 27 *
28 28 * Redistribution and use in source and binary forms, with or without
29 29 * modification, are permitted provided that the following conditions
30 30 * are met:
31 31 *
32 32 * 1. Redistributions of source code must retain the above copyright
33 33 * notice, this list of conditions and the following disclaimer.
34 34 *
35 35 * 2. Redistributions in binary form must reproduce the above copyright
36 36 * notice, this list of conditions and the following disclaimer in
37 37 * the documentation and/or other materials provided with the
38 38 * distribution.
39 39 *
40 40 * 3. All advertising materials mentioning features or use of this
41 41 * software must display the following acknowledgment:
42 42 * "This product includes software developed by the OpenSSL Project
43 43 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
44 44 *
45 45 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
46 46 * endorse or promote products derived from this software without
47 47 * prior written permission. For written permission, please contact
48 48 * openssl-core@openssl.org.
49 49 *
50 50 * 5. Products derived from this software may not be called "OpenSSL"
51 51 * nor may "OpenSSL" appear in their names without prior written
52 52 * permission of the OpenSSL Project.
53 53 *
54 54 * 6. Redistributions of any form whatsoever must retain the following
55 55 * acknowledgment:
56 56 * "This product includes software developed by the OpenSSL Project
57 57 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
58 58 *
59 59 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
60 60 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
62 62 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
63 63 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
64 64 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
65 65 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
66 66 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
68 68 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69 69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
70 70 * OF THE POSSIBILITY OF SUCH DAMAGE.
71 71 * ====================================================================
72 72 */
73 73
74 74 /*
75 75 * ====================================================================
76 76 * OpenSolaris OS modifications
77 77 *
78 78 * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
79 79 * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
80 80 * Huang Ying of Intel to the openssl-dev mailing list under the subject
81 81 * of "Add support to Intel AES-NI instruction set for x86_64 platform".
82 82 *
83 83 * This OpenSolaris version has these major changes from the original source:
84 84 *
85 85 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
86 86 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
87 87 * definitions for lint.
88 88 *
89 89 * 2. Formatted code, added comments, and added #includes and #defines.
90 90 *
91 91 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
92 92 * calling kpreempt_disable() and kpreempt_enable().
93 93 * If the TS bit is not set, Save and restore %xmm registers at the beginning
94 94 * and end of function calls (%xmm* registers are not saved and restored by
95 95 * during kernel thread preemption).
96 96 *
97 97 * 4. Renamed functions, reordered parameters, and changed return value
98 98 * to match OpenSolaris:
99 99 *
100 100 * OpenSSL interface:
101 101 * int intel_AES_set_encrypt_key(const unsigned char *userKey,
102 102 * const int bits, AES_KEY *key);
103 103 * int intel_AES_set_decrypt_key(const unsigned char *userKey,
104 104 * const int bits, AES_KEY *key);
105 105 * Return values for above are non-zero on error, 0 on success.
106 106 *
107 107 * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
108 108 * const AES_KEY *key);
109 109 * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
110 110 * const AES_KEY *key);
111 111 * typedef struct aes_key_st {
112 112 * unsigned int rd_key[4 *(AES_MAXNR + 1)];
113 113 * int rounds;
114 114 * unsigned int pad[3];
115 115 * } AES_KEY;
116 116 * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
117 117 * (ks32) instead of 64-bit (ks64).
118 118 * Number of rounds (aka round count) is at offset 240 of AES_KEY.
119 119 *
120 120 * OpenSolaris OS interface (#ifdefs removed for readability):
121 121 * int rijndael_key_setup_dec_intel(uint32_t rk[],
122 122 * const uint32_t cipherKey[], uint64_t keyBits);
123 123 * int rijndael_key_setup_enc_intel(uint32_t rk[],
124 124 * const uint32_t cipherKey[], uint64_t keyBits);
125 125 * Return values for above are 0 on error, number of rounds on success.
126 126 *
127 127 * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
128 128 * const uint32_t pt[4], uint32_t ct[4]);
129 129 * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
130 130 * const uint32_t pt[4], uint32_t ct[4]);
131 131 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
132 132 * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
133 133 *
134 134 * typedef union {
135 135 * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
136 136 * } aes_ks_t;
137 137 * typedef struct aes_key {
138 138 * aes_ks_t encr_ks, decr_ks;
139 139 * long double align128;
140 140 * int flags, nr, type;
↓ open down ↓ |
140 lines elided |
↑ open up ↑ |
141 141 * } aes_key_t;
142 142 *
143 143 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
144 144 * ct is crypto text, and MAX_AES_NR is 14.
145 145 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
146 146 *
147 147 * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
148 148 *
149 149 * ====================================================================
150 150 */
151 +/*
152 + * Copyright 2015 by Saso Kiselkov. All rights reserved.
153 + */
151 154
152 155 #if defined(lint) || defined(__lint)
153 156
154 157 #include <sys/types.h>
155 158
156 159 /* ARGSUSED */
157 160 void
158 161 aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
159 162 uint32_t ct[4]) {
160 163 }
161 164 /* ARGSUSED */
162 165 void
163 166 aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
164 167 uint32_t pt[4]) {
165 168 }
166 169 /* ARGSUSED */
167 170 int
168 171 rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
169 172 uint64_t keyBits) {
170 173 return (0);
171 174 }
172 175 /* ARGSUSED */
173 176 int
174 177 rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
175 178 uint64_t keyBits) {
176 179 return (0);
177 180 }
178 181
179 182
180 183 #else /* lint */
181 184
182 185 #include <sys/asm_linkage.h>
183 186 #include <sys/controlregs.h>
184 187 #ifdef _KERNEL
185 188 #include <sys/machprivregs.h>
186 189 #endif
187 190
188 191 #ifdef _KERNEL
189 192 /*
190 193 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
191 194 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
192 195 * uses it to pass P2 to syscall.
193 196 * This also occurs with the STTS macro, but we don't care if
194 197 * P2 (%rsi) is modified just before function exit.
195 198 * The CLTS and STTS macros push and pop P1 (%rdi) already.
196 199 */
197 200 #ifdef __xpv
198 201 #define PROTECTED_CLTS \
199 202 push %rsi; \
200 203 CLTS; \
201 204 pop %rsi
202 205 #else
203 206 #define PROTECTED_CLTS \
204 207 CLTS
205 208 #endif /* __xpv */
206 209
207 210 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \
208 211 push %rbp; \
209 212 mov %rsp, %rbp; \
210 213 movq %cr0, tmpreg; \
211 214 testq $CR0_TS, tmpreg; \
212 215 jnz 1f; \
213 216 and $-XMM_ALIGN, %rsp; \
214 217 sub $[XMM_SIZE * 2], %rsp; \
215 218 movaps %xmm0, 16(%rsp); \
216 219 movaps %xmm1, (%rsp); \
217 220 jmp 2f; \
218 221 1: \
219 222 PROTECTED_CLTS; \
220 223 2:
221 224
222 225 /*
223 226 * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack,
224 227 * otherwise set CR0_TS.
225 228 */
226 229 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg) \
227 230 testq $CR0_TS, tmpreg; \
228 231 jnz 1f; \
229 232 movaps (%rsp), %xmm1; \
230 233 movaps 16(%rsp), %xmm0; \
231 234 jmp 2f; \
232 235 1: \
233 236 STTS(tmpreg); \
234 237 2: \
235 238 mov %rbp, %rsp; \
236 239 pop %rbp
237 240
238 241 /*
239 242 * If CR0_TS is not set, align stack (with push %rbp) and push
240 243 * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS
241 244 */
242 245 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \
243 246 push %rbp; \
244 247 mov %rsp, %rbp; \
245 248 movq %cr0, tmpreg; \
246 249 testq $CR0_TS, tmpreg; \
247 250 jnz 1f; \
248 251 and $-XMM_ALIGN, %rsp; \
249 252 sub $[XMM_SIZE * 7], %rsp; \
250 253 movaps %xmm0, 96(%rsp); \
251 254 movaps %xmm1, 80(%rsp); \
252 255 movaps %xmm2, 64(%rsp); \
253 256 movaps %xmm3, 48(%rsp); \
254 257 movaps %xmm4, 32(%rsp); \
255 258 movaps %xmm5, 16(%rsp); \
256 259 movaps %xmm6, (%rsp); \
257 260 jmp 2f; \
258 261 1: \
259 262 PROTECTED_CLTS; \
260 263 2:
261 264
262 265
263 266 /*
264 267 * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
265 268 * otherwise set CR0_TS.
266 269 */
267 270 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
268 271 testq $CR0_TS, tmpreg; \
269 272 jnz 1f; \
270 273 movaps (%rsp), %xmm6; \
271 274 movaps 16(%rsp), %xmm5; \
272 275 movaps 32(%rsp), %xmm4; \
273 276 movaps 48(%rsp), %xmm3; \
↓ open down ↓ |
113 lines elided |
↑ open up ↑ |
274 277 movaps 64(%rsp), %xmm2; \
275 278 movaps 80(%rsp), %xmm1; \
276 279 movaps 96(%rsp), %xmm0; \
277 280 jmp 2f; \
278 281 1: \
279 282 STTS(tmpreg); \
280 283 2: \
281 284 mov %rbp, %rsp; \
282 285 pop %rbp
283 286
287 +/*
288 + * void aes_accel_save(void *savestate);
289 + *
290 + * Saves all 16 XMM registers and CR0 to a temporary location pointed to
291 + * in the first argument and clears TS in CR0. This must be invoked before
292 + * executing any floating point operations inside the kernel (and kernel
293 + * thread preemption must be disabled as well). The memory region to which
294 + * all state is saved must be at least 16x 128-bit + 64-bit long and must
295 + * be 128-bit aligned.
296 + */
297 +ENTRY_NP(aes_accel_save)
298 + movq %cr0, %rax
299 + movq %rax, 0x100(%rdi)
300 + testq $CR0_TS, %rax
301 + jnz 1f
302 + movaps %xmm0, 0x00(%rdi)
303 + movaps %xmm1, 0x10(%rdi)
304 + movaps %xmm2, 0x20(%rdi)
305 + movaps %xmm3, 0x30(%rdi)
306 + movaps %xmm4, 0x40(%rdi)
307 + movaps %xmm5, 0x50(%rdi)
308 + movaps %xmm6, 0x60(%rdi)
309 + movaps %xmm7, 0x70(%rdi)
310 + movaps %xmm8, 0x80(%rdi)
311 + movaps %xmm9, 0x90(%rdi)
312 + movaps %xmm10, 0xa0(%rdi)
313 + movaps %xmm11, 0xb0(%rdi)
314 + movaps %xmm12, 0xc0(%rdi)
315 + movaps %xmm13, 0xd0(%rdi)
316 + movaps %xmm14, 0xe0(%rdi)
317 + movaps %xmm15, 0xf0(%rdi)
318 + ret
319 +1:
320 + PROTECTED_CLTS
321 + ret
322 + SET_SIZE(aes_accel_save)
284 323
324 +/*
325 + * void aes_accel_restore(void *savestate);
326 + *
327 + * Restores the saved XMM and CR0.TS state from aes_accel_save.
328 + */
329 +ENTRY_NP(aes_accel_restore)
330 + mov 0x100(%rdi), %rax
331 + testq $CR0_TS, %rax
332 + jnz 1f
333 + movaps 0x00(%rdi), %xmm0
334 + movaps 0x10(%rdi), %xmm1
335 + movaps 0x20(%rdi), %xmm2
336 + movaps 0x30(%rdi), %xmm3
337 + movaps 0x40(%rdi), %xmm4
338 + movaps 0x50(%rdi), %xmm5
339 + movaps 0x60(%rdi), %xmm6
340 + movaps 0x70(%rdi), %xmm7
341 + movaps 0x80(%rdi), %xmm8
342 + movaps 0x90(%rdi), %xmm9
343 + movaps 0xa0(%rdi), %xmm10
344 + movaps 0xb0(%rdi), %xmm11
345 + movaps 0xc0(%rdi), %xmm12
346 + movaps 0xd0(%rdi), %xmm13
347 + movaps 0xe0(%rdi), %xmm14
348 + movaps 0xf0(%rdi), %xmm15
349 + ret
350 +1:
351 + STTS(%rax)
352 + ret
353 + SET_SIZE(aes_accel_restore)
354 +
285 355 #else
286 356 #define PROTECTED_CLTS
287 357 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
288 358 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
289 359 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
290 360 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
291 361 #endif /* _KERNEL */
292 362
293 363
294 364 /*
295 365 * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
296 366 * _key_expansion_256a(), _key_expansion_256b()
297 367 *
298 368 * Helper functions called by rijndael_key_setup_inc_intel().
299 369 * Also used indirectly by rijndael_key_setup_dec_intel().
300 370 *
301 371 * Input:
302 372 * %xmm0 User-provided cipher key
303 373 * %xmm1 Round constant
304 374 * Output:
305 375 * (%rcx) AES key
306 376 */
307 377
308 378 .align 16
309 379 _key_expansion_128:
310 380 _key_expansion_256a:
311 381 pshufd $0b11111111, %xmm1, %xmm1
312 382 shufps $0b00010000, %xmm0, %xmm4
313 383 pxor %xmm4, %xmm0
314 384 shufps $0b10001100, %xmm0, %xmm4
315 385 pxor %xmm4, %xmm0
316 386 pxor %xmm1, %xmm0
317 387 movaps %xmm0, (%rcx)
318 388 add $0x10, %rcx
319 389 ret
320 390 SET_SIZE(_key_expansion_128)
321 391 SET_SIZE(_key_expansion_256a)
322 392
323 393 .align 16
324 394 _key_expansion_192a:
325 395 pshufd $0b01010101, %xmm1, %xmm1
326 396 shufps $0b00010000, %xmm0, %xmm4
327 397 pxor %xmm4, %xmm0
328 398 shufps $0b10001100, %xmm0, %xmm4
329 399 pxor %xmm4, %xmm0
330 400 pxor %xmm1, %xmm0
331 401
332 402 movaps %xmm2, %xmm5
333 403 movaps %xmm2, %xmm6
334 404 pslldq $4, %xmm5
335 405 pshufd $0b11111111, %xmm0, %xmm3
336 406 pxor %xmm3, %xmm2
337 407 pxor %xmm5, %xmm2
338 408
339 409 movaps %xmm0, %xmm1
340 410 shufps $0b01000100, %xmm0, %xmm6
341 411 movaps %xmm6, (%rcx)
342 412 shufps $0b01001110, %xmm2, %xmm1
343 413 movaps %xmm1, 0x10(%rcx)
344 414 add $0x20, %rcx
345 415 ret
346 416 SET_SIZE(_key_expansion_192a)
347 417
348 418 .align 16
349 419 _key_expansion_192b:
350 420 pshufd $0b01010101, %xmm1, %xmm1
351 421 shufps $0b00010000, %xmm0, %xmm4
352 422 pxor %xmm4, %xmm0
353 423 shufps $0b10001100, %xmm0, %xmm4
354 424 pxor %xmm4, %xmm0
355 425 pxor %xmm1, %xmm0
356 426
357 427 movaps %xmm2, %xmm5
358 428 pslldq $4, %xmm5
359 429 pshufd $0b11111111, %xmm0, %xmm3
360 430 pxor %xmm3, %xmm2
361 431 pxor %xmm5, %xmm2
362 432
363 433 movaps %xmm0, (%rcx)
364 434 add $0x10, %rcx
365 435 ret
366 436 SET_SIZE(_key_expansion_192b)
367 437
368 438 .align 16
369 439 _key_expansion_256b:
370 440 pshufd $0b10101010, %xmm1, %xmm1
↓ open down ↓ |
76 lines elided |
↑ open up ↑ |
371 441 shufps $0b00010000, %xmm2, %xmm4
372 442 pxor %xmm4, %xmm2
373 443 shufps $0b10001100, %xmm2, %xmm4
374 444 pxor %xmm4, %xmm2
375 445 pxor %xmm1, %xmm2
376 446 movaps %xmm2, (%rcx)
377 447 add $0x10, %rcx
378 448 ret
379 449 SET_SIZE(_key_expansion_256b)
380 450
451 +/*
452 + * void aes_copy_intel(const uint8_t *src, uint8_t *dst);
453 + *
454 + * Copies one unaligned 128-bit block from `src' to `dst'. The copy is
455 + * performed using FPU registers, so make sure FPU state is saved when
456 + * running this in the kernel.
457 + */
458 +ENTRY_NP(aes_copy_intel)
459 + movdqu (%rdi), %xmm0
460 + movdqu %xmm0, (%rsi)
461 + ret
462 + SET_SIZE(aes_copy_intel)
381 463
382 464 /*
465 + * void aes_xor_intel(const uint8_t *src, uint8_t *dst);
466 + *
467 + * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
468 + * stores the result at `dst'. The XOR is performed using FPU registers,
469 + * so make sure FPU state is saved when running this in the kernel.
470 + */
471 +ENTRY_NP(aes_xor_intel)
472 + movdqu (%rdi), %xmm0
473 + movdqu (%rsi), %xmm1
474 + pxor %xmm1, %xmm0
475 + movdqu %xmm0, (%rsi)
476 + ret
477 + SET_SIZE(aes_xor_intel)
478 +
479 +/*
480 + * void aes_xor_intel8(const uint8_t *src, uint8_t *dst);
481 + *
482 + * XORs eight pairs of consecutive unaligned 128-bit blocks from `src' and
483 + * 'dst' and stores the results at `dst'. The XOR is performed using FPU
484 + * registers, so make sure FPU state is saved when running this in the kernel.
485 + */
486 +ENTRY_NP(aes_xor_intel8)
487 + movdqu 0x00(%rdi), %xmm0
488 + movdqu 0x00(%rsi), %xmm1
489 + movdqu 0x10(%rdi), %xmm2
490 + movdqu 0x10(%rsi), %xmm3
491 + movdqu 0x20(%rdi), %xmm4
492 + movdqu 0x20(%rsi), %xmm5
493 + movdqu 0x30(%rdi), %xmm6
494 + movdqu 0x30(%rsi), %xmm7
495 + movdqu 0x40(%rdi), %xmm8
496 + movdqu 0x40(%rsi), %xmm9
497 + movdqu 0x50(%rdi), %xmm10
498 + movdqu 0x50(%rsi), %xmm11
499 + movdqu 0x60(%rdi), %xmm12
500 + movdqu 0x60(%rsi), %xmm13
501 + movdqu 0x70(%rdi), %xmm14
502 + movdqu 0x70(%rsi), %xmm15
503 + pxor %xmm1, %xmm0
504 + pxor %xmm3, %xmm2
505 + pxor %xmm5, %xmm4
506 + pxor %xmm7, %xmm6
507 + pxor %xmm9, %xmm8
508 + pxor %xmm11, %xmm10
509 + pxor %xmm13, %xmm12
510 + pxor %xmm15, %xmm14
511 + movdqu %xmm0, 0x00(%rsi)
512 + movdqu %xmm2, 0x10(%rsi)
513 + movdqu %xmm4, 0x20(%rsi)
514 + movdqu %xmm6, 0x30(%rsi)
515 + movdqu %xmm8, 0x40(%rsi)
516 + movdqu %xmm10, 0x50(%rsi)
517 + movdqu %xmm12, 0x60(%rsi)
518 + movdqu %xmm14, 0x70(%rsi)
519 + ret
520 + SET_SIZE(aes_xor_intel8)
521 +
522 +/*
383 523 * rijndael_key_setup_enc_intel()
384 524 * Expand the cipher key into the encryption key schedule.
385 525 *
386 526 * For kernel code, caller is responsible for ensuring kpreempt_disable()
387 527 * has been called. This is because %xmm registers are not saved/restored.
388 528 * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
389 529 * on entry. Otherwise, if TS is not set, save and restore %xmm registers
390 530 * on the stack.
391 531 *
392 532 * OpenSolaris interface:
393 533 * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
394 534 * uint64_t keyBits);
395 535 * Return value is 0 on error, number of rounds on success.
396 536 *
397 537 * Original Intel OpenSSL interface:
398 538 * int intel_AES_set_encrypt_key(const unsigned char *userKey,
399 539 * const int bits, AES_KEY *key);
400 540 * Return value is non-zero on error, 0 on success.
401 541 */
402 542
403 543 #ifdef OPENSSL_INTERFACE
404 544 #define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key
405 545 #define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key
406 546
407 547 #define USERCIPHERKEY rdi /* P1, 64 bits */
408 548 #define KEYSIZE32 esi /* P2, 32 bits */
409 549 #define KEYSIZE64 rsi /* P2, 64 bits */
410 550 #define AESKEY rdx /* P3, 64 bits */
411 551
412 552 #else /* OpenSolaris Interface */
413 553 #define AESKEY rdi /* P1, 64 bits */
414 554 #define USERCIPHERKEY rsi /* P2, 64 bits */
415 555 #define KEYSIZE32 edx /* P3, 32 bits */
416 556 #define KEYSIZE64 rdx /* P3, 64 bits */
417 557 #endif /* OPENSSL_INTERFACE */
418 558
419 559 #define ROUNDS32 KEYSIZE32 /* temp */
420 560 #define ROUNDS64 KEYSIZE64 /* temp */
421 561 #define ENDAESKEY USERCIPHERKEY /* temp */
422 562
423 563
424 564 ENTRY_NP(rijndael_key_setup_enc_intel)
425 565 CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10)
426 566
427 567 / NULL pointer sanity check
428 568 test %USERCIPHERKEY, %USERCIPHERKEY
429 569 jz .Lenc_key_invalid_param
430 570 test %AESKEY, %AESKEY
431 571 jz .Lenc_key_invalid_param
432 572
433 573 movups (%USERCIPHERKEY), %xmm0 / user key (first 16 bytes)
434 574 movaps %xmm0, (%AESKEY)
435 575 lea 0x10(%AESKEY), %rcx / key addr
436 576 pxor %xmm4, %xmm4 / xmm4 is assumed 0 in _key_expansion_x
437 577
438 578 cmp $256, %KEYSIZE32
439 579 jnz .Lenc_key192
440 580
441 581 / AES 256: 14 rounds in encryption key schedule
442 582 #ifdef OPENSSL_INTERFACE
443 583 mov $14, %ROUNDS32
444 584 movl %ROUNDS32, 240(%AESKEY) / key.rounds = 14
445 585 #endif /* OPENSSL_INTERFACE */
446 586
447 587 movups 0x10(%USERCIPHERKEY), %xmm2 / other user key (2nd 16 bytes)
448 588 movaps %xmm2, (%rcx)
449 589 add $0x10, %rcx
450 590
451 591 aeskeygenassist $0x1, %xmm2, %xmm1 / expand the key
452 592 call _key_expansion_256a
453 593 aeskeygenassist $0x1, %xmm0, %xmm1
454 594 call _key_expansion_256b
455 595 aeskeygenassist $0x2, %xmm2, %xmm1 / expand the key
456 596 call _key_expansion_256a
457 597 aeskeygenassist $0x2, %xmm0, %xmm1
458 598 call _key_expansion_256b
459 599 aeskeygenassist $0x4, %xmm2, %xmm1 / expand the key
460 600 call _key_expansion_256a
461 601 aeskeygenassist $0x4, %xmm0, %xmm1
462 602 call _key_expansion_256b
463 603 aeskeygenassist $0x8, %xmm2, %xmm1 / expand the key
464 604 call _key_expansion_256a
465 605 aeskeygenassist $0x8, %xmm0, %xmm1
466 606 call _key_expansion_256b
467 607 aeskeygenassist $0x10, %xmm2, %xmm1 / expand the key
468 608 call _key_expansion_256a
469 609 aeskeygenassist $0x10, %xmm0, %xmm1
470 610 call _key_expansion_256b
471 611 aeskeygenassist $0x20, %xmm2, %xmm1 / expand the key
472 612 call _key_expansion_256a
473 613 aeskeygenassist $0x20, %xmm0, %xmm1
474 614 call _key_expansion_256b
475 615 aeskeygenassist $0x40, %xmm2, %xmm1 / expand the key
476 616 call _key_expansion_256a
477 617
478 618 SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
479 619 #ifdef OPENSSL_INTERFACE
480 620 xor %rax, %rax / return 0 (OK)
481 621 #else /* Open Solaris Interface */
482 622 mov $14, %rax / return # rounds = 14
483 623 #endif
484 624 ret
485 625
486 626 .align 4
487 627 .Lenc_key192:
488 628 cmp $192, %KEYSIZE32
489 629 jnz .Lenc_key128
490 630
491 631 / AES 192: 12 rounds in encryption key schedule
492 632 #ifdef OPENSSL_INTERFACE
493 633 mov $12, %ROUNDS32
494 634 movl %ROUNDS32, 240(%AESKEY) / key.rounds = 12
495 635 #endif /* OPENSSL_INTERFACE */
496 636
497 637 movq 0x10(%USERCIPHERKEY), %xmm2 / other user key
498 638 aeskeygenassist $0x1, %xmm2, %xmm1 / expand the key
499 639 call _key_expansion_192a
500 640 aeskeygenassist $0x2, %xmm2, %xmm1 / expand the key
501 641 call _key_expansion_192b
502 642 aeskeygenassist $0x4, %xmm2, %xmm1 / expand the key
503 643 call _key_expansion_192a
504 644 aeskeygenassist $0x8, %xmm2, %xmm1 / expand the key
505 645 call _key_expansion_192b
506 646 aeskeygenassist $0x10, %xmm2, %xmm1 / expand the key
507 647 call _key_expansion_192a
508 648 aeskeygenassist $0x20, %xmm2, %xmm1 / expand the key
509 649 call _key_expansion_192b
510 650 aeskeygenassist $0x40, %xmm2, %xmm1 / expand the key
511 651 call _key_expansion_192a
512 652 aeskeygenassist $0x80, %xmm2, %xmm1 / expand the key
513 653 call _key_expansion_192b
514 654
515 655 SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
516 656 #ifdef OPENSSL_INTERFACE
517 657 xor %rax, %rax / return 0 (OK)
518 658 #else /* OpenSolaris Interface */
519 659 mov $12, %rax / return # rounds = 12
520 660 #endif
521 661 ret
522 662
523 663 .align 4
524 664 .Lenc_key128:
525 665 cmp $128, %KEYSIZE32
526 666 jnz .Lenc_key_invalid_key_bits
527 667
528 668 / AES 128: 10 rounds in encryption key schedule
529 669 #ifdef OPENSSL_INTERFACE
530 670 mov $10, %ROUNDS32
531 671 movl %ROUNDS32, 240(%AESKEY) / key.rounds = 10
532 672 #endif /* OPENSSL_INTERFACE */
533 673
534 674 aeskeygenassist $0x1, %xmm0, %xmm1 / expand the key
535 675 call _key_expansion_128
536 676 aeskeygenassist $0x2, %xmm0, %xmm1 / expand the key
537 677 call _key_expansion_128
538 678 aeskeygenassist $0x4, %xmm0, %xmm1 / expand the key
539 679 call _key_expansion_128
540 680 aeskeygenassist $0x8, %xmm0, %xmm1 / expand the key
541 681 call _key_expansion_128
542 682 aeskeygenassist $0x10, %xmm0, %xmm1 / expand the key
543 683 call _key_expansion_128
544 684 aeskeygenassist $0x20, %xmm0, %xmm1 / expand the key
545 685 call _key_expansion_128
546 686 aeskeygenassist $0x40, %xmm0, %xmm1 / expand the key
547 687 call _key_expansion_128
548 688 aeskeygenassist $0x80, %xmm0, %xmm1 / expand the key
549 689 call _key_expansion_128
550 690 aeskeygenassist $0x1b, %xmm0, %xmm1 / expand the key
551 691 call _key_expansion_128
552 692 aeskeygenassist $0x36, %xmm0, %xmm1 / expand the key
553 693 call _key_expansion_128
554 694
555 695 SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
556 696 #ifdef OPENSSL_INTERFACE
557 697 xor %rax, %rax / return 0 (OK)
558 698 #else /* OpenSolaris Interface */
559 699 mov $10, %rax / return # rounds = 10
560 700 #endif
561 701 ret
562 702
563 703 .Lenc_key_invalid_param:
564 704 #ifdef OPENSSL_INTERFACE
565 705 SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
566 706 mov $-1, %rax / user key or AES key pointer is NULL
567 707 ret
568 708 #else
569 709 /* FALLTHROUGH */
570 710 #endif /* OPENSSL_INTERFACE */
571 711
572 712 .Lenc_key_invalid_key_bits:
573 713 SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
574 714 #ifdef OPENSSL_INTERFACE
575 715 mov $-2, %rax / keysize is invalid
576 716 #else /* Open Solaris Interface */
577 717 xor %rax, %rax / a key pointer is NULL or invalid keysize
578 718 #endif /* OPENSSL_INTERFACE */
579 719
580 720 ret
581 721 SET_SIZE(rijndael_key_setup_enc_intel)
582 722
583 723
584 724 /*
585 725 * rijndael_key_setup_dec_intel()
586 726 * Expand the cipher key into the decryption key schedule.
587 727 *
588 728 * For kernel code, caller is responsible for ensuring kpreempt_disable()
589 729 * has been called. This is because %xmm registers are not saved/restored.
590 730 * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
591 731 * on entry. Otherwise, if TS is not set, save and restore %xmm registers
592 732 * on the stack.
593 733 *
594 734 * OpenSolaris interface:
595 735 * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
596 736 * uint64_t keyBits);
597 737 * Return value is 0 on error, number of rounds on success.
598 738 * P1->P2, P2->P3, P3->P1
599 739 *
600 740 * Original Intel OpenSSL interface:
601 741 * int intel_AES_set_decrypt_key(const unsigned char *userKey,
602 742 * const int bits, AES_KEY *key);
603 743 * Return value is non-zero on error, 0 on success.
604 744 */
605 745 ENTRY_NP(rijndael_key_setup_dec_intel)
606 746 / Generate round keys used for encryption
607 747 call rijndael_key_setup_enc_intel
608 748 test %rax, %rax
609 749 #ifdef OPENSSL_INTERFACE
610 750 jnz .Ldec_key_exit / Failed if returned non-0
611 751 #else /* OpenSolaris Interface */
612 752 jz .Ldec_key_exit / Failed if returned 0
613 753 #endif /* OPENSSL_INTERFACE */
614 754
615 755 CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
616 756
617 757 /*
618 758 * Convert round keys used for encryption
619 759 * to a form usable for decryption
620 760 */
621 761 #ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */
622 762 mov %rax, %ROUNDS64 / set # rounds (10, 12, or 14)
623 763 / (already set for OpenSSL)
624 764 #endif
625 765
626 766 lea 0x10(%AESKEY), %rcx / key addr
627 767 shl $4, %ROUNDS32
628 768 add %AESKEY, %ROUNDS64
629 769 mov %ROUNDS64, %ENDAESKEY
630 770
631 771 .align 4
632 772 .Ldec_key_reorder_loop:
633 773 movaps (%AESKEY), %xmm0
634 774 movaps (%ROUNDS64), %xmm1
635 775 movaps %xmm0, (%ROUNDS64)
636 776 movaps %xmm1, (%AESKEY)
637 777 lea 0x10(%AESKEY), %AESKEY
638 778 lea -0x10(%ROUNDS64), %ROUNDS64
639 779 cmp %AESKEY, %ROUNDS64
640 780 ja .Ldec_key_reorder_loop
641 781
642 782 .align 4
643 783 .Ldec_key_inv_loop:
644 784 movaps (%rcx), %xmm0
645 785 / Convert an encryption round key to a form usable for decryption
646 786 / with the "AES Inverse Mix Columns" instruction
647 787 aesimc %xmm0, %xmm1
648 788 movaps %xmm1, (%rcx)
649 789 lea 0x10(%rcx), %rcx
650 790 cmp %ENDAESKEY, %rcx
651 791 jnz .Ldec_key_inv_loop
↓ open down ↓ |
259 lines elided |
↑ open up ↑ |
652 792
653 793 SET_TS_OR_POP_XMM0_XMM1(%r10)
654 794
655 795 .Ldec_key_exit:
656 796 / OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
657 797 / OpenSSL: rax = 0 for OK, or non-zero for error
658 798 ret
659 799 SET_SIZE(rijndael_key_setup_dec_intel)
660 800
661 801
662 -/*
663 - * aes_encrypt_intel()
664 - * Encrypt a single block (in and out can overlap).
665 - *
666 - * For kernel code, caller is responsible for ensuring kpreempt_disable()
667 - * has been called. This is because %xmm registers are not saved/restored.
668 - * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
669 - * on entry. Otherwise, if TS is not set, save and restore %xmm registers
670 - * on the stack.
671 - *
672 - * Temporary register usage:
673 - * %xmm0 State
674 - * %xmm1 Key
675 - *
676 - * Original OpenSolaris Interface:
677 - * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
678 - * const uint32_t pt[4], uint32_t ct[4])
679 - *
680 - * Original Intel OpenSSL Interface:
681 - * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
682 - * const AES_KEY *key)
683 - */
684 -
685 802 #ifdef OPENSSL_INTERFACE
686 803 #define aes_encrypt_intel intel_AES_encrypt
687 804 #define aes_decrypt_intel intel_AES_decrypt
688 805
689 806 #define INP rdi /* P1, 64 bits */
690 807 #define OUTP rsi /* P2, 64 bits */
691 808 #define KEYP rdx /* P3, 64 bits */
692 809
693 810 /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */
694 811 #define NROUNDS32 ecx /* temporary, 32 bits */
695 812 #define NROUNDS cl /* temporary, 8 bits */
696 813
697 814 #else /* OpenSolaris Interface */
698 815 #define KEYP rdi /* P1, 64 bits */
699 816 #define NROUNDS esi /* P2, 32 bits */
700 817 #define INP rdx /* P3, 64 bits */
701 818 #define OUTP rcx /* P4, 64 bits */
819 +#define LENGTH r8 /* P5, 64 bits */
702 820 #endif /* OPENSSL_INTERFACE */
703 821
704 -#define STATE xmm0 /* temporary, 128 bits */
705 -#define KEY xmm1 /* temporary, 128 bits */
822 +#define KEY xmm0 /* temporary, 128 bits */
823 +#define STATE0 xmm8 /* temporary, 128 bits */
824 +#define STATE1 xmm9 /* temporary, 128 bits */
825 +#define STATE2 xmm10 /* temporary, 128 bits */
826 +#define STATE3 xmm11 /* temporary, 128 bits */
827 +#define STATE4 xmm12 /* temporary, 128 bits */
828 +#define STATE5 xmm13 /* temporary, 128 bits */
829 +#define STATE6 xmm14 /* temporary, 128 bits */
830 +#define STATE7 xmm15 /* temporary, 128 bits */
706 831
707 -ENTRY_NP(aes_encrypt_intel)
708 - CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
832 +/*
833 + * Runs the first two rounds of AES256 on a state register. `op' should be
834 + * aesenc or aesdec.
835 + */
836 +#define AES256_ROUNDS(op, statereg) \
837 + movaps -0x60(%KEYP), %KEY; \
838 + op %KEY, %statereg; \
839 + movaps -0x50(%KEYP), %KEY; \
840 + op %KEY, %statereg
709 841
710 - movups (%INP), %STATE / input
842 +/*
843 + * Runs the first two rounds of AES192, or the 3rd & 4th round of AES256 on
844 + * a state register. `op' should be aesenc or aesdec.
845 + */
846 +#define AES192_ROUNDS(op, statereg) \
847 + movaps -0x40(%KEYP), %KEY; \
848 + op %KEY, %statereg; \
849 + movaps -0x30(%KEYP), %KEY; \
850 + op %KEY, %statereg
851 +
852 +/*
853 + * Runs the full 10 rounds of AES128, or the last 10 rounds of AES192/AES256
854 + * on a state register. `op' should be aesenc or aesdec and `lastop' should
855 + * be aesenclast or aesdeclast.
856 + */
857 +#define AES128_ROUNDS(op, lastop, statereg) \
858 + movaps -0x20(%KEYP), %KEY; \
859 + op %KEY, %statereg; \
860 + movaps -0x10(%KEYP), %KEY; \
861 + op %KEY, %statereg; \
862 + movaps (%KEYP), %KEY; \
863 + op %KEY, %statereg; \
864 + movaps 0x10(%KEYP), %KEY; \
865 + op %KEY, %statereg; \
866 + movaps 0x20(%KEYP), %KEY; \
867 + op %KEY, %statereg; \
868 + movaps 0x30(%KEYP), %KEY; \
869 + op %KEY, %statereg; \
870 + movaps 0x40(%KEYP), %KEY; \
871 + op %KEY, %statereg; \
872 + movaps 0x50(%KEYP), %KEY; \
873 + op %KEY, %statereg; \
874 + movaps 0x60(%KEYP), %KEY; \
875 + op %KEY, %statereg; \
876 + movaps 0x70(%KEYP), %KEY; \
877 + lastop %KEY, %statereg
878 +
879 +/*
880 + * Macros to run AES encryption rounds. Input must be prefilled in state
881 + * register - output will be left there as well.
882 + * To run AES256, invoke all of these macros in sequence. To run AES192,
883 + * invoke only the -192 and -128 variants. To run AES128, invoke only the
884 + * -128 variant.
885 + */
886 +#define AES256_ENC_ROUNDS(statereg) \
887 + AES256_ROUNDS(aesenc, statereg)
888 +#define AES192_ENC_ROUNDS(statereg) \
889 + AES192_ROUNDS(aesenc, statereg)
890 +#define AES128_ENC_ROUNDS(statereg) \
891 + AES128_ROUNDS(aesenc, aesenclast, statereg)
892 +
893 +/* Same as the AES*_ENC_ROUNDS macros, but for decryption. */
894 +#define AES256_DEC_ROUNDS(statereg) \
895 + AES256_ROUNDS(aesdec, statereg)
896 +#define AES192_DEC_ROUNDS(statereg) \
897 + AES192_ROUNDS(aesdec, statereg)
898 +#define AES128_DEC_ROUNDS(statereg) \
899 + AES128_ROUNDS(aesdec, aesdeclast, statereg)
900 +
901 +
902 +/*
903 + * aes_encrypt_intel()
904 + * Encrypt a single block (in and out can overlap).
905 + *
906 + * For kernel code, caller is responsible for bracketing this call with
907 + * disabling kernel thread preemption and calling aes_accel_save/restore().
908 + *
909 + * Temporary register usage:
910 + * %xmm0 Key
911 + * %xmm8 State
912 + *
913 + * Original OpenSolaris Interface:
914 + * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
915 + * const uint32_t pt[4], uint32_t ct[4])
916 + *
917 + * Original Intel OpenSSL Interface:
918 + * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
919 + * const AES_KEY *key)
920 + */
921 +ENTRY_NP(aes_encrypt_intel)
922 + movups (%INP), %STATE0 / input
711 923 movaps (%KEYP), %KEY / key
924 +
712 925 #ifdef OPENSSL_INTERFACE
713 926 mov 240(%KEYP), %NROUNDS32 / round count
714 927 #else /* OpenSolaris Interface */
715 928 /* Round count is already present as P2 in %rsi/%esi */
716 929 #endif /* OPENSSL_INTERFACE */
717 930
718 - pxor %KEY, %STATE / round 0
931 + pxor %KEY, %STATE0 / round 0
719 932 lea 0x30(%KEYP), %KEYP
720 933 cmp $12, %NROUNDS
721 934 jb .Lenc128
722 935 lea 0x20(%KEYP), %KEYP
723 936 je .Lenc192
724 937
725 938 / AES 256
726 939 lea 0x20(%KEYP), %KEYP
727 - movaps -0x60(%KEYP), %KEY
728 - aesenc %KEY, %STATE
729 - movaps -0x50(%KEYP), %KEY
730 - aesenc %KEY, %STATE
940 + AES256_ENC_ROUNDS(STATE0)
731 941
732 942 .align 4
733 943 .Lenc192:
734 944 / AES 192 and 256
735 - movaps -0x40(%KEYP), %KEY
736 - aesenc %KEY, %STATE
737 - movaps -0x30(%KEYP), %KEY
738 - aesenc %KEY, %STATE
945 + AES192_ENC_ROUNDS(STATE0)
739 946
740 947 .align 4
741 948 .Lenc128:
742 949 / AES 128, 192, and 256
743 - movaps -0x20(%KEYP), %KEY
744 - aesenc %KEY, %STATE
745 - movaps -0x10(%KEYP), %KEY
746 - aesenc %KEY, %STATE
747 - movaps (%KEYP), %KEY
748 - aesenc %KEY, %STATE
749 - movaps 0x10(%KEYP), %KEY
750 - aesenc %KEY, %STATE
751 - movaps 0x20(%KEYP), %KEY
752 - aesenc %KEY, %STATE
753 - movaps 0x30(%KEYP), %KEY
754 - aesenc %KEY, %STATE
755 - movaps 0x40(%KEYP), %KEY
756 - aesenc %KEY, %STATE
757 - movaps 0x50(%KEYP), %KEY
758 - aesenc %KEY, %STATE
759 - movaps 0x60(%KEYP), %KEY
760 - aesenc %KEY, %STATE
761 - movaps 0x70(%KEYP), %KEY
762 - aesenclast %KEY, %STATE / last round
763 - movups %STATE, (%OUTP) / output
950 + AES128_ENC_ROUNDS(STATE0)
951 + movups %STATE0, (%OUTP) / output
764 952
765 - SET_TS_OR_POP_XMM0_XMM1(%r10)
766 953 ret
767 954 SET_SIZE(aes_encrypt_intel)
768 955
769 -
770 956 /*
771 957 * aes_decrypt_intel()
772 958 * Decrypt a single block (in and out can overlap).
773 959 *
774 - * For kernel code, caller is responsible for ensuring kpreempt_disable()
775 - * has been called. This is because %xmm registers are not saved/restored.
776 - * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
777 - * on entry. Otherwise, if TS is not set, save and restore %xmm registers
778 - * on the stack.
960 + * For kernel code, caller is responsible for bracketing this call with
961 + * disabling kernel thread preemption and calling aes_accel_save/restore().
779 962 *
780 963 * Temporary register usage:
781 964 * %xmm0 State
782 965 * %xmm1 Key
783 966 *
784 967 * Original OpenSolaris Interface:
785 968 * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
786 - * const uint32_t pt[4], uint32_t ct[4])/
969 + * const uint32_t pt[4], uint32_t ct[4])
787 970 *
788 971 * Original Intel OpenSSL Interface:
789 972 * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
790 973 * const AES_KEY *key);
791 974 */
792 975 ENTRY_NP(aes_decrypt_intel)
793 - CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
794 -
795 - movups (%INP), %STATE / input
976 + movups (%INP), %STATE0 / input
796 977 movaps (%KEYP), %KEY / key
978 +
797 979 #ifdef OPENSSL_INTERFACE
798 980 mov 240(%KEYP), %NROUNDS32 / round count
799 981 #else /* OpenSolaris Interface */
800 982 /* Round count is already present as P2 in %rsi/%esi */
801 983 #endif /* OPENSSL_INTERFACE */
802 984
803 - pxor %KEY, %STATE / round 0
985 + pxor %KEY, %STATE0 / round 0
804 986 lea 0x30(%KEYP), %KEYP
805 987 cmp $12, %NROUNDS
806 988 jb .Ldec128
807 989 lea 0x20(%KEYP), %KEYP
808 990 je .Ldec192
809 991
810 992 / AES 256
811 993 lea 0x20(%KEYP), %KEYP
812 - movaps -0x60(%KEYP), %KEY
813 - aesdec %KEY, %STATE
814 - movaps -0x50(%KEYP), %KEY
815 - aesdec %KEY, %STATE
994 + AES256_DEC_ROUNDS(STATE0)
816 995
817 996 .align 4
818 997 .Ldec192:
819 998 / AES 192 and 256
820 - movaps -0x40(%KEYP), %KEY
821 - aesdec %KEY, %STATE
822 - movaps -0x30(%KEYP), %KEY
823 - aesdec %KEY, %STATE
999 + AES192_DEC_ROUNDS(STATE0)
824 1000
825 1001 .align 4
826 1002 .Ldec128:
827 1003 / AES 128, 192, and 256
828 - movaps -0x20(%KEYP), %KEY
829 - aesdec %KEY, %STATE
830 - movaps -0x10(%KEYP), %KEY
831 - aesdec %KEY, %STATE
832 - movaps (%KEYP), %KEY
833 - aesdec %KEY, %STATE
834 - movaps 0x10(%KEYP), %KEY
835 - aesdec %KEY, %STATE
836 - movaps 0x20(%KEYP), %KEY
837 - aesdec %KEY, %STATE
838 - movaps 0x30(%KEYP), %KEY
839 - aesdec %KEY, %STATE
840 - movaps 0x40(%KEYP), %KEY
841 - aesdec %KEY, %STATE
842 - movaps 0x50(%KEYP), %KEY
843 - aesdec %KEY, %STATE
844 - movaps 0x60(%KEYP), %KEY
845 - aesdec %KEY, %STATE
846 - movaps 0x70(%KEYP), %KEY
847 - aesdeclast %KEY, %STATE / last round
848 - movups %STATE, (%OUTP) / output
1004 + AES128_DEC_ROUNDS(STATE0)
1005 + movups %STATE0, (%OUTP) / output
849 1006
850 - SET_TS_OR_POP_XMM0_XMM1(%r10)
851 1007 ret
852 1008 SET_SIZE(aes_decrypt_intel)
853 1009
1010 +/* Does a pipelined load of eight input blocks into our AES state registers. */
1011 +#define AES_LOAD_INPUT_8BLOCKS \
1012 + movups 0x00(%INP), %STATE0; \
1013 + movups 0x10(%INP), %STATE1; \
1014 + movups 0x20(%INP), %STATE2; \
1015 + movups 0x30(%INP), %STATE3; \
1016 + movups 0x40(%INP), %STATE4; \
1017 + movups 0x50(%INP), %STATE5; \
1018 + movups 0x60(%INP), %STATE6; \
1019 + movups 0x70(%INP), %STATE7;
1020 +
1021 +/* Does a pipelined store of eight AES state registers to the output. */
1022 +#define AES_STORE_OUTPUT_8BLOCKS \
1023 + movups %STATE0, 0x00(%OUTP); \
1024 + movups %STATE1, 0x10(%OUTP); \
1025 + movups %STATE2, 0x20(%OUTP); \
1026 + movups %STATE3, 0x30(%OUTP); \
1027 + movups %STATE4, 0x40(%OUTP); \
1028 + movups %STATE5, 0x50(%OUTP); \
1029 + movups %STATE6, 0x60(%OUTP); \
1030 + movups %STATE7, 0x70(%OUTP);
1031 +
1032 +/* Performs a pipelined AES instruction with the key on all state registers. */
1033 +#define AES_KEY_STATE_OP_8BLOCKS(op) \
1034 + op %KEY, %STATE0; \
1035 + op %KEY, %STATE1; \
1036 + op %KEY, %STATE2; \
1037 + op %KEY, %STATE3; \
1038 + op %KEY, %STATE4; \
1039 + op %KEY, %STATE5; \
1040 + op %KEY, %STATE6; \
1041 + op %KEY, %STATE7
1042 +
1043 +/* XOR all AES state regs with key to initiate encryption/decryption. */
1044 +#define AES_XOR_STATE_8BLOCKS \
1045 + AES_KEY_STATE_OP_8BLOCKS(pxor)
1046 +
1047 +/*
1048 + * Loads a round key from the key schedule offset `off' into the KEY
1049 + * register and performs `op' using the KEY on all 8 STATE registers.
1050 + */
1051 +#define AES_RND_8BLOCKS(op, off) \
1052 + movaps off(%KEYP), %KEY; \
1053 + AES_KEY_STATE_OP_8BLOCKS(op)
1054 +
1055 +/*
1056 + * void aes_encrypt_intel8(const uint32_t roundkeys[], int numrounds,
1057 + * const void *plaintext, void *ciphertext)
1058 + *
1059 + * Same as aes_encrypt_intel, but performs the encryption operation on
1060 + * 8 independent blocks in sequence, exploiting instruction pipelining.
1061 + * This function doesn't support the OpenSSL interface, it's only meant
1062 + * for kernel use.
1063 + */
1064 +ENTRY_NP(aes_encrypt_intel8)
1065 + AES_LOAD_INPUT_8BLOCKS / load input
1066 + movaps (%KEYP), %KEY / key
1067 + AES_XOR_STATE_8BLOCKS / round 0
1068 +
1069 + lea 0x30(%KEYP), %KEYP / point to key schedule
1070 + cmp $12, %NROUNDS / determine AES variant
1071 + jb .Lenc8_128
1072 + lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
1073 + je .Lenc8_192
1074 +
1075 + lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
1076 + AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1
1077 + AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2
1078 +
1079 +.align 4
1080 +.Lenc8_192:
1081 + AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3
1082 + AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4
1083 +
1084 +.align 4
1085 +.Lenc8_128:
1086 + AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
1087 + AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
1088 + AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
1089 + AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
1090 + AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
1091 + AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
1092 + AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
1093 + AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
1094 + AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
1095 + AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1096 +
1097 + AES_STORE_OUTPUT_8BLOCKS / store output
1098 + ret
1099 + SET_SIZE(aes_encrypt_intel8)
1100 +
1101 +
1102 +/*
1103 + * void aes_decrypt_intel8(const uint32_t roundkeys[], int numrounds,
1104 + * const void *ciphertext, void *plaintext)
1105 + *
1106 + * Same as aes_decrypt_intel, but performs the decryption operation on
1107 + * 8 independent blocks in sequence, exploiting instruction pipelining.
1108 + * This function doesn't support the OpenSSL interface, it's only meant
1109 + * for kernel use.
1110 + */
1111 +ENTRY_NP(aes_decrypt_intel8)
1112 + AES_LOAD_INPUT_8BLOCKS / load input
1113 + movaps (%KEYP), %KEY / key
1114 + AES_XOR_STATE_8BLOCKS / round 0
1115 +
1116 + lea 0x30(%KEYP), %KEYP / point to key schedule
1117 + cmp $12, %NROUNDS / determine AES variant
1118 + jb .Ldec8_128
1119 + lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
1120 + je .Ldec8_192
1121 +
1122 + lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
1123 + AES_RND_8BLOCKS(aesdec, -0x60) / AES256 R.1
1124 + AES_RND_8BLOCKS(aesdec, -0x50) / AES256 R.2
1125 +
1126 +.align 4
1127 +.Ldec8_192:
1128 + AES_RND_8BLOCKS(aesdec, -0x40) / AES192 R.1; AES256 R.3
1129 + AES_RND_8BLOCKS(aesdec, -0x30) / AES192 R.2; AES256 R.4
1130 +
1131 +.align 4
1132 +.Ldec8_128:
1133 + AES_RND_8BLOCKS(aesdec, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
1134 + AES_RND_8BLOCKS(aesdec, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
1135 + AES_RND_8BLOCKS(aesdec, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
1136 + AES_RND_8BLOCKS(aesdec, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
1137 + AES_RND_8BLOCKS(aesdec, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
1138 + AES_RND_8BLOCKS(aesdec, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
1139 + AES_RND_8BLOCKS(aesdec, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
1140 + AES_RND_8BLOCKS(aesdec, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
1141 + AES_RND_8BLOCKS(aesdec, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
1142 + AES_RND_8BLOCKS(aesdeclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1143 +
1144 + AES_STORE_OUTPUT_8BLOCKS / store output
1145 + ret
1146 + SET_SIZE(aes_decrypt_intel8)
1147 +
1148 +
1149 +/*
1150 + * This macro encapsulates the entire AES encryption algo for a single
1151 + * block, which is prefilled in statereg and which will be replaced by
1152 + * the encrypted output. The KEYP register must already point to the
1153 + * AES128 key schedule ("lea 0x30(%KEYP), %KEYP" from encryption
1154 + * function call) so that consecutive invocations of this macro are
1155 + * supported (KEYP is restored after each invocation).
1156 + */
1157 +#define AES_ENC(statereg, label_128, label_192, label_out) \
1158 + cmp $12, %NROUNDS; \
1159 + jb label_128; \
1160 + je label_192; \
1161 + /* AES 256 only */ \
1162 + lea 0x40(%KEYP), %KEYP; \
1163 + AES256_ENC_ROUNDS(statereg); \
1164 + AES192_ENC_ROUNDS(statereg); \
1165 + AES128_ENC_ROUNDS(statereg); \
1166 + lea -0x40(%KEYP), %KEYP; \
1167 + jmp label_out; \
1168 +.align 4; \
1169 +label_192: \
1170 + lea 0x20(%KEYP), %KEYP; \
1171 + /* AES 192 only */ \
1172 + AES192_ENC_ROUNDS(statereg); \
1173 + AES128_ENC_ROUNDS(statereg); \
1174 + lea -0x20(%KEYP), %KEYP; \
1175 + jmp label_out; \
1176 +.align 4; \
1177 +label_128: \
1178 + /* AES 128 only */ \
1179 + AES128_ENC_ROUNDS(statereg); \
1180 +.align 4; \
1181 +label_out:
1182 +
1183 +
1184 +/*
1185 + * void aes_encrypt_cbc_intel8(const uint32_t roundkeys[], int numrounds,
1186 + * const void *plaintext, void *ciphertext, const void *IV)
1187 + *
1188 + * Encrypts 8 consecutive AES blocks in the CBC mode. Input and output
1189 + * may overlap. This provides a modest performance boost over invoking
1190 + * the encryption and XOR in separate functions because we can avoid
1191 + * copying the ciphertext block to and from memory between encryption
1192 + * and XOR calls.
1193 + */
1194 +#define CBC_IV r8 /* input - IV blk pointer */
1195 +#define CBC_IV_XMM xmm1 /* tmp IV location for alignment */
1196 +
1197 +ENTRY_NP(aes_encrypt_cbc_intel8)
1198 + AES_LOAD_INPUT_8BLOCKS / load input
1199 + movaps (%KEYP), %KEY / key
1200 + AES_XOR_STATE_8BLOCKS / round 0
1201 +
1202 + lea 0x30(%KEYP), %KEYP / point to key schedule
1203 + movdqu (%CBC_IV), %CBC_IV_XMM / load IV from unaligned memory
1204 + pxor %CBC_IV_XMM, %STATE0 / XOR IV with input block and encrypt
1205 + AES_ENC(STATE0, .Lenc_cbc_0_128, .Lenc_cbc_0_192, .Lenc_cbc_0_out)
1206 + pxor %STATE0, %STATE1
1207 + AES_ENC(STATE1, .Lenc_cbc_1_128, .Lenc_cbc_1_192, .Lenc_cbc_1_out)
1208 + pxor %STATE1, %STATE2
1209 + AES_ENC(STATE2, .Lenc_cbc_2_128, .Lenc_cbc_2_192, .Lenc_cbc_2_out)
1210 + pxor %STATE2, %STATE3
1211 + AES_ENC(STATE3, .Lenc_cbc_3_128, .Lenc_cbc_3_192, .Lenc_cbc_3_out)
1212 + pxor %STATE3, %STATE4
1213 + AES_ENC(STATE4, .Lenc_cbc_4_128, .Lenc_cbc_4_192, .Lenc_cbc_4_out)
1214 + pxor %STATE4, %STATE5
1215 + AES_ENC(STATE5, .Lenc_cbc_5_128, .Lenc_cbc_5_192, .Lenc_cbc_5_out)
1216 + pxor %STATE5, %STATE6
1217 + AES_ENC(STATE6, .Lenc_cbc_6_128, .Lenc_cbc_6_192, .Lenc_cbc_6_out)
1218 + pxor %STATE6, %STATE7
1219 + AES_ENC(STATE7, .Lenc_cbc_7_128, .Lenc_cbc_7_192, .Lenc_cbc_7_out)
1220 +
1221 + AES_STORE_OUTPUT_8BLOCKS / store output
1222 + ret
1223 + SET_SIZE(aes_encrypt_cbc_intel8)
1224 +
1225 +/*
1226 + * Prefills register state with counters suitable for the CTR encryption
1227 + * mode. The counter is assumed to consist of two portions:
1228 + * - A lower monotonically increasing 64-bit counter. If the caller wants
1229 + * a smaller counter, they are responsible for checking that it doesn't
1230 + * overflow between encryption calls.
1231 + * - An upper static "nonce" portion, in big endian, preloaded into the
1232 + * lower portion of an XMM register.
1233 + * This macro adds `ctridx' to the lower_LE counter, swaps it to big
1234 + * endian and by way of a temporary general-purpose register loads the
1235 + * lower and upper counter portions into a target XMM result register,
1236 + * which can then be handed off to the encryption process.
1237 + */
1238 +#define PREP_CTR_BLOCKS(lower_LE, upper_BE_xmm, ctridx, tmpreg, resreg) \
1239 + lea ctridx(%lower_LE), %tmpreg; \
1240 + bswap %tmpreg; \
1241 + movq %tmpreg, %resreg; \
1242 + movlhps %upper_BE_xmm, %resreg; \
1243 + pshufd $0b01001110, %resreg, %resreg
1244 +
1245 +#define CTR_UPPER_BE r8 /* input - counter upper 64 bits (BE) */
1246 +#define CTR_UPPER_BE_XMM xmm1 /* tmp for upper counter bits */
1247 +#define CTR_LOWER_LE r9 /* input - counter lower 64 bits (LE) */
1248 +#define CTR_TMP0 rax /* tmp for lower 64 bit add & bswap */
1249 +#define CTR_TMP1 rbx /* tmp for lower 64 bit add & bswap */
1250 +#define CTR_TMP2 r10 /* tmp for lower 64 bit add & bswap */
1251 +#define CTR_TMP3 r11 /* tmp for lower 64 bit add & bswap */
1252 +#define CTR_TMP4 r12 /* tmp for lower 64 bit add & bswap */
1253 +#define CTR_TMP5 r13 /* tmp for lower 64 bit add & bswap */
1254 +#define CTR_TMP6 r14 /* tmp for lower 64 bit add & bswap */
1255 +#define CTR_TMP7 r15 /* tmp for lower 64 bit add & bswap */
1256 +
1257 +/*
1258 + * These are used in case CTR encryption input is unaligned before XORing.
1259 + * Must not overlap with any STATE[0-7] register.
1260 + */
1261 +#define TMP_INPUT0 xmm0
1262 +#define TMP_INPUT1 xmm1
1263 +#define TMP_INPUT2 xmm2
1264 +#define TMP_INPUT3 xmm3
1265 +#define TMP_INPUT4 xmm4
1266 +#define TMP_INPUT5 xmm5
1267 +#define TMP_INPUT6 xmm6
1268 +#define TMP_INPUT7 xmm7
1269 +
1270 +/*
1271 + * void aes_ctr_intel8(const uint32_t roundkeys[], int numrounds,
1272 + * const void *input, void *output, uint64_t counter_upper_BE,
1273 + * uint64_t counter_lower_LE)
1274 + *
1275 + * Runs AES on 8 consecutive blocks in counter mode (encryption and
1276 + * decryption in counter mode are the same).
1277 + */
1278 +ENTRY_NP(aes_ctr_intel8)
1279 + /* save caller's regs */
1280 + pushq %rbp
1281 + movq %rsp, %rbp
1282 + subq $0x38, %rsp
1283 + / CTR_TMP0 is rax, no need to save
1284 + movq %CTR_TMP1, -0x38(%rbp)
1285 + movq %CTR_TMP2, -0x30(%rbp)
1286 + movq %CTR_TMP3, -0x28(%rbp)
1287 + movq %CTR_TMP4, -0x20(%rbp)
1288 + movq %CTR_TMP5, -0x18(%rbp)
1289 + movq %CTR_TMP6, -0x10(%rbp)
1290 + movq %CTR_TMP7, -0x08(%rbp)
1291 +
1292 + /*
1293 + * CTR step 1: prepare big-endian formatted 128-bit counter values,
1294 + * placing the result in the AES-NI input state registers.
1295 + */
1296 + movq %CTR_UPPER_BE, %CTR_UPPER_BE_XMM
1297 + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 0, CTR_TMP0, STATE0)
1298 + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 1, CTR_TMP1, STATE1)
1299 + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 2, CTR_TMP2, STATE2)
1300 + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 3, CTR_TMP3, STATE3)
1301 + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 4, CTR_TMP4, STATE4)
1302 + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 5, CTR_TMP5, STATE5)
1303 + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 6, CTR_TMP6, STATE6)
1304 + PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 7, CTR_TMP7, STATE7)
1305 +
1306 + /*
1307 + * CTR step 2: Encrypt the counters.
1308 + */
1309 + movaps (%KEYP), %KEY / key
1310 + AES_XOR_STATE_8BLOCKS / round 0
1311 +
1312 + /* Determine the AES variant we're going to compute */
1313 + lea 0x30(%KEYP), %KEYP / point to key schedule
1314 + cmp $12, %NROUNDS / determine AES variant
1315 + jb .Lctr8_128
1316 + lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
1317 + je .Lctr8_192
1318 +
1319 + /* AES 256 */
1320 + lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
1321 + AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1
1322 + AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2
1323 +
1324 +.align 4
1325 +.Lctr8_192:
1326 + /* AES 192 and 256 */
1327 + AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3
1328 + AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4
1329 +
1330 +.align 4
1331 +.Lctr8_128:
1332 + /* AES 128, 192, and 256 */
1333 + AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
1334 + AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
1335 + AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
1336 + AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
1337 + AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
1338 + AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
1339 + AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
1340 + AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
1341 + AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
1342 + AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1343 +
1344 + /*
1345 + * CTR step 3: XOR input data blocks with encrypted counters to
1346 + * produce result.
1347 + */
1348 + mov %INP, %rax / pxor requires alignment, so check
1349 + andq $0xf, %rax
1350 + jnz .Lctr_input_unaligned
1351 + pxor 0x00(%INP), %STATE0
1352 + pxor 0x10(%INP), %STATE1
1353 + pxor 0x20(%INP), %STATE2
1354 + pxor 0x30(%INP), %STATE3
1355 + pxor 0x40(%INP), %STATE4
1356 + pxor 0x50(%INP), %STATE5
1357 + pxor 0x60(%INP), %STATE6
1358 + pxor 0x70(%INP), %STATE7
1359 + jmp .Lctr_out
1360 +
1361 +.align 4
1362 +.Lctr_input_unaligned:
1363 + movdqu 0x00(%INP), %TMP_INPUT0
1364 + movdqu 0x10(%INP), %TMP_INPUT1
1365 + movdqu 0x20(%INP), %TMP_INPUT2
1366 + movdqu 0x30(%INP), %TMP_INPUT3
1367 + movdqu 0x40(%INP), %TMP_INPUT4
1368 + movdqu 0x50(%INP), %TMP_INPUT5
1369 + movdqu 0x60(%INP), %TMP_INPUT6
1370 + movdqu 0x70(%INP), %TMP_INPUT7
1371 + pxor %TMP_INPUT0, %STATE0
1372 + pxor %TMP_INPUT1, %STATE1
1373 + pxor %TMP_INPUT2, %STATE2
1374 + pxor %TMP_INPUT3, %STATE3
1375 + pxor %TMP_INPUT4, %STATE4
1376 + pxor %TMP_INPUT5, %STATE5
1377 + pxor %TMP_INPUT6, %STATE6
1378 + pxor %TMP_INPUT7, %STATE7
1379 +
1380 +.align 4
1381 +.Lctr_out:
1382 + /*
1383 + * Step 4: Write out processed blocks to memory.
1384 + */
1385 + movdqu %STATE0, 0x00(%OUTP)
1386 + movdqu %STATE1, 0x10(%OUTP)
1387 + movdqu %STATE2, 0x20(%OUTP)
1388 + movdqu %STATE3, 0x30(%OUTP)
1389 + movdqu %STATE4, 0x40(%OUTP)
1390 + movdqu %STATE5, 0x50(%OUTP)
1391 + movdqu %STATE6, 0x60(%OUTP)
1392 + movdqu %STATE7, 0x70(%OUTP)
1393 +
1394 + /* restore caller's regs */
1395 + / CTR_TMP0 is rax, no need to restore
1396 + movq -0x38(%rbp), %CTR_TMP1
1397 + movq -0x30(%rbp), %CTR_TMP2
1398 + movq -0x28(%rbp), %CTR_TMP3
1399 + movq -0x20(%rbp), %CTR_TMP4
1400 + movq -0x18(%rbp), %CTR_TMP5
1401 + movq -0x10(%rbp), %CTR_TMP6
1402 + movq -0x08(%rbp), %CTR_TMP7
1403 + leave
1404 + ret
1405 + SET_SIZE(aes_ctr_intel8)
1406 +
854 1407 #endif /* lint || __lint */
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX