131 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
132 * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
133 *
134 * typedef union {
135 * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
136 * } aes_ks_t;
137 * typedef struct aes_key {
138 * aes_ks_t encr_ks, decr_ks;
139 * long double align128;
140 * int flags, nr, type;
141 * } aes_key_t;
142 *
143 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
144 * ct is crypto text, and MAX_AES_NR is 14.
145 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
146 *
147 * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
148 *
149 * ====================================================================
150 */
151
152 #if defined(lint) || defined(__lint)
153
154 #include <sys/types.h>
155
156 /* ARGSUSED */
157 void
158 aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
159 uint32_t ct[4]) {
160 }
161 /* ARGSUSED */
162 void
163 aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
164 uint32_t pt[4]) {
165 }
166 /* ARGSUSED */
167 int
168 rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
169 uint64_t keyBits) {
170 return (0);
264 * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
265 * otherwise set CR0_TS.
266 */
267 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
268 testq $CR0_TS, tmpreg; \
269 jnz 1f; \
270 movaps (%rsp), %xmm6; \
271 movaps 16(%rsp), %xmm5; \
272 movaps 32(%rsp), %xmm4; \
273 movaps 48(%rsp), %xmm3; \
274 movaps 64(%rsp), %xmm2; \
275 movaps 80(%rsp), %xmm1; \
276 movaps 96(%rsp), %xmm0; \
277 jmp 2f; \
278 1: \
279 STTS(tmpreg); \
280 2: \
281 mov %rbp, %rsp; \
282 pop %rbp
283
284
285 #else
286 #define PROTECTED_CLTS
287 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
288 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
289 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
290 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
291 #endif /* _KERNEL */
292
293
294 /*
295 * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
296 * _key_expansion_256a(), _key_expansion_256b()
297 *
298 * Helper functions called by rijndael_key_setup_inc_intel().
299 * Also used indirectly by rijndael_key_setup_dec_intel().
300 *
301 * Input:
302 * %xmm0 User-provided cipher key
303 * %xmm1 Round constant
304 * Output:
361 pxor %xmm5, %xmm2
362
363 movaps %xmm0, (%rcx)
364 add $0x10, %rcx
365 ret
366 SET_SIZE(_key_expansion_192b)
367
368 .align 16
369 _key_expansion_256b:
370 pshufd $0b10101010, %xmm1, %xmm1
371 shufps $0b00010000, %xmm2, %xmm4
372 pxor %xmm4, %xmm2
373 shufps $0b10001100, %xmm2, %xmm4
374 pxor %xmm4, %xmm2
375 pxor %xmm1, %xmm2
376 movaps %xmm2, (%rcx)
377 add $0x10, %rcx
378 ret
379 SET_SIZE(_key_expansion_256b)
380
381
382 /*
383 * rijndael_key_setup_enc_intel()
384 * Expand the cipher key into the encryption key schedule.
385 *
386 * For kernel code, caller is responsible for ensuring kpreempt_disable()
387 * has been called. This is because %xmm registers are not saved/restored.
388 * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
389 * on entry. Otherwise, if TS is not set, save and restore %xmm registers
390 * on the stack.
391 *
392 * OpenSolaris interface:
393 * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
394 * uint64_t keyBits);
395 * Return value is 0 on error, number of rounds on success.
396 *
397 * Original Intel OpenSSL interface:
398 * int intel_AES_set_encrypt_key(const unsigned char *userKey,
399 * const int bits, AES_KEY *key);
400 * Return value is non-zero on error, 0 on success.
401 */
402
642 .align 4
643 .Ldec_key_inv_loop:
644 movaps (%rcx), %xmm0
645 / Convert an encryption round key to a form usable for decryption
646 / with the "AES Inverse Mix Columns" instruction
647 aesimc %xmm0, %xmm1
648 movaps %xmm1, (%rcx)
649 lea 0x10(%rcx), %rcx
650 cmp %ENDAESKEY, %rcx
651 jnz .Ldec_key_inv_loop
652
653 SET_TS_OR_POP_XMM0_XMM1(%r10)
654
655 .Ldec_key_exit:
656 / OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
657 / OpenSSL: rax = 0 for OK, or non-zero for error
658 ret
659 SET_SIZE(rijndael_key_setup_dec_intel)
660
661
662 /*
663 * aes_encrypt_intel()
664 * Encrypt a single block (in and out can overlap).
665 *
666 * For kernel code, caller is responsible for ensuring kpreempt_disable()
667 * has been called. This is because %xmm registers are not saved/restored.
668 * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
669 * on entry. Otherwise, if TS is not set, save and restore %xmm registers
670 * on the stack.
671 *
672 * Temporary register usage:
673 * %xmm0 State
674 * %xmm1 Key
675 *
676 * Original OpenSolaris Interface:
677 * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
678 * const uint32_t pt[4], uint32_t ct[4])
679 *
680 * Original Intel OpenSSL Interface:
681 * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
682 * const AES_KEY *key)
683 */
684
685 #ifdef OPENSSL_INTERFACE
686 #define aes_encrypt_intel intel_AES_encrypt
687 #define aes_decrypt_intel intel_AES_decrypt
688
689 #define INP rdi /* P1, 64 bits */
690 #define OUTP rsi /* P2, 64 bits */
691 #define KEYP rdx /* P3, 64 bits */
692
693 /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */
694 #define NROUNDS32 ecx /* temporary, 32 bits */
695 #define NROUNDS cl /* temporary, 8 bits */
696
697 #else /* OpenSolaris Interface */
698 #define KEYP rdi /* P1, 64 bits */
699 #define NROUNDS esi /* P2, 32 bits */
700 #define INP rdx /* P3, 64 bits */
701 #define OUTP rcx /* P4, 64 bits */
702 #endif /* OPENSSL_INTERFACE */
703
704 #define STATE xmm0 /* temporary, 128 bits */
705 #define KEY xmm1 /* temporary, 128 bits */
706
707 ENTRY_NP(aes_encrypt_intel)
708 CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
709
710 movups (%INP), %STATE / input
711 movaps (%KEYP), %KEY / key
712 #ifdef OPENSSL_INTERFACE
713 mov 240(%KEYP), %NROUNDS32 / round count
714 #else /* OpenSolaris Interface */
715 /* Round count is already present as P2 in %rsi/%esi */
716 #endif /* OPENSSL_INTERFACE */
717
718 pxor %KEY, %STATE / round 0
719 lea 0x30(%KEYP), %KEYP
720 cmp $12, %NROUNDS
721 jb .Lenc128
722 lea 0x20(%KEYP), %KEYP
723 je .Lenc192
724
725 / AES 256
726 lea 0x20(%KEYP), %KEYP
727 movaps -0x60(%KEYP), %KEY
728 aesenc %KEY, %STATE
729 movaps -0x50(%KEYP), %KEY
730 aesenc %KEY, %STATE
731
732 .align 4
733 .Lenc192:
734 / AES 192 and 256
735 movaps -0x40(%KEYP), %KEY
736 aesenc %KEY, %STATE
737 movaps -0x30(%KEYP), %KEY
738 aesenc %KEY, %STATE
739
740 .align 4
741 .Lenc128:
742 / AES 128, 192, and 256
743 movaps -0x20(%KEYP), %KEY
744 aesenc %KEY, %STATE
745 movaps -0x10(%KEYP), %KEY
746 aesenc %KEY, %STATE
747 movaps (%KEYP), %KEY
748 aesenc %KEY, %STATE
749 movaps 0x10(%KEYP), %KEY
750 aesenc %KEY, %STATE
751 movaps 0x20(%KEYP), %KEY
752 aesenc %KEY, %STATE
753 movaps 0x30(%KEYP), %KEY
754 aesenc %KEY, %STATE
755 movaps 0x40(%KEYP), %KEY
756 aesenc %KEY, %STATE
757 movaps 0x50(%KEYP), %KEY
758 aesenc %KEY, %STATE
759 movaps 0x60(%KEYP), %KEY
760 aesenc %KEY, %STATE
761 movaps 0x70(%KEYP), %KEY
762 aesenclast %KEY, %STATE / last round
763 movups %STATE, (%OUTP) / output
764
765 SET_TS_OR_POP_XMM0_XMM1(%r10)
766 ret
767 SET_SIZE(aes_encrypt_intel)
768
769
770 /*
771 * aes_decrypt_intel()
772 * Decrypt a single block (in and out can overlap).
773 *
774 * For kernel code, caller is responsible for ensuring kpreempt_disable()
775 * has been called. This is because %xmm registers are not saved/restored.
776 * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
777 * on entry. Otherwise, if TS is not set, save and restore %xmm registers
778 * on the stack.
779 *
780 * Temporary register usage:
781 * %xmm0 State
782 * %xmm1 Key
783 *
784 * Original OpenSolaris Interface:
785 * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
786 * const uint32_t pt[4], uint32_t ct[4])/
787 *
788 * Original Intel OpenSSL Interface:
789 * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
790 * const AES_KEY *key);
791 */
792 ENTRY_NP(aes_decrypt_intel)
793 CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
794
795 movups (%INP), %STATE / input
796 movaps (%KEYP), %KEY / key
797 #ifdef OPENSSL_INTERFACE
798 mov 240(%KEYP), %NROUNDS32 / round count
799 #else /* OpenSolaris Interface */
800 /* Round count is already present as P2 in %rsi/%esi */
801 #endif /* OPENSSL_INTERFACE */
802
803 pxor %KEY, %STATE / round 0
804 lea 0x30(%KEYP), %KEYP
805 cmp $12, %NROUNDS
806 jb .Ldec128
807 lea 0x20(%KEYP), %KEYP
808 je .Ldec192
809
810 / AES 256
811 lea 0x20(%KEYP), %KEYP
812 movaps -0x60(%KEYP), %KEY
813 aesdec %KEY, %STATE
814 movaps -0x50(%KEYP), %KEY
815 aesdec %KEY, %STATE
816
817 .align 4
818 .Ldec192:
819 / AES 192 and 256
820 movaps -0x40(%KEYP), %KEY
821 aesdec %KEY, %STATE
822 movaps -0x30(%KEYP), %KEY
823 aesdec %KEY, %STATE
824
825 .align 4
826 .Ldec128:
827 / AES 128, 192, and 256
828 movaps -0x20(%KEYP), %KEY
829 aesdec %KEY, %STATE
830 movaps -0x10(%KEYP), %KEY
831 aesdec %KEY, %STATE
832 movaps (%KEYP), %KEY
833 aesdec %KEY, %STATE
834 movaps 0x10(%KEYP), %KEY
835 aesdec %KEY, %STATE
836 movaps 0x20(%KEYP), %KEY
837 aesdec %KEY, %STATE
838 movaps 0x30(%KEYP), %KEY
839 aesdec %KEY, %STATE
840 movaps 0x40(%KEYP), %KEY
841 aesdec %KEY, %STATE
842 movaps 0x50(%KEYP), %KEY
843 aesdec %KEY, %STATE
844 movaps 0x60(%KEYP), %KEY
845 aesdec %KEY, %STATE
846 movaps 0x70(%KEYP), %KEY
847 aesdeclast %KEY, %STATE / last round
848 movups %STATE, (%OUTP) / output
849
850 SET_TS_OR_POP_XMM0_XMM1(%r10)
851 ret
852 SET_SIZE(aes_decrypt_intel)
853
854 #endif /* lint || __lint */
|
131 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
132 * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
133 *
134 * typedef union {
135 * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
136 * } aes_ks_t;
137 * typedef struct aes_key {
138 * aes_ks_t encr_ks, decr_ks;
139 * long double align128;
140 * int flags, nr, type;
141 * } aes_key_t;
142 *
143 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
144 * ct is crypto text, and MAX_AES_NR is 14.
145 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
146 *
147 * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
148 *
149 * ====================================================================
150 */
151 /*
152 * Copyright 2015 by Saso Kiselkov. All rights reserved.
153 */
154
155 #if defined(lint) || defined(__lint)
156
157 #include <sys/types.h>
158
159 /* ARGSUSED */
160 void
161 aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
162 uint32_t ct[4]) {
163 }
164 /* ARGSUSED */
165 void
166 aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
167 uint32_t pt[4]) {
168 }
169 /* ARGSUSED */
170 int
171 rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
172 uint64_t keyBits) {
173 return (0);
267 * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
268 * otherwise set CR0_TS.
269 */
270 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
271 testq $CR0_TS, tmpreg; \
272 jnz 1f; \
273 movaps (%rsp), %xmm6; \
274 movaps 16(%rsp), %xmm5; \
275 movaps 32(%rsp), %xmm4; \
276 movaps 48(%rsp), %xmm3; \
277 movaps 64(%rsp), %xmm2; \
278 movaps 80(%rsp), %xmm1; \
279 movaps 96(%rsp), %xmm0; \
280 jmp 2f; \
281 1: \
282 STTS(tmpreg); \
283 2: \
284 mov %rbp, %rsp; \
285 pop %rbp
286
287 /*
288 * void aes_accel_save(void *savestate);
289 *
290 * Saves all 16 XMM registers and CR0 to a temporary location pointed to
291 * in the first argument and clears TS in CR0. This must be invoked before
292 * executing any floating point operations inside the kernel (and kernel
293 * thread preemption must be disabled as well). The memory region to which
294 * all state is saved must be at least 16x 128-bit + 64-bit long and must
295 * be 128-bit aligned.
296 */
297 ENTRY_NP(aes_accel_save)
298 movq %cr0, %rax
299 movq %rax, 0x100(%rdi)
300 testq $CR0_TS, %rax
301 jnz 1f
302 movaps %xmm0, 0x00(%rdi)
303 movaps %xmm1, 0x10(%rdi)
304 movaps %xmm2, 0x20(%rdi)
305 movaps %xmm3, 0x30(%rdi)
306 movaps %xmm4, 0x40(%rdi)
307 movaps %xmm5, 0x50(%rdi)
308 movaps %xmm6, 0x60(%rdi)
309 movaps %xmm7, 0x70(%rdi)
310 movaps %xmm8, 0x80(%rdi)
311 movaps %xmm9, 0x90(%rdi)
312 movaps %xmm10, 0xa0(%rdi)
313 movaps %xmm11, 0xb0(%rdi)
314 movaps %xmm12, 0xc0(%rdi)
315 movaps %xmm13, 0xd0(%rdi)
316 movaps %xmm14, 0xe0(%rdi)
317 movaps %xmm15, 0xf0(%rdi)
318 ret
319 1:
320 PROTECTED_CLTS
321 ret
322 SET_SIZE(aes_accel_save)
323
324 /*
325 * void aes_accel_restore(void *savestate);
326 *
327 * Restores the saved XMM and CR0.TS state from aes_accel_save.
328 */
329 ENTRY_NP(aes_accel_restore)
330 mov 0x100(%rdi), %rax
331 testq $CR0_TS, %rax
332 jnz 1f
333 movaps 0x00(%rdi), %xmm0
334 movaps 0x10(%rdi), %xmm1
335 movaps 0x20(%rdi), %xmm2
336 movaps 0x30(%rdi), %xmm3
337 movaps 0x40(%rdi), %xmm4
338 movaps 0x50(%rdi), %xmm5
339 movaps 0x60(%rdi), %xmm6
340 movaps 0x70(%rdi), %xmm7
341 movaps 0x80(%rdi), %xmm8
342 movaps 0x90(%rdi), %xmm9
343 movaps 0xa0(%rdi), %xmm10
344 movaps 0xb0(%rdi), %xmm11
345 movaps 0xc0(%rdi), %xmm12
346 movaps 0xd0(%rdi), %xmm13
347 movaps 0xe0(%rdi), %xmm14
348 movaps 0xf0(%rdi), %xmm15
349 ret
350 1:
351 STTS(%rax)
352 ret
353 SET_SIZE(aes_accel_restore)
354
355 #else
356 #define PROTECTED_CLTS
357 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
358 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
359 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
360 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
361 #endif /* _KERNEL */
362
363
364 /*
365 * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
366 * _key_expansion_256a(), _key_expansion_256b()
367 *
368 * Helper functions called by rijndael_key_setup_inc_intel().
369 * Also used indirectly by rijndael_key_setup_dec_intel().
370 *
371 * Input:
372 * %xmm0 User-provided cipher key
373 * %xmm1 Round constant
374 * Output:
431 pxor %xmm5, %xmm2
432
433 movaps %xmm0, (%rcx)
434 add $0x10, %rcx
435 ret
436 SET_SIZE(_key_expansion_192b)
437
438 .align 16
439 _key_expansion_256b:
440 pshufd $0b10101010, %xmm1, %xmm1
441 shufps $0b00010000, %xmm2, %xmm4
442 pxor %xmm4, %xmm2
443 shufps $0b10001100, %xmm2, %xmm4
444 pxor %xmm4, %xmm2
445 pxor %xmm1, %xmm2
446 movaps %xmm2, (%rcx)
447 add $0x10, %rcx
448 ret
449 SET_SIZE(_key_expansion_256b)
450
451 /*
452 * void aes_copy_intel(const uint8_t *src, uint8_t *dst);
453 *
454 * Copies one unaligned 128-bit block from `src' to `dst'. The copy is
455 * performed using FPU registers, so make sure FPU state is saved when
456 * running this in the kernel.
457 */
458 ENTRY_NP(aes_copy_intel)
459 movdqu (%rdi), %xmm0
460 movdqu %xmm0, (%rsi)
461 ret
462 SET_SIZE(aes_copy_intel)
463
464 /*
465 * void aes_xor_intel(const uint8_t *src, uint8_t *dst);
466 *
467 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
468 * stores the result at `dst'. The XOR is performed using FPU registers,
469 * so make sure FPU state is saved when running this in the kernel.
470 */
471 ENTRY_NP(aes_xor_intel)
472 movdqu (%rdi), %xmm0
473 movdqu (%rsi), %xmm1
474 pxor %xmm1, %xmm0
475 movdqu %xmm0, (%rsi)
476 ret
477 SET_SIZE(aes_xor_intel)
478
479 /*
480 * void aes_xor_intel8(const uint8_t *src, uint8_t *dst);
481 *
482 * XORs eight pairs of consecutive unaligned 128-bit blocks from `src' and
483 * 'dst' and stores the results at `dst'. The XOR is performed using FPU
484 * registers, so make sure FPU state is saved when running this in the kernel.
485 */
486 ENTRY_NP(aes_xor_intel8)
487 movdqu 0x00(%rdi), %xmm0
488 movdqu 0x00(%rsi), %xmm1
489 movdqu 0x10(%rdi), %xmm2
490 movdqu 0x10(%rsi), %xmm3
491 movdqu 0x20(%rdi), %xmm4
492 movdqu 0x20(%rsi), %xmm5
493 movdqu 0x30(%rdi), %xmm6
494 movdqu 0x30(%rsi), %xmm7
495 movdqu 0x40(%rdi), %xmm8
496 movdqu 0x40(%rsi), %xmm9
497 movdqu 0x50(%rdi), %xmm10
498 movdqu 0x50(%rsi), %xmm11
499 movdqu 0x60(%rdi), %xmm12
500 movdqu 0x60(%rsi), %xmm13
501 movdqu 0x70(%rdi), %xmm14
502 movdqu 0x70(%rsi), %xmm15
503 pxor %xmm1, %xmm0
504 pxor %xmm3, %xmm2
505 pxor %xmm5, %xmm4
506 pxor %xmm7, %xmm6
507 pxor %xmm9, %xmm8
508 pxor %xmm11, %xmm10
509 pxor %xmm13, %xmm12
510 pxor %xmm15, %xmm14
511 movdqu %xmm0, 0x00(%rsi)
512 movdqu %xmm2, 0x10(%rsi)
513 movdqu %xmm4, 0x20(%rsi)
514 movdqu %xmm6, 0x30(%rsi)
515 movdqu %xmm8, 0x40(%rsi)
516 movdqu %xmm10, 0x50(%rsi)
517 movdqu %xmm12, 0x60(%rsi)
518 movdqu %xmm14, 0x70(%rsi)
519 ret
520 SET_SIZE(aes_xor_intel8)
521
522 /*
523 * rijndael_key_setup_enc_intel()
524 * Expand the cipher key into the encryption key schedule.
525 *
526 * For kernel code, caller is responsible for ensuring kpreempt_disable()
527 * has been called. This is because %xmm registers are not saved/restored.
528 * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
529 * on entry. Otherwise, if TS is not set, save and restore %xmm registers
530 * on the stack.
531 *
532 * OpenSolaris interface:
533 * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
534 * uint64_t keyBits);
535 * Return value is 0 on error, number of rounds on success.
536 *
537 * Original Intel OpenSSL interface:
538 * int intel_AES_set_encrypt_key(const unsigned char *userKey,
539 * const int bits, AES_KEY *key);
540 * Return value is non-zero on error, 0 on success.
541 */
542
782 .align 4
783 .Ldec_key_inv_loop:
784 movaps (%rcx), %xmm0
785 / Convert an encryption round key to a form usable for decryption
786 / with the "AES Inverse Mix Columns" instruction
787 aesimc %xmm0, %xmm1
788 movaps %xmm1, (%rcx)
789 lea 0x10(%rcx), %rcx
790 cmp %ENDAESKEY, %rcx
791 jnz .Ldec_key_inv_loop
792
793 SET_TS_OR_POP_XMM0_XMM1(%r10)
794
795 .Ldec_key_exit:
796 / OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
797 / OpenSSL: rax = 0 for OK, or non-zero for error
798 ret
799 SET_SIZE(rijndael_key_setup_dec_intel)
800
801
802 #ifdef OPENSSL_INTERFACE
803 #define aes_encrypt_intel intel_AES_encrypt
804 #define aes_decrypt_intel intel_AES_decrypt
805
806 #define INP rdi /* P1, 64 bits */
807 #define OUTP rsi /* P2, 64 bits */
808 #define KEYP rdx /* P3, 64 bits */
809
810 /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */
811 #define NROUNDS32 ecx /* temporary, 32 bits */
812 #define NROUNDS cl /* temporary, 8 bits */
813
814 #else /* OpenSolaris Interface */
815 #define KEYP rdi /* P1, 64 bits */
816 #define NROUNDS esi /* P2, 32 bits */
817 #define INP rdx /* P3, 64 bits */
818 #define OUTP rcx /* P4, 64 bits */
819 #define LENGTH r8 /* P5, 64 bits */
820 #endif /* OPENSSL_INTERFACE */
821
822 #define KEY xmm0 /* temporary, 128 bits */
823 #define STATE0 xmm8 /* temporary, 128 bits */
824 #define STATE1 xmm9 /* temporary, 128 bits */
825 #define STATE2 xmm10 /* temporary, 128 bits */
826 #define STATE3 xmm11 /* temporary, 128 bits */
827 #define STATE4 xmm12 /* temporary, 128 bits */
828 #define STATE5 xmm13 /* temporary, 128 bits */
829 #define STATE6 xmm14 /* temporary, 128 bits */
830 #define STATE7 xmm15 /* temporary, 128 bits */
831
832 /*
833 * Runs the first two rounds of AES256 on a state register. `op' should be
834 * aesenc or aesdec.
835 */
836 #define AES256_ROUNDS(op, statereg) \
837 movaps -0x60(%KEYP), %KEY; \
838 op %KEY, %statereg; \
839 movaps -0x50(%KEYP), %KEY; \
840 op %KEY, %statereg
841
842 /*
843 * Runs the first two rounds of AES192, or the 3rd & 4th round of AES256 on
844 * a state register. `op' should be aesenc or aesdec.
845 */
846 #define AES192_ROUNDS(op, statereg) \
847 movaps -0x40(%KEYP), %KEY; \
848 op %KEY, %statereg; \
849 movaps -0x30(%KEYP), %KEY; \
850 op %KEY, %statereg
851
852 /*
853 * Runs the full 10 rounds of AES128, or the last 10 rounds of AES192/AES256
854 * on a state register. `op' should be aesenc or aesdec and `lastop' should
855 * be aesenclast or aesdeclast.
856 */
857 #define AES128_ROUNDS(op, lastop, statereg) \
858 movaps -0x20(%KEYP), %KEY; \
859 op %KEY, %statereg; \
860 movaps -0x10(%KEYP), %KEY; \
861 op %KEY, %statereg; \
862 movaps (%KEYP), %KEY; \
863 op %KEY, %statereg; \
864 movaps 0x10(%KEYP), %KEY; \
865 op %KEY, %statereg; \
866 movaps 0x20(%KEYP), %KEY; \
867 op %KEY, %statereg; \
868 movaps 0x30(%KEYP), %KEY; \
869 op %KEY, %statereg; \
870 movaps 0x40(%KEYP), %KEY; \
871 op %KEY, %statereg; \
872 movaps 0x50(%KEYP), %KEY; \
873 op %KEY, %statereg; \
874 movaps 0x60(%KEYP), %KEY; \
875 op %KEY, %statereg; \
876 movaps 0x70(%KEYP), %KEY; \
877 lastop %KEY, %statereg
878
879 /*
880 * Macros to run AES encryption rounds. Input must be prefilled in state
881 * register - output will be left there as well.
882 * To run AES256, invoke all of these macros in sequence. To run AES192,
883 * invoke only the -192 and -128 variants. To run AES128, invoke only the
884 * -128 variant.
885 */
886 #define AES256_ENC_ROUNDS(statereg) \
887 AES256_ROUNDS(aesenc, statereg)
888 #define AES192_ENC_ROUNDS(statereg) \
889 AES192_ROUNDS(aesenc, statereg)
890 #define AES128_ENC_ROUNDS(statereg) \
891 AES128_ROUNDS(aesenc, aesenclast, statereg)
892
893 /* Same as the AES*_ENC_ROUNDS macros, but for decryption. */
894 #define AES256_DEC_ROUNDS(statereg) \
895 AES256_ROUNDS(aesdec, statereg)
896 #define AES192_DEC_ROUNDS(statereg) \
897 AES192_ROUNDS(aesdec, statereg)
898 #define AES128_DEC_ROUNDS(statereg) \
899 AES128_ROUNDS(aesdec, aesdeclast, statereg)
900
901
902 /*
903 * aes_encrypt_intel()
904 * Encrypt a single block (in and out can overlap).
905 *
906 * For kernel code, caller is responsible for bracketing this call with
907 * disabling kernel thread preemption and calling aes_accel_save/restore().
908 *
909 * Temporary register usage:
910 * %xmm0 Key
911 * %xmm8 State
912 *
913 * Original OpenSolaris Interface:
914 * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
915 * const uint32_t pt[4], uint32_t ct[4])
916 *
917 * Original Intel OpenSSL Interface:
918 * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
919 * const AES_KEY *key)
920 */
921 ENTRY_NP(aes_encrypt_intel)
922 movups (%INP), %STATE0 / input
923 movaps (%KEYP), %KEY / key
924
925 #ifdef OPENSSL_INTERFACE
926 mov 240(%KEYP), %NROUNDS32 / round count
927 #else /* OpenSolaris Interface */
928 /* Round count is already present as P2 in %rsi/%esi */
929 #endif /* OPENSSL_INTERFACE */
930
931 pxor %KEY, %STATE0 / round 0
932 lea 0x30(%KEYP), %KEYP
933 cmp $12, %NROUNDS
934 jb .Lenc128
935 lea 0x20(%KEYP), %KEYP
936 je .Lenc192
937
938 / AES 256
939 lea 0x20(%KEYP), %KEYP
940 AES256_ENC_ROUNDS(STATE0)
941
942 .align 4
943 .Lenc192:
944 / AES 192 and 256
945 AES192_ENC_ROUNDS(STATE0)
946
947 .align 4
948 .Lenc128:
949 / AES 128, 192, and 256
950 AES128_ENC_ROUNDS(STATE0)
951 movups %STATE0, (%OUTP) / output
952
953 ret
954 SET_SIZE(aes_encrypt_intel)
955
956 /*
957 * aes_decrypt_intel()
958 * Decrypt a single block (in and out can overlap).
959 *
960 * For kernel code, caller is responsible for bracketing this call with
961 * disabling kernel thread preemption and calling aes_accel_save/restore().
962 *
963 * Temporary register usage:
964 * %xmm0 State
965 * %xmm1 Key
966 *
967 * Original OpenSolaris Interface:
968 * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
969 * const uint32_t pt[4], uint32_t ct[4])
970 *
971 * Original Intel OpenSSL Interface:
972 * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
973 * const AES_KEY *key);
974 */
975 ENTRY_NP(aes_decrypt_intel)
976 movups (%INP), %STATE0 / input
977 movaps (%KEYP), %KEY / key
978
979 #ifdef OPENSSL_INTERFACE
980 mov 240(%KEYP), %NROUNDS32 / round count
981 #else /* OpenSolaris Interface */
982 /* Round count is already present as P2 in %rsi/%esi */
983 #endif /* OPENSSL_INTERFACE */
984
985 pxor %KEY, %STATE0 / round 0
986 lea 0x30(%KEYP), %KEYP
987 cmp $12, %NROUNDS
988 jb .Ldec128
989 lea 0x20(%KEYP), %KEYP
990 je .Ldec192
991
992 / AES 256
993 lea 0x20(%KEYP), %KEYP
994 AES256_DEC_ROUNDS(STATE0)
995
996 .align 4
997 .Ldec192:
998 / AES 192 and 256
999 AES192_DEC_ROUNDS(STATE0)
1000
1001 .align 4
1002 .Ldec128:
1003 / AES 128, 192, and 256
1004 AES128_DEC_ROUNDS(STATE0)
1005 movups %STATE0, (%OUTP) / output
1006
1007 ret
1008 SET_SIZE(aes_decrypt_intel)
1009
1010 /* Does a pipelined load of eight input blocks into our AES state registers. */
1011 #define AES_LOAD_INPUT_8BLOCKS \
1012 movups 0x00(%INP), %STATE0; \
1013 movups 0x10(%INP), %STATE1; \
1014 movups 0x20(%INP), %STATE2; \
1015 movups 0x30(%INP), %STATE3; \
1016 movups 0x40(%INP), %STATE4; \
1017 movups 0x50(%INP), %STATE5; \
1018 movups 0x60(%INP), %STATE6; \
1019 movups 0x70(%INP), %STATE7;
1020
1021 /* Does a pipelined store of eight AES state registers to the output. */
1022 #define AES_STORE_OUTPUT_8BLOCKS \
1023 movups %STATE0, 0x00(%OUTP); \
1024 movups %STATE1, 0x10(%OUTP); \
1025 movups %STATE2, 0x20(%OUTP); \
1026 movups %STATE3, 0x30(%OUTP); \
1027 movups %STATE4, 0x40(%OUTP); \
1028 movups %STATE5, 0x50(%OUTP); \
1029 movups %STATE6, 0x60(%OUTP); \
1030 movups %STATE7, 0x70(%OUTP);
1031
1032 /* Performs a pipelined AES instruction with the key on all state registers. */
1033 #define AES_KEY_STATE_OP_8BLOCKS(op) \
1034 op %KEY, %STATE0; \
1035 op %KEY, %STATE1; \
1036 op %KEY, %STATE2; \
1037 op %KEY, %STATE3; \
1038 op %KEY, %STATE4; \
1039 op %KEY, %STATE5; \
1040 op %KEY, %STATE6; \
1041 op %KEY, %STATE7
1042
1043 /* XOR all AES state regs with key to initiate encryption/decryption. */
1044 #define AES_XOR_STATE_8BLOCKS \
1045 AES_KEY_STATE_OP_8BLOCKS(pxor)
1046
1047 /*
1048 * Loads a round key from the key schedule offset `off' into the KEY
1049 * register and performs `op' using the KEY on all 8 STATE registers.
1050 */
1051 #define AES_RND_8BLOCKS(op, off) \
1052 movaps off(%KEYP), %KEY; \
1053 AES_KEY_STATE_OP_8BLOCKS(op)
1054
1055 /*
1056 * void aes_encrypt_intel8(const uint32_t roundkeys[], int numrounds,
1057 * const void *plaintext, void *ciphertext)
1058 *
1059 * Same as aes_encrypt_intel, but performs the encryption operation on
1060 * 8 independent blocks in sequence, exploiting instruction pipelining.
1061 * This function doesn't support the OpenSSL interface, it's only meant
1062 * for kernel use.
1063 */
1064 ENTRY_NP(aes_encrypt_intel8)
1065 AES_LOAD_INPUT_8BLOCKS / load input
1066 movaps (%KEYP), %KEY / key
1067 AES_XOR_STATE_8BLOCKS / round 0
1068
1069 lea 0x30(%KEYP), %KEYP / point to key schedule
1070 cmp $12, %NROUNDS / determine AES variant
1071 jb .Lenc8_128
1072 lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
1073 je .Lenc8_192
1074
1075 lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
1076 AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1
1077 AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2
1078
1079 .align 4
1080 .Lenc8_192:
1081 AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3
1082 AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4
1083
1084 .align 4
1085 .Lenc8_128:
1086 AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
1087 AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
1088 AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
1089 AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
1090 AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
1091 AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
1092 AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
1093 AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
1094 AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
1095 AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1096
1097 AES_STORE_OUTPUT_8BLOCKS / store output
1098 ret
1099 SET_SIZE(aes_encrypt_intel8)
1100
1101
1102 /*
1103 * void aes_decrypt_intel8(const uint32_t roundkeys[], int numrounds,
1104 * const void *ciphertext, void *plaintext)
1105 *
1106 * Same as aes_decrypt_intel, but performs the decryption operation on
1107 * 8 independent blocks in sequence, exploiting instruction pipelining.
1108 * This function doesn't support the OpenSSL interface, it's only meant
1109 * for kernel use.
1110 */
1111 ENTRY_NP(aes_decrypt_intel8)
1112 AES_LOAD_INPUT_8BLOCKS / load input
1113 movaps (%KEYP), %KEY / key
1114 AES_XOR_STATE_8BLOCKS / round 0
1115
1116 lea 0x30(%KEYP), %KEYP / point to key schedule
1117 cmp $12, %NROUNDS / determine AES variant
1118 jb .Ldec8_128
1119 lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
1120 je .Ldec8_192
1121
1122 lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
1123 AES_RND_8BLOCKS(aesdec, -0x60) / AES256 R.1
1124 AES_RND_8BLOCKS(aesdec, -0x50) / AES256 R.2
1125
1126 .align 4
1127 .Ldec8_192:
1128 AES_RND_8BLOCKS(aesdec, -0x40) / AES192 R.1; AES256 R.3
1129 AES_RND_8BLOCKS(aesdec, -0x30) / AES192 R.2; AES256 R.4
1130
1131 .align 4
1132 .Ldec8_128:
1133 AES_RND_8BLOCKS(aesdec, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
1134 AES_RND_8BLOCKS(aesdec, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
1135 AES_RND_8BLOCKS(aesdec, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
1136 AES_RND_8BLOCKS(aesdec, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
1137 AES_RND_8BLOCKS(aesdec, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
1138 AES_RND_8BLOCKS(aesdec, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
1139 AES_RND_8BLOCKS(aesdec, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
1140 AES_RND_8BLOCKS(aesdec, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
1141 AES_RND_8BLOCKS(aesdec, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
1142 AES_RND_8BLOCKS(aesdeclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1143
1144 AES_STORE_OUTPUT_8BLOCKS / store output
1145 ret
1146 SET_SIZE(aes_decrypt_intel8)
1147
1148
1149 /*
1150 * This macro encapsulates the entire AES encryption algo for a single
1151 * block, which is prefilled in statereg and which will be replaced by
1152 * the encrypted output. The KEYP register must already point to the
1153 * AES128 key schedule ("lea 0x30(%KEYP), %KEYP" from encryption
1154 * function call) so that consecutive invocations of this macro are
1155 * supported (KEYP is restored after each invocation).
1156 */
1157 #define AES_ENC(statereg, label_128, label_192, label_out) \
1158 cmp $12, %NROUNDS; \
1159 jb label_128; \
1160 je label_192; \
1161 /* AES 256 only */ \
1162 lea 0x40(%KEYP), %KEYP; \
1163 AES256_ENC_ROUNDS(statereg); \
1164 AES192_ENC_ROUNDS(statereg); \
1165 AES128_ENC_ROUNDS(statereg); \
1166 lea -0x40(%KEYP), %KEYP; \
1167 jmp label_out; \
1168 .align 4; \
1169 label_192: \
1170 lea 0x20(%KEYP), %KEYP; \
1171 /* AES 192 only */ \
1172 AES192_ENC_ROUNDS(statereg); \
1173 AES128_ENC_ROUNDS(statereg); \
1174 lea -0x20(%KEYP), %KEYP; \
1175 jmp label_out; \
1176 .align 4; \
1177 label_128: \
1178 /* AES 128 only */ \
1179 AES128_ENC_ROUNDS(statereg); \
1180 .align 4; \
1181 label_out:
1182
1183
1184 /*
1185 * void aes_encrypt_cbc_intel8(const uint32_t roundkeys[], int numrounds,
1186 * const void *plaintext, void *ciphertext, const void *IV)
1187 *
1188 * Encrypts 8 consecutive AES blocks in the CBC mode. Input and output
1189 * may overlap. This provides a modest performance boost over invoking
1190 * the encryption and XOR in separate functions because we can avoid
1191 * copying the ciphertext block to and from memory between encryption
1192 * and XOR calls.
1193 */
1194 #define CBC_IV r8 /* input - IV blk pointer */
1195 #define CBC_IV_XMM xmm1 /* tmp IV location for alignment */
1196
1197 ENTRY_NP(aes_encrypt_cbc_intel8)
1198 AES_LOAD_INPUT_8BLOCKS / load input
1199 movaps (%KEYP), %KEY / key
1200 AES_XOR_STATE_8BLOCKS / round 0
1201
1202 lea 0x30(%KEYP), %KEYP / point to key schedule
1203 movdqu (%CBC_IV), %CBC_IV_XMM / load IV from unaligned memory
1204 pxor %CBC_IV_XMM, %STATE0 / XOR IV with input block and encrypt
1205 AES_ENC(STATE0, .Lenc_cbc_0_128, .Lenc_cbc_0_192, .Lenc_cbc_0_out)
1206 pxor %STATE0, %STATE1
1207 AES_ENC(STATE1, .Lenc_cbc_1_128, .Lenc_cbc_1_192, .Lenc_cbc_1_out)
1208 pxor %STATE1, %STATE2
1209 AES_ENC(STATE2, .Lenc_cbc_2_128, .Lenc_cbc_2_192, .Lenc_cbc_2_out)
1210 pxor %STATE2, %STATE3
1211 AES_ENC(STATE3, .Lenc_cbc_3_128, .Lenc_cbc_3_192, .Lenc_cbc_3_out)
1212 pxor %STATE3, %STATE4
1213 AES_ENC(STATE4, .Lenc_cbc_4_128, .Lenc_cbc_4_192, .Lenc_cbc_4_out)
1214 pxor %STATE4, %STATE5
1215 AES_ENC(STATE5, .Lenc_cbc_5_128, .Lenc_cbc_5_192, .Lenc_cbc_5_out)
1216 pxor %STATE5, %STATE6
1217 AES_ENC(STATE6, .Lenc_cbc_6_128, .Lenc_cbc_6_192, .Lenc_cbc_6_out)
1218 pxor %STATE6, %STATE7
1219 AES_ENC(STATE7, .Lenc_cbc_7_128, .Lenc_cbc_7_192, .Lenc_cbc_7_out)
1220
1221 AES_STORE_OUTPUT_8BLOCKS / store output
1222 ret
1223 SET_SIZE(aes_encrypt_cbc_intel8)
1224
1225 /*
1226 * Prefills register state with counters suitable for the CTR encryption
1227 * mode. The counter is assumed to consist of two portions:
1228 * - A lower monotonically increasing 64-bit counter. If the caller wants
1229 * a smaller counter, they are responsible for checking that it doesn't
1230 * overflow between encryption calls.
1231 * - An upper static "nonce" portion, in big endian, preloaded into the
1232 * lower portion of an XMM register.
1233 * This macro adds `ctridx' to the lower_LE counter, swaps it to big
1234 * endian and by way of a temporary general-purpose register loads the
1235 * lower and upper counter portions into a target XMM result register,
1236 * which can then be handed off to the encryption process.
1237 */
1238 #define PREP_CTR_BLOCKS(lower_LE, upper_BE_xmm, ctridx, tmpreg, resreg) \
1239 lea ctridx(%lower_LE), %tmpreg; \
1240 bswap %tmpreg; \
1241 movq %tmpreg, %resreg; \
1242 movlhps %upper_BE_xmm, %resreg; \
1243 pshufd $0b01001110, %resreg, %resreg
1244
1245 #define CTR_UPPER_BE r8 /* input - counter upper 64 bits (BE) */
1246 #define CTR_UPPER_BE_XMM xmm1 /* tmp for upper counter bits */
1247 #define CTR_LOWER_LE r9 /* input - counter lower 64 bits (LE) */
1248 #define CTR_TMP0 rax /* tmp for lower 64 bit add & bswap */
1249 #define CTR_TMP1 rbx /* tmp for lower 64 bit add & bswap */
1250 #define CTR_TMP2 r10 /* tmp for lower 64 bit add & bswap */
1251 #define CTR_TMP3 r11 /* tmp for lower 64 bit add & bswap */
1252 #define CTR_TMP4 r12 /* tmp for lower 64 bit add & bswap */
1253 #define CTR_TMP5 r13 /* tmp for lower 64 bit add & bswap */
1254 #define CTR_TMP6 r14 /* tmp for lower 64 bit add & bswap */
1255 #define CTR_TMP7 r15 /* tmp for lower 64 bit add & bswap */
1256
1257 /*
1258 * These are used in case CTR encryption input is unaligned before XORing.
1259 * Must not overlap with any STATE[0-7] register.
1260 */
1261 #define TMP_INPUT0 xmm0
1262 #define TMP_INPUT1 xmm1
1263 #define TMP_INPUT2 xmm2
1264 #define TMP_INPUT3 xmm3
1265 #define TMP_INPUT4 xmm4
1266 #define TMP_INPUT5 xmm5
1267 #define TMP_INPUT6 xmm6
1268 #define TMP_INPUT7 xmm7
1269
1270 /*
1271 * void aes_ctr_intel8(const uint32_t roundkeys[], int numrounds,
1272 * const void *input, void *output, uint64_t counter_upper_BE,
1273 * uint64_t counter_lower_LE)
1274 *
1275 * Runs AES on 8 consecutive blocks in counter mode (encryption and
1276 * decryption in counter mode are the same).
1277 */
1278 ENTRY_NP(aes_ctr_intel8)
1279 /* save caller's regs */
1280 pushq %rbp
1281 movq %rsp, %rbp
1282 subq $0x38, %rsp
1283 / CTR_TMP0 is rax, no need to save
1284 movq %CTR_TMP1, -0x38(%rbp)
1285 movq %CTR_TMP2, -0x30(%rbp)
1286 movq %CTR_TMP3, -0x28(%rbp)
1287 movq %CTR_TMP4, -0x20(%rbp)
1288 movq %CTR_TMP5, -0x18(%rbp)
1289 movq %CTR_TMP6, -0x10(%rbp)
1290 movq %CTR_TMP7, -0x08(%rbp)
1291
1292 /*
1293 * CTR step 1: prepare big-endian formatted 128-bit counter values,
1294 * placing the result in the AES-NI input state registers.
1295 */
1296 movq %CTR_UPPER_BE, %CTR_UPPER_BE_XMM
1297 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 0, CTR_TMP0, STATE0)
1298 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 1, CTR_TMP1, STATE1)
1299 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 2, CTR_TMP2, STATE2)
1300 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 3, CTR_TMP3, STATE3)
1301 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 4, CTR_TMP4, STATE4)
1302 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 5, CTR_TMP5, STATE5)
1303 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 6, CTR_TMP6, STATE6)
1304 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 7, CTR_TMP7, STATE7)
1305
1306 /*
1307 * CTR step 2: Encrypt the counters.
1308 */
1309 movaps (%KEYP), %KEY / key
1310 AES_XOR_STATE_8BLOCKS / round 0
1311
1312 /* Determine the AES variant we're going to compute */
1313 lea 0x30(%KEYP), %KEYP / point to key schedule
1314 cmp $12, %NROUNDS / determine AES variant
1315 jb .Lctr8_128
1316 lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule
1317 je .Lctr8_192
1318
1319 /* AES 256 */
1320 lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule
1321 AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1
1322 AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2
1323
1324 .align 4
1325 .Lctr8_192:
1326 /* AES 192 and 256 */
1327 AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3
1328 AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4
1329
1330 .align 4
1331 .Lctr8_128:
1332 /* AES 128, 192, and 256 */
1333 AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5
1334 AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6
1335 AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7
1336 AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8
1337 AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9
1338 AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10
1339 AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11
1340 AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12
1341 AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13
1342 AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1343
1344 /*
1345 * CTR step 3: XOR input data blocks with encrypted counters to
1346 * produce result.
1347 */
1348 mov %INP, %rax / pxor requires alignment, so check
1349 andq $0xf, %rax
1350 jnz .Lctr_input_unaligned
1351 pxor 0x00(%INP), %STATE0
1352 pxor 0x10(%INP), %STATE1
1353 pxor 0x20(%INP), %STATE2
1354 pxor 0x30(%INP), %STATE3
1355 pxor 0x40(%INP), %STATE4
1356 pxor 0x50(%INP), %STATE5
1357 pxor 0x60(%INP), %STATE6
1358 pxor 0x70(%INP), %STATE7
1359 jmp .Lctr_out
1360
1361 .align 4
1362 .Lctr_input_unaligned:
1363 movdqu 0x00(%INP), %TMP_INPUT0
1364 movdqu 0x10(%INP), %TMP_INPUT1
1365 movdqu 0x20(%INP), %TMP_INPUT2
1366 movdqu 0x30(%INP), %TMP_INPUT3
1367 movdqu 0x40(%INP), %TMP_INPUT4
1368 movdqu 0x50(%INP), %TMP_INPUT5
1369 movdqu 0x60(%INP), %TMP_INPUT6
1370 movdqu 0x70(%INP), %TMP_INPUT7
1371 pxor %TMP_INPUT0, %STATE0
1372 pxor %TMP_INPUT1, %STATE1
1373 pxor %TMP_INPUT2, %STATE2
1374 pxor %TMP_INPUT3, %STATE3
1375 pxor %TMP_INPUT4, %STATE4
1376 pxor %TMP_INPUT5, %STATE5
1377 pxor %TMP_INPUT6, %STATE6
1378 pxor %TMP_INPUT7, %STATE7
1379
1380 .align 4
1381 .Lctr_out:
1382 /*
1383 * Step 4: Write out processed blocks to memory.
1384 */
1385 movdqu %STATE0, 0x00(%OUTP)
1386 movdqu %STATE1, 0x10(%OUTP)
1387 movdqu %STATE2, 0x20(%OUTP)
1388 movdqu %STATE3, 0x30(%OUTP)
1389 movdqu %STATE4, 0x40(%OUTP)
1390 movdqu %STATE5, 0x50(%OUTP)
1391 movdqu %STATE6, 0x60(%OUTP)
1392 movdqu %STATE7, 0x70(%OUTP)
1393
1394 /* restore caller's regs */
1395 / CTR_TMP0 is rax, no need to restore
1396 movq -0x38(%rbp), %CTR_TMP1
1397 movq -0x30(%rbp), %CTR_TMP2
1398 movq -0x28(%rbp), %CTR_TMP3
1399 movq -0x20(%rbp), %CTR_TMP4
1400 movq -0x18(%rbp), %CTR_TMP5
1401 movq -0x10(%rbp), %CTR_TMP6
1402 movq -0x08(%rbp), %CTR_TMP7
1403 leave
1404 ret
1405 SET_SIZE(aes_ctr_intel8)
1406
1407 #endif /* lint || __lint */
|