Print this page
4896 Performance improvements for KCF AES modes


 131  *      typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
 132  *               uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
 133  *
 134  *      typedef union {
 135  *              uint32_t        ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
 136  *      } aes_ks_t;
 137  *      typedef struct aes_key {
 138  *              aes_ks_t        encr_ks, decr_ks;
 139  *              long double     align128;
 140  *              int             flags, nr, type;
 141  *      } aes_key_t;
 142  *
 143  * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
 144  * ct is crypto text, and MAX_AES_NR is 14.
 145  * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
 146  *
 147  * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
 148  *
 149  * ====================================================================
 150  */



 151 
 152 #if defined(lint) || defined(__lint)
 153 
 154 #include <sys/types.h>
 155 
 156 /* ARGSUSED */
 157 void
 158 aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
 159     uint32_t ct[4]) {
 160 }
 161 /* ARGSUSED */
 162 void
 163 aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
 164     uint32_t pt[4]) {
 165 }
 166 /* ARGSUSED */
 167 int
 168 rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
 169     uint64_t keyBits) {
 170         return (0);


 264          * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
 265          * otherwise set CR0_TS.
 266          */
 267 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
 268         testq   $CR0_TS, tmpreg; \
 269         jnz     1f; \
 270         movaps  (%rsp), %xmm6; \
 271         movaps  16(%rsp), %xmm5; \
 272         movaps  32(%rsp), %xmm4; \
 273         movaps  48(%rsp), %xmm3; \
 274         movaps  64(%rsp), %xmm2; \
 275         movaps  80(%rsp), %xmm1; \
 276         movaps  96(%rsp), %xmm0; \
 277         jmp     2f; \
 278 1: \
 279         STTS(tmpreg); \
 280 2: \
 281         mov     %rbp, %rsp; \
 282         pop     %rbp
 283 




































 284 































 285 #else
 286 #define PROTECTED_CLTS
 287 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
 288 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
 289 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
 290 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
 291 #endif  /* _KERNEL */
 292 
 293 
 294 /*
 295  * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
 296  * _key_expansion_256a(), _key_expansion_256b()
 297  *
 298  * Helper functions called by rijndael_key_setup_inc_intel().
 299  * Also used indirectly by rijndael_key_setup_dec_intel().
 300  *
 301  * Input:
 302  * %xmm0        User-provided cipher key
 303  * %xmm1        Round constant
 304  * Output:


 361         pxor    %xmm5, %xmm2
 362 
 363         movaps  %xmm0, (%rcx)
 364         add     $0x10, %rcx
 365         ret
 366         SET_SIZE(_key_expansion_192b)
 367 
 368 .align 16
 369 _key_expansion_256b:
 370         pshufd  $0b10101010, %xmm1, %xmm1
 371         shufps  $0b00010000, %xmm2, %xmm4
 372         pxor    %xmm4, %xmm2
 373         shufps  $0b10001100, %xmm2, %xmm4
 374         pxor    %xmm4, %xmm2
 375         pxor    %xmm1, %xmm2
 376         movaps  %xmm2, (%rcx)
 377         add     $0x10, %rcx
 378         ret
 379         SET_SIZE(_key_expansion_256b)
 380 












 381 
 382 /*


























































 383  * rijndael_key_setup_enc_intel()
 384  * Expand the cipher key into the encryption key schedule.
 385  *
 386  * For kernel code, caller is responsible for ensuring kpreempt_disable()
 387  * has been called.  This is because %xmm registers are not saved/restored.
 388  * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 389  * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 390  * on the stack.
 391  *
 392  * OpenSolaris interface:
 393  * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
 394  *      uint64_t keyBits);
 395  * Return value is 0 on error, number of rounds on success.
 396  *
 397  * Original Intel OpenSSL interface:
 398  * int intel_AES_set_encrypt_key(const unsigned char *userKey,
 399  *      const int bits, AES_KEY *key);
 400  * Return value is non-zero on error, 0 on success.
 401  */
 402 


 642 .align 4
 643 .Ldec_key_inv_loop:
 644         movaps  (%rcx), %xmm0
 645         / Convert an encryption round key to a form usable for decryption
 646         / with the "AES Inverse Mix Columns" instruction
 647         aesimc  %xmm0, %xmm1
 648         movaps  %xmm1, (%rcx)
 649         lea     0x10(%rcx), %rcx
 650         cmp     %ENDAESKEY, %rcx
 651         jnz     .Ldec_key_inv_loop
 652 
 653         SET_TS_OR_POP_XMM0_XMM1(%r10)
 654 
 655 .Ldec_key_exit:
 656         / OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
 657         / OpenSSL: rax = 0 for OK, or non-zero for error
 658         ret
 659         SET_SIZE(rijndael_key_setup_dec_intel)
 660 
 661 
 662 /*
 663  * aes_encrypt_intel()
 664  * Encrypt a single block (in and out can overlap).
 665  *
 666  * For kernel code, caller is responsible for ensuring kpreempt_disable()
 667  * has been called.  This is because %xmm registers are not saved/restored.
 668  * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 669  * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 670  * on the stack.
 671  *
 672  * Temporary register usage:
 673  * %xmm0        State
 674  * %xmm1        Key
 675  *
 676  * Original OpenSolaris Interface:
 677  * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
 678  *      const uint32_t pt[4], uint32_t ct[4])
 679  *
 680  * Original Intel OpenSSL Interface:
 681  * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
 682  *      const AES_KEY *key)
 683  */
 684 
 685 #ifdef  OPENSSL_INTERFACE
 686 #define aes_encrypt_intel       intel_AES_encrypt
 687 #define aes_decrypt_intel       intel_AES_decrypt
 688 
 689 #define INP             rdi     /* P1, 64 bits */
 690 #define OUTP            rsi     /* P2, 64 bits */
 691 #define KEYP            rdx     /* P3, 64 bits */
 692 
 693 /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx:  */
 694 #define NROUNDS32       ecx     /* temporary, 32 bits */
 695 #define NROUNDS         cl      /* temporary,  8 bits */
 696 
 697 #else   /* OpenSolaris Interface */
 698 #define KEYP            rdi     /* P1, 64 bits */
 699 #define NROUNDS         esi     /* P2, 32 bits */
 700 #define INP             rdx     /* P3, 64 bits */
 701 #define OUTP            rcx     /* P4, 64 bits */

 702 #endif  /* OPENSSL_INTERFACE */
 703 
 704 #define STATE           xmm0    /* temporary, 128 bits */
 705 #define KEY             xmm1    /* temporary, 128 bits */







 706 
 707 ENTRY_NP(aes_encrypt_intel)
 708         CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)







 709 
 710         movups  (%INP), %STATE                  / input
















































































 711         movaps  (%KEYP), %KEY                   / key

 712 #ifdef  OPENSSL_INTERFACE
 713         mov     240(%KEYP), %NROUNDS32          / round count
 714 #else   /* OpenSolaris Interface */
 715         /* Round count is already present as P2 in %rsi/%esi */
 716 #endif  /* OPENSSL_INTERFACE */
 717 
 718         pxor    %KEY, %STATE                    / round 0
 719         lea     0x30(%KEYP), %KEYP
 720         cmp     $12, %NROUNDS
 721         jb      .Lenc128
 722         lea     0x20(%KEYP), %KEYP
 723         je      .Lenc192
 724 
 725         / AES 256
 726         lea     0x20(%KEYP), %KEYP
 727         movaps  -0x60(%KEYP), %KEY
 728         aesenc  %KEY, %STATE
 729         movaps  -0x50(%KEYP), %KEY
 730         aesenc  %KEY, %STATE
 731 
 732 .align 4
 733 .Lenc192:
 734         / AES 192 and 256
 735         movaps  -0x40(%KEYP), %KEY
 736         aesenc  %KEY, %STATE
 737         movaps  -0x30(%KEYP), %KEY
 738         aesenc  %KEY, %STATE
 739 
 740 .align 4
 741 .Lenc128:
 742         / AES 128, 192, and 256
 743         movaps  -0x20(%KEYP), %KEY
 744         aesenc  %KEY, %STATE
 745         movaps  -0x10(%KEYP), %KEY
 746         aesenc  %KEY, %STATE
 747         movaps  (%KEYP), %KEY
 748         aesenc  %KEY, %STATE
 749         movaps  0x10(%KEYP), %KEY
 750         aesenc  %KEY, %STATE
 751         movaps  0x20(%KEYP), %KEY
 752         aesenc  %KEY, %STATE
 753         movaps  0x30(%KEYP), %KEY
 754         aesenc  %KEY, %STATE
 755         movaps  0x40(%KEYP), %KEY
 756         aesenc  %KEY, %STATE
 757         movaps  0x50(%KEYP), %KEY
 758         aesenc  %KEY, %STATE
 759         movaps  0x60(%KEYP), %KEY
 760         aesenc  %KEY, %STATE
 761         movaps  0x70(%KEYP), %KEY
 762         aesenclast       %KEY, %STATE           / last round
 763         movups  %STATE, (%OUTP)                 / output
 764 
 765         SET_TS_OR_POP_XMM0_XMM1(%r10)
 766         ret
 767         SET_SIZE(aes_encrypt_intel)
 768 
 769 
 770 /*
 771  * aes_decrypt_intel()
 772  * Decrypt a single block (in and out can overlap).
 773  *
 774  * For kernel code, caller is responsible for ensuring kpreempt_disable()
 775  * has been called.  This is because %xmm registers are not saved/restored.
 776  * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 777  * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 778  * on the stack.
 779  *
 780  * Temporary register usage:
 781  * %xmm0        State
 782  * %xmm1        Key
 783  *
 784  * Original OpenSolaris Interface:
 785  * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
 786  *      const uint32_t pt[4], uint32_t ct[4])/
 787  *
 788  * Original Intel OpenSSL Interface:
 789  * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
 790  *      const AES_KEY *key);
 791  */
 792 ENTRY_NP(aes_decrypt_intel)
 793         CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
 794 
 795         movups  (%INP), %STATE                  / input
 796         movaps  (%KEYP), %KEY                   / key

 797 #ifdef  OPENSSL_INTERFACE
 798         mov     240(%KEYP), %NROUNDS32          / round count
 799 #else   /* OpenSolaris Interface */
 800         /* Round count is already present as P2 in %rsi/%esi */
 801 #endif  /* OPENSSL_INTERFACE */
 802 
 803         pxor    %KEY, %STATE                    / round 0
 804         lea     0x30(%KEYP), %KEYP
 805         cmp     $12, %NROUNDS
 806         jb      .Ldec128
 807         lea     0x20(%KEYP), %KEYP
 808         je      .Ldec192
 809 
 810         / AES 256
 811         lea     0x20(%KEYP), %KEYP
 812         movaps  -0x60(%KEYP), %KEY
 813         aesdec  %KEY, %STATE
 814         movaps  -0x50(%KEYP), %KEY
 815         aesdec  %KEY, %STATE
 816 
 817 .align 4
 818 .Ldec192:
 819         / AES 192 and 256
 820         movaps  -0x40(%KEYP), %KEY
 821         aesdec  %KEY, %STATE
 822         movaps  -0x30(%KEYP), %KEY
 823         aesdec  %KEY, %STATE
 824 
 825 .align 4
 826 .Ldec128:
 827         / AES 128, 192, and 256
 828         movaps  -0x20(%KEYP), %KEY
 829         aesdec  %KEY, %STATE
 830         movaps  -0x10(%KEYP), %KEY
 831         aesdec  %KEY, %STATE
 832         movaps  (%KEYP), %KEY
 833         aesdec  %KEY, %STATE
 834         movaps  0x10(%KEYP), %KEY
 835         aesdec  %KEY, %STATE
 836         movaps  0x20(%KEYP), %KEY
 837         aesdec  %KEY, %STATE
 838         movaps  0x30(%KEYP), %KEY
 839         aesdec  %KEY, %STATE
 840         movaps  0x40(%KEYP), %KEY
 841         aesdec  %KEY, %STATE
 842         movaps  0x50(%KEYP), %KEY
 843         aesdec  %KEY, %STATE
 844         movaps  0x60(%KEYP), %KEY
 845         aesdec  %KEY, %STATE
 846         movaps  0x70(%KEYP), %KEY
 847         aesdeclast      %KEY, %STATE            / last round
 848         movups  %STATE, (%OUTP)                 / output
 849 
 850         SET_TS_OR_POP_XMM0_XMM1(%r10)
 851         ret
 852         SET_SIZE(aes_decrypt_intel)
 853 













































































































































































































































































































































































































 854 #endif  /* lint || __lint */


 131  *      typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
 132  *               uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
 133  *
 134  *      typedef union {
 135  *              uint32_t        ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
 136  *      } aes_ks_t;
 137  *      typedef struct aes_key {
 138  *              aes_ks_t        encr_ks, decr_ks;
 139  *              long double     align128;
 140  *              int             flags, nr, type;
 141  *      } aes_key_t;
 142  *
 143  * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
 144  * ct is crypto text, and MAX_AES_NR is 14.
 145  * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
 146  *
 147  * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
 148  *
 149  * ====================================================================
 150  */
 151 /*
 152  * Copyright 2015 by Saso Kiselkov. All rights reserved.
 153  */
 154 
 155 #if defined(lint) || defined(__lint)
 156 
 157 #include <sys/types.h>
 158 
 159 /* ARGSUSED */
 160 void
 161 aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
 162     uint32_t ct[4]) {
 163 }
 164 /* ARGSUSED */
 165 void
 166 aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
 167     uint32_t pt[4]) {
 168 }
 169 /* ARGSUSED */
 170 int
 171 rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
 172     uint64_t keyBits) {
 173         return (0);


 267          * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
 268          * otherwise set CR0_TS.
 269          */
 270 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
 271         testq   $CR0_TS, tmpreg; \
 272         jnz     1f; \
 273         movaps  (%rsp), %xmm6; \
 274         movaps  16(%rsp), %xmm5; \
 275         movaps  32(%rsp), %xmm4; \
 276         movaps  48(%rsp), %xmm3; \
 277         movaps  64(%rsp), %xmm2; \
 278         movaps  80(%rsp), %xmm1; \
 279         movaps  96(%rsp), %xmm0; \
 280         jmp     2f; \
 281 1: \
 282         STTS(tmpreg); \
 283 2: \
 284         mov     %rbp, %rsp; \
 285         pop     %rbp
 286 
 287 /*
 288  * void aes_accel_save(void *savestate);
 289  *
 290  * Saves all 16 XMM registers and CR0 to a temporary location pointed to
 291  * in the first argument and clears TS in CR0. This must be invoked before
 292  * executing any floating point operations inside the kernel (and kernel
 293  * thread preemption must be disabled as well). The memory region to which
 294  * all state is saved must be at least 16x 128-bit + 64-bit long and must
 295  * be 128-bit aligned.
 296  */
 297 ENTRY_NP(aes_accel_save)
 298         movq    %cr0, %rax
 299         movq    %rax, 0x100(%rdi)
 300         testq   $CR0_TS, %rax
 301         jnz     1f
 302         movaps  %xmm0, 0x00(%rdi)
 303         movaps  %xmm1, 0x10(%rdi)
 304         movaps  %xmm2, 0x20(%rdi)
 305         movaps  %xmm3, 0x30(%rdi)
 306         movaps  %xmm4, 0x40(%rdi)
 307         movaps  %xmm5, 0x50(%rdi)
 308         movaps  %xmm6, 0x60(%rdi)
 309         movaps  %xmm7, 0x70(%rdi)
 310         movaps  %xmm8, 0x80(%rdi)
 311         movaps  %xmm9, 0x90(%rdi)
 312         movaps  %xmm10, 0xa0(%rdi)
 313         movaps  %xmm11, 0xb0(%rdi)
 314         movaps  %xmm12, 0xc0(%rdi)
 315         movaps  %xmm13, 0xd0(%rdi)
 316         movaps  %xmm14, 0xe0(%rdi)
 317         movaps  %xmm15, 0xf0(%rdi)
 318         ret
 319 1:
 320         PROTECTED_CLTS
 321         ret
 322         SET_SIZE(aes_accel_save)
 323 
 324 /*
 325  * void aes_accel_restore(void *savestate);
 326  *
 327  * Restores the saved XMM and CR0.TS state from aes_accel_save.
 328  */
 329 ENTRY_NP(aes_accel_restore)
 330         mov     0x100(%rdi), %rax
 331         testq   $CR0_TS, %rax
 332         jnz     1f
 333         movaps  0x00(%rdi), %xmm0
 334         movaps  0x10(%rdi), %xmm1
 335         movaps  0x20(%rdi), %xmm2
 336         movaps  0x30(%rdi), %xmm3
 337         movaps  0x40(%rdi), %xmm4
 338         movaps  0x50(%rdi), %xmm5
 339         movaps  0x60(%rdi), %xmm6
 340         movaps  0x70(%rdi), %xmm7
 341         movaps  0x80(%rdi), %xmm8
 342         movaps  0x90(%rdi), %xmm9
 343         movaps  0xa0(%rdi), %xmm10
 344         movaps  0xb0(%rdi), %xmm11
 345         movaps  0xc0(%rdi), %xmm12
 346         movaps  0xd0(%rdi), %xmm13
 347         movaps  0xe0(%rdi), %xmm14
 348         movaps  0xf0(%rdi), %xmm15
 349         ret
 350 1:
 351         STTS(%rax)
 352         ret
 353         SET_SIZE(aes_accel_restore)
 354 
 355 #else
 356 #define PROTECTED_CLTS
 357 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
 358 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
 359 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
 360 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
 361 #endif  /* _KERNEL */
 362 
 363 
 364 /*
 365  * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
 366  * _key_expansion_256a(), _key_expansion_256b()
 367  *
 368  * Helper functions called by rijndael_key_setup_inc_intel().
 369  * Also used indirectly by rijndael_key_setup_dec_intel().
 370  *
 371  * Input:
 372  * %xmm0        User-provided cipher key
 373  * %xmm1        Round constant
 374  * Output:


 431         pxor    %xmm5, %xmm2
 432 
 433         movaps  %xmm0, (%rcx)
 434         add     $0x10, %rcx
 435         ret
 436         SET_SIZE(_key_expansion_192b)
 437 
 438 .align 16
 439 _key_expansion_256b:
 440         pshufd  $0b10101010, %xmm1, %xmm1
 441         shufps  $0b00010000, %xmm2, %xmm4
 442         pxor    %xmm4, %xmm2
 443         shufps  $0b10001100, %xmm2, %xmm4
 444         pxor    %xmm4, %xmm2
 445         pxor    %xmm1, %xmm2
 446         movaps  %xmm2, (%rcx)
 447         add     $0x10, %rcx
 448         ret
 449         SET_SIZE(_key_expansion_256b)
 450 
 451 /*
 452  * void aes_copy_intel(const uint8_t *src, uint8_t *dst);
 453  *
 454  * Copies one unaligned 128-bit block from `src' to `dst'. The copy is
 455  * performed using FPU registers, so make sure FPU state is saved when
 456  * running this in the kernel.
 457  */
 458 ENTRY_NP(aes_copy_intel)
 459         movdqu  (%rdi), %xmm0
 460         movdqu  %xmm0, (%rsi)
 461         ret
 462         SET_SIZE(aes_copy_intel)
 463 
 464 /*
 465  * void aes_xor_intel(const uint8_t *src, uint8_t *dst);
 466  *
 467  * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
 468  * stores the result at `dst'. The XOR is performed using FPU registers,
 469  * so make sure FPU state is saved when running this in the kernel.
 470  */
 471 ENTRY_NP(aes_xor_intel)
 472         movdqu  (%rdi), %xmm0
 473         movdqu  (%rsi), %xmm1
 474         pxor    %xmm1, %xmm0
 475         movdqu  %xmm0, (%rsi)
 476         ret
 477         SET_SIZE(aes_xor_intel)
 478 
 479 /*
 480  * void aes_xor_intel8(const uint8_t *src, uint8_t *dst);
 481  *
 482  * XORs eight pairs of consecutive unaligned 128-bit blocks from `src' and
 483  * 'dst' and stores the results at `dst'. The XOR is performed using FPU
 484  * registers, so make sure FPU state is saved when running this in the kernel.
 485  */
 486 ENTRY_NP(aes_xor_intel8)
 487         movdqu  0x00(%rdi), %xmm0
 488         movdqu  0x00(%rsi), %xmm1
 489         movdqu  0x10(%rdi), %xmm2
 490         movdqu  0x10(%rsi), %xmm3
 491         movdqu  0x20(%rdi), %xmm4
 492         movdqu  0x20(%rsi), %xmm5
 493         movdqu  0x30(%rdi), %xmm6
 494         movdqu  0x30(%rsi), %xmm7
 495         movdqu  0x40(%rdi), %xmm8
 496         movdqu  0x40(%rsi), %xmm9
 497         movdqu  0x50(%rdi), %xmm10
 498         movdqu  0x50(%rsi), %xmm11
 499         movdqu  0x60(%rdi), %xmm12
 500         movdqu  0x60(%rsi), %xmm13
 501         movdqu  0x70(%rdi), %xmm14
 502         movdqu  0x70(%rsi), %xmm15
 503         pxor    %xmm1, %xmm0
 504         pxor    %xmm3, %xmm2
 505         pxor    %xmm5, %xmm4
 506         pxor    %xmm7, %xmm6
 507         pxor    %xmm9, %xmm8
 508         pxor    %xmm11, %xmm10
 509         pxor    %xmm13, %xmm12
 510         pxor    %xmm15, %xmm14
 511         movdqu  %xmm0, 0x00(%rsi)
 512         movdqu  %xmm2, 0x10(%rsi)
 513         movdqu  %xmm4, 0x20(%rsi)
 514         movdqu  %xmm6, 0x30(%rsi)
 515         movdqu  %xmm8, 0x40(%rsi)
 516         movdqu  %xmm10, 0x50(%rsi)
 517         movdqu  %xmm12, 0x60(%rsi)
 518         movdqu  %xmm14, 0x70(%rsi)
 519         ret
 520         SET_SIZE(aes_xor_intel8)
 521 
 522 /*
 523  * rijndael_key_setup_enc_intel()
 524  * Expand the cipher key into the encryption key schedule.
 525  *
 526  * For kernel code, caller is responsible for ensuring kpreempt_disable()
 527  * has been called.  This is because %xmm registers are not saved/restored.
 528  * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 529  * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 530  * on the stack.
 531  *
 532  * OpenSolaris interface:
 533  * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
 534  *      uint64_t keyBits);
 535  * Return value is 0 on error, number of rounds on success.
 536  *
 537  * Original Intel OpenSSL interface:
 538  * int intel_AES_set_encrypt_key(const unsigned char *userKey,
 539  *      const int bits, AES_KEY *key);
 540  * Return value is non-zero on error, 0 on success.
 541  */
 542 


 782 .align 4
 783 .Ldec_key_inv_loop:
 784         movaps  (%rcx), %xmm0
 785         / Convert an encryption round key to a form usable for decryption
 786         / with the "AES Inverse Mix Columns" instruction
 787         aesimc  %xmm0, %xmm1
 788         movaps  %xmm1, (%rcx)
 789         lea     0x10(%rcx), %rcx
 790         cmp     %ENDAESKEY, %rcx
 791         jnz     .Ldec_key_inv_loop
 792 
 793         SET_TS_OR_POP_XMM0_XMM1(%r10)
 794 
 795 .Ldec_key_exit:
 796         / OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
 797         / OpenSSL: rax = 0 for OK, or non-zero for error
 798         ret
 799         SET_SIZE(rijndael_key_setup_dec_intel)
 800 
 801 























 802 #ifdef  OPENSSL_INTERFACE
 803 #define aes_encrypt_intel       intel_AES_encrypt
 804 #define aes_decrypt_intel       intel_AES_decrypt
 805 
 806 #define INP             rdi     /* P1, 64 bits */
 807 #define OUTP            rsi     /* P2, 64 bits */
 808 #define KEYP            rdx     /* P3, 64 bits */
 809 
 810 /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx:  */
 811 #define NROUNDS32       ecx     /* temporary, 32 bits */
 812 #define NROUNDS         cl      /* temporary,  8 bits */
 813 
 814 #else   /* OpenSolaris Interface */
 815 #define KEYP            rdi     /* P1, 64 bits */
 816 #define NROUNDS         esi     /* P2, 32 bits */
 817 #define INP             rdx     /* P3, 64 bits */
 818 #define OUTP            rcx     /* P4, 64 bits */
 819 #define LENGTH          r8      /* P5, 64 bits */
 820 #endif  /* OPENSSL_INTERFACE */
 821 
 822 #define KEY             xmm0    /* temporary, 128 bits */
 823 #define STATE0          xmm8    /* temporary, 128 bits */
 824 #define STATE1          xmm9    /* temporary, 128 bits */
 825 #define STATE2          xmm10   /* temporary, 128 bits */
 826 #define STATE3          xmm11   /* temporary, 128 bits */
 827 #define STATE4          xmm12   /* temporary, 128 bits */
 828 #define STATE5          xmm13   /* temporary, 128 bits */
 829 #define STATE6          xmm14   /* temporary, 128 bits */
 830 #define STATE7          xmm15   /* temporary, 128 bits */
 831 
 832 /*
 833  * Runs the first two rounds of AES256 on a state register. `op' should be
 834  * aesenc or aesdec.
 835  */
 836 #define AES256_ROUNDS(op, statereg)     \
 837         movaps  -0x60(%KEYP), %KEY;     \
 838         op      %KEY, %statereg;        \
 839         movaps  -0x50(%KEYP), %KEY;     \
 840         op      %KEY, %statereg
 841 
 842 /*
 843  * Runs the first two rounds of AES192, or the 3rd & 4th round of AES256 on
 844  * a state register. `op' should be aesenc or aesdec.
 845  */
 846 #define AES192_ROUNDS(op, statereg)     \
 847         movaps  -0x40(%KEYP), %KEY;     \
 848         op      %KEY, %statereg;        \
 849         movaps  -0x30(%KEYP), %KEY;     \
 850         op      %KEY, %statereg
 851 
 852 /*
 853  * Runs the full 10 rounds of AES128, or the last 10 rounds of AES192/AES256
 854  * on a state register. `op' should be aesenc or aesdec and `lastop' should
 855  * be aesenclast or aesdeclast.
 856  */
 857 #define AES128_ROUNDS(op, lastop, statereg) \
 858         movaps  -0x20(%KEYP), %KEY;     \
 859         op      %KEY, %statereg;        \
 860         movaps  -0x10(%KEYP), %KEY;     \
 861         op      %KEY, %statereg;        \
 862         movaps  (%KEYP), %KEY;          \
 863         op      %KEY, %statereg;        \
 864         movaps  0x10(%KEYP), %KEY;      \
 865         op      %KEY, %statereg;        \
 866         movaps  0x20(%KEYP), %KEY;      \
 867         op      %KEY, %statereg;        \
 868         movaps  0x30(%KEYP), %KEY;      \
 869         op      %KEY, %statereg;        \
 870         movaps  0x40(%KEYP), %KEY;      \
 871         op      %KEY, %statereg;        \
 872         movaps  0x50(%KEYP), %KEY;      \
 873         op      %KEY, %statereg;        \
 874         movaps  0x60(%KEYP), %KEY;      \
 875         op      %KEY, %statereg;        \
 876         movaps  0x70(%KEYP), %KEY;      \
 877         lastop  %KEY, %statereg
 878 
 879 /*
 880  * Macros to run AES encryption rounds. Input must be prefilled in state
 881  * register - output will be left there as well.
 882  * To run AES256, invoke all of these macros in sequence. To run AES192,
 883  * invoke only the -192 and -128 variants. To run AES128, invoke only the
 884  * -128 variant.
 885  */
 886 #define AES256_ENC_ROUNDS(statereg) \
 887         AES256_ROUNDS(aesenc, statereg)
 888 #define AES192_ENC_ROUNDS(statereg) \
 889         AES192_ROUNDS(aesenc, statereg)
 890 #define AES128_ENC_ROUNDS(statereg) \
 891         AES128_ROUNDS(aesenc, aesenclast, statereg)
 892 
 893 /* Same as the AES*_ENC_ROUNDS macros, but for decryption. */
 894 #define AES256_DEC_ROUNDS(statereg) \
 895         AES256_ROUNDS(aesdec, statereg)
 896 #define AES192_DEC_ROUNDS(statereg) \
 897         AES192_ROUNDS(aesdec, statereg)
 898 #define AES128_DEC_ROUNDS(statereg) \
 899         AES128_ROUNDS(aesdec, aesdeclast, statereg)
 900 
 901 
 902 /*
 903  * aes_encrypt_intel()
 904  * Encrypt a single block (in and out can overlap).
 905  *
 906  * For kernel code, caller is responsible for bracketing this call with
 907  * disabling kernel thread preemption and calling aes_accel_save/restore().
 908  *
 909  * Temporary register usage:
 910  * %xmm0        Key
 911  * %xmm8        State
 912  *
 913  * Original OpenSolaris Interface:
 914  * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
 915  *      const uint32_t pt[4], uint32_t ct[4])
 916  *
 917  * Original Intel OpenSSL Interface:
 918  * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
 919  *      const AES_KEY *key)
 920  */
 921 ENTRY_NP(aes_encrypt_intel)
 922         movups  (%INP), %STATE0                 / input
 923         movaps  (%KEYP), %KEY                   / key
 924 
 925 #ifdef  OPENSSL_INTERFACE
 926         mov     240(%KEYP), %NROUNDS32          / round count
 927 #else   /* OpenSolaris Interface */
 928         /* Round count is already present as P2 in %rsi/%esi */
 929 #endif  /* OPENSSL_INTERFACE */
 930 
 931         pxor    %KEY, %STATE0                   / round 0
 932         lea     0x30(%KEYP), %KEYP
 933         cmp     $12, %NROUNDS
 934         jb      .Lenc128
 935         lea     0x20(%KEYP), %KEYP
 936         je      .Lenc192
 937 
 938         / AES 256
 939         lea     0x20(%KEYP), %KEYP
 940         AES256_ENC_ROUNDS(STATE0)



 941 
 942 .align 4
 943 .Lenc192:
 944         / AES 192 and 256
 945         AES192_ENC_ROUNDS(STATE0)



 946 
 947 .align 4
 948 .Lenc128:
 949         / AES 128, 192, and 256
 950         AES128_ENC_ROUNDS(STATE0)
 951         movups  %STATE0, (%OUTP)                / output



















 952 

 953         ret
 954         SET_SIZE(aes_encrypt_intel)
 955 

 956 /*
 957  * aes_decrypt_intel()
 958  * Decrypt a single block (in and out can overlap).
 959  *
 960  * For kernel code, caller is responsible for bracketing this call with
 961  * disabling kernel thread preemption and calling aes_accel_save/restore().



 962  *
 963  * Temporary register usage:
 964  * %xmm0        State
 965  * %xmm1        Key
 966  *
 967  * Original OpenSolaris Interface:
 968  * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
 969  *      const uint32_t pt[4], uint32_t ct[4])
 970  *
 971  * Original Intel OpenSSL Interface:
 972  * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
 973  *      const AES_KEY *key);
 974  */
 975 ENTRY_NP(aes_decrypt_intel)
 976         movups  (%INP), %STATE0                 / input


 977         movaps  (%KEYP), %KEY                   / key
 978 
 979 #ifdef  OPENSSL_INTERFACE
 980         mov     240(%KEYP), %NROUNDS32          / round count
 981 #else   /* OpenSolaris Interface */
 982         /* Round count is already present as P2 in %rsi/%esi */
 983 #endif  /* OPENSSL_INTERFACE */
 984 
 985         pxor    %KEY, %STATE0                   / round 0
 986         lea     0x30(%KEYP), %KEYP
 987         cmp     $12, %NROUNDS
 988         jb      .Ldec128
 989         lea     0x20(%KEYP), %KEYP
 990         je      .Ldec192
 991 
 992         / AES 256
 993         lea     0x20(%KEYP), %KEYP
 994         AES256_DEC_ROUNDS(STATE0)



 995 
 996 .align 4
 997 .Ldec192:
 998         / AES 192 and 256
 999         AES192_DEC_ROUNDS(STATE0)



1000 
1001 .align 4
1002 .Ldec128:
1003         / AES 128, 192, and 256
1004         AES128_DEC_ROUNDS(STATE0)
1005         movups  %STATE0, (%OUTP)                / output



















1006 

1007         ret
1008         SET_SIZE(aes_decrypt_intel)
1009 
1010 /* Does a pipelined load of eight input blocks into our AES state registers. */
1011 #define AES_LOAD_INPUT_8BLOCKS          \
1012         movups  0x00(%INP), %STATE0;    \
1013         movups  0x10(%INP), %STATE1;    \
1014         movups  0x20(%INP), %STATE2;    \
1015         movups  0x30(%INP), %STATE3;    \
1016         movups  0x40(%INP), %STATE4;    \
1017         movups  0x50(%INP), %STATE5;    \
1018         movups  0x60(%INP), %STATE6;    \
1019         movups  0x70(%INP), %STATE7;
1020 
1021 /* Does a pipelined store of eight AES state registers to the output. */
1022 #define AES_STORE_OUTPUT_8BLOCKS        \
1023         movups  %STATE0, 0x00(%OUTP);   \
1024         movups  %STATE1, 0x10(%OUTP);   \
1025         movups  %STATE2, 0x20(%OUTP);   \
1026         movups  %STATE3, 0x30(%OUTP);   \
1027         movups  %STATE4, 0x40(%OUTP);   \
1028         movups  %STATE5, 0x50(%OUTP);   \
1029         movups  %STATE6, 0x60(%OUTP);   \
1030         movups  %STATE7, 0x70(%OUTP);
1031 
1032 /* Performs a pipelined AES instruction with the key on all state registers. */
1033 #define AES_KEY_STATE_OP_8BLOCKS(op)    \
1034         op      %KEY, %STATE0;          \
1035         op      %KEY, %STATE1;          \
1036         op      %KEY, %STATE2;          \
1037         op      %KEY, %STATE3;          \
1038         op      %KEY, %STATE4;          \
1039         op      %KEY, %STATE5;          \
1040         op      %KEY, %STATE6;          \
1041         op      %KEY, %STATE7
1042 
1043 /* XOR all AES state regs with key to initiate encryption/decryption. */
1044 #define AES_XOR_STATE_8BLOCKS           \
1045         AES_KEY_STATE_OP_8BLOCKS(pxor)
1046 
1047 /*
1048  * Loads a round key from the key schedule offset `off' into the KEY
1049  * register and performs `op' using the KEY on all 8 STATE registers.
1050  */
1051 #define AES_RND_8BLOCKS(op, off)        \
1052         movaps  off(%KEYP), %KEY;       \
1053         AES_KEY_STATE_OP_8BLOCKS(op)
1054 
1055 /*
1056  * void aes_encrypt_intel8(const uint32_t roundkeys[], int numrounds,
1057  *      const void *plaintext, void *ciphertext)
1058  *
1059  * Same as aes_encrypt_intel, but performs the encryption operation on
1060  * 8 independent blocks in sequence, exploiting instruction pipelining.
1061  * This function doesn't support the OpenSSL interface, it's only meant
1062  * for kernel use.
1063  */
1064 ENTRY_NP(aes_encrypt_intel8)
1065         AES_LOAD_INPUT_8BLOCKS          / load input
1066         movaps  (%KEYP), %KEY           / key
1067         AES_XOR_STATE_8BLOCKS           / round 0
1068 
1069         lea     0x30(%KEYP), %KEYP      / point to key schedule
1070         cmp     $12, %NROUNDS           / determine AES variant
1071         jb      .Lenc8_128
1072         lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
1073         je      .Lenc8_192
1074 
1075         lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
1076         AES_RND_8BLOCKS(aesenc, -0x60)  / AES256 R.1
1077         AES_RND_8BLOCKS(aesenc, -0x50)  / AES256 R.2
1078 
1079 .align 4
1080 .Lenc8_192:
1081         AES_RND_8BLOCKS(aesenc, -0x40)  / AES192 R.1; AES256 R.3
1082         AES_RND_8BLOCKS(aesenc, -0x30)  / AES192 R.2; AES256 R.4
1083 
1084 .align 4
1085 .Lenc8_128:
1086         AES_RND_8BLOCKS(aesenc, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
1087         AES_RND_8BLOCKS(aesenc, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
1088         AES_RND_8BLOCKS(aesenc, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
1089         AES_RND_8BLOCKS(aesenc, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
1090         AES_RND_8BLOCKS(aesenc, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
1091         AES_RND_8BLOCKS(aesenc, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
1092         AES_RND_8BLOCKS(aesenc, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
1093         AES_RND_8BLOCKS(aesenc, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
1094         AES_RND_8BLOCKS(aesenc, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
1095         AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1096 
1097         AES_STORE_OUTPUT_8BLOCKS        / store output
1098         ret
1099         SET_SIZE(aes_encrypt_intel8)
1100 
1101 
1102 /*
1103  * void aes_decrypt_intel8(const uint32_t roundkeys[], int numrounds,
1104  *      const void *ciphertext, void *plaintext)
1105  *
1106  * Same as aes_decrypt_intel, but performs the decryption operation on
1107  * 8 independent blocks in sequence, exploiting instruction pipelining.
1108  * This function doesn't support the OpenSSL interface, it's only meant
1109  * for kernel use.
1110  */
1111 ENTRY_NP(aes_decrypt_intel8)
1112         AES_LOAD_INPUT_8BLOCKS          / load input
1113         movaps  (%KEYP), %KEY           / key
1114         AES_XOR_STATE_8BLOCKS           / round 0
1115 
1116         lea     0x30(%KEYP), %KEYP      / point to key schedule
1117         cmp     $12, %NROUNDS           / determine AES variant
1118         jb      .Ldec8_128
1119         lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
1120         je      .Ldec8_192
1121 
1122         lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
1123         AES_RND_8BLOCKS(aesdec, -0x60)  / AES256 R.1
1124         AES_RND_8BLOCKS(aesdec, -0x50)  / AES256 R.2
1125 
1126 .align 4
1127 .Ldec8_192:
1128         AES_RND_8BLOCKS(aesdec, -0x40)  / AES192 R.1; AES256 R.3
1129         AES_RND_8BLOCKS(aesdec, -0x30)  / AES192 R.2; AES256 R.4
1130 
1131 .align 4
1132 .Ldec8_128:
1133         AES_RND_8BLOCKS(aesdec, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
1134         AES_RND_8BLOCKS(aesdec, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
1135         AES_RND_8BLOCKS(aesdec, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
1136         AES_RND_8BLOCKS(aesdec, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
1137         AES_RND_8BLOCKS(aesdec, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
1138         AES_RND_8BLOCKS(aesdec, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
1139         AES_RND_8BLOCKS(aesdec, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
1140         AES_RND_8BLOCKS(aesdec, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
1141         AES_RND_8BLOCKS(aesdec, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
1142         AES_RND_8BLOCKS(aesdeclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1143 
1144         AES_STORE_OUTPUT_8BLOCKS        / store output
1145         ret
1146         SET_SIZE(aes_decrypt_intel8)
1147 
1148 
1149 /*
1150  * This macro encapsulates the entire AES encryption algo for a single
1151  * block, which is prefilled in statereg and which will be replaced by
1152  * the encrypted output. The KEYP register must already point to the
1153  * AES128 key schedule ("lea 0x30(%KEYP), %KEYP" from encryption
1154  * function call) so that consecutive invocations of this macro are
1155  * supported (KEYP is restored after each invocation).
1156  */
1157 #define AES_ENC(statereg, label_128, label_192, label_out)      \
1158         cmp     $12, %NROUNDS;                                  \
1159         jb      label_128;                                      \
1160         je      label_192;                                      \
1161         /* AES 256 only */                                      \
1162         lea     0x40(%KEYP), %KEYP;                             \
1163         AES256_ENC_ROUNDS(statereg);                            \
1164         AES192_ENC_ROUNDS(statereg);                            \
1165         AES128_ENC_ROUNDS(statereg);                            \
1166         lea     -0x40(%KEYP), %KEYP;                            \
1167         jmp     label_out;                                      \
1168 .align 4;                                                       \
1169 label_192:                                                      \
1170         lea     0x20(%KEYP), %KEYP;                             \
1171         /* AES 192 only */                                      \
1172         AES192_ENC_ROUNDS(statereg);                            \
1173         AES128_ENC_ROUNDS(statereg);                            \
1174         lea     -0x20(%KEYP), %KEYP;                            \
1175         jmp     label_out;                                      \
1176 .align 4;                                                       \
1177 label_128:                                                      \
1178         /* AES 128 only */                                      \
1179         AES128_ENC_ROUNDS(statereg);                            \
1180 .align 4;                                                       \
1181 label_out:
1182 
1183 
1184 /*
1185  * void aes_encrypt_cbc_intel8(const uint32_t roundkeys[], int numrounds,
1186  *      const void *plaintext, void *ciphertext, const void *IV)
1187  *
1188  * Encrypts 8 consecutive AES blocks in the CBC mode. Input and output
1189  * may overlap. This provides a modest performance boost over invoking
1190  * the encryption and XOR in separate functions because we can avoid
1191  * copying the ciphertext block to and from memory between encryption
1192  * and XOR calls.
1193  */
1194 #define CBC_IV                  r8      /* input - IV blk pointer */
1195 #define CBC_IV_XMM              xmm1    /* tmp IV location for alignment */
1196 
1197 ENTRY_NP(aes_encrypt_cbc_intel8)
1198         AES_LOAD_INPUT_8BLOCKS          / load input
1199         movaps  (%KEYP), %KEY           / key
1200         AES_XOR_STATE_8BLOCKS           / round 0
1201 
1202         lea     0x30(%KEYP), %KEYP      / point to key schedule
1203         movdqu  (%CBC_IV), %CBC_IV_XMM  / load IV from unaligned memory
1204         pxor    %CBC_IV_XMM, %STATE0    / XOR IV with input block and encrypt
1205         AES_ENC(STATE0, .Lenc_cbc_0_128, .Lenc_cbc_0_192, .Lenc_cbc_0_out)
1206         pxor    %STATE0, %STATE1
1207         AES_ENC(STATE1, .Lenc_cbc_1_128, .Lenc_cbc_1_192, .Lenc_cbc_1_out)
1208         pxor    %STATE1, %STATE2
1209         AES_ENC(STATE2, .Lenc_cbc_2_128, .Lenc_cbc_2_192, .Lenc_cbc_2_out)
1210         pxor    %STATE2, %STATE3
1211         AES_ENC(STATE3, .Lenc_cbc_3_128, .Lenc_cbc_3_192, .Lenc_cbc_3_out)
1212         pxor    %STATE3, %STATE4
1213         AES_ENC(STATE4, .Lenc_cbc_4_128, .Lenc_cbc_4_192, .Lenc_cbc_4_out)
1214         pxor    %STATE4, %STATE5
1215         AES_ENC(STATE5, .Lenc_cbc_5_128, .Lenc_cbc_5_192, .Lenc_cbc_5_out)
1216         pxor    %STATE5, %STATE6
1217         AES_ENC(STATE6, .Lenc_cbc_6_128, .Lenc_cbc_6_192, .Lenc_cbc_6_out)
1218         pxor    %STATE6, %STATE7
1219         AES_ENC(STATE7, .Lenc_cbc_7_128, .Lenc_cbc_7_192, .Lenc_cbc_7_out)
1220 
1221         AES_STORE_OUTPUT_8BLOCKS        / store output
1222         ret
1223         SET_SIZE(aes_encrypt_cbc_intel8)
1224 
1225 /*
1226  * Prefills register state with counters suitable for the CTR encryption
1227  * mode. The counter is assumed to consist of two portions:
1228  * - A lower monotonically increasing 64-bit counter. If the caller wants
1229  *   a smaller counter, they are responsible for checking that it doesn't
1230  *   overflow between encryption calls.
1231  * - An upper static "nonce" portion, in big endian, preloaded into the
1232  *   lower portion of an XMM register.
1233  * This macro adds `ctridx' to the lower_LE counter, swaps it to big
1234  * endian and by way of a temporary general-purpose register loads the
1235  * lower and upper counter portions into a target XMM result register,
1236  * which can then be handed off to the encryption process.
1237  */
1238 #define PREP_CTR_BLOCKS(lower_LE, upper_BE_xmm, ctridx, tmpreg, resreg) \
1239         lea     ctridx(%lower_LE), %tmpreg;                             \
1240         bswap   %tmpreg;                                                \
1241         movq    %tmpreg, %resreg;                                       \
1242         movlhps %upper_BE_xmm, %resreg;                                 \
1243         pshufd  $0b01001110, %resreg, %resreg
1244 
1245 #define CTR_UPPER_BE            r8      /* input - counter upper 64 bits (BE) */
1246 #define CTR_UPPER_BE_XMM        xmm1    /* tmp for upper counter bits */
1247 #define CTR_LOWER_LE            r9      /* input - counter lower 64 bits (LE) */
1248 #define CTR_TMP0                rax     /* tmp for lower 64 bit add & bswap */
1249 #define CTR_TMP1                rbx     /* tmp for lower 64 bit add & bswap */
1250 #define CTR_TMP2                r10     /* tmp for lower 64 bit add & bswap */
1251 #define CTR_TMP3                r11     /* tmp for lower 64 bit add & bswap */
1252 #define CTR_TMP4                r12     /* tmp for lower 64 bit add & bswap */
1253 #define CTR_TMP5                r13     /* tmp for lower 64 bit add & bswap */
1254 #define CTR_TMP6                r14     /* tmp for lower 64 bit add & bswap */
1255 #define CTR_TMP7                r15     /* tmp for lower 64 bit add & bswap */
1256 
1257 /*
1258  * These are used in case CTR encryption input is unaligned before XORing.
1259  * Must not overlap with any STATE[0-7] register.
1260  */
1261 #define TMP_INPUT0      xmm0
1262 #define TMP_INPUT1      xmm1
1263 #define TMP_INPUT2      xmm2
1264 #define TMP_INPUT3      xmm3
1265 #define TMP_INPUT4      xmm4
1266 #define TMP_INPUT5      xmm5
1267 #define TMP_INPUT6      xmm6
1268 #define TMP_INPUT7      xmm7
1269 
1270 /*
1271  * void aes_ctr_intel8(const uint32_t roundkeys[], int numrounds,
1272  *      const void *input, void *output, uint64_t counter_upper_BE,
1273  *      uint64_t counter_lower_LE)
1274  *
1275  * Runs AES on 8 consecutive blocks in counter mode (encryption and
1276  * decryption in counter mode are the same).
1277  */
1278 ENTRY_NP(aes_ctr_intel8)
1279         /* save caller's regs */
1280         pushq   %rbp
1281         movq    %rsp, %rbp
1282         subq    $0x38, %rsp
1283         / CTR_TMP0 is rax, no need to save
1284         movq    %CTR_TMP1, -0x38(%rbp)
1285         movq    %CTR_TMP2, -0x30(%rbp)
1286         movq    %CTR_TMP3, -0x28(%rbp)
1287         movq    %CTR_TMP4, -0x20(%rbp)
1288         movq    %CTR_TMP5, -0x18(%rbp)
1289         movq    %CTR_TMP6, -0x10(%rbp)
1290         movq    %CTR_TMP7, -0x08(%rbp)
1291 
1292         /*
1293          * CTR step 1: prepare big-endian formatted 128-bit counter values,
1294          * placing the result in the AES-NI input state registers.
1295          */
1296         movq    %CTR_UPPER_BE, %CTR_UPPER_BE_XMM
1297         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 0, CTR_TMP0, STATE0)
1298         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 1, CTR_TMP1, STATE1)
1299         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 2, CTR_TMP2, STATE2)
1300         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 3, CTR_TMP3, STATE3)
1301         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 4, CTR_TMP4, STATE4)
1302         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 5, CTR_TMP5, STATE5)
1303         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 6, CTR_TMP6, STATE6)
1304         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 7, CTR_TMP7, STATE7)
1305 
1306         /*
1307          * CTR step 2: Encrypt the counters.
1308          */
1309         movaps  (%KEYP), %KEY           / key
1310         AES_XOR_STATE_8BLOCKS           / round 0
1311 
1312         /* Determine the AES variant we're going to compute */
1313         lea     0x30(%KEYP), %KEYP      / point to key schedule
1314         cmp     $12, %NROUNDS           / determine AES variant
1315         jb      .Lctr8_128
1316         lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
1317         je      .Lctr8_192
1318 
1319         /* AES 256 */
1320         lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
1321         AES_RND_8BLOCKS(aesenc, -0x60)  / AES256 R.1
1322         AES_RND_8BLOCKS(aesenc, -0x50)  / AES256 R.2
1323 
1324 .align 4
1325 .Lctr8_192:
1326         /* AES 192 and 256 */
1327         AES_RND_8BLOCKS(aesenc, -0x40)  / AES192 R.1; AES256 R.3
1328         AES_RND_8BLOCKS(aesenc, -0x30)  / AES192 R.2; AES256 R.4
1329 
1330 .align 4
1331 .Lctr8_128:
1332         /* AES 128, 192, and 256 */
1333         AES_RND_8BLOCKS(aesenc, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
1334         AES_RND_8BLOCKS(aesenc, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
1335         AES_RND_8BLOCKS(aesenc, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
1336         AES_RND_8BLOCKS(aesenc, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
1337         AES_RND_8BLOCKS(aesenc, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
1338         AES_RND_8BLOCKS(aesenc, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
1339         AES_RND_8BLOCKS(aesenc, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
1340         AES_RND_8BLOCKS(aesenc, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
1341         AES_RND_8BLOCKS(aesenc, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
1342         AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1343 
1344         /*
1345          * CTR step 3: XOR input data blocks with encrypted counters to
1346          * produce result.
1347          */
1348         mov     %INP, %rax              / pxor requires alignment, so check
1349         andq    $0xf, %rax
1350         jnz     .Lctr_input_unaligned
1351         pxor    0x00(%INP), %STATE0
1352         pxor    0x10(%INP), %STATE1
1353         pxor    0x20(%INP), %STATE2
1354         pxor    0x30(%INP), %STATE3
1355         pxor    0x40(%INP), %STATE4
1356         pxor    0x50(%INP), %STATE5
1357         pxor    0x60(%INP), %STATE6
1358         pxor    0x70(%INP), %STATE7
1359         jmp     .Lctr_out
1360 
1361 .align 4
1362 .Lctr_input_unaligned:
1363         movdqu  0x00(%INP), %TMP_INPUT0
1364         movdqu  0x10(%INP), %TMP_INPUT1
1365         movdqu  0x20(%INP), %TMP_INPUT2
1366         movdqu  0x30(%INP), %TMP_INPUT3
1367         movdqu  0x40(%INP), %TMP_INPUT4
1368         movdqu  0x50(%INP), %TMP_INPUT5
1369         movdqu  0x60(%INP), %TMP_INPUT6
1370         movdqu  0x70(%INP), %TMP_INPUT7
1371         pxor    %TMP_INPUT0, %STATE0
1372         pxor    %TMP_INPUT1, %STATE1
1373         pxor    %TMP_INPUT2, %STATE2
1374         pxor    %TMP_INPUT3, %STATE3
1375         pxor    %TMP_INPUT4, %STATE4
1376         pxor    %TMP_INPUT5, %STATE5
1377         pxor    %TMP_INPUT6, %STATE6
1378         pxor    %TMP_INPUT7, %STATE7
1379 
1380 .align 4
1381 .Lctr_out:
1382         /*
1383          * Step 4: Write out processed blocks to memory.
1384          */
1385         movdqu  %STATE0, 0x00(%OUTP)
1386         movdqu  %STATE1, 0x10(%OUTP)
1387         movdqu  %STATE2, 0x20(%OUTP)
1388         movdqu  %STATE3, 0x30(%OUTP)
1389         movdqu  %STATE4, 0x40(%OUTP)
1390         movdqu  %STATE5, 0x50(%OUTP)
1391         movdqu  %STATE6, 0x60(%OUTP)
1392         movdqu  %STATE7, 0x70(%OUTP)
1393 
1394         /* restore caller's regs */
1395         / CTR_TMP0 is rax, no need to restore
1396         movq    -0x38(%rbp), %CTR_TMP1
1397         movq    -0x30(%rbp), %CTR_TMP2
1398         movq    -0x28(%rbp), %CTR_TMP3
1399         movq    -0x20(%rbp), %CTR_TMP4
1400         movq    -0x18(%rbp), %CTR_TMP5
1401         movq    -0x10(%rbp), %CTR_TMP6
1402         movq    -0x08(%rbp), %CTR_TMP7
1403         leave
1404         ret
1405         SET_SIZE(aes_ctr_intel8)
1406 
1407 #endif  /* lint || __lint */