Print this page
4896 Performance improvements for KCF AES modes

Split Close
Expand all
Collapse all
          --- old/usr/src/common/crypto/aes/amd64/aes_intel.s
          +++ new/usr/src/common/crypto/aes/amd64/aes_intel.s
↓ open down ↓ 140 lines elided ↑ open up ↑
 141  141   *      } aes_key_t;
 142  142   *
 143  143   * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
 144  144   * ct is crypto text, and MAX_AES_NR is 14.
 145  145   * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
 146  146   *
 147  147   * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
 148  148   *
 149  149   * ====================================================================
 150  150   */
      151 +/*
      152 + * Copyright 2015 by Saso Kiselkov. All rights reserved.
      153 + */
 151  154  
 152  155  #if defined(lint) || defined(__lint)
 153  156  
 154  157  #include <sys/types.h>
 155  158  
 156  159  /* ARGSUSED */
 157  160  void
 158  161  aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
 159  162      uint32_t ct[4]) {
 160  163  }
↓ open down ↓ 113 lines elided ↑ open up ↑
 274  277          movaps  64(%rsp), %xmm2; \
 275  278          movaps  80(%rsp), %xmm1; \
 276  279          movaps  96(%rsp), %xmm0; \
 277  280          jmp     2f; \
 278  281  1: \
 279  282          STTS(tmpreg); \
 280  283  2: \
 281  284          mov     %rbp, %rsp; \
 282  285          pop     %rbp
 283  286  
      287 +/*
      288 + * void aes_accel_save(void *savestate);
      289 + *
      290 + * Saves all 16 XMM registers and CR0 to a temporary location pointed to
      291 + * in the first argument and clears TS in CR0. This must be invoked before
      292 + * executing any floating point operations inside the kernel (and kernel
      293 + * thread preemption must be disabled as well). The memory region to which
      294 + * all state is saved must be at least 16x 128-bit + 64-bit long and must
      295 + * be 128-bit aligned.
      296 + */
      297 +ENTRY_NP(aes_accel_save)
      298 +        movq    %cr0, %rax
      299 +        movq    %rax, 0x100(%rdi)
      300 +        testq   $CR0_TS, %rax
      301 +        jnz     1f
      302 +        movaps  %xmm0, 0x00(%rdi)
      303 +        movaps  %xmm1, 0x10(%rdi)
      304 +        movaps  %xmm2, 0x20(%rdi)
      305 +        movaps  %xmm3, 0x30(%rdi)
      306 +        movaps  %xmm4, 0x40(%rdi)
      307 +        movaps  %xmm5, 0x50(%rdi)
      308 +        movaps  %xmm6, 0x60(%rdi)
      309 +        movaps  %xmm7, 0x70(%rdi)
      310 +        movaps  %xmm8, 0x80(%rdi)
      311 +        movaps  %xmm9, 0x90(%rdi)
      312 +        movaps  %xmm10, 0xa0(%rdi)
      313 +        movaps  %xmm11, 0xb0(%rdi)
      314 +        movaps  %xmm12, 0xc0(%rdi)
      315 +        movaps  %xmm13, 0xd0(%rdi)
      316 +        movaps  %xmm14, 0xe0(%rdi)
      317 +        movaps  %xmm15, 0xf0(%rdi)
      318 +        ret
      319 +1:
      320 +        PROTECTED_CLTS
      321 +        ret
      322 +        SET_SIZE(aes_accel_save)
 284  323  
      324 +/*
      325 + * void aes_accel_restore(void *savestate);
      326 + *
      327 + * Restores the saved XMM and CR0.TS state from aes_accel_save.
      328 + */
      329 +ENTRY_NP(aes_accel_restore)
      330 +        mov     0x100(%rdi), %rax
      331 +        testq   $CR0_TS, %rax
      332 +        jnz     1f
      333 +        movaps  0x00(%rdi), %xmm0
      334 +        movaps  0x10(%rdi), %xmm1
      335 +        movaps  0x20(%rdi), %xmm2
      336 +        movaps  0x30(%rdi), %xmm3
      337 +        movaps  0x40(%rdi), %xmm4
      338 +        movaps  0x50(%rdi), %xmm5
      339 +        movaps  0x60(%rdi), %xmm6
      340 +        movaps  0x70(%rdi), %xmm7
      341 +        movaps  0x80(%rdi), %xmm8
      342 +        movaps  0x90(%rdi), %xmm9
      343 +        movaps  0xa0(%rdi), %xmm10
      344 +        movaps  0xb0(%rdi), %xmm11
      345 +        movaps  0xc0(%rdi), %xmm12
      346 +        movaps  0xd0(%rdi), %xmm13
      347 +        movaps  0xe0(%rdi), %xmm14
      348 +        movaps  0xf0(%rdi), %xmm15
      349 +        ret
      350 +1:
      351 +        STTS(%rax)
      352 +        ret
      353 +        SET_SIZE(aes_accel_restore)
      354 +
 285  355  #else
 286  356  #define PROTECTED_CLTS
 287  357  #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
 288  358  #define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
 289  359  #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
 290  360  #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
 291  361  #endif  /* _KERNEL */
 292  362  
 293  363  
 294  364  /*
↓ open down ↓ 76 lines elided ↑ open up ↑
 371  441          shufps  $0b00010000, %xmm2, %xmm4
 372  442          pxor    %xmm4, %xmm2
 373  443          shufps  $0b10001100, %xmm2, %xmm4
 374  444          pxor    %xmm4, %xmm2
 375  445          pxor    %xmm1, %xmm2
 376  446          movaps  %xmm2, (%rcx)
 377  447          add     $0x10, %rcx
 378  448          ret
 379  449          SET_SIZE(_key_expansion_256b)
 380  450  
      451 +/*
      452 + * void aes_copy_intel(const uint8_t *src, uint8_t *dst);
      453 + *
      454 + * Copies one unaligned 128-bit block from `src' to `dst'. The copy is
      455 + * performed using FPU registers, so make sure FPU state is saved when
      456 + * running this in the kernel.
      457 + */
      458 +ENTRY_NP(aes_copy_intel)
      459 +        movdqu  (%rdi), %xmm0
      460 +        movdqu  %xmm0, (%rsi)
      461 +        ret
      462 +        SET_SIZE(aes_copy_intel)
 381  463  
 382  464  /*
      465 + * void aes_xor_intel(const uint8_t *src, uint8_t *dst);
      466 + *
      467 + * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
      468 + * stores the result at `dst'. The XOR is performed using FPU registers,
      469 + * so make sure FPU state is saved when running this in the kernel.
      470 + */
      471 +ENTRY_NP(aes_xor_intel)
      472 +        movdqu  (%rdi), %xmm0
      473 +        movdqu  (%rsi), %xmm1
      474 +        pxor    %xmm1, %xmm0
      475 +        movdqu  %xmm0, (%rsi)
      476 +        ret
      477 +        SET_SIZE(aes_xor_intel)
      478 +
      479 +/*
      480 + * void aes_xor_intel8(const uint8_t *src, uint8_t *dst);
      481 + *
      482 + * XORs eight pairs of consecutive unaligned 128-bit blocks from `src' and
      483 + * 'dst' and stores the results at `dst'. The XOR is performed using FPU
      484 + * registers, so make sure FPU state is saved when running this in the kernel.
      485 + */
      486 +ENTRY_NP(aes_xor_intel8)
      487 +        movdqu  0x00(%rdi), %xmm0
      488 +        movdqu  0x00(%rsi), %xmm1
      489 +        movdqu  0x10(%rdi), %xmm2
      490 +        movdqu  0x10(%rsi), %xmm3
      491 +        movdqu  0x20(%rdi), %xmm4
      492 +        movdqu  0x20(%rsi), %xmm5
      493 +        movdqu  0x30(%rdi), %xmm6
      494 +        movdqu  0x30(%rsi), %xmm7
      495 +        movdqu  0x40(%rdi), %xmm8
      496 +        movdqu  0x40(%rsi), %xmm9
      497 +        movdqu  0x50(%rdi), %xmm10
      498 +        movdqu  0x50(%rsi), %xmm11
      499 +        movdqu  0x60(%rdi), %xmm12
      500 +        movdqu  0x60(%rsi), %xmm13
      501 +        movdqu  0x70(%rdi), %xmm14
      502 +        movdqu  0x70(%rsi), %xmm15
      503 +        pxor    %xmm1, %xmm0
      504 +        pxor    %xmm3, %xmm2
      505 +        pxor    %xmm5, %xmm4
      506 +        pxor    %xmm7, %xmm6
      507 +        pxor    %xmm9, %xmm8
      508 +        pxor    %xmm11, %xmm10
      509 +        pxor    %xmm13, %xmm12
      510 +        pxor    %xmm15, %xmm14
      511 +        movdqu  %xmm0, 0x00(%rsi)
      512 +        movdqu  %xmm2, 0x10(%rsi)
      513 +        movdqu  %xmm4, 0x20(%rsi)
      514 +        movdqu  %xmm6, 0x30(%rsi)
      515 +        movdqu  %xmm8, 0x40(%rsi)
      516 +        movdqu  %xmm10, 0x50(%rsi)
      517 +        movdqu  %xmm12, 0x60(%rsi)
      518 +        movdqu  %xmm14, 0x70(%rsi)
      519 +        ret
      520 +        SET_SIZE(aes_xor_intel8)
      521 +
      522 +/*
 383  523   * rijndael_key_setup_enc_intel()
 384  524   * Expand the cipher key into the encryption key schedule.
 385  525   *
 386  526   * For kernel code, caller is responsible for ensuring kpreempt_disable()
 387  527   * has been called.  This is because %xmm registers are not saved/restored.
 388  528   * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 389  529   * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 390  530   * on the stack.
 391  531   *
 392  532   * OpenSolaris interface:
↓ open down ↓ 259 lines elided ↑ open up ↑
 652  792  
 653  793          SET_TS_OR_POP_XMM0_XMM1(%r10)
 654  794  
 655  795  .Ldec_key_exit:
 656  796          / OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
 657  797          / OpenSSL: rax = 0 for OK, or non-zero for error
 658  798          ret
 659  799          SET_SIZE(rijndael_key_setup_dec_intel)
 660  800  
 661  801  
 662      -/*
 663      - * aes_encrypt_intel()
 664      - * Encrypt a single block (in and out can overlap).
 665      - *
 666      - * For kernel code, caller is responsible for ensuring kpreempt_disable()
 667      - * has been called.  This is because %xmm registers are not saved/restored.
 668      - * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 669      - * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 670      - * on the stack.
 671      - *
 672      - * Temporary register usage:
 673      - * %xmm0        State
 674      - * %xmm1        Key
 675      - *
 676      - * Original OpenSolaris Interface:
 677      - * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
 678      - *      const uint32_t pt[4], uint32_t ct[4])
 679      - *
 680      - * Original Intel OpenSSL Interface:
 681      - * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
 682      - *      const AES_KEY *key)
 683      - */
 684      -
 685  802  #ifdef  OPENSSL_INTERFACE
 686  803  #define aes_encrypt_intel       intel_AES_encrypt
 687  804  #define aes_decrypt_intel       intel_AES_decrypt
 688  805  
 689  806  #define INP             rdi     /* P1, 64 bits */
 690  807  #define OUTP            rsi     /* P2, 64 bits */
 691  808  #define KEYP            rdx     /* P3, 64 bits */
 692  809  
 693  810  /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx:  */
 694  811  #define NROUNDS32       ecx     /* temporary, 32 bits */
 695  812  #define NROUNDS         cl      /* temporary,  8 bits */
 696  813  
 697  814  #else   /* OpenSolaris Interface */
 698  815  #define KEYP            rdi     /* P1, 64 bits */
 699  816  #define NROUNDS         esi     /* P2, 32 bits */
 700  817  #define INP             rdx     /* P3, 64 bits */
 701  818  #define OUTP            rcx     /* P4, 64 bits */
      819 +#define LENGTH          r8      /* P5, 64 bits */
 702  820  #endif  /* OPENSSL_INTERFACE */
 703  821  
 704      -#define STATE           xmm0    /* temporary, 128 bits */
 705      -#define KEY             xmm1    /* temporary, 128 bits */
      822 +#define KEY             xmm0    /* temporary, 128 bits */
      823 +#define STATE0          xmm8    /* temporary, 128 bits */
      824 +#define STATE1          xmm9    /* temporary, 128 bits */
      825 +#define STATE2          xmm10   /* temporary, 128 bits */
      826 +#define STATE3          xmm11   /* temporary, 128 bits */
      827 +#define STATE4          xmm12   /* temporary, 128 bits */
      828 +#define STATE5          xmm13   /* temporary, 128 bits */
      829 +#define STATE6          xmm14   /* temporary, 128 bits */
      830 +#define STATE7          xmm15   /* temporary, 128 bits */
 706  831  
 707      -ENTRY_NP(aes_encrypt_intel)
 708      -        CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
      832 +/*
      833 + * Runs the first two rounds of AES256 on a state register. `op' should be
      834 + * aesenc or aesdec.
      835 + */
      836 +#define AES256_ROUNDS(op, statereg)     \
      837 +        movaps  -0x60(%KEYP), %KEY;     \
      838 +        op      %KEY, %statereg;        \
      839 +        movaps  -0x50(%KEYP), %KEY;     \
      840 +        op      %KEY, %statereg
 709  841  
 710      -        movups  (%INP), %STATE                  / input
      842 +/*
      843 + * Runs the first two rounds of AES192, or the 3rd & 4th round of AES256 on
      844 + * a state register. `op' should be aesenc or aesdec.
      845 + */
      846 +#define AES192_ROUNDS(op, statereg)     \
      847 +        movaps  -0x40(%KEYP), %KEY;     \
      848 +        op      %KEY, %statereg;        \
      849 +        movaps  -0x30(%KEYP), %KEY;     \
      850 +        op      %KEY, %statereg
      851 +
      852 +/*
      853 + * Runs the full 10 rounds of AES128, or the last 10 rounds of AES192/AES256
      854 + * on a state register. `op' should be aesenc or aesdec and `lastop' should
      855 + * be aesenclast or aesdeclast.
      856 + */
      857 +#define AES128_ROUNDS(op, lastop, statereg) \
      858 +        movaps  -0x20(%KEYP), %KEY;     \
      859 +        op      %KEY, %statereg;        \
      860 +        movaps  -0x10(%KEYP), %KEY;     \
      861 +        op      %KEY, %statereg;        \
      862 +        movaps  (%KEYP), %KEY;          \
      863 +        op      %KEY, %statereg;        \
      864 +        movaps  0x10(%KEYP), %KEY;      \
      865 +        op      %KEY, %statereg;        \
      866 +        movaps  0x20(%KEYP), %KEY;      \
      867 +        op      %KEY, %statereg;        \
      868 +        movaps  0x30(%KEYP), %KEY;      \
      869 +        op      %KEY, %statereg;        \
      870 +        movaps  0x40(%KEYP), %KEY;      \
      871 +        op      %KEY, %statereg;        \
      872 +        movaps  0x50(%KEYP), %KEY;      \
      873 +        op      %KEY, %statereg;        \
      874 +        movaps  0x60(%KEYP), %KEY;      \
      875 +        op      %KEY, %statereg;        \
      876 +        movaps  0x70(%KEYP), %KEY;      \
      877 +        lastop  %KEY, %statereg
      878 +
      879 +/*
      880 + * Macros to run AES encryption rounds. Input must be prefilled in state
      881 + * register - output will be left there as well.
      882 + * To run AES256, invoke all of these macros in sequence. To run AES192,
      883 + * invoke only the -192 and -128 variants. To run AES128, invoke only the
      884 + * -128 variant.
      885 + */
      886 +#define AES256_ENC_ROUNDS(statereg) \
      887 +        AES256_ROUNDS(aesenc, statereg)
      888 +#define AES192_ENC_ROUNDS(statereg) \
      889 +        AES192_ROUNDS(aesenc, statereg)
      890 +#define AES128_ENC_ROUNDS(statereg) \
      891 +        AES128_ROUNDS(aesenc, aesenclast, statereg)
      892 +
      893 +/* Same as the AES*_ENC_ROUNDS macros, but for decryption. */
      894 +#define AES256_DEC_ROUNDS(statereg) \
      895 +        AES256_ROUNDS(aesdec, statereg)
      896 +#define AES192_DEC_ROUNDS(statereg) \
      897 +        AES192_ROUNDS(aesdec, statereg)
      898 +#define AES128_DEC_ROUNDS(statereg) \
      899 +        AES128_ROUNDS(aesdec, aesdeclast, statereg)
      900 +
      901 +
      902 +/*
      903 + * aes_encrypt_intel()
      904 + * Encrypt a single block (in and out can overlap).
      905 + *
      906 + * For kernel code, caller is responsible for bracketing this call with
      907 + * disabling kernel thread preemption and calling aes_accel_save/restore().
      908 + *
      909 + * Temporary register usage:
      910 + * %xmm0        Key
      911 + * %xmm8        State
      912 + *
      913 + * Original OpenSolaris Interface:
      914 + * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
      915 + *      const uint32_t pt[4], uint32_t ct[4])
      916 + *
      917 + * Original Intel OpenSSL Interface:
      918 + * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
      919 + *      const AES_KEY *key)
      920 + */
      921 +ENTRY_NP(aes_encrypt_intel)
      922 +        movups  (%INP), %STATE0                 / input
 711  923          movaps  (%KEYP), %KEY                   / key
      924 +
 712  925  #ifdef  OPENSSL_INTERFACE
 713  926          mov     240(%KEYP), %NROUNDS32          / round count
 714  927  #else   /* OpenSolaris Interface */
 715  928          /* Round count is already present as P2 in %rsi/%esi */
 716  929  #endif  /* OPENSSL_INTERFACE */
 717  930  
 718      -        pxor    %KEY, %STATE                    / round 0
      931 +        pxor    %KEY, %STATE0                   / round 0
 719  932          lea     0x30(%KEYP), %KEYP
 720  933          cmp     $12, %NROUNDS
 721  934          jb      .Lenc128
 722  935          lea     0x20(%KEYP), %KEYP
 723  936          je      .Lenc192
 724  937  
 725  938          / AES 256
 726  939          lea     0x20(%KEYP), %KEYP
 727      -        movaps  -0x60(%KEYP), %KEY
 728      -        aesenc  %KEY, %STATE
 729      -        movaps  -0x50(%KEYP), %KEY
 730      -        aesenc  %KEY, %STATE
      940 +        AES256_ENC_ROUNDS(STATE0)
 731  941  
 732  942  .align 4
 733  943  .Lenc192:
 734  944          / AES 192 and 256
 735      -        movaps  -0x40(%KEYP), %KEY
 736      -        aesenc  %KEY, %STATE
 737      -        movaps  -0x30(%KEYP), %KEY
 738      -        aesenc  %KEY, %STATE
      945 +        AES192_ENC_ROUNDS(STATE0)
 739  946  
 740  947  .align 4
 741  948  .Lenc128:
 742  949          / AES 128, 192, and 256
 743      -        movaps  -0x20(%KEYP), %KEY
 744      -        aesenc  %KEY, %STATE
 745      -        movaps  -0x10(%KEYP), %KEY
 746      -        aesenc  %KEY, %STATE
 747      -        movaps  (%KEYP), %KEY
 748      -        aesenc  %KEY, %STATE
 749      -        movaps  0x10(%KEYP), %KEY
 750      -        aesenc  %KEY, %STATE
 751      -        movaps  0x20(%KEYP), %KEY
 752      -        aesenc  %KEY, %STATE
 753      -        movaps  0x30(%KEYP), %KEY
 754      -        aesenc  %KEY, %STATE
 755      -        movaps  0x40(%KEYP), %KEY
 756      -        aesenc  %KEY, %STATE
 757      -        movaps  0x50(%KEYP), %KEY
 758      -        aesenc  %KEY, %STATE
 759      -        movaps  0x60(%KEYP), %KEY
 760      -        aesenc  %KEY, %STATE
 761      -        movaps  0x70(%KEYP), %KEY
 762      -        aesenclast       %KEY, %STATE           / last round
 763      -        movups  %STATE, (%OUTP)                 / output
      950 +        AES128_ENC_ROUNDS(STATE0)
      951 +        movups  %STATE0, (%OUTP)                / output
 764  952  
 765      -        SET_TS_OR_POP_XMM0_XMM1(%r10)
 766  953          ret
 767  954          SET_SIZE(aes_encrypt_intel)
 768  955  
 769      -
 770  956  /*
 771  957   * aes_decrypt_intel()
 772  958   * Decrypt a single block (in and out can overlap).
 773  959   *
 774      - * For kernel code, caller is responsible for ensuring kpreempt_disable()
 775      - * has been called.  This is because %xmm registers are not saved/restored.
 776      - * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 777      - * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 778      - * on the stack.
      960 + * For kernel code, caller is responsible for bracketing this call with
      961 + * disabling kernel thread preemption and calling aes_accel_save/restore().
 779  962   *
 780  963   * Temporary register usage:
 781  964   * %xmm0        State
 782  965   * %xmm1        Key
 783  966   *
 784  967   * Original OpenSolaris Interface:
 785  968   * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
 786      - *      const uint32_t pt[4], uint32_t ct[4])/
      969 + *      const uint32_t pt[4], uint32_t ct[4])
 787  970   *
 788  971   * Original Intel OpenSSL Interface:
 789  972   * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
 790  973   *      const AES_KEY *key);
 791  974   */
 792  975  ENTRY_NP(aes_decrypt_intel)
 793      -        CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
 794      -
 795      -        movups  (%INP), %STATE                  / input
      976 +        movups  (%INP), %STATE0                 / input
 796  977          movaps  (%KEYP), %KEY                   / key
      978 +
 797  979  #ifdef  OPENSSL_INTERFACE
 798  980          mov     240(%KEYP), %NROUNDS32          / round count
 799  981  #else   /* OpenSolaris Interface */
 800  982          /* Round count is already present as P2 in %rsi/%esi */
 801  983  #endif  /* OPENSSL_INTERFACE */
 802  984  
 803      -        pxor    %KEY, %STATE                    / round 0
      985 +        pxor    %KEY, %STATE0                   / round 0
 804  986          lea     0x30(%KEYP), %KEYP
 805  987          cmp     $12, %NROUNDS
 806  988          jb      .Ldec128
 807  989          lea     0x20(%KEYP), %KEYP
 808  990          je      .Ldec192
 809  991  
 810  992          / AES 256
 811  993          lea     0x20(%KEYP), %KEYP
 812      -        movaps  -0x60(%KEYP), %KEY
 813      -        aesdec  %KEY, %STATE
 814      -        movaps  -0x50(%KEYP), %KEY
 815      -        aesdec  %KEY, %STATE
      994 +        AES256_DEC_ROUNDS(STATE0)
 816  995  
 817  996  .align 4
 818  997  .Ldec192:
 819  998          / AES 192 and 256
 820      -        movaps  -0x40(%KEYP), %KEY
 821      -        aesdec  %KEY, %STATE
 822      -        movaps  -0x30(%KEYP), %KEY
 823      -        aesdec  %KEY, %STATE
      999 +        AES192_DEC_ROUNDS(STATE0)
 824 1000  
 825 1001  .align 4
 826 1002  .Ldec128:
 827 1003          / AES 128, 192, and 256
 828      -        movaps  -0x20(%KEYP), %KEY
 829      -        aesdec  %KEY, %STATE
 830      -        movaps  -0x10(%KEYP), %KEY
 831      -        aesdec  %KEY, %STATE
 832      -        movaps  (%KEYP), %KEY
 833      -        aesdec  %KEY, %STATE
 834      -        movaps  0x10(%KEYP), %KEY
 835      -        aesdec  %KEY, %STATE
 836      -        movaps  0x20(%KEYP), %KEY
 837      -        aesdec  %KEY, %STATE
 838      -        movaps  0x30(%KEYP), %KEY
 839      -        aesdec  %KEY, %STATE
 840      -        movaps  0x40(%KEYP), %KEY
 841      -        aesdec  %KEY, %STATE
 842      -        movaps  0x50(%KEYP), %KEY
 843      -        aesdec  %KEY, %STATE
 844      -        movaps  0x60(%KEYP), %KEY
 845      -        aesdec  %KEY, %STATE
 846      -        movaps  0x70(%KEYP), %KEY
 847      -        aesdeclast      %KEY, %STATE            / last round
 848      -        movups  %STATE, (%OUTP)                 / output
     1004 +        AES128_DEC_ROUNDS(STATE0)
     1005 +        movups  %STATE0, (%OUTP)                / output
 849 1006  
 850      -        SET_TS_OR_POP_XMM0_XMM1(%r10)
 851 1007          ret
 852 1008          SET_SIZE(aes_decrypt_intel)
 853 1009  
     1010 +/* Does a pipelined load of eight input blocks into our AES state registers. */
     1011 +#define AES_LOAD_INPUT_8BLOCKS          \
     1012 +        movups  0x00(%INP), %STATE0;    \
     1013 +        movups  0x10(%INP), %STATE1;    \
     1014 +        movups  0x20(%INP), %STATE2;    \
     1015 +        movups  0x30(%INP), %STATE3;    \
     1016 +        movups  0x40(%INP), %STATE4;    \
     1017 +        movups  0x50(%INP), %STATE5;    \
     1018 +        movups  0x60(%INP), %STATE6;    \
     1019 +        movups  0x70(%INP), %STATE7;
     1020 +
     1021 +/* Does a pipelined store of eight AES state registers to the output. */
     1022 +#define AES_STORE_OUTPUT_8BLOCKS        \
     1023 +        movups  %STATE0, 0x00(%OUTP);   \
     1024 +        movups  %STATE1, 0x10(%OUTP);   \
     1025 +        movups  %STATE2, 0x20(%OUTP);   \
     1026 +        movups  %STATE3, 0x30(%OUTP);   \
     1027 +        movups  %STATE4, 0x40(%OUTP);   \
     1028 +        movups  %STATE5, 0x50(%OUTP);   \
     1029 +        movups  %STATE6, 0x60(%OUTP);   \
     1030 +        movups  %STATE7, 0x70(%OUTP);
     1031 +
     1032 +/* Performs a pipelined AES instruction with the key on all state registers. */
     1033 +#define AES_KEY_STATE_OP_8BLOCKS(op)    \
     1034 +        op      %KEY, %STATE0;          \
     1035 +        op      %KEY, %STATE1;          \
     1036 +        op      %KEY, %STATE2;          \
     1037 +        op      %KEY, %STATE3;          \
     1038 +        op      %KEY, %STATE4;          \
     1039 +        op      %KEY, %STATE5;          \
     1040 +        op      %KEY, %STATE6;          \
     1041 +        op      %KEY, %STATE7
     1042 +
     1043 +/* XOR all AES state regs with key to initiate encryption/decryption. */
     1044 +#define AES_XOR_STATE_8BLOCKS           \
     1045 +        AES_KEY_STATE_OP_8BLOCKS(pxor)
     1046 +
     1047 +/*
     1048 + * Loads a round key from the key schedule offset `off' into the KEY
     1049 + * register and performs `op' using the KEY on all 8 STATE registers.
     1050 + */
     1051 +#define AES_RND_8BLOCKS(op, off)        \
     1052 +        movaps  off(%KEYP), %KEY;       \
     1053 +        AES_KEY_STATE_OP_8BLOCKS(op)
     1054 +
     1055 +/*
     1056 + * void aes_encrypt_intel8(const uint32_t roundkeys[], int numrounds,
     1057 + *      const void *plaintext, void *ciphertext)
     1058 + *
     1059 + * Same as aes_encrypt_intel, but performs the encryption operation on
     1060 + * 8 independent blocks in sequence, exploiting instruction pipelining.
     1061 + * This function doesn't support the OpenSSL interface, it's only meant
     1062 + * for kernel use.
     1063 + */
     1064 +ENTRY_NP(aes_encrypt_intel8)
     1065 +        AES_LOAD_INPUT_8BLOCKS          / load input
     1066 +        movaps  (%KEYP), %KEY           / key
     1067 +        AES_XOR_STATE_8BLOCKS           / round 0
     1068 +
     1069 +        lea     0x30(%KEYP), %KEYP      / point to key schedule
     1070 +        cmp     $12, %NROUNDS           / determine AES variant
     1071 +        jb      .Lenc8_128
     1072 +        lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
     1073 +        je      .Lenc8_192
     1074 +
     1075 +        lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
     1076 +        AES_RND_8BLOCKS(aesenc, -0x60)  / AES256 R.1
     1077 +        AES_RND_8BLOCKS(aesenc, -0x50)  / AES256 R.2
     1078 +
     1079 +.align 4
     1080 +.Lenc8_192:
     1081 +        AES_RND_8BLOCKS(aesenc, -0x40)  / AES192 R.1; AES256 R.3
     1082 +        AES_RND_8BLOCKS(aesenc, -0x30)  / AES192 R.2; AES256 R.4
     1083 +
     1084 +.align 4
     1085 +.Lenc8_128:
     1086 +        AES_RND_8BLOCKS(aesenc, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
     1087 +        AES_RND_8BLOCKS(aesenc, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
     1088 +        AES_RND_8BLOCKS(aesenc, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
     1089 +        AES_RND_8BLOCKS(aesenc, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
     1090 +        AES_RND_8BLOCKS(aesenc, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
     1091 +        AES_RND_8BLOCKS(aesenc, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
     1092 +        AES_RND_8BLOCKS(aesenc, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
     1093 +        AES_RND_8BLOCKS(aesenc, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
     1094 +        AES_RND_8BLOCKS(aesenc, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
     1095 +        AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
     1096 +
     1097 +        AES_STORE_OUTPUT_8BLOCKS        / store output
     1098 +        ret
     1099 +        SET_SIZE(aes_encrypt_intel8)
     1100 +
     1101 +
     1102 +/*
     1103 + * void aes_decrypt_intel8(const uint32_t roundkeys[], int numrounds,
     1104 + *      const void *ciphertext, void *plaintext)
     1105 + *
     1106 + * Same as aes_decrypt_intel, but performs the decryption operation on
     1107 + * 8 independent blocks in sequence, exploiting instruction pipelining.
     1108 + * This function doesn't support the OpenSSL interface, it's only meant
     1109 + * for kernel use.
     1110 + */
     1111 +ENTRY_NP(aes_decrypt_intel8)
     1112 +        AES_LOAD_INPUT_8BLOCKS          / load input
     1113 +        movaps  (%KEYP), %KEY           / key
     1114 +        AES_XOR_STATE_8BLOCKS           / round 0
     1115 +
     1116 +        lea     0x30(%KEYP), %KEYP      / point to key schedule
     1117 +        cmp     $12, %NROUNDS           / determine AES variant
     1118 +        jb      .Ldec8_128
     1119 +        lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
     1120 +        je      .Ldec8_192
     1121 +
     1122 +        lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
     1123 +        AES_RND_8BLOCKS(aesdec, -0x60)  / AES256 R.1
     1124 +        AES_RND_8BLOCKS(aesdec, -0x50)  / AES256 R.2
     1125 +
     1126 +.align 4
     1127 +.Ldec8_192:
     1128 +        AES_RND_8BLOCKS(aesdec, -0x40)  / AES192 R.1; AES256 R.3
     1129 +        AES_RND_8BLOCKS(aesdec, -0x30)  / AES192 R.2; AES256 R.4
     1130 +
     1131 +.align 4
     1132 +.Ldec8_128:
     1133 +        AES_RND_8BLOCKS(aesdec, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
     1134 +        AES_RND_8BLOCKS(aesdec, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
     1135 +        AES_RND_8BLOCKS(aesdec, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
     1136 +        AES_RND_8BLOCKS(aesdec, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
     1137 +        AES_RND_8BLOCKS(aesdec, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
     1138 +        AES_RND_8BLOCKS(aesdec, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
     1139 +        AES_RND_8BLOCKS(aesdec, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
     1140 +        AES_RND_8BLOCKS(aesdec, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
     1141 +        AES_RND_8BLOCKS(aesdec, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
     1142 +        AES_RND_8BLOCKS(aesdeclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
     1143 +
     1144 +        AES_STORE_OUTPUT_8BLOCKS        / store output
     1145 +        ret
     1146 +        SET_SIZE(aes_decrypt_intel8)
     1147 +
     1148 +
     1149 +/*
     1150 + * This macro encapsulates the entire AES encryption algo for a single
     1151 + * block, which is prefilled in statereg and which will be replaced by
     1152 + * the encrypted output. The KEYP register must already point to the
     1153 + * AES128 key schedule ("lea 0x30(%KEYP), %KEYP" from encryption
     1154 + * function call) so that consecutive invocations of this macro are
     1155 + * supported (KEYP is restored after each invocation).
     1156 + */
     1157 +#define AES_ENC(statereg, label_128, label_192, label_out)      \
     1158 +        cmp     $12, %NROUNDS;                                  \
     1159 +        jb      label_128;                                      \
     1160 +        je      label_192;                                      \
     1161 +        /* AES 256 only */                                      \
     1162 +        lea     0x40(%KEYP), %KEYP;                             \
     1163 +        AES256_ENC_ROUNDS(statereg);                            \
     1164 +        AES192_ENC_ROUNDS(statereg);                            \
     1165 +        AES128_ENC_ROUNDS(statereg);                            \
     1166 +        lea     -0x40(%KEYP), %KEYP;                            \
     1167 +        jmp     label_out;                                      \
     1168 +.align 4;                                                       \
     1169 +label_192:                                                      \
     1170 +        lea     0x20(%KEYP), %KEYP;                             \
     1171 +        /* AES 192 only */                                      \
     1172 +        AES192_ENC_ROUNDS(statereg);                            \
     1173 +        AES128_ENC_ROUNDS(statereg);                            \
     1174 +        lea     -0x20(%KEYP), %KEYP;                            \
     1175 +        jmp     label_out;                                      \
     1176 +.align 4;                                                       \
     1177 +label_128:                                                      \
     1178 +        /* AES 128 only */                                      \
     1179 +        AES128_ENC_ROUNDS(statereg);                            \
     1180 +.align 4;                                                       \
     1181 +label_out:
     1182 +
     1183 +
     1184 +/*
     1185 + * void aes_encrypt_cbc_intel8(const uint32_t roundkeys[], int numrounds,
     1186 + *      const void *plaintext, void *ciphertext, const void *IV)
     1187 + *
     1188 + * Encrypts 8 consecutive AES blocks in the CBC mode. Input and output
     1189 + * may overlap. This provides a modest performance boost over invoking
     1190 + * the encryption and XOR in separate functions because we can avoid
     1191 + * copying the ciphertext block to and from memory between encryption
     1192 + * and XOR calls.
     1193 + */
     1194 +#define CBC_IV                  r8      /* input - IV blk pointer */
     1195 +#define CBC_IV_XMM              xmm1    /* tmp IV location for alignment */
     1196 +
     1197 +ENTRY_NP(aes_encrypt_cbc_intel8)
     1198 +        AES_LOAD_INPUT_8BLOCKS          / load input
     1199 +        movaps  (%KEYP), %KEY           / key
     1200 +        AES_XOR_STATE_8BLOCKS           / round 0
     1201 +
     1202 +        lea     0x30(%KEYP), %KEYP      / point to key schedule
     1203 +        movdqu  (%CBC_IV), %CBC_IV_XMM  / load IV from unaligned memory
     1204 +        pxor    %CBC_IV_XMM, %STATE0    / XOR IV with input block and encrypt
     1205 +        AES_ENC(STATE0, .Lenc_cbc_0_128, .Lenc_cbc_0_192, .Lenc_cbc_0_out)
     1206 +        pxor    %STATE0, %STATE1
     1207 +        AES_ENC(STATE1, .Lenc_cbc_1_128, .Lenc_cbc_1_192, .Lenc_cbc_1_out)
     1208 +        pxor    %STATE1, %STATE2
     1209 +        AES_ENC(STATE2, .Lenc_cbc_2_128, .Lenc_cbc_2_192, .Lenc_cbc_2_out)
     1210 +        pxor    %STATE2, %STATE3
     1211 +        AES_ENC(STATE3, .Lenc_cbc_3_128, .Lenc_cbc_3_192, .Lenc_cbc_3_out)
     1212 +        pxor    %STATE3, %STATE4
     1213 +        AES_ENC(STATE4, .Lenc_cbc_4_128, .Lenc_cbc_4_192, .Lenc_cbc_4_out)
     1214 +        pxor    %STATE4, %STATE5
     1215 +        AES_ENC(STATE5, .Lenc_cbc_5_128, .Lenc_cbc_5_192, .Lenc_cbc_5_out)
     1216 +        pxor    %STATE5, %STATE6
     1217 +        AES_ENC(STATE6, .Lenc_cbc_6_128, .Lenc_cbc_6_192, .Lenc_cbc_6_out)
     1218 +        pxor    %STATE6, %STATE7
     1219 +        AES_ENC(STATE7, .Lenc_cbc_7_128, .Lenc_cbc_7_192, .Lenc_cbc_7_out)
     1220 +
     1221 +        AES_STORE_OUTPUT_8BLOCKS        / store output
     1222 +        ret
     1223 +        SET_SIZE(aes_encrypt_cbc_intel8)
     1224 +
     1225 +/*
     1226 + * Prefills register state with counters suitable for the CTR encryption
     1227 + * mode. The counter is assumed to consist of two portions:
     1228 + * - A lower monotonically increasing 64-bit counter. If the caller wants
     1229 + *   a smaller counter, they are responsible for checking that it doesn't
     1230 + *   overflow between encryption calls.
     1231 + * - An upper static "nonce" portion, in big endian, preloaded into the
     1232 + *   lower portion of an XMM register.
     1233 + * This macro adds `ctridx' to the lower_LE counter, swaps it to big
     1234 + * endian and by way of a temporary general-purpose register loads the
     1235 + * lower and upper counter portions into a target XMM result register,
     1236 + * which can then be handed off to the encryption process.
     1237 + */
     1238 +#define PREP_CTR_BLOCKS(lower_LE, upper_BE_xmm, ctridx, tmpreg, resreg) \
     1239 +        lea     ctridx(%lower_LE), %tmpreg;                             \
     1240 +        bswap   %tmpreg;                                                \
     1241 +        movq    %tmpreg, %resreg;                                       \
     1242 +        movlhps %upper_BE_xmm, %resreg;                                 \
     1243 +        pshufd  $0b01001110, %resreg, %resreg
     1244 +
     1245 +#define CTR_UPPER_BE            r8      /* input - counter upper 64 bits (BE) */
     1246 +#define CTR_UPPER_BE_XMM        xmm1    /* tmp for upper counter bits */
     1247 +#define CTR_LOWER_LE            r9      /* input - counter lower 64 bits (LE) */
     1248 +#define CTR_TMP0                rax     /* tmp for lower 64 bit add & bswap */
     1249 +#define CTR_TMP1                rbx     /* tmp for lower 64 bit add & bswap */
     1250 +#define CTR_TMP2                r10     /* tmp for lower 64 bit add & bswap */
     1251 +#define CTR_TMP3                r11     /* tmp for lower 64 bit add & bswap */
     1252 +#define CTR_TMP4                r12     /* tmp for lower 64 bit add & bswap */
     1253 +#define CTR_TMP5                r13     /* tmp for lower 64 bit add & bswap */
     1254 +#define CTR_TMP6                r14     /* tmp for lower 64 bit add & bswap */
     1255 +#define CTR_TMP7                r15     /* tmp for lower 64 bit add & bswap */
     1256 +
     1257 +/*
     1258 + * These are used in case CTR encryption input is unaligned before XORing.
     1259 + * Must not overlap with any STATE[0-7] register.
     1260 + */
     1261 +#define TMP_INPUT0      xmm0
     1262 +#define TMP_INPUT1      xmm1
     1263 +#define TMP_INPUT2      xmm2
     1264 +#define TMP_INPUT3      xmm3
     1265 +#define TMP_INPUT4      xmm4
     1266 +#define TMP_INPUT5      xmm5
     1267 +#define TMP_INPUT6      xmm6
     1268 +#define TMP_INPUT7      xmm7
     1269 +
     1270 +/*
     1271 + * void aes_ctr_intel8(const uint32_t roundkeys[], int numrounds,
     1272 + *      const void *input, void *output, uint64_t counter_upper_BE,
     1273 + *      uint64_t counter_lower_LE)
     1274 + *
     1275 + * Runs AES on 8 consecutive blocks in counter mode (encryption and
     1276 + * decryption in counter mode are the same).
     1277 + */
     1278 +ENTRY_NP(aes_ctr_intel8)
     1279 +        /* save caller's regs */
     1280 +        pushq   %rbp
     1281 +        movq    %rsp, %rbp
     1282 +        subq    $0x38, %rsp
     1283 +        / CTR_TMP0 is rax, no need to save
     1284 +        movq    %CTR_TMP1, -0x38(%rbp)
     1285 +        movq    %CTR_TMP2, -0x30(%rbp)
     1286 +        movq    %CTR_TMP3, -0x28(%rbp)
     1287 +        movq    %CTR_TMP4, -0x20(%rbp)
     1288 +        movq    %CTR_TMP5, -0x18(%rbp)
     1289 +        movq    %CTR_TMP6, -0x10(%rbp)
     1290 +        movq    %CTR_TMP7, -0x08(%rbp)
     1291 +
     1292 +        /*
     1293 +         * CTR step 1: prepare big-endian formatted 128-bit counter values,
     1294 +         * placing the result in the AES-NI input state registers.
     1295 +         */
     1296 +        movq    %CTR_UPPER_BE, %CTR_UPPER_BE_XMM
     1297 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 0, CTR_TMP0, STATE0)
     1298 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 1, CTR_TMP1, STATE1)
     1299 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 2, CTR_TMP2, STATE2)
     1300 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 3, CTR_TMP3, STATE3)
     1301 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 4, CTR_TMP4, STATE4)
     1302 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 5, CTR_TMP5, STATE5)
     1303 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 6, CTR_TMP6, STATE6)
     1304 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 7, CTR_TMP7, STATE7)
     1305 +
     1306 +        /*
     1307 +         * CTR step 2: Encrypt the counters.
     1308 +         */
     1309 +        movaps  (%KEYP), %KEY           / key
     1310 +        AES_XOR_STATE_8BLOCKS           / round 0
     1311 +
     1312 +        /* Determine the AES variant we're going to compute */
     1313 +        lea     0x30(%KEYP), %KEYP      / point to key schedule
     1314 +        cmp     $12, %NROUNDS           / determine AES variant
     1315 +        jb      .Lctr8_128
     1316 +        lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
     1317 +        je      .Lctr8_192
     1318 +
     1319 +        /* AES 256 */
     1320 +        lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
     1321 +        AES_RND_8BLOCKS(aesenc, -0x60)  / AES256 R.1
     1322 +        AES_RND_8BLOCKS(aesenc, -0x50)  / AES256 R.2
     1323 +
     1324 +.align 4
     1325 +.Lctr8_192:
     1326 +        /* AES 192 and 256 */
     1327 +        AES_RND_8BLOCKS(aesenc, -0x40)  / AES192 R.1; AES256 R.3
     1328 +        AES_RND_8BLOCKS(aesenc, -0x30)  / AES192 R.2; AES256 R.4
     1329 +
     1330 +.align 4
     1331 +.Lctr8_128:
     1332 +        /* AES 128, 192, and 256 */
     1333 +        AES_RND_8BLOCKS(aesenc, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
     1334 +        AES_RND_8BLOCKS(aesenc, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
     1335 +        AES_RND_8BLOCKS(aesenc, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
     1336 +        AES_RND_8BLOCKS(aesenc, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
     1337 +        AES_RND_8BLOCKS(aesenc, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
     1338 +        AES_RND_8BLOCKS(aesenc, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
     1339 +        AES_RND_8BLOCKS(aesenc, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
     1340 +        AES_RND_8BLOCKS(aesenc, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
     1341 +        AES_RND_8BLOCKS(aesenc, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
     1342 +        AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
     1343 +
     1344 +        /*
     1345 +         * CTR step 3: XOR input data blocks with encrypted counters to
     1346 +         * produce result.
     1347 +         */
     1348 +        mov     %INP, %rax              / pxor requires alignment, so check
     1349 +        andq    $0xf, %rax
     1350 +        jnz     .Lctr_input_unaligned
     1351 +        pxor    0x00(%INP), %STATE0
     1352 +        pxor    0x10(%INP), %STATE1
     1353 +        pxor    0x20(%INP), %STATE2
     1354 +        pxor    0x30(%INP), %STATE3
     1355 +        pxor    0x40(%INP), %STATE4
     1356 +        pxor    0x50(%INP), %STATE5
     1357 +        pxor    0x60(%INP), %STATE6
     1358 +        pxor    0x70(%INP), %STATE7
     1359 +        jmp     .Lctr_out
     1360 +
     1361 +.align 4
     1362 +.Lctr_input_unaligned:
     1363 +        movdqu  0x00(%INP), %TMP_INPUT0
     1364 +        movdqu  0x10(%INP), %TMP_INPUT1
     1365 +        movdqu  0x20(%INP), %TMP_INPUT2
     1366 +        movdqu  0x30(%INP), %TMP_INPUT3
     1367 +        movdqu  0x40(%INP), %TMP_INPUT4
     1368 +        movdqu  0x50(%INP), %TMP_INPUT5
     1369 +        movdqu  0x60(%INP), %TMP_INPUT6
     1370 +        movdqu  0x70(%INP), %TMP_INPUT7
     1371 +        pxor    %TMP_INPUT0, %STATE0
     1372 +        pxor    %TMP_INPUT1, %STATE1
     1373 +        pxor    %TMP_INPUT2, %STATE2
     1374 +        pxor    %TMP_INPUT3, %STATE3
     1375 +        pxor    %TMP_INPUT4, %STATE4
     1376 +        pxor    %TMP_INPUT5, %STATE5
     1377 +        pxor    %TMP_INPUT6, %STATE6
     1378 +        pxor    %TMP_INPUT7, %STATE7
     1379 +
     1380 +.align 4
     1381 +.Lctr_out:
     1382 +        /*
     1383 +         * Step 4: Write out processed blocks to memory.
     1384 +         */
     1385 +        movdqu  %STATE0, 0x00(%OUTP)
     1386 +        movdqu  %STATE1, 0x10(%OUTP)
     1387 +        movdqu  %STATE2, 0x20(%OUTP)
     1388 +        movdqu  %STATE3, 0x30(%OUTP)
     1389 +        movdqu  %STATE4, 0x40(%OUTP)
     1390 +        movdqu  %STATE5, 0x50(%OUTP)
     1391 +        movdqu  %STATE6, 0x60(%OUTP)
     1392 +        movdqu  %STATE7, 0x70(%OUTP)
     1393 +
     1394 +        /* restore caller's regs */
     1395 +        / CTR_TMP0 is rax, no need to restore
     1396 +        movq    -0x38(%rbp), %CTR_TMP1
     1397 +        movq    -0x30(%rbp), %CTR_TMP2
     1398 +        movq    -0x28(%rbp), %CTR_TMP3
     1399 +        movq    -0x20(%rbp), %CTR_TMP4
     1400 +        movq    -0x18(%rbp), %CTR_TMP5
     1401 +        movq    -0x10(%rbp), %CTR_TMP6
     1402 +        movq    -0x08(%rbp), %CTR_TMP7
     1403 +        leave
     1404 +        ret
     1405 +        SET_SIZE(aes_ctr_intel8)
     1406 +
 854 1407  #endif  /* lint || __lint */
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX