illumos-gate Wdiff usr/src/common/crypto/aes/amd64/aes_intel.s

Print this page

4896 Performance improvements for KCF AES modes

Split	Close
Expand all
Collapse all

          --- old/usr/src/common/crypto/aes/amd64/aes_intel.s
          +++ new/usr/src/common/crypto/aes/amd64/aes_intel.s

   1    1  /*
   2    2   * ====================================================================
   3    3   * Written by Intel Corporation for the OpenSSL project to add support
   4    4   * for Intel AES-NI instructions. Rights for redistribution and usage
   5    5   * in source and binary forms are granted according to the OpenSSL
   6    6   * license.
   7    7   *
   8    8   *   Author: Huang Ying <ying.huang at intel dot com>
   9    9   *           Vinodh Gopal <vinodh.gopal at intel dot com>
  10   10   *           Kahraman Akdemir
  11   11   *
  12   12   * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
  13   13   * instructions that are going to be introduced in the next generation
  14   14   * of Intel processor, as of 2009. These instructions enable fast and
  15   15   * secure data encryption and decryption, using the Advanced Encryption
  16   16   * Standard (AES), defined by FIPS Publication number 197. The
  17   17   * architecture introduces six instructions that offer full hardware
  18   18   * support for AES. Four of them support high performance data
  19   19   * encryption and decryption, and the other two instructions support
  20   20   * the AES key expansion procedure.
  21   21   * ====================================================================
  22   22   */
  23   23  
  24   24  /*
  25   25   * ====================================================================
  26   26   * Copyright (c) 1998-2008 The OpenSSL Project.  All rights reserved.
  27   27   *
  28   28   * Redistribution and use in source and binary forms, with or without
  29   29   * modification, are permitted provided that the following conditions
  30   30   * are met:
  31   31   *
  32   32   * 1. Redistributions of source code must retain the above copyright
  33   33   *    notice, this list of conditions and the following disclaimer.
  34   34   *
  35   35   * 2. Redistributions in binary form must reproduce the above copyright
  36   36   *    notice, this list of conditions and the following disclaimer in
  37   37   *    the documentation and/or other materials provided with the
  38   38   *    distribution.
  39   39   *
  40   40   * 3. All advertising materials mentioning features or use of this
  41   41   *    software must display the following acknowledgment:
  42   42   *    "This product includes software developed by the OpenSSL Project
  43   43   *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
  44   44   *
  45   45   * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
  46   46   *    endorse or promote products derived from this software without
  47   47   *    prior written permission. For written permission, please contact
  48   48   *    openssl-core@openssl.org.
  49   49   *
  50   50   * 5. Products derived from this software may not be called "OpenSSL"
  51   51   *    nor may "OpenSSL" appear in their names without prior written
  52   52   *    permission of the OpenSSL Project.
  53   53   *
  54   54   * 6. Redistributions of any form whatsoever must retain the following
  55   55   *    acknowledgment:
  56   56   *    "This product includes software developed by the OpenSSL Project
  57   57   *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
  58   58   *
  59   59   * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
  60   60   * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  61   61   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  62   62   * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
  63   63   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  64   64   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  65   65   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  66   66   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  67   67   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  68   68   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  69   69   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  70   70   * OF THE POSSIBILITY OF SUCH DAMAGE.
  71   71   * ====================================================================
  72   72   */
  73   73  
  74   74  /*
  75   75   * ====================================================================
  76   76   * OpenSolaris OS modifications
  77   77   *
  78   78   * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
  79   79   * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
  80   80   * Huang Ying of Intel to the openssl-dev mailing list under the subject
  81   81   * of "Add support to Intel AES-NI instruction set for x86_64 platform".
  82   82   *
  83   83   * This OpenSolaris version has these major changes from the original source:
  84   84   *
  85   85   * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  86   86   * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
  87   87   * definitions for lint.
  88   88   *
  89   89   * 2. Formatted code, added comments, and added #includes and #defines.
  90   90   *
  91   91   * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
  92   92   * calling kpreempt_disable() and kpreempt_enable().
  93   93   * If the TS bit is not set, Save and restore %xmm registers at the beginning
  94   94   * and end of function calls (%xmm* registers are not saved and restored by
  95   95   * during kernel thread preemption).
  96   96   *
  97   97   * 4. Renamed functions, reordered parameters, and changed return value
  98   98   * to match OpenSolaris:
  99   99   *
 100  100   * OpenSSL interface:
 101  101   *      int intel_AES_set_encrypt_key(const unsigned char *userKey,
 102  102   *              const int bits, AES_KEY *key);
 103  103   *      int intel_AES_set_decrypt_key(const unsigned char *userKey,
 104  104   *              const int bits, AES_KEY *key);
 105  105   *      Return values for above are non-zero on error, 0 on success.
 106  106   *
 107  107   *      void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
 108  108   *              const AES_KEY *key);
 109  109   *      void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
 110  110   *              const AES_KEY *key);
 111  111   *      typedef struct aes_key_st {
 112  112   *              unsigned int    rd_key[4 *(AES_MAXNR + 1)];
 113  113   *              int             rounds;
 114  114   *              unsigned int    pad[3];
 115  115   *      } AES_KEY;
 116  116   * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
 117  117   * (ks32) instead of 64-bit (ks64).
 118  118   * Number of rounds (aka round count) is at offset 240 of AES_KEY.
 119  119   *
 120  120   * OpenSolaris OS interface (#ifdefs removed for readability):
 121  121   *      int rijndael_key_setup_dec_intel(uint32_t rk[],
 122  122   *              const uint32_t cipherKey[], uint64_t keyBits);
 123  123   *      int rijndael_key_setup_enc_intel(uint32_t rk[],
 124  124   *              const uint32_t cipherKey[], uint64_t keyBits);
 125  125   *      Return values for above are 0 on error, number of rounds on success.
 126  126   *
 127  127   *      void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
 128  128   *              const uint32_t pt[4], uint32_t ct[4]);
 129  129   *      void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
 130  130   *              const uint32_t pt[4], uint32_t ct[4]);
 131  131   *      typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
 132  132   *               uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
 133  133   *
 134  134   *      typedef union {
 135  135   *              uint32_t        ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
 136  136   *      } aes_ks_t;
 137  137   *      typedef struct aes_key {
 138  138   *              aes_ks_t        encr_ks, decr_ks;
 139  139   *              long double     align128;
 140  140   *              int             flags, nr, type;

↓ open down ↓

140 lines elided

↑ open up ↑

 141  141   *      } aes_key_t;
 142  142   *
 143  143   * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
 144  144   * ct is crypto text, and MAX_AES_NR is 14.
 145  145   * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
 146  146   *
 147  147   * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
 148  148   *
 149  149   * ====================================================================
 150  150   */
      151 +/*
      152 + * Copyright 2015 by Saso Kiselkov. All rights reserved.
      153 + */
 151  154  
 152  155  #if defined(lint) || defined(__lint)
 153  156  
 154  157  #include <sys/types.h>
 155  158  
 156  159  /* ARGSUSED */
 157  160  void
 158  161  aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
 159  162      uint32_t ct[4]) {
 160  163  }

 161  164  /* ARGSUSED */
 162  165  void
 163  166  aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
 164  167      uint32_t pt[4]) {
 165  168  }
 166  169  /* ARGSUSED */
 167  170  int
 168  171  rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
 169  172      uint64_t keyBits) {
 170  173          return (0);
 171  174  }
 172  175  /* ARGSUSED */
 173  176  int
 174  177  rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
 175  178     uint64_t keyBits) {
 176  179          return (0);
 177  180  }
 178  181  
 179  182  
 180  183  #else   /* lint */
 181  184  
 182  185  #include <sys/asm_linkage.h>
 183  186  #include <sys/controlregs.h>
 184  187  #ifdef _KERNEL
 185  188  #include <sys/machprivregs.h>
 186  189  #endif
 187  190  
 188  191  #ifdef _KERNEL
 189  192          /*
 190  193           * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv.  That is,
 191  194           * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
 192  195           * uses it to pass P2 to syscall.
 193  196           * This also occurs with the STTS macro, but we don't care if
 194  197           * P2 (%rsi) is modified just before function exit.
 195  198           * The CLTS and STTS macros push and pop P1 (%rdi) already.
 196  199           */
 197  200  #ifdef __xpv
 198  201  #define PROTECTED_CLTS \
 199  202          push    %rsi; \
 200  203          CLTS; \
 201  204          pop     %rsi
 202  205  #else
 203  206  #define PROTECTED_CLTS \
 204  207          CLTS
 205  208  #endif  /* __xpv */
 206  209  
 207  210  #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \
 208  211          push    %rbp; \
 209  212          mov     %rsp, %rbp; \
 210  213          movq    %cr0, tmpreg; \
 211  214          testq   $CR0_TS, tmpreg; \
 212  215          jnz     1f; \
 213  216          and     $-XMM_ALIGN, %rsp; \
 214  217          sub     $[XMM_SIZE * 2], %rsp; \
 215  218          movaps  %xmm0, 16(%rsp); \
 216  219          movaps  %xmm1, (%rsp); \
 217  220          jmp     2f; \
 218  221  1: \
 219  222          PROTECTED_CLTS; \
 220  223  2:
 221  224  
 222  225          /*
 223  226           * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack,
 224  227           * otherwise set CR0_TS.
 225  228           */
 226  229  #define SET_TS_OR_POP_XMM0_XMM1(tmpreg) \
 227  230          testq   $CR0_TS, tmpreg; \
 228  231          jnz     1f; \
 229  232          movaps  (%rsp), %xmm1; \
 230  233          movaps  16(%rsp), %xmm0; \
 231  234          jmp     2f; \
 232  235  1: \
 233  236          STTS(tmpreg); \
 234  237  2: \
 235  238          mov     %rbp, %rsp; \
 236  239          pop     %rbp
 237  240  
 238  241          /*
 239  242           * If CR0_TS is not set, align stack (with push %rbp) and push
 240  243           * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS
 241  244           */
 242  245  #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \
 243  246          push    %rbp; \
 244  247          mov     %rsp, %rbp; \
 245  248          movq    %cr0, tmpreg; \
 246  249          testq   $CR0_TS, tmpreg; \
 247  250          jnz     1f; \
 248  251          and     $-XMM_ALIGN, %rsp; \
 249  252          sub     $[XMM_SIZE * 7], %rsp; \
 250  253          movaps  %xmm0, 96(%rsp); \
 251  254          movaps  %xmm1, 80(%rsp); \
 252  255          movaps  %xmm2, 64(%rsp); \
 253  256          movaps  %xmm3, 48(%rsp); \
 254  257          movaps  %xmm4, 32(%rsp); \
 255  258          movaps  %xmm5, 16(%rsp); \
 256  259          movaps  %xmm6, (%rsp); \
 257  260          jmp     2f; \
 258  261  1: \
 259  262          PROTECTED_CLTS; \
 260  263  2:
 261  264  
 262  265  
 263  266          /*
 264  267           * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
 265  268           * otherwise set CR0_TS.
 266  269           */
 267  270  #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
 268  271          testq   $CR0_TS, tmpreg; \
 269  272          jnz     1f; \
 270  273          movaps  (%rsp), %xmm6; \
 271  274          movaps  16(%rsp), %xmm5; \
 272  275          movaps  32(%rsp), %xmm4; \
 273  276          movaps  48(%rsp), %xmm3; \

↓ open down ↓

113 lines elided

↑ open up ↑

 274  277          movaps  64(%rsp), %xmm2; \
 275  278          movaps  80(%rsp), %xmm1; \
 276  279          movaps  96(%rsp), %xmm0; \
 277  280          jmp     2f; \
 278  281  1: \
 279  282          STTS(tmpreg); \
 280  283  2: \
 281  284          mov     %rbp, %rsp; \
 282  285          pop     %rbp
 283  286  
      287 +/*
      288 + * void aes_accel_save(void *savestate);
      289 + *
      290 + * Saves all 16 XMM registers and CR0 to a temporary location pointed to
      291 + * in the first argument and clears TS in CR0. This must be invoked before
      292 + * executing any floating point operations inside the kernel (and kernel
      293 + * thread preemption must be disabled as well). The memory region to which
      294 + * all state is saved must be at least 16x 128-bit + 64-bit long and must
      295 + * be 128-bit aligned.
      296 + */
      297 +ENTRY_NP(aes_accel_save)
      298 +        movq    %cr0, %rax
      299 +        movq    %rax, 0x100(%rdi)
      300 +        testq   $CR0_TS, %rax
      301 +        jnz     1f
      302 +        movaps  %xmm0, 0x00(%rdi)
      303 +        movaps  %xmm1, 0x10(%rdi)
      304 +        movaps  %xmm2, 0x20(%rdi)
      305 +        movaps  %xmm3, 0x30(%rdi)
      306 +        movaps  %xmm4, 0x40(%rdi)
      307 +        movaps  %xmm5, 0x50(%rdi)
      308 +        movaps  %xmm6, 0x60(%rdi)
      309 +        movaps  %xmm7, 0x70(%rdi)
      310 +        movaps  %xmm8, 0x80(%rdi)
      311 +        movaps  %xmm9, 0x90(%rdi)
      312 +        movaps  %xmm10, 0xa0(%rdi)
      313 +        movaps  %xmm11, 0xb0(%rdi)
      314 +        movaps  %xmm12, 0xc0(%rdi)
      315 +        movaps  %xmm13, 0xd0(%rdi)
      316 +        movaps  %xmm14, 0xe0(%rdi)
      317 +        movaps  %xmm15, 0xf0(%rdi)
      318 +        ret
      319 +1:
      320 +        PROTECTED_CLTS
      321 +        ret
      322 +        SET_SIZE(aes_accel_save)
 284  323  
      324 +/*
      325 + * void aes_accel_restore(void *savestate);
      326 + *
      327 + * Restores the saved XMM and CR0.TS state from aes_accel_save.
      328 + */
      329 +ENTRY_NP(aes_accel_restore)
      330 +        mov     0x100(%rdi), %rax
      331 +        testq   $CR0_TS, %rax
      332 +        jnz     1f
      333 +        movaps  0x00(%rdi), %xmm0
      334 +        movaps  0x10(%rdi), %xmm1
      335 +        movaps  0x20(%rdi), %xmm2
      336 +        movaps  0x30(%rdi), %xmm3
      337 +        movaps  0x40(%rdi), %xmm4
      338 +        movaps  0x50(%rdi), %xmm5
      339 +        movaps  0x60(%rdi), %xmm6
      340 +        movaps  0x70(%rdi), %xmm7
      341 +        movaps  0x80(%rdi), %xmm8
      342 +        movaps  0x90(%rdi), %xmm9
      343 +        movaps  0xa0(%rdi), %xmm10
      344 +        movaps  0xb0(%rdi), %xmm11
      345 +        movaps  0xc0(%rdi), %xmm12
      346 +        movaps  0xd0(%rdi), %xmm13
      347 +        movaps  0xe0(%rdi), %xmm14
      348 +        movaps  0xf0(%rdi), %xmm15
      349 +        ret
      350 +1:
      351 +        STTS(%rax)
      352 +        ret
      353 +        SET_SIZE(aes_accel_restore)
      354 +
 285  355  #else
 286  356  #define PROTECTED_CLTS
 287  357  #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
 288  358  #define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
 289  359  #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
 290  360  #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
 291  361  #endif  /* _KERNEL */
 292  362  
 293  363  
 294  364  /*

 295  365   * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
 296  366   * _key_expansion_256a(), _key_expansion_256b()
 297  367   *
 298  368   * Helper functions called by rijndael_key_setup_inc_intel().
 299  369   * Also used indirectly by rijndael_key_setup_dec_intel().
 300  370   *
 301  371   * Input:
 302  372   * %xmm0        User-provided cipher key
 303  373   * %xmm1        Round constant
 304  374   * Output:
 305  375   * (%rcx)       AES key
 306  376   */
 307  377  
 308  378  .align  16
 309  379  _key_expansion_128:
 310  380  _key_expansion_256a:
 311  381          pshufd  $0b11111111, %xmm1, %xmm1
 312  382          shufps  $0b00010000, %xmm0, %xmm4
 313  383          pxor    %xmm4, %xmm0
 314  384          shufps  $0b10001100, %xmm0, %xmm4
 315  385          pxor    %xmm4, %xmm0
 316  386          pxor    %xmm1, %xmm0
 317  387          movaps  %xmm0, (%rcx)
 318  388          add     $0x10, %rcx
 319  389          ret
 320  390          SET_SIZE(_key_expansion_128)
 321  391          SET_SIZE(_key_expansion_256a)
 322  392  
 323  393  .align 16
 324  394  _key_expansion_192a:
 325  395          pshufd  $0b01010101, %xmm1, %xmm1
 326  396          shufps  $0b00010000, %xmm0, %xmm4
 327  397          pxor    %xmm4, %xmm0
 328  398          shufps  $0b10001100, %xmm0, %xmm4
 329  399          pxor    %xmm4, %xmm0
 330  400          pxor    %xmm1, %xmm0
 331  401  
 332  402          movaps  %xmm2, %xmm5
 333  403          movaps  %xmm2, %xmm6
 334  404          pslldq  $4, %xmm5
 335  405          pshufd  $0b11111111, %xmm0, %xmm3
 336  406          pxor    %xmm3, %xmm2
 337  407          pxor    %xmm5, %xmm2
 338  408  
 339  409          movaps  %xmm0, %xmm1
 340  410          shufps  $0b01000100, %xmm0, %xmm6
 341  411          movaps  %xmm6, (%rcx)
 342  412          shufps  $0b01001110, %xmm2, %xmm1
 343  413          movaps  %xmm1, 0x10(%rcx)
 344  414          add     $0x20, %rcx
 345  415          ret
 346  416          SET_SIZE(_key_expansion_192a)
 347  417  
 348  418  .align 16
 349  419  _key_expansion_192b:
 350  420          pshufd  $0b01010101, %xmm1, %xmm1
 351  421          shufps  $0b00010000, %xmm0, %xmm4
 352  422          pxor    %xmm4, %xmm0
 353  423          shufps  $0b10001100, %xmm0, %xmm4
 354  424          pxor    %xmm4, %xmm0
 355  425          pxor    %xmm1, %xmm0
 356  426  
 357  427          movaps  %xmm2, %xmm5
 358  428          pslldq  $4, %xmm5
 359  429          pshufd  $0b11111111, %xmm0, %xmm3
 360  430          pxor    %xmm3, %xmm2
 361  431          pxor    %xmm5, %xmm2
 362  432  
 363  433          movaps  %xmm0, (%rcx)
 364  434          add     $0x10, %rcx
 365  435          ret
 366  436          SET_SIZE(_key_expansion_192b)
 367  437  
 368  438  .align 16
 369  439  _key_expansion_256b:
 370  440          pshufd  $0b10101010, %xmm1, %xmm1

↓ open down ↓

76 lines elided

↑ open up ↑

 371  441          shufps  $0b00010000, %xmm2, %xmm4
 372  442          pxor    %xmm4, %xmm2
 373  443          shufps  $0b10001100, %xmm2, %xmm4
 374  444          pxor    %xmm4, %xmm2
 375  445          pxor    %xmm1, %xmm2
 376  446          movaps  %xmm2, (%rcx)
 377  447          add     $0x10, %rcx
 378  448          ret
 379  449          SET_SIZE(_key_expansion_256b)
 380  450  
      451 +/*
      452 + * void aes_copy_intel(const uint8_t *src, uint8_t *dst);
      453 + *
      454 + * Copies one unaligned 128-bit block from `src' to `dst'. The copy is
      455 + * performed using FPU registers, so make sure FPU state is saved when
      456 + * running this in the kernel.
      457 + */
      458 +ENTRY_NP(aes_copy_intel)
      459 +        movdqu  (%rdi), %xmm0
      460 +        movdqu  %xmm0, (%rsi)
      461 +        ret
      462 +        SET_SIZE(aes_copy_intel)
 381  463  
 382  464  /*
      465 + * void aes_xor_intel(const uint8_t *src, uint8_t *dst);
      466 + *
      467 + * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
      468 + * stores the result at `dst'. The XOR is performed using FPU registers,
      469 + * so make sure FPU state is saved when running this in the kernel.
      470 + */
      471 +ENTRY_NP(aes_xor_intel)
      472 +        movdqu  (%rdi), %xmm0
      473 +        movdqu  (%rsi), %xmm1
      474 +        pxor    %xmm1, %xmm0
      475 +        movdqu  %xmm0, (%rsi)
      476 +        ret
      477 +        SET_SIZE(aes_xor_intel)
      478 +
      479 +/*
      480 + * void aes_xor_intel8(const uint8_t *src, uint8_t *dst);
      481 + *
      482 + * XORs eight pairs of consecutive unaligned 128-bit blocks from `src' and
      483 + * 'dst' and stores the results at `dst'. The XOR is performed using FPU
      484 + * registers, so make sure FPU state is saved when running this in the kernel.
      485 + */
      486 +ENTRY_NP(aes_xor_intel8)
      487 +        movdqu  0x00(%rdi), %xmm0
      488 +        movdqu  0x00(%rsi), %xmm1
      489 +        movdqu  0x10(%rdi), %xmm2
      490 +        movdqu  0x10(%rsi), %xmm3
      491 +        movdqu  0x20(%rdi), %xmm4
      492 +        movdqu  0x20(%rsi), %xmm5
      493 +        movdqu  0x30(%rdi), %xmm6
      494 +        movdqu  0x30(%rsi), %xmm7
      495 +        movdqu  0x40(%rdi), %xmm8
      496 +        movdqu  0x40(%rsi), %xmm9
      497 +        movdqu  0x50(%rdi), %xmm10
      498 +        movdqu  0x50(%rsi), %xmm11
      499 +        movdqu  0x60(%rdi), %xmm12
      500 +        movdqu  0x60(%rsi), %xmm13
      501 +        movdqu  0x70(%rdi), %xmm14
      502 +        movdqu  0x70(%rsi), %xmm15
      503 +        pxor    %xmm1, %xmm0
      504 +        pxor    %xmm3, %xmm2
      505 +        pxor    %xmm5, %xmm4
      506 +        pxor    %xmm7, %xmm6
      507 +        pxor    %xmm9, %xmm8
      508 +        pxor    %xmm11, %xmm10
      509 +        pxor    %xmm13, %xmm12
      510 +        pxor    %xmm15, %xmm14
      511 +        movdqu  %xmm0, 0x00(%rsi)
      512 +        movdqu  %xmm2, 0x10(%rsi)
      513 +        movdqu  %xmm4, 0x20(%rsi)
      514 +        movdqu  %xmm6, 0x30(%rsi)
      515 +        movdqu  %xmm8, 0x40(%rsi)
      516 +        movdqu  %xmm10, 0x50(%rsi)
      517 +        movdqu  %xmm12, 0x60(%rsi)
      518 +        movdqu  %xmm14, 0x70(%rsi)
      519 +        ret
      520 +        SET_SIZE(aes_xor_intel8)
      521 +
      522 +/*
 383  523   * rijndael_key_setup_enc_intel()
 384  524   * Expand the cipher key into the encryption key schedule.
 385  525   *
 386  526   * For kernel code, caller is responsible for ensuring kpreempt_disable()
 387  527   * has been called.  This is because %xmm registers are not saved/restored.
 388  528   * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 389  529   * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 390  530   * on the stack.
 391  531   *
 392  532   * OpenSolaris interface:

 393  533   * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
 394  534   *      uint64_t keyBits);
 395  535   * Return value is 0 on error, number of rounds on success.
 396  536   *
 397  537   * Original Intel OpenSSL interface:
 398  538   * int intel_AES_set_encrypt_key(const unsigned char *userKey,
 399  539   *      const int bits, AES_KEY *key);
 400  540   * Return value is non-zero on error, 0 on success.
 401  541   */
 402  542  
 403  543  #ifdef  OPENSSL_INTERFACE
 404  544  #define rijndael_key_setup_enc_intel    intel_AES_set_encrypt_key
 405  545  #define rijndael_key_setup_dec_intel    intel_AES_set_decrypt_key
 406  546  
 407  547  #define USERCIPHERKEY           rdi     /* P1, 64 bits */
 408  548  #define KEYSIZE32               esi     /* P2, 32 bits */
 409  549  #define KEYSIZE64               rsi     /* P2, 64 bits */
 410  550  #define AESKEY                  rdx     /* P3, 64 bits */
 411  551  
 412  552  #else   /* OpenSolaris Interface */
 413  553  #define AESKEY                  rdi     /* P1, 64 bits */
 414  554  #define USERCIPHERKEY           rsi     /* P2, 64 bits */
 415  555  #define KEYSIZE32               edx     /* P3, 32 bits */
 416  556  #define KEYSIZE64               rdx     /* P3, 64 bits */
 417  557  #endif  /* OPENSSL_INTERFACE */
 418  558  
 419  559  #define ROUNDS32                KEYSIZE32       /* temp */
 420  560  #define ROUNDS64                KEYSIZE64       /* temp */
 421  561  #define ENDAESKEY               USERCIPHERKEY   /* temp */
 422  562  
 423  563  
 424  564  ENTRY_NP(rijndael_key_setup_enc_intel)
 425  565          CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10)
 426  566  
 427  567          / NULL pointer sanity check
 428  568          test    %USERCIPHERKEY, %USERCIPHERKEY
 429  569          jz      .Lenc_key_invalid_param
 430  570          test    %AESKEY, %AESKEY
 431  571          jz      .Lenc_key_invalid_param
 432  572  
 433  573          movups  (%USERCIPHERKEY), %xmm0 / user key (first 16 bytes)
 434  574          movaps  %xmm0, (%AESKEY)
 435  575          lea     0x10(%AESKEY), %rcx     / key addr
 436  576          pxor    %xmm4, %xmm4            / xmm4 is assumed 0 in _key_expansion_x
 437  577  
 438  578          cmp     $256, %KEYSIZE32
 439  579          jnz     .Lenc_key192
 440  580  
 441  581          / AES 256: 14 rounds in encryption key schedule
 442  582  #ifdef OPENSSL_INTERFACE
 443  583          mov     $14, %ROUNDS32
 444  584          movl    %ROUNDS32, 240(%AESKEY)         / key.rounds = 14
 445  585  #endif  /* OPENSSL_INTERFACE */
 446  586  
 447  587          movups  0x10(%USERCIPHERKEY), %xmm2     / other user key (2nd 16 bytes)
 448  588          movaps  %xmm2, (%rcx)
 449  589          add     $0x10, %rcx
 450  590  
 451  591          aeskeygenassist $0x1, %xmm2, %xmm1      / expand the key
 452  592          call    _key_expansion_256a
 453  593          aeskeygenassist $0x1, %xmm0, %xmm1
 454  594          call    _key_expansion_256b
 455  595          aeskeygenassist $0x2, %xmm2, %xmm1      / expand the key
 456  596          call    _key_expansion_256a
 457  597          aeskeygenassist $0x2, %xmm0, %xmm1
 458  598          call    _key_expansion_256b
 459  599          aeskeygenassist $0x4, %xmm2, %xmm1      / expand the key
 460  600          call    _key_expansion_256a
 461  601          aeskeygenassist $0x4, %xmm0, %xmm1
 462  602          call    _key_expansion_256b
 463  603          aeskeygenassist $0x8, %xmm2, %xmm1      / expand the key
 464  604          call    _key_expansion_256a
 465  605          aeskeygenassist $0x8, %xmm0, %xmm1
 466  606          call    _key_expansion_256b
 467  607          aeskeygenassist $0x10, %xmm2, %xmm1     / expand the key
 468  608          call    _key_expansion_256a
 469  609          aeskeygenassist $0x10, %xmm0, %xmm1
 470  610          call    _key_expansion_256b
 471  611          aeskeygenassist $0x20, %xmm2, %xmm1     / expand the key
 472  612          call    _key_expansion_256a
 473  613          aeskeygenassist $0x20, %xmm0, %xmm1
 474  614          call    _key_expansion_256b
 475  615          aeskeygenassist $0x40, %xmm2, %xmm1     / expand the key
 476  616          call    _key_expansion_256a
 477  617  
 478  618          SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
 479  619  #ifdef  OPENSSL_INTERFACE
 480  620          xor     %rax, %rax                      / return 0 (OK)
 481  621  #else   /* Open Solaris Interface */
 482  622          mov     $14, %rax                       / return # rounds = 14
 483  623  #endif
 484  624          ret
 485  625  
 486  626  .align 4
 487  627  .Lenc_key192:
 488  628          cmp     $192, %KEYSIZE32
 489  629          jnz     .Lenc_key128
 490  630  
 491  631          / AES 192: 12 rounds in encryption key schedule
 492  632  #ifdef OPENSSL_INTERFACE
 493  633          mov     $12, %ROUNDS32
 494  634          movl    %ROUNDS32, 240(%AESKEY) / key.rounds = 12
 495  635  #endif  /* OPENSSL_INTERFACE */
 496  636  
 497  637          movq    0x10(%USERCIPHERKEY), %xmm2     / other user key
 498  638          aeskeygenassist $0x1, %xmm2, %xmm1      / expand the key
 499  639          call    _key_expansion_192a
 500  640          aeskeygenassist $0x2, %xmm2, %xmm1      / expand the key
 501  641          call    _key_expansion_192b
 502  642          aeskeygenassist $0x4, %xmm2, %xmm1      / expand the key
 503  643          call    _key_expansion_192a
 504  644          aeskeygenassist $0x8, %xmm2, %xmm1      / expand the key
 505  645          call    _key_expansion_192b
 506  646          aeskeygenassist $0x10, %xmm2, %xmm1     / expand the key
 507  647          call    _key_expansion_192a
 508  648          aeskeygenassist $0x20, %xmm2, %xmm1     / expand the key
 509  649          call    _key_expansion_192b
 510  650          aeskeygenassist $0x40, %xmm2, %xmm1     / expand the key
 511  651          call    _key_expansion_192a
 512  652          aeskeygenassist $0x80, %xmm2, %xmm1     / expand the key
 513  653          call    _key_expansion_192b
 514  654  
 515  655          SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
 516  656  #ifdef  OPENSSL_INTERFACE
 517  657          xor     %rax, %rax                      / return 0 (OK)
 518  658  #else   /* OpenSolaris Interface */
 519  659          mov     $12, %rax                       / return # rounds = 12
 520  660  #endif
 521  661          ret
 522  662  
 523  663  .align 4
 524  664  .Lenc_key128:
 525  665          cmp $128, %KEYSIZE32
 526  666          jnz .Lenc_key_invalid_key_bits
 527  667  
 528  668          / AES 128: 10 rounds in encryption key schedule
 529  669  #ifdef OPENSSL_INTERFACE
 530  670          mov     $10, %ROUNDS32
 531  671          movl    %ROUNDS32, 240(%AESKEY)         / key.rounds = 10
 532  672  #endif  /* OPENSSL_INTERFACE */
 533  673  
 534  674          aeskeygenassist $0x1, %xmm0, %xmm1      / expand the key
 535  675          call    _key_expansion_128
 536  676          aeskeygenassist $0x2, %xmm0, %xmm1      / expand the key
 537  677          call    _key_expansion_128
 538  678          aeskeygenassist $0x4, %xmm0, %xmm1      / expand the key
 539  679          call    _key_expansion_128
 540  680          aeskeygenassist $0x8, %xmm0, %xmm1      / expand the key
 541  681          call    _key_expansion_128
 542  682          aeskeygenassist $0x10, %xmm0, %xmm1     / expand the key
 543  683          call    _key_expansion_128
 544  684          aeskeygenassist $0x20, %xmm0, %xmm1     / expand the key
 545  685          call    _key_expansion_128
 546  686          aeskeygenassist $0x40, %xmm0, %xmm1     / expand the key
 547  687          call    _key_expansion_128
 548  688          aeskeygenassist $0x80, %xmm0, %xmm1     / expand the key
 549  689          call    _key_expansion_128
 550  690          aeskeygenassist $0x1b, %xmm0, %xmm1     / expand the key
 551  691          call    _key_expansion_128
 552  692          aeskeygenassist $0x36, %xmm0, %xmm1     / expand the key
 553  693          call    _key_expansion_128
 554  694  
 555  695          SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
 556  696  #ifdef  OPENSSL_INTERFACE
 557  697          xor     %rax, %rax                      / return 0 (OK)
 558  698  #else   /* OpenSolaris Interface */
 559  699          mov     $10, %rax                       / return # rounds = 10
 560  700  #endif
 561  701          ret
 562  702  
 563  703  .Lenc_key_invalid_param:
 564  704  #ifdef  OPENSSL_INTERFACE
 565  705          SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
 566  706          mov     $-1, %rax       / user key or AES key pointer is NULL
 567  707          ret
 568  708  #else
 569  709          /* FALLTHROUGH */
 570  710  #endif  /* OPENSSL_INTERFACE */
 571  711  
 572  712  .Lenc_key_invalid_key_bits:
 573  713          SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
 574  714  #ifdef  OPENSSL_INTERFACE
 575  715          mov     $-2, %rax       / keysize is invalid
 576  716  #else   /* Open Solaris Interface */
 577  717          xor     %rax, %rax      / a key pointer is NULL or invalid keysize
 578  718  #endif  /* OPENSSL_INTERFACE */
 579  719  
 580  720          ret
 581  721          SET_SIZE(rijndael_key_setup_enc_intel)
 582  722  
 583  723  
 584  724  /*
 585  725   * rijndael_key_setup_dec_intel()
 586  726   * Expand the cipher key into the decryption key schedule.
 587  727   *
 588  728   * For kernel code, caller is responsible for ensuring kpreempt_disable()
 589  729   * has been called.  This is because %xmm registers are not saved/restored.
 590  730   * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 591  731   * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 592  732   * on the stack.
 593  733   *
 594  734   * OpenSolaris interface:
 595  735   * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
 596  736   *      uint64_t keyBits);
 597  737   * Return value is 0 on error, number of rounds on success.
 598  738   * P1->P2, P2->P3, P3->P1
 599  739   *
 600  740   * Original Intel OpenSSL interface:
 601  741   * int intel_AES_set_decrypt_key(const unsigned char *userKey,
 602  742   *      const int bits, AES_KEY *key);
 603  743   * Return value is non-zero on error, 0 on success.
 604  744   */
 605  745  ENTRY_NP(rijndael_key_setup_dec_intel)
 606  746          / Generate round keys used for encryption
 607  747          call    rijndael_key_setup_enc_intel
 608  748          test    %rax, %rax
 609  749  #ifdef  OPENSSL_INTERFACE
 610  750          jnz     .Ldec_key_exit  / Failed if returned non-0
 611  751  #else   /* OpenSolaris Interface */
 612  752          jz      .Ldec_key_exit  / Failed if returned 0
 613  753  #endif  /* OPENSSL_INTERFACE */
 614  754  
 615  755          CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
 616  756  
 617  757          /*
 618  758           * Convert round keys used for encryption
 619  759           * to a form usable for decryption
 620  760           */
 621  761  #ifndef OPENSSL_INTERFACE               /* OpenSolaris Interface */
 622  762          mov     %rax, %ROUNDS64         / set # rounds (10, 12, or 14)
 623  763                                          / (already set for OpenSSL)
 624  764  #endif
 625  765  
 626  766          lea     0x10(%AESKEY), %rcx     / key addr
 627  767          shl     $4, %ROUNDS32
 628  768          add     %AESKEY, %ROUNDS64
 629  769          mov     %ROUNDS64, %ENDAESKEY
 630  770  
 631  771  .align 4
 632  772  .Ldec_key_reorder_loop:
 633  773          movaps  (%AESKEY), %xmm0
 634  774          movaps  (%ROUNDS64), %xmm1
 635  775          movaps  %xmm0, (%ROUNDS64)
 636  776          movaps  %xmm1, (%AESKEY)
 637  777          lea     0x10(%AESKEY), %AESKEY
 638  778          lea     -0x10(%ROUNDS64), %ROUNDS64
 639  779          cmp     %AESKEY, %ROUNDS64
 640  780          ja      .Ldec_key_reorder_loop
 641  781  
 642  782  .align 4
 643  783  .Ldec_key_inv_loop:
 644  784          movaps  (%rcx), %xmm0
 645  785          / Convert an encryption round key to a form usable for decryption
 646  786          / with the "AES Inverse Mix Columns" instruction
 647  787          aesimc  %xmm0, %xmm1
 648  788          movaps  %xmm1, (%rcx)
 649  789          lea     0x10(%rcx), %rcx
 650  790          cmp     %ENDAESKEY, %rcx
 651  791          jnz     .Ldec_key_inv_loop

↓ open down ↓

259 lines elided

↑ open up ↑

 652  792  
 653  793          SET_TS_OR_POP_XMM0_XMM1(%r10)
 654  794  
 655  795  .Ldec_key_exit:
 656  796          / OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
 657  797          / OpenSSL: rax = 0 for OK, or non-zero for error
 658  798          ret
 659  799          SET_SIZE(rijndael_key_setup_dec_intel)
 660  800  
 661  801  
 662      -/*
 663      - * aes_encrypt_intel()
 664      - * Encrypt a single block (in and out can overlap).
 665      - *
 666      - * For kernel code, caller is responsible for ensuring kpreempt_disable()
 667      - * has been called.  This is because %xmm registers are not saved/restored.
 668      - * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 669      - * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 670      - * on the stack.
 671      - *
 672      - * Temporary register usage:
 673      - * %xmm0        State
 674      - * %xmm1        Key
 675      - *
 676      - * Original OpenSolaris Interface:
 677      - * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
 678      - *      const uint32_t pt[4], uint32_t ct[4])
 679      - *
 680      - * Original Intel OpenSSL Interface:
 681      - * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
 682      - *      const AES_KEY *key)
 683      - */
 684      -
 685  802  #ifdef  OPENSSL_INTERFACE
 686  803  #define aes_encrypt_intel       intel_AES_encrypt
 687  804  #define aes_decrypt_intel       intel_AES_decrypt
 688  805  
 689  806  #define INP             rdi     /* P1, 64 bits */
 690  807  #define OUTP            rsi     /* P2, 64 bits */
 691  808  #define KEYP            rdx     /* P3, 64 bits */
 692  809  
 693  810  /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx:  */
 694  811  #define NROUNDS32       ecx     /* temporary, 32 bits */
 695  812  #define NROUNDS         cl      /* temporary,  8 bits */
 696  813  
 697  814  #else   /* OpenSolaris Interface */
 698  815  #define KEYP            rdi     /* P1, 64 bits */
 699  816  #define NROUNDS         esi     /* P2, 32 bits */
 700  817  #define INP             rdx     /* P3, 64 bits */
 701  818  #define OUTP            rcx     /* P4, 64 bits */
      819 +#define LENGTH          r8      /* P5, 64 bits */
 702  820  #endif  /* OPENSSL_INTERFACE */
 703  821  
 704      -#define STATE           xmm0    /* temporary, 128 bits */
 705      -#define KEY             xmm1    /* temporary, 128 bits */
      822 +#define KEY             xmm0    /* temporary, 128 bits */
      823 +#define STATE0          xmm8    /* temporary, 128 bits */
      824 +#define STATE1          xmm9    /* temporary, 128 bits */
      825 +#define STATE2          xmm10   /* temporary, 128 bits */
      826 +#define STATE3          xmm11   /* temporary, 128 bits */
      827 +#define STATE4          xmm12   /* temporary, 128 bits */
      828 +#define STATE5          xmm13   /* temporary, 128 bits */
      829 +#define STATE6          xmm14   /* temporary, 128 bits */
      830 +#define STATE7          xmm15   /* temporary, 128 bits */
 706  831  
 707      -ENTRY_NP(aes_encrypt_intel)
 708      -        CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
      832 +/*
      833 + * Runs the first two rounds of AES256 on a state register. `op' should be
      834 + * aesenc or aesdec.
      835 + */
      836 +#define AES256_ROUNDS(op, statereg)     \
      837 +        movaps  -0x60(%KEYP), %KEY;     \
      838 +        op      %KEY, %statereg;        \
      839 +        movaps  -0x50(%KEYP), %KEY;     \
      840 +        op      %KEY, %statereg
 709  841  
 710      -        movups  (%INP), %STATE                  / input
      842 +/*
      843 + * Runs the first two rounds of AES192, or the 3rd & 4th round of AES256 on
      844 + * a state register. `op' should be aesenc or aesdec.
      845 + */
      846 +#define AES192_ROUNDS(op, statereg)     \
      847 +        movaps  -0x40(%KEYP), %KEY;     \
      848 +        op      %KEY, %statereg;        \
      849 +        movaps  -0x30(%KEYP), %KEY;     \
      850 +        op      %KEY, %statereg
      851 +
      852 +/*
      853 + * Runs the full 10 rounds of AES128, or the last 10 rounds of AES192/AES256
      854 + * on a state register. `op' should be aesenc or aesdec and `lastop' should
      855 + * be aesenclast or aesdeclast.
      856 + */
      857 +#define AES128_ROUNDS(op, lastop, statereg) \
      858 +        movaps  -0x20(%KEYP), %KEY;     \
      859 +        op      %KEY, %statereg;        \
      860 +        movaps  -0x10(%KEYP), %KEY;     \
      861 +        op      %KEY, %statereg;        \
      862 +        movaps  (%KEYP), %KEY;          \
      863 +        op      %KEY, %statereg;        \
      864 +        movaps  0x10(%KEYP), %KEY;      \
      865 +        op      %KEY, %statereg;        \
      866 +        movaps  0x20(%KEYP), %KEY;      \
      867 +        op      %KEY, %statereg;        \
      868 +        movaps  0x30(%KEYP), %KEY;      \
      869 +        op      %KEY, %statereg;        \
      870 +        movaps  0x40(%KEYP), %KEY;      \
      871 +        op      %KEY, %statereg;        \
      872 +        movaps  0x50(%KEYP), %KEY;      \
      873 +        op      %KEY, %statereg;        \
      874 +        movaps  0x60(%KEYP), %KEY;      \
      875 +        op      %KEY, %statereg;        \
      876 +        movaps  0x70(%KEYP), %KEY;      \
      877 +        lastop  %KEY, %statereg
      878 +
      879 +/*
      880 + * Macros to run AES encryption rounds. Input must be prefilled in state
      881 + * register - output will be left there as well.
      882 + * To run AES256, invoke all of these macros in sequence. To run AES192,
      883 + * invoke only the -192 and -128 variants. To run AES128, invoke only the
      884 + * -128 variant.
      885 + */
      886 +#define AES256_ENC_ROUNDS(statereg) \
      887 +        AES256_ROUNDS(aesenc, statereg)
      888 +#define AES192_ENC_ROUNDS(statereg) \
      889 +        AES192_ROUNDS(aesenc, statereg)
      890 +#define AES128_ENC_ROUNDS(statereg) \
      891 +        AES128_ROUNDS(aesenc, aesenclast, statereg)
      892 +
      893 +/* Same as the AES*_ENC_ROUNDS macros, but for decryption. */
      894 +#define AES256_DEC_ROUNDS(statereg) \
      895 +        AES256_ROUNDS(aesdec, statereg)
      896 +#define AES192_DEC_ROUNDS(statereg) \
      897 +        AES192_ROUNDS(aesdec, statereg)
      898 +#define AES128_DEC_ROUNDS(statereg) \
      899 +        AES128_ROUNDS(aesdec, aesdeclast, statereg)
      900 +
      901 +
      902 +/*
      903 + * aes_encrypt_intel()
      904 + * Encrypt a single block (in and out can overlap).
      905 + *
      906 + * For kernel code, caller is responsible for bracketing this call with
      907 + * disabling kernel thread preemption and calling aes_accel_save/restore().
      908 + *
      909 + * Temporary register usage:
      910 + * %xmm0        Key
      911 + * %xmm8        State
      912 + *
      913 + * Original OpenSolaris Interface:
      914 + * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
      915 + *      const uint32_t pt[4], uint32_t ct[4])
      916 + *
      917 + * Original Intel OpenSSL Interface:
      918 + * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
      919 + *      const AES_KEY *key)
      920 + */
      921 +ENTRY_NP(aes_encrypt_intel)
      922 +        movups  (%INP), %STATE0                 / input
 711  923          movaps  (%KEYP), %KEY                   / key
      924 +
 712  925  #ifdef  OPENSSL_INTERFACE
 713  926          mov     240(%KEYP), %NROUNDS32          / round count
 714  927  #else   /* OpenSolaris Interface */
 715  928          /* Round count is already present as P2 in %rsi/%esi */
 716  929  #endif  /* OPENSSL_INTERFACE */
 717  930  
 718      -        pxor    %KEY, %STATE                    / round 0
      931 +        pxor    %KEY, %STATE0                   / round 0
 719  932          lea     0x30(%KEYP), %KEYP
 720  933          cmp     $12, %NROUNDS
 721  934          jb      .Lenc128
 722  935          lea     0x20(%KEYP), %KEYP
 723  936          je      .Lenc192
 724  937  
 725  938          / AES 256
 726  939          lea     0x20(%KEYP), %KEYP
 727      -        movaps  -0x60(%KEYP), %KEY
 728      -        aesenc  %KEY, %STATE
 729      -        movaps  -0x50(%KEYP), %KEY
 730      -        aesenc  %KEY, %STATE
      940 +        AES256_ENC_ROUNDS(STATE0)
 731  941  
 732  942  .align 4
 733  943  .Lenc192:
 734  944          / AES 192 and 256
 735      -        movaps  -0x40(%KEYP), %KEY
 736      -        aesenc  %KEY, %STATE
 737      -        movaps  -0x30(%KEYP), %KEY
 738      -        aesenc  %KEY, %STATE
      945 +        AES192_ENC_ROUNDS(STATE0)
 739  946  
 740  947  .align 4
 741  948  .Lenc128:
 742  949          / AES 128, 192, and 256
 743      -        movaps  -0x20(%KEYP), %KEY
 744      -        aesenc  %KEY, %STATE
 745      -        movaps  -0x10(%KEYP), %KEY
 746      -        aesenc  %KEY, %STATE
 747      -        movaps  (%KEYP), %KEY
 748      -        aesenc  %KEY, %STATE
 749      -        movaps  0x10(%KEYP), %KEY
 750      -        aesenc  %KEY, %STATE
 751      -        movaps  0x20(%KEYP), %KEY
 752      -        aesenc  %KEY, %STATE
 753      -        movaps  0x30(%KEYP), %KEY
 754      -        aesenc  %KEY, %STATE
 755      -        movaps  0x40(%KEYP), %KEY
 756      -        aesenc  %KEY, %STATE
 757      -        movaps  0x50(%KEYP), %KEY
 758      -        aesenc  %KEY, %STATE
 759      -        movaps  0x60(%KEYP), %KEY
 760      -        aesenc  %KEY, %STATE
 761      -        movaps  0x70(%KEYP), %KEY
 762      -        aesenclast       %KEY, %STATE           / last round
 763      -        movups  %STATE, (%OUTP)                 / output
      950 +        AES128_ENC_ROUNDS(STATE0)
      951 +        movups  %STATE0, (%OUTP)                / output
 764  952  
 765      -        SET_TS_OR_POP_XMM0_XMM1(%r10)
 766  953          ret
 767  954          SET_SIZE(aes_encrypt_intel)
 768  955  
 769      -
 770  956  /*
 771  957   * aes_decrypt_intel()
 772  958   * Decrypt a single block (in and out can overlap).
 773  959   *
 774      - * For kernel code, caller is responsible for ensuring kpreempt_disable()
 775      - * has been called.  This is because %xmm registers are not saved/restored.
 776      - * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 777      - * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 778      - * on the stack.
      960 + * For kernel code, caller is responsible for bracketing this call with
      961 + * disabling kernel thread preemption and calling aes_accel_save/restore().
 779  962   *
 780  963   * Temporary register usage:
 781  964   * %xmm0        State
 782  965   * %xmm1        Key
 783  966   *
 784  967   * Original OpenSolaris Interface:
 785  968   * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
 786      - *      const uint32_t pt[4], uint32_t ct[4])/
      969 + *      const uint32_t pt[4], uint32_t ct[4])
 787  970   *
 788  971   * Original Intel OpenSSL Interface:
 789  972   * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
 790  973   *      const AES_KEY *key);
 791  974   */
 792  975  ENTRY_NP(aes_decrypt_intel)
 793      -        CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
 794      -
 795      -        movups  (%INP), %STATE                  / input
      976 +        movups  (%INP), %STATE0                 / input
 796  977          movaps  (%KEYP), %KEY                   / key
      978 +
 797  979  #ifdef  OPENSSL_INTERFACE
 798  980          mov     240(%KEYP), %NROUNDS32          / round count
 799  981  #else   /* OpenSolaris Interface */
 800  982          /* Round count is already present as P2 in %rsi/%esi */
 801  983  #endif  /* OPENSSL_INTERFACE */
 802  984  
 803      -        pxor    %KEY, %STATE                    / round 0
      985 +        pxor    %KEY, %STATE0                   / round 0
 804  986          lea     0x30(%KEYP), %KEYP
 805  987          cmp     $12, %NROUNDS
 806  988          jb      .Ldec128
 807  989          lea     0x20(%KEYP), %KEYP
 808  990          je      .Ldec192
 809  991  
 810  992          / AES 256
 811  993          lea     0x20(%KEYP), %KEYP
 812      -        movaps  -0x60(%KEYP), %KEY
 813      -        aesdec  %KEY, %STATE
 814      -        movaps  -0x50(%KEYP), %KEY
 815      -        aesdec  %KEY, %STATE
      994 +        AES256_DEC_ROUNDS(STATE0)
 816  995  
 817  996  .align 4
 818  997  .Ldec192:
 819  998          / AES 192 and 256
 820      -        movaps  -0x40(%KEYP), %KEY
 821      -        aesdec  %KEY, %STATE
 822      -        movaps  -0x30(%KEYP), %KEY
 823      -        aesdec  %KEY, %STATE
      999 +        AES192_DEC_ROUNDS(STATE0)
 824 1000  
 825 1001  .align 4
 826 1002  .Ldec128:
 827 1003          / AES 128, 192, and 256
 828      -        movaps  -0x20(%KEYP), %KEY
 829      -        aesdec  %KEY, %STATE
 830      -        movaps  -0x10(%KEYP), %KEY
 831      -        aesdec  %KEY, %STATE
 832      -        movaps  (%KEYP), %KEY
 833      -        aesdec  %KEY, %STATE
 834      -        movaps  0x10(%KEYP), %KEY
 835      -        aesdec  %KEY, %STATE
 836      -        movaps  0x20(%KEYP), %KEY
 837      -        aesdec  %KEY, %STATE
 838      -        movaps  0x30(%KEYP), %KEY
 839      -        aesdec  %KEY, %STATE
 840      -        movaps  0x40(%KEYP), %KEY
 841      -        aesdec  %KEY, %STATE
 842      -        movaps  0x50(%KEYP), %KEY
 843      -        aesdec  %KEY, %STATE
 844      -        movaps  0x60(%KEYP), %KEY
 845      -        aesdec  %KEY, %STATE
 846      -        movaps  0x70(%KEYP), %KEY
 847      -        aesdeclast      %KEY, %STATE            / last round
 848      -        movups  %STATE, (%OUTP)                 / output
     1004 +        AES128_DEC_ROUNDS(STATE0)
     1005 +        movups  %STATE0, (%OUTP)                / output
 849 1006  
 850      -        SET_TS_OR_POP_XMM0_XMM1(%r10)
 851 1007          ret
 852 1008          SET_SIZE(aes_decrypt_intel)
 853 1009  
     1010 +/* Does a pipelined load of eight input blocks into our AES state registers. */
     1011 +#define AES_LOAD_INPUT_8BLOCKS          \
     1012 +        movups  0x00(%INP), %STATE0;    \
     1013 +        movups  0x10(%INP), %STATE1;    \
     1014 +        movups  0x20(%INP), %STATE2;    \
     1015 +        movups  0x30(%INP), %STATE3;    \
     1016 +        movups  0x40(%INP), %STATE4;    \
     1017 +        movups  0x50(%INP), %STATE5;    \
     1018 +        movups  0x60(%INP), %STATE6;    \
     1019 +        movups  0x70(%INP), %STATE7;
     1020 +
     1021 +/* Does a pipelined store of eight AES state registers to the output. */
     1022 +#define AES_STORE_OUTPUT_8BLOCKS        \
     1023 +        movups  %STATE0, 0x00(%OUTP);   \
     1024 +        movups  %STATE1, 0x10(%OUTP);   \
     1025 +        movups  %STATE2, 0x20(%OUTP);   \
     1026 +        movups  %STATE3, 0x30(%OUTP);   \
     1027 +        movups  %STATE4, 0x40(%OUTP);   \
     1028 +        movups  %STATE5, 0x50(%OUTP);   \
     1029 +        movups  %STATE6, 0x60(%OUTP);   \
     1030 +        movups  %STATE7, 0x70(%OUTP);
     1031 +
     1032 +/* Performs a pipelined AES instruction with the key on all state registers. */
     1033 +#define AES_KEY_STATE_OP_8BLOCKS(op)    \
     1034 +        op      %KEY, %STATE0;          \
     1035 +        op      %KEY, %STATE1;          \
     1036 +        op      %KEY, %STATE2;          \
     1037 +        op      %KEY, %STATE3;          \
     1038 +        op      %KEY, %STATE4;          \
     1039 +        op      %KEY, %STATE5;          \
     1040 +        op      %KEY, %STATE6;          \
     1041 +        op      %KEY, %STATE7
     1042 +
     1043 +/* XOR all AES state regs with key to initiate encryption/decryption. */
     1044 +#define AES_XOR_STATE_8BLOCKS           \
     1045 +        AES_KEY_STATE_OP_8BLOCKS(pxor)
     1046 +
     1047 +/*
     1048 + * Loads a round key from the key schedule offset `off' into the KEY
     1049 + * register and performs `op' using the KEY on all 8 STATE registers.
     1050 + */
     1051 +#define AES_RND_8BLOCKS(op, off)        \
     1052 +        movaps  off(%KEYP), %KEY;       \
     1053 +        AES_KEY_STATE_OP_8BLOCKS(op)
     1054 +
     1055 +/*
     1056 + * void aes_encrypt_intel8(const uint32_t roundkeys[], int numrounds,
     1057 + *      const void *plaintext, void *ciphertext)
     1058 + *
     1059 + * Same as aes_encrypt_intel, but performs the encryption operation on
     1060 + * 8 independent blocks in sequence, exploiting instruction pipelining.
     1061 + * This function doesn't support the OpenSSL interface, it's only meant
     1062 + * for kernel use.
     1063 + */
     1064 +ENTRY_NP(aes_encrypt_intel8)
     1065 +        AES_LOAD_INPUT_8BLOCKS          / load input
     1066 +        movaps  (%KEYP), %KEY           / key
     1067 +        AES_XOR_STATE_8BLOCKS           / round 0
     1068 +
     1069 +        lea     0x30(%KEYP), %KEYP      / point to key schedule
     1070 +        cmp     $12, %NROUNDS           / determine AES variant
     1071 +        jb      .Lenc8_128
     1072 +        lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
     1073 +        je      .Lenc8_192
     1074 +
     1075 +        lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
     1076 +        AES_RND_8BLOCKS(aesenc, -0x60)  / AES256 R.1
     1077 +        AES_RND_8BLOCKS(aesenc, -0x50)  / AES256 R.2
     1078 +
     1079 +.align 4
     1080 +.Lenc8_192:
     1081 +        AES_RND_8BLOCKS(aesenc, -0x40)  / AES192 R.1; AES256 R.3
     1082 +        AES_RND_8BLOCKS(aesenc, -0x30)  / AES192 R.2; AES256 R.4
     1083 +
     1084 +.align 4
     1085 +.Lenc8_128:
     1086 +        AES_RND_8BLOCKS(aesenc, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
     1087 +        AES_RND_8BLOCKS(aesenc, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
     1088 +        AES_RND_8BLOCKS(aesenc, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
     1089 +        AES_RND_8BLOCKS(aesenc, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
     1090 +        AES_RND_8BLOCKS(aesenc, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
     1091 +        AES_RND_8BLOCKS(aesenc, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
     1092 +        AES_RND_8BLOCKS(aesenc, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
     1093 +        AES_RND_8BLOCKS(aesenc, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
     1094 +        AES_RND_8BLOCKS(aesenc, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
     1095 +        AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
     1096 +
     1097 +        AES_STORE_OUTPUT_8BLOCKS        / store output
     1098 +        ret
     1099 +        SET_SIZE(aes_encrypt_intel8)
     1100 +
     1101 +
     1102 +/*
     1103 + * void aes_decrypt_intel8(const uint32_t roundkeys[], int numrounds,
     1104 + *      const void *ciphertext, void *plaintext)
     1105 + *
     1106 + * Same as aes_decrypt_intel, but performs the decryption operation on
     1107 + * 8 independent blocks in sequence, exploiting instruction pipelining.
     1108 + * This function doesn't support the OpenSSL interface, it's only meant
     1109 + * for kernel use.
     1110 + */
     1111 +ENTRY_NP(aes_decrypt_intel8)
     1112 +        AES_LOAD_INPUT_8BLOCKS          / load input
     1113 +        movaps  (%KEYP), %KEY           / key
     1114 +        AES_XOR_STATE_8BLOCKS           / round 0
     1115 +
     1116 +        lea     0x30(%KEYP), %KEYP      / point to key schedule
     1117 +        cmp     $12, %NROUNDS           / determine AES variant
     1118 +        jb      .Ldec8_128
     1119 +        lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
     1120 +        je      .Ldec8_192
     1121 +
     1122 +        lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
     1123 +        AES_RND_8BLOCKS(aesdec, -0x60)  / AES256 R.1
     1124 +        AES_RND_8BLOCKS(aesdec, -0x50)  / AES256 R.2
     1125 +
     1126 +.align 4
     1127 +.Ldec8_192:
     1128 +        AES_RND_8BLOCKS(aesdec, -0x40)  / AES192 R.1; AES256 R.3
     1129 +        AES_RND_8BLOCKS(aesdec, -0x30)  / AES192 R.2; AES256 R.4
     1130 +
     1131 +.align 4
     1132 +.Ldec8_128:
     1133 +        AES_RND_8BLOCKS(aesdec, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
     1134 +        AES_RND_8BLOCKS(aesdec, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
     1135 +        AES_RND_8BLOCKS(aesdec, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
     1136 +        AES_RND_8BLOCKS(aesdec, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
     1137 +        AES_RND_8BLOCKS(aesdec, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
     1138 +        AES_RND_8BLOCKS(aesdec, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
     1139 +        AES_RND_8BLOCKS(aesdec, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
     1140 +        AES_RND_8BLOCKS(aesdec, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
     1141 +        AES_RND_8BLOCKS(aesdec, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
     1142 +        AES_RND_8BLOCKS(aesdeclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
     1143 +
     1144 +        AES_STORE_OUTPUT_8BLOCKS        / store output
     1145 +        ret
     1146 +        SET_SIZE(aes_decrypt_intel8)
     1147 +
     1148 +
     1149 +/*
     1150 + * This macro encapsulates the entire AES encryption algo for a single
     1151 + * block, which is prefilled in statereg and which will be replaced by
     1152 + * the encrypted output. The KEYP register must already point to the
     1153 + * AES128 key schedule ("lea 0x30(%KEYP), %KEYP" from encryption
     1154 + * function call) so that consecutive invocations of this macro are
     1155 + * supported (KEYP is restored after each invocation).
     1156 + */
     1157 +#define AES_ENC(statereg, label_128, label_192, label_out)      \
     1158 +        cmp     $12, %NROUNDS;                                  \
     1159 +        jb      label_128;                                      \
     1160 +        je      label_192;                                      \
     1161 +        /* AES 256 only */                                      \
     1162 +        lea     0x40(%KEYP), %KEYP;                             \
     1163 +        AES256_ENC_ROUNDS(statereg);                            \
     1164 +        AES192_ENC_ROUNDS(statereg);                            \
     1165 +        AES128_ENC_ROUNDS(statereg);                            \
     1166 +        lea     -0x40(%KEYP), %KEYP;                            \
     1167 +        jmp     label_out;                                      \
     1168 +.align 4;                                                       \
     1169 +label_192:                                                      \
     1170 +        lea     0x20(%KEYP), %KEYP;                             \
     1171 +        /* AES 192 only */                                      \
     1172 +        AES192_ENC_ROUNDS(statereg);                            \
     1173 +        AES128_ENC_ROUNDS(statereg);                            \
     1174 +        lea     -0x20(%KEYP), %KEYP;                            \
     1175 +        jmp     label_out;                                      \
     1176 +.align 4;                                                       \
     1177 +label_128:                                                      \
     1178 +        /* AES 128 only */                                      \
     1179 +        AES128_ENC_ROUNDS(statereg);                            \
     1180 +.align 4;                                                       \
     1181 +label_out:
     1182 +
     1183 +
     1184 +/*
     1185 + * void aes_encrypt_cbc_intel8(const uint32_t roundkeys[], int numrounds,
     1186 + *      const void *plaintext, void *ciphertext, const void *IV)
     1187 + *
     1188 + * Encrypts 8 consecutive AES blocks in the CBC mode. Input and output
     1189 + * may overlap. This provides a modest performance boost over invoking
     1190 + * the encryption and XOR in separate functions because we can avoid
     1191 + * copying the ciphertext block to and from memory between encryption
     1192 + * and XOR calls.
     1193 + */
     1194 +#define CBC_IV                  r8      /* input - IV blk pointer */
     1195 +#define CBC_IV_XMM              xmm1    /* tmp IV location for alignment */
     1196 +
     1197 +ENTRY_NP(aes_encrypt_cbc_intel8)
     1198 +        AES_LOAD_INPUT_8BLOCKS          / load input
     1199 +        movaps  (%KEYP), %KEY           / key
     1200 +        AES_XOR_STATE_8BLOCKS           / round 0
     1201 +
     1202 +        lea     0x30(%KEYP), %KEYP      / point to key schedule
     1203 +        movdqu  (%CBC_IV), %CBC_IV_XMM  / load IV from unaligned memory
     1204 +        pxor    %CBC_IV_XMM, %STATE0    / XOR IV with input block and encrypt
     1205 +        AES_ENC(STATE0, .Lenc_cbc_0_128, .Lenc_cbc_0_192, .Lenc_cbc_0_out)
     1206 +        pxor    %STATE0, %STATE1
     1207 +        AES_ENC(STATE1, .Lenc_cbc_1_128, .Lenc_cbc_1_192, .Lenc_cbc_1_out)
     1208 +        pxor    %STATE1, %STATE2
     1209 +        AES_ENC(STATE2, .Lenc_cbc_2_128, .Lenc_cbc_2_192, .Lenc_cbc_2_out)
     1210 +        pxor    %STATE2, %STATE3
     1211 +        AES_ENC(STATE3, .Lenc_cbc_3_128, .Lenc_cbc_3_192, .Lenc_cbc_3_out)
     1212 +        pxor    %STATE3, %STATE4
     1213 +        AES_ENC(STATE4, .Lenc_cbc_4_128, .Lenc_cbc_4_192, .Lenc_cbc_4_out)
     1214 +        pxor    %STATE4, %STATE5
     1215 +        AES_ENC(STATE5, .Lenc_cbc_5_128, .Lenc_cbc_5_192, .Lenc_cbc_5_out)
     1216 +        pxor    %STATE5, %STATE6
     1217 +        AES_ENC(STATE6, .Lenc_cbc_6_128, .Lenc_cbc_6_192, .Lenc_cbc_6_out)
     1218 +        pxor    %STATE6, %STATE7
     1219 +        AES_ENC(STATE7, .Lenc_cbc_7_128, .Lenc_cbc_7_192, .Lenc_cbc_7_out)
     1220 +
     1221 +        AES_STORE_OUTPUT_8BLOCKS        / store output
     1222 +        ret
     1223 +        SET_SIZE(aes_encrypt_cbc_intel8)
     1224 +
     1225 +/*
     1226 + * Prefills register state with counters suitable for the CTR encryption
     1227 + * mode. The counter is assumed to consist of two portions:
     1228 + * - A lower monotonically increasing 64-bit counter. If the caller wants
     1229 + *   a smaller counter, they are responsible for checking that it doesn't
     1230 + *   overflow between encryption calls.
     1231 + * - An upper static "nonce" portion, in big endian, preloaded into the
     1232 + *   lower portion of an XMM register.
     1233 + * This macro adds `ctridx' to the lower_LE counter, swaps it to big
     1234 + * endian and by way of a temporary general-purpose register loads the
     1235 + * lower and upper counter portions into a target XMM result register,
     1236 + * which can then be handed off to the encryption process.
     1237 + */
     1238 +#define PREP_CTR_BLOCKS(lower_LE, upper_BE_xmm, ctridx, tmpreg, resreg) \
     1239 +        lea     ctridx(%lower_LE), %tmpreg;                             \
     1240 +        bswap   %tmpreg;                                                \
     1241 +        movq    %tmpreg, %resreg;                                       \
     1242 +        movlhps %upper_BE_xmm, %resreg;                                 \
     1243 +        pshufd  $0b01001110, %resreg, %resreg
     1244 +
     1245 +#define CTR_UPPER_BE            r8      /* input - counter upper 64 bits (BE) */
     1246 +#define CTR_UPPER_BE_XMM        xmm1    /* tmp for upper counter bits */
     1247 +#define CTR_LOWER_LE            r9      /* input - counter lower 64 bits (LE) */
     1248 +#define CTR_TMP0                rax     /* tmp for lower 64 bit add & bswap */
     1249 +#define CTR_TMP1                rbx     /* tmp for lower 64 bit add & bswap */
     1250 +#define CTR_TMP2                r10     /* tmp for lower 64 bit add & bswap */
     1251 +#define CTR_TMP3                r11     /* tmp for lower 64 bit add & bswap */
     1252 +#define CTR_TMP4                r12     /* tmp for lower 64 bit add & bswap */
     1253 +#define CTR_TMP5                r13     /* tmp for lower 64 bit add & bswap */
     1254 +#define CTR_TMP6                r14     /* tmp for lower 64 bit add & bswap */
     1255 +#define CTR_TMP7                r15     /* tmp for lower 64 bit add & bswap */
     1256 +
     1257 +/*
     1258 + * These are used in case CTR encryption input is unaligned before XORing.
     1259 + * Must not overlap with any STATE[0-7] register.
     1260 + */
     1261 +#define TMP_INPUT0      xmm0
     1262 +#define TMP_INPUT1      xmm1
     1263 +#define TMP_INPUT2      xmm2
     1264 +#define TMP_INPUT3      xmm3
     1265 +#define TMP_INPUT4      xmm4
     1266 +#define TMP_INPUT5      xmm5
     1267 +#define TMP_INPUT6      xmm6
     1268 +#define TMP_INPUT7      xmm7
     1269 +
     1270 +/*
     1271 + * void aes_ctr_intel8(const uint32_t roundkeys[], int numrounds,
     1272 + *      const void *input, void *output, uint64_t counter_upper_BE,
     1273 + *      uint64_t counter_lower_LE)
     1274 + *
     1275 + * Runs AES on 8 consecutive blocks in counter mode (encryption and
     1276 + * decryption in counter mode are the same).
     1277 + */
     1278 +ENTRY_NP(aes_ctr_intel8)
     1279 +        /* save caller's regs */
     1280 +        pushq   %rbp
     1281 +        movq    %rsp, %rbp
     1282 +        subq    $0x38, %rsp
     1283 +        / CTR_TMP0 is rax, no need to save
     1284 +        movq    %CTR_TMP1, -0x38(%rbp)
     1285 +        movq    %CTR_TMP2, -0x30(%rbp)
     1286 +        movq    %CTR_TMP3, -0x28(%rbp)
     1287 +        movq    %CTR_TMP4, -0x20(%rbp)
     1288 +        movq    %CTR_TMP5, -0x18(%rbp)
     1289 +        movq    %CTR_TMP6, -0x10(%rbp)
     1290 +        movq    %CTR_TMP7, -0x08(%rbp)
     1291 +
     1292 +        /*
     1293 +         * CTR step 1: prepare big-endian formatted 128-bit counter values,
     1294 +         * placing the result in the AES-NI input state registers.
     1295 +         */
     1296 +        movq    %CTR_UPPER_BE, %CTR_UPPER_BE_XMM
     1297 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 0, CTR_TMP0, STATE0)
     1298 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 1, CTR_TMP1, STATE1)
     1299 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 2, CTR_TMP2, STATE2)
     1300 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 3, CTR_TMP3, STATE3)
     1301 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 4, CTR_TMP4, STATE4)
     1302 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 5, CTR_TMP5, STATE5)
     1303 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 6, CTR_TMP6, STATE6)
     1304 +        PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 7, CTR_TMP7, STATE7)
     1305 +
     1306 +        /*
     1307 +         * CTR step 2: Encrypt the counters.
     1308 +         */
     1309 +        movaps  (%KEYP), %KEY           / key
     1310 +        AES_XOR_STATE_8BLOCKS           / round 0
     1311 +
     1312 +        /* Determine the AES variant we're going to compute */
     1313 +        lea     0x30(%KEYP), %KEYP      / point to key schedule
     1314 +        cmp     $12, %NROUNDS           / determine AES variant
     1315 +        jb      .Lctr8_128
     1316 +        lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
     1317 +        je      .Lctr8_192
     1318 +
     1319 +        /* AES 256 */
     1320 +        lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
     1321 +        AES_RND_8BLOCKS(aesenc, -0x60)  / AES256 R.1
     1322 +        AES_RND_8BLOCKS(aesenc, -0x50)  / AES256 R.2
     1323 +
     1324 +.align 4
     1325 +.Lctr8_192:
     1326 +        /* AES 192 and 256 */
     1327 +        AES_RND_8BLOCKS(aesenc, -0x40)  / AES192 R.1; AES256 R.3
     1328 +        AES_RND_8BLOCKS(aesenc, -0x30)  / AES192 R.2; AES256 R.4
     1329 +
     1330 +.align 4
     1331 +.Lctr8_128:
     1332 +        /* AES 128, 192, and 256 */
     1333 +        AES_RND_8BLOCKS(aesenc, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
     1334 +        AES_RND_8BLOCKS(aesenc, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
     1335 +        AES_RND_8BLOCKS(aesenc, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
     1336 +        AES_RND_8BLOCKS(aesenc, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
     1337 +        AES_RND_8BLOCKS(aesenc, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
     1338 +        AES_RND_8BLOCKS(aesenc, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
     1339 +        AES_RND_8BLOCKS(aesenc, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
     1340 +        AES_RND_8BLOCKS(aesenc, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
     1341 +        AES_RND_8BLOCKS(aesenc, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
     1342 +        AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
     1343 +
     1344 +        /*
     1345 +         * CTR step 3: XOR input data blocks with encrypted counters to
     1346 +         * produce result.
     1347 +         */
     1348 +        mov     %INP, %rax              / pxor requires alignment, so check
     1349 +        andq    $0xf, %rax
     1350 +        jnz     .Lctr_input_unaligned
     1351 +        pxor    0x00(%INP), %STATE0
     1352 +        pxor    0x10(%INP), %STATE1
     1353 +        pxor    0x20(%INP), %STATE2
     1354 +        pxor    0x30(%INP), %STATE3
     1355 +        pxor    0x40(%INP), %STATE4
     1356 +        pxor    0x50(%INP), %STATE5
     1357 +        pxor    0x60(%INP), %STATE6
     1358 +        pxor    0x70(%INP), %STATE7
     1359 +        jmp     .Lctr_out
     1360 +
     1361 +.align 4
     1362 +.Lctr_input_unaligned:
     1363 +        movdqu  0x00(%INP), %TMP_INPUT0
     1364 +        movdqu  0x10(%INP), %TMP_INPUT1
     1365 +        movdqu  0x20(%INP), %TMP_INPUT2
     1366 +        movdqu  0x30(%INP), %TMP_INPUT3
     1367 +        movdqu  0x40(%INP), %TMP_INPUT4
     1368 +        movdqu  0x50(%INP), %TMP_INPUT5
     1369 +        movdqu  0x60(%INP), %TMP_INPUT6
     1370 +        movdqu  0x70(%INP), %TMP_INPUT7
     1371 +        pxor    %TMP_INPUT0, %STATE0
     1372 +        pxor    %TMP_INPUT1, %STATE1
     1373 +        pxor    %TMP_INPUT2, %STATE2
     1374 +        pxor    %TMP_INPUT3, %STATE3
     1375 +        pxor    %TMP_INPUT4, %STATE4
     1376 +        pxor    %TMP_INPUT5, %STATE5
     1377 +        pxor    %TMP_INPUT6, %STATE6
     1378 +        pxor    %TMP_INPUT7, %STATE7
     1379 +
     1380 +.align 4
     1381 +.Lctr_out:
     1382 +        /*
     1383 +         * Step 4: Write out processed blocks to memory.
     1384 +         */
     1385 +        movdqu  %STATE0, 0x00(%OUTP)
     1386 +        movdqu  %STATE1, 0x10(%OUTP)
     1387 +        movdqu  %STATE2, 0x20(%OUTP)
     1388 +        movdqu  %STATE3, 0x30(%OUTP)
     1389 +        movdqu  %STATE4, 0x40(%OUTP)
     1390 +        movdqu  %STATE5, 0x50(%OUTP)
     1391 +        movdqu  %STATE6, 0x60(%OUTP)
     1392 +        movdqu  %STATE7, 0x70(%OUTP)
     1393 +
     1394 +        /* restore caller's regs */
     1395 +        / CTR_TMP0 is rax, no need to restore
     1396 +        movq    -0x38(%rbp), %CTR_TMP1
     1397 +        movq    -0x30(%rbp), %CTR_TMP2
     1398 +        movq    -0x28(%rbp), %CTR_TMP3
     1399 +        movq    -0x20(%rbp), %CTR_TMP4
     1400 +        movq    -0x18(%rbp), %CTR_TMP5
     1401 +        movq    -0x10(%rbp), %CTR_TMP6
     1402 +        movq    -0x08(%rbp), %CTR_TMP7
     1403 +        leave
     1404 +        ret
     1405 +        SET_SIZE(aes_ctr_intel8)
     1406 +
 854 1407  #endif  /* lint || __lint */

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX