illumos-gate New usr/src/common/crypto/aes/amd64/aes

   1 /*
   2  * ====================================================================
   3  * Written by Intel Corporation for the OpenSSL project to add support
   4  * for Intel AES-NI instructions. Rights for redistribution and usage
   5  * in source and binary forms are granted according to the OpenSSL
   6  * license.
   7  *
   8  *   Author: Huang Ying <ying.huang at intel dot com>
   9  *           Vinodh Gopal <vinodh.gopal at intel dot com>
  10  *           Kahraman Akdemir
  11  *
  12  * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
  13  * instructions that are going to be introduced in the next generation
  14  * of Intel processor, as of 2009. These instructions enable fast and
  15  * secure data encryption and decryption, using the Advanced Encryption
  16  * Standard (AES), defined by FIPS Publication number 197. The
  17  * architecture introduces six instructions that offer full hardware
  18  * support for AES. Four of them support high performance data
  19  * encryption and decryption, and the other two instructions support
  20  * the AES key expansion procedure.
  21  * ====================================================================
  22  */
  23 
  24 /*
  25  * ====================================================================
  26  * Copyright (c) 1998-2008 The OpenSSL Project.  All rights reserved.
  27  *
  28  * Redistribution and use in source and binary forms, with or without
  29  * modification, are permitted provided that the following conditions
  30  * are met:
  31  *
  32  * 1. Redistributions of source code must retain the above copyright
  33  *    notice, this list of conditions and the following disclaimer.
  34  *
  35  * 2. Redistributions in binary form must reproduce the above copyright
  36  *    notice, this list of conditions and the following disclaimer in
  37  *    the documentation and/or other materials provided with the
  38  *    distribution.
  39  *
  40  * 3. All advertising materials mentioning features or use of this
  41  *    software must display the following acknowledgment:
  42  *    "This product includes software developed by the OpenSSL Project
  43  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
  44  *
  45  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
  46  *    endorse or promote products derived from this software without
  47  *    prior written permission. For written permission, please contact
  48  *    openssl-core@openssl.org.
  49  *
  50  * 5. Products derived from this software may not be called "OpenSSL"
  51  *    nor may "OpenSSL" appear in their names without prior written
  52  *    permission of the OpenSSL Project.
  53  *
  54  * 6. Redistributions of any form whatsoever must retain the following
  55  *    acknowledgment:
  56  *    "This product includes software developed by the OpenSSL Project
  57  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
  58  *
  59  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
  60  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  61  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  62  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
  63  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  64  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  65  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  66  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  67  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  68  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  69  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  70  * OF THE POSSIBILITY OF SUCH DAMAGE.
  71  * ====================================================================
  72  */
  73 
  74 /*
  75  * ====================================================================
  76  * OpenSolaris OS modifications
  77  *
  78  * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
  79  * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
  80  * Huang Ying of Intel to the openssl-dev mailing list under the subject
  81  * of "Add support to Intel AES-NI instruction set for x86_64 platform".
  82  *
  83  * This OpenSolaris version has these major changes from the original source:
  84  *
  85  * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  86  * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
  87  * definitions for lint.
  88  *
  89  * 2. Formatted code, added comments, and added #includes and #defines.
  90  *
  91  * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
  92  * calling kpreempt_disable() and kpreempt_enable().
  93  * If the TS bit is not set, Save and restore %xmm registers at the beginning
  94  * and end of function calls (%xmm* registers are not saved and restored by
  95  * during kernel thread preemption).
  96  *
  97  * 4. Renamed functions, reordered parameters, and changed return value
  98  * to match OpenSolaris:
  99  *
 100  * OpenSSL interface:
 101  *      int intel_AES_set_encrypt_key(const unsigned char *userKey,
 102  *              const int bits, AES_KEY *key);
 103  *      int intel_AES_set_decrypt_key(const unsigned char *userKey,
 104  *              const int bits, AES_KEY *key);
 105  *      Return values for above are non-zero on error, 0 on success.
 106  *
 107  *      void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
 108  *              const AES_KEY *key);
 109  *      void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
 110  *              const AES_KEY *key);
 111  *      typedef struct aes_key_st {
 112  *              unsigned int    rd_key[4 *(AES_MAXNR + 1)];
 113  *              int             rounds;
 114  *              unsigned int    pad[3];
 115  *      } AES_KEY;
 116  * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
 117  * (ks32) instead of 64-bit (ks64).
 118  * Number of rounds (aka round count) is at offset 240 of AES_KEY.
 119  *
 120  * OpenSolaris OS interface (#ifdefs removed for readability):
 121  *      int rijndael_key_setup_dec_intel(uint32_t rk[],
 122  *              const uint32_t cipherKey[], uint64_t keyBits);
 123  *      int rijndael_key_setup_enc_intel(uint32_t rk[],
 124  *              const uint32_t cipherKey[], uint64_t keyBits);
 125  *      Return values for above are 0 on error, number of rounds on success.
 126  *
 127  *      void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
 128  *              const uint32_t pt[4], uint32_t ct[4]);
 129  *      void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
 130  *              const uint32_t pt[4], uint32_t ct[4]);
 131  *      typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
 132  *               uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
 133  *
 134  *      typedef union {
 135  *              uint32_t        ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
 136  *      } aes_ks_t;
 137  *      typedef struct aes_key {
 138  *              aes_ks_t        encr_ks, decr_ks;
 139  *              long double     align128;
 140  *              int             flags, nr, type;
 141  *      } aes_key_t;
 142  *
 143  * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
 144  * ct is crypto text, and MAX_AES_NR is 14.
 145  * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
 146  *
 147  * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
 148  *
 149  * ====================================================================
 150  */
 151 /*
 152  * Copyright 2015 by Saso Kiselkov. All rights reserved.
 153  */
 154 
 155 #if defined(lint) || defined(__lint)
 156 
 157 #include <sys/types.h>
 158 
 159 /* ARGSUSED */
 160 void
 161 aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
 162     uint32_t ct[4]) {
 163 }
 164 /* ARGSUSED */
 165 void
 166 aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
 167     uint32_t pt[4]) {
 168 }
 169 /* ARGSUSED */
 170 int
 171 rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
 172     uint64_t keyBits) {
 173         return (0);
 174 }
 175 /* ARGSUSED */
 176 int
 177 rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
 178    uint64_t keyBits) {
 179         return (0);
 180 }
 181 
 182 
 183 #else   /* lint */
 184 
 185 #include <sys/asm_linkage.h>
 186 #include <sys/controlregs.h>
 187 #ifdef _KERNEL
 188 #include <sys/machprivregs.h>
 189 #endif
 190 
 191 #ifdef _KERNEL
 192         /*
 193          * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv.  That is,
 194          * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
 195          * uses it to pass P2 to syscall.
 196          * This also occurs with the STTS macro, but we don't care if
 197          * P2 (%rsi) is modified just before function exit.
 198          * The CLTS and STTS macros push and pop P1 (%rdi) already.
 199          */
 200 #ifdef __xpv
 201 #define PROTECTED_CLTS \
 202         push    %rsi; \
 203         CLTS; \
 204         pop     %rsi
 205 #else
 206 #define PROTECTED_CLTS \
 207         CLTS
 208 #endif  /* __xpv */
 209 
 210 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \
 211         push    %rbp; \
 212         mov     %rsp, %rbp; \
 213         movq    %cr0, tmpreg; \
 214         testq   $CR0_TS, tmpreg; \
 215         jnz     1f; \
 216         and     $-XMM_ALIGN, %rsp; \
 217         sub     $[XMM_SIZE * 2], %rsp; \
 218         movaps  %xmm0, 16(%rsp); \
 219         movaps  %xmm1, (%rsp); \
 220         jmp     2f; \
 221 1: \
 222         PROTECTED_CLTS; \
 223 2:
 224 
 225         /*
 226          * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack,
 227          * otherwise set CR0_TS.
 228          */
 229 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg) \
 230         testq   $CR0_TS, tmpreg; \
 231         jnz     1f; \
 232         movaps  (%rsp), %xmm1; \
 233         movaps  16(%rsp), %xmm0; \
 234         jmp     2f; \
 235 1: \
 236         STTS(tmpreg); \
 237 2: \
 238         mov     %rbp, %rsp; \
 239         pop     %rbp
 240 
 241         /*
 242          * If CR0_TS is not set, align stack (with push %rbp) and push
 243          * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS
 244          */
 245 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \
 246         push    %rbp; \
 247         mov     %rsp, %rbp; \
 248         movq    %cr0, tmpreg; \
 249         testq   $CR0_TS, tmpreg; \
 250         jnz     1f; \
 251         and     $-XMM_ALIGN, %rsp; \
 252         sub     $[XMM_SIZE * 7], %rsp; \
 253         movaps  %xmm0, 96(%rsp); \
 254         movaps  %xmm1, 80(%rsp); \
 255         movaps  %xmm2, 64(%rsp); \
 256         movaps  %xmm3, 48(%rsp); \
 257         movaps  %xmm4, 32(%rsp); \
 258         movaps  %xmm5, 16(%rsp); \
 259         movaps  %xmm6, (%rsp); \
 260         jmp     2f; \
 261 1: \
 262         PROTECTED_CLTS; \
 263 2:
 264 
 265 
 266         /*
 267          * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
 268          * otherwise set CR0_TS.
 269          */
 270 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
 271         testq   $CR0_TS, tmpreg; \
 272         jnz     1f; \
 273         movaps  (%rsp), %xmm6; \
 274         movaps  16(%rsp), %xmm5; \
 275         movaps  32(%rsp), %xmm4; \
 276         movaps  48(%rsp), %xmm3; \
 277         movaps  64(%rsp), %xmm2; \
 278         movaps  80(%rsp), %xmm1; \
 279         movaps  96(%rsp), %xmm0; \
 280         jmp     2f; \
 281 1: \
 282         STTS(tmpreg); \
 283 2: \
 284         mov     %rbp, %rsp; \
 285         pop     %rbp
 286 
 287 /*
 288  * void aes_accel_save(void *savestate);
 289  *
 290  * Saves all 16 XMM registers and CR0 to a temporary location pointed to
 291  * in the first argument and clears TS in CR0. This must be invoked before
 292  * executing any floating point operations inside the kernel (and kernel
 293  * thread preemption must be disabled as well). The memory region to which
 294  * all state is saved must be at least 16x 128-bit + 64-bit long and must
 295  * be 128-bit aligned.
 296  */
 297 ENTRY_NP(aes_accel_save)
 298         movq    %cr0, %rax
 299         movq    %rax, 0x100(%rdi)
 300         testq   $CR0_TS, %rax
 301         jnz     1f
 302         movaps  %xmm0, 0x00(%rdi)
 303         movaps  %xmm1, 0x10(%rdi)
 304         movaps  %xmm2, 0x20(%rdi)
 305         movaps  %xmm3, 0x30(%rdi)
 306         movaps  %xmm4, 0x40(%rdi)
 307         movaps  %xmm5, 0x50(%rdi)
 308         movaps  %xmm6, 0x60(%rdi)
 309         movaps  %xmm7, 0x70(%rdi)
 310         movaps  %xmm8, 0x80(%rdi)
 311         movaps  %xmm9, 0x90(%rdi)
 312         movaps  %xmm10, 0xa0(%rdi)
 313         movaps  %xmm11, 0xb0(%rdi)
 314         movaps  %xmm12, 0xc0(%rdi)
 315         movaps  %xmm13, 0xd0(%rdi)
 316         movaps  %xmm14, 0xe0(%rdi)
 317         movaps  %xmm15, 0xf0(%rdi)
 318         ret
 319 1:
 320         PROTECTED_CLTS
 321         ret
 322         SET_SIZE(aes_accel_save)
 323 
 324 /*
 325  * void aes_accel_restore(void *savestate);
 326  *
 327  * Restores the saved XMM and CR0.TS state from aes_accel_save.
 328  */
 329 ENTRY_NP(aes_accel_restore)
 330         mov     0x100(%rdi), %rax
 331         testq   $CR0_TS, %rax
 332         jnz     1f
 333         movaps  0x00(%rdi), %xmm0
 334         movaps  0x10(%rdi), %xmm1
 335         movaps  0x20(%rdi), %xmm2
 336         movaps  0x30(%rdi), %xmm3
 337         movaps  0x40(%rdi), %xmm4
 338         movaps  0x50(%rdi), %xmm5
 339         movaps  0x60(%rdi), %xmm6
 340         movaps  0x70(%rdi), %xmm7
 341         movaps  0x80(%rdi), %xmm8
 342         movaps  0x90(%rdi), %xmm9
 343         movaps  0xa0(%rdi), %xmm10
 344         movaps  0xb0(%rdi), %xmm11
 345         movaps  0xc0(%rdi), %xmm12
 346         movaps  0xd0(%rdi), %xmm13
 347         movaps  0xe0(%rdi), %xmm14
 348         movaps  0xf0(%rdi), %xmm15
 349         ret
 350 1:
 351         STTS(%rax)
 352         ret
 353         SET_SIZE(aes_accel_restore)
 354 
 355 #else
 356 #define PROTECTED_CLTS
 357 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
 358 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
 359 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
 360 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
 361 #endif  /* _KERNEL */
 362 
 363 
 364 /*
 365  * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
 366  * _key_expansion_256a(), _key_expansion_256b()
 367  *
 368  * Helper functions called by rijndael_key_setup_inc_intel().
 369  * Also used indirectly by rijndael_key_setup_dec_intel().
 370  *
 371  * Input:
 372  * %xmm0        User-provided cipher key
 373  * %xmm1        Round constant
 374  * Output:
 375  * (%rcx)       AES key
 376  */
 377 
 378 .align  16
 379 _key_expansion_128:
 380 _key_expansion_256a:
 381         pshufd  $0b11111111, %xmm1, %xmm1
 382         shufps  $0b00010000, %xmm0, %xmm4
 383         pxor    %xmm4, %xmm0
 384         shufps  $0b10001100, %xmm0, %xmm4
 385         pxor    %xmm4, %xmm0
 386         pxor    %xmm1, %xmm0
 387         movaps  %xmm0, (%rcx)
 388         add     $0x10, %rcx
 389         ret
 390         SET_SIZE(_key_expansion_128)
 391         SET_SIZE(_key_expansion_256a)
 392 
 393 .align 16
 394 _key_expansion_192a:
 395         pshufd  $0b01010101, %xmm1, %xmm1
 396         shufps  $0b00010000, %xmm0, %xmm4
 397         pxor    %xmm4, %xmm0
 398         shufps  $0b10001100, %xmm0, %xmm4
 399         pxor    %xmm4, %xmm0
 400         pxor    %xmm1, %xmm0
 401 
 402         movaps  %xmm2, %xmm5
 403         movaps  %xmm2, %xmm6
 404         pslldq  $4, %xmm5
 405         pshufd  $0b11111111, %xmm0, %xmm3
 406         pxor    %xmm3, %xmm2
 407         pxor    %xmm5, %xmm2
 408 
 409         movaps  %xmm0, %xmm1
 410         shufps  $0b01000100, %xmm0, %xmm6
 411         movaps  %xmm6, (%rcx)
 412         shufps  $0b01001110, %xmm2, %xmm1
 413         movaps  %xmm1, 0x10(%rcx)
 414         add     $0x20, %rcx
 415         ret
 416         SET_SIZE(_key_expansion_192a)
 417 
 418 .align 16
 419 _key_expansion_192b:
 420         pshufd  $0b01010101, %xmm1, %xmm1
 421         shufps  $0b00010000, %xmm0, %xmm4
 422         pxor    %xmm4, %xmm0
 423         shufps  $0b10001100, %xmm0, %xmm4
 424         pxor    %xmm4, %xmm0
 425         pxor    %xmm1, %xmm0
 426 
 427         movaps  %xmm2, %xmm5
 428         pslldq  $4, %xmm5
 429         pshufd  $0b11111111, %xmm0, %xmm3
 430         pxor    %xmm3, %xmm2
 431         pxor    %xmm5, %xmm2
 432 
 433         movaps  %xmm0, (%rcx)
 434         add     $0x10, %rcx
 435         ret
 436         SET_SIZE(_key_expansion_192b)
 437 
 438 .align 16
 439 _key_expansion_256b:
 440         pshufd  $0b10101010, %xmm1, %xmm1
 441         shufps  $0b00010000, %xmm2, %xmm4
 442         pxor    %xmm4, %xmm2
 443         shufps  $0b10001100, %xmm2, %xmm4
 444         pxor    %xmm4, %xmm2
 445         pxor    %xmm1, %xmm2
 446         movaps  %xmm2, (%rcx)
 447         add     $0x10, %rcx
 448         ret
 449         SET_SIZE(_key_expansion_256b)
 450 
 451 /*
 452  * void aes_copy_intel(const uint8_t *src, uint8_t *dst);
 453  *
 454  * Copies one unaligned 128-bit block from `src' to `dst'. The copy is
 455  * performed using FPU registers, so make sure FPU state is saved when
 456  * running this in the kernel.
 457  */
 458 ENTRY_NP(aes_copy_intel)
 459         movdqu  (%rdi), %xmm0
 460         movdqu  %xmm0, (%rsi)
 461         ret
 462         SET_SIZE(aes_copy_intel)
 463 
 464 /*
 465  * void aes_xor_intel(const uint8_t *src, uint8_t *dst);
 466  *
 467  * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
 468  * stores the result at `dst'. The XOR is performed using FPU registers,
 469  * so make sure FPU state is saved when running this in the kernel.
 470  */
 471 ENTRY_NP(aes_xor_intel)
 472         movdqu  (%rdi), %xmm0
 473         movdqu  (%rsi), %xmm1
 474         pxor    %xmm1, %xmm0
 475         movdqu  %xmm0, (%rsi)
 476         ret
 477         SET_SIZE(aes_xor_intel)
 478 
 479 /*
 480  * void aes_xor_intel8(const uint8_t *src, uint8_t *dst);
 481  *
 482  * XORs eight pairs of consecutive unaligned 128-bit blocks from `src' and
 483  * 'dst' and stores the results at `dst'. The XOR is performed using FPU
 484  * registers, so make sure FPU state is saved when running this in the kernel.
 485  */
 486 ENTRY_NP(aes_xor_intel8)
 487         movdqu  0x00(%rdi), %xmm0
 488         movdqu  0x00(%rsi), %xmm1
 489         movdqu  0x10(%rdi), %xmm2
 490         movdqu  0x10(%rsi), %xmm3
 491         movdqu  0x20(%rdi), %xmm4
 492         movdqu  0x20(%rsi), %xmm5
 493         movdqu  0x30(%rdi), %xmm6
 494         movdqu  0x30(%rsi), %xmm7
 495         movdqu  0x40(%rdi), %xmm8
 496         movdqu  0x40(%rsi), %xmm9
 497         movdqu  0x50(%rdi), %xmm10
 498         movdqu  0x50(%rsi), %xmm11
 499         movdqu  0x60(%rdi), %xmm12
 500         movdqu  0x60(%rsi), %xmm13
 501         movdqu  0x70(%rdi), %xmm14
 502         movdqu  0x70(%rsi), %xmm15
 503         pxor    %xmm1, %xmm0
 504         pxor    %xmm3, %xmm2
 505         pxor    %xmm5, %xmm4
 506         pxor    %xmm7, %xmm6
 507         pxor    %xmm9, %xmm8
 508         pxor    %xmm11, %xmm10
 509         pxor    %xmm13, %xmm12
 510         pxor    %xmm15, %xmm14
 511         movdqu  %xmm0, 0x00(%rsi)
 512         movdqu  %xmm2, 0x10(%rsi)
 513         movdqu  %xmm4, 0x20(%rsi)
 514         movdqu  %xmm6, 0x30(%rsi)
 515         movdqu  %xmm8, 0x40(%rsi)
 516         movdqu  %xmm10, 0x50(%rsi)
 517         movdqu  %xmm12, 0x60(%rsi)
 518         movdqu  %xmm14, 0x70(%rsi)
 519         ret
 520         SET_SIZE(aes_xor_intel8)
 521 
 522 /*
 523  * rijndael_key_setup_enc_intel()
 524  * Expand the cipher key into the encryption key schedule.
 525  *
 526  * For kernel code, caller is responsible for ensuring kpreempt_disable()
 527  * has been called.  This is because %xmm registers are not saved/restored.
 528  * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 529  * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 530  * on the stack.
 531  *
 532  * OpenSolaris interface:
 533  * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
 534  *      uint64_t keyBits);
 535  * Return value is 0 on error, number of rounds on success.
 536  *
 537  * Original Intel OpenSSL interface:
 538  * int intel_AES_set_encrypt_key(const unsigned char *userKey,
 539  *      const int bits, AES_KEY *key);
 540  * Return value is non-zero on error, 0 on success.
 541  */
 542 
 543 #ifdef  OPENSSL_INTERFACE
 544 #define rijndael_key_setup_enc_intel    intel_AES_set_encrypt_key
 545 #define rijndael_key_setup_dec_intel    intel_AES_set_decrypt_key
 546 
 547 #define USERCIPHERKEY           rdi     /* P1, 64 bits */
 548 #define KEYSIZE32               esi     /* P2, 32 bits */
 549 #define KEYSIZE64               rsi     /* P2, 64 bits */
 550 #define AESKEY                  rdx     /* P3, 64 bits */
 551 
 552 #else   /* OpenSolaris Interface */
 553 #define AESKEY                  rdi     /* P1, 64 bits */
 554 #define USERCIPHERKEY           rsi     /* P2, 64 bits */
 555 #define KEYSIZE32               edx     /* P3, 32 bits */
 556 #define KEYSIZE64               rdx     /* P3, 64 bits */
 557 #endif  /* OPENSSL_INTERFACE */
 558 
 559 #define ROUNDS32                KEYSIZE32       /* temp */
 560 #define ROUNDS64                KEYSIZE64       /* temp */
 561 #define ENDAESKEY               USERCIPHERKEY   /* temp */
 562 
 563 
 564 ENTRY_NP(rijndael_key_setup_enc_intel)
 565         CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10)
 566 
 567         / NULL pointer sanity check
 568         test    %USERCIPHERKEY, %USERCIPHERKEY
 569         jz      .Lenc_key_invalid_param
 570         test    %AESKEY, %AESKEY
 571         jz      .Lenc_key_invalid_param
 572 
 573         movups  (%USERCIPHERKEY), %xmm0 / user key (first 16 bytes)
 574         movaps  %xmm0, (%AESKEY)
 575         lea     0x10(%AESKEY), %rcx     / key addr
 576         pxor    %xmm4, %xmm4            / xmm4 is assumed 0 in _key_expansion_x
 577 
 578         cmp     $256, %KEYSIZE32
 579         jnz     .Lenc_key192
 580 
 581         / AES 256: 14 rounds in encryption key schedule
 582 #ifdef OPENSSL_INTERFACE
 583         mov     $14, %ROUNDS32
 584         movl    %ROUNDS32, 240(%AESKEY)         / key.rounds = 14
 585 #endif  /* OPENSSL_INTERFACE */
 586 
 587         movups  0x10(%USERCIPHERKEY), %xmm2     / other user key (2nd 16 bytes)
 588         movaps  %xmm2, (%rcx)
 589         add     $0x10, %rcx
 590 
 591         aeskeygenassist $0x1, %xmm2, %xmm1      / expand the key
 592         call    _key_expansion_256a
 593         aeskeygenassist $0x1, %xmm0, %xmm1
 594         call    _key_expansion_256b
 595         aeskeygenassist $0x2, %xmm2, %xmm1      / expand the key
 596         call    _key_expansion_256a
 597         aeskeygenassist $0x2, %xmm0, %xmm1
 598         call    _key_expansion_256b
 599         aeskeygenassist $0x4, %xmm2, %xmm1      / expand the key
 600         call    _key_expansion_256a
 601         aeskeygenassist $0x4, %xmm0, %xmm1
 602         call    _key_expansion_256b
 603         aeskeygenassist $0x8, %xmm2, %xmm1      / expand the key
 604         call    _key_expansion_256a
 605         aeskeygenassist $0x8, %xmm0, %xmm1
 606         call    _key_expansion_256b
 607         aeskeygenassist $0x10, %xmm2, %xmm1     / expand the key
 608         call    _key_expansion_256a
 609         aeskeygenassist $0x10, %xmm0, %xmm1
 610         call    _key_expansion_256b
 611         aeskeygenassist $0x20, %xmm2, %xmm1     / expand the key
 612         call    _key_expansion_256a
 613         aeskeygenassist $0x20, %xmm0, %xmm1
 614         call    _key_expansion_256b
 615         aeskeygenassist $0x40, %xmm2, %xmm1     / expand the key
 616         call    _key_expansion_256a
 617 
 618         SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
 619 #ifdef  OPENSSL_INTERFACE
 620         xor     %rax, %rax                      / return 0 (OK)
 621 #else   /* Open Solaris Interface */
 622         mov     $14, %rax                       / return # rounds = 14
 623 #endif
 624         ret
 625 
 626 .align 4
 627 .Lenc_key192:
 628         cmp     $192, %KEYSIZE32
 629         jnz     .Lenc_key128
 630 
 631         / AES 192: 12 rounds in encryption key schedule
 632 #ifdef OPENSSL_INTERFACE
 633         mov     $12, %ROUNDS32
 634         movl    %ROUNDS32, 240(%AESKEY) / key.rounds = 12
 635 #endif  /* OPENSSL_INTERFACE */
 636 
 637         movq    0x10(%USERCIPHERKEY), %xmm2     / other user key
 638         aeskeygenassist $0x1, %xmm2, %xmm1      / expand the key
 639         call    _key_expansion_192a
 640         aeskeygenassist $0x2, %xmm2, %xmm1      / expand the key
 641         call    _key_expansion_192b
 642         aeskeygenassist $0x4, %xmm2, %xmm1      / expand the key
 643         call    _key_expansion_192a
 644         aeskeygenassist $0x8, %xmm2, %xmm1      / expand the key
 645         call    _key_expansion_192b
 646         aeskeygenassist $0x10, %xmm2, %xmm1     / expand the key
 647         call    _key_expansion_192a
 648         aeskeygenassist $0x20, %xmm2, %xmm1     / expand the key
 649         call    _key_expansion_192b
 650         aeskeygenassist $0x40, %xmm2, %xmm1     / expand the key
 651         call    _key_expansion_192a
 652         aeskeygenassist $0x80, %xmm2, %xmm1     / expand the key
 653         call    _key_expansion_192b
 654 
 655         SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
 656 #ifdef  OPENSSL_INTERFACE
 657         xor     %rax, %rax                      / return 0 (OK)
 658 #else   /* OpenSolaris Interface */
 659         mov     $12, %rax                       / return # rounds = 12
 660 #endif
 661         ret
 662 
 663 .align 4
 664 .Lenc_key128:
 665         cmp $128, %KEYSIZE32
 666         jnz .Lenc_key_invalid_key_bits
 667 
 668         / AES 128: 10 rounds in encryption key schedule
 669 #ifdef OPENSSL_INTERFACE
 670         mov     $10, %ROUNDS32
 671         movl    %ROUNDS32, 240(%AESKEY)         / key.rounds = 10
 672 #endif  /* OPENSSL_INTERFACE */
 673 
 674         aeskeygenassist $0x1, %xmm0, %xmm1      / expand the key
 675         call    _key_expansion_128
 676         aeskeygenassist $0x2, %xmm0, %xmm1      / expand the key
 677         call    _key_expansion_128
 678         aeskeygenassist $0x4, %xmm0, %xmm1      / expand the key
 679         call    _key_expansion_128
 680         aeskeygenassist $0x8, %xmm0, %xmm1      / expand the key
 681         call    _key_expansion_128
 682         aeskeygenassist $0x10, %xmm0, %xmm1     / expand the key
 683         call    _key_expansion_128
 684         aeskeygenassist $0x20, %xmm0, %xmm1     / expand the key
 685         call    _key_expansion_128
 686         aeskeygenassist $0x40, %xmm0, %xmm1     / expand the key
 687         call    _key_expansion_128
 688         aeskeygenassist $0x80, %xmm0, %xmm1     / expand the key
 689         call    _key_expansion_128
 690         aeskeygenassist $0x1b, %xmm0, %xmm1     / expand the key
 691         call    _key_expansion_128
 692         aeskeygenassist $0x36, %xmm0, %xmm1     / expand the key
 693         call    _key_expansion_128
 694 
 695         SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
 696 #ifdef  OPENSSL_INTERFACE
 697         xor     %rax, %rax                      / return 0 (OK)
 698 #else   /* OpenSolaris Interface */
 699         mov     $10, %rax                       / return # rounds = 10
 700 #endif
 701         ret
 702 
 703 .Lenc_key_invalid_param:
 704 #ifdef  OPENSSL_INTERFACE
 705         SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
 706         mov     $-1, %rax       / user key or AES key pointer is NULL
 707         ret
 708 #else
 709         /* FALLTHROUGH */
 710 #endif  /* OPENSSL_INTERFACE */
 711 
 712 .Lenc_key_invalid_key_bits:
 713         SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
 714 #ifdef  OPENSSL_INTERFACE
 715         mov     $-2, %rax       / keysize is invalid
 716 #else   /* Open Solaris Interface */
 717         xor     %rax, %rax      / a key pointer is NULL or invalid keysize
 718 #endif  /* OPENSSL_INTERFACE */
 719 
 720         ret
 721         SET_SIZE(rijndael_key_setup_enc_intel)
 722 
 723 
 724 /*
 725  * rijndael_key_setup_dec_intel()
 726  * Expand the cipher key into the decryption key schedule.
 727  *
 728  * For kernel code, caller is responsible for ensuring kpreempt_disable()
 729  * has been called.  This is because %xmm registers are not saved/restored.
 730  * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
 731  * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
 732  * on the stack.
 733  *
 734  * OpenSolaris interface:
 735  * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
 736  *      uint64_t keyBits);
 737  * Return value is 0 on error, number of rounds on success.
 738  * P1->P2, P2->P3, P3->P1
 739  *
 740  * Original Intel OpenSSL interface:
 741  * int intel_AES_set_decrypt_key(const unsigned char *userKey,
 742  *      const int bits, AES_KEY *key);
 743  * Return value is non-zero on error, 0 on success.
 744  */
 745 ENTRY_NP(rijndael_key_setup_dec_intel)
 746         / Generate round keys used for encryption
 747         call    rijndael_key_setup_enc_intel
 748         test    %rax, %rax
 749 #ifdef  OPENSSL_INTERFACE
 750         jnz     .Ldec_key_exit  / Failed if returned non-0
 751 #else   /* OpenSolaris Interface */
 752         jz      .Ldec_key_exit  / Failed if returned 0
 753 #endif  /* OPENSSL_INTERFACE */
 754 
 755         CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
 756 
 757         /*
 758          * Convert round keys used for encryption
 759          * to a form usable for decryption
 760          */
 761 #ifndef OPENSSL_INTERFACE               /* OpenSolaris Interface */
 762         mov     %rax, %ROUNDS64         / set # rounds (10, 12, or 14)
 763                                         / (already set for OpenSSL)
 764 #endif
 765 
 766         lea     0x10(%AESKEY), %rcx     / key addr
 767         shl     $4, %ROUNDS32
 768         add     %AESKEY, %ROUNDS64
 769         mov     %ROUNDS64, %ENDAESKEY
 770 
 771 .align 4
 772 .Ldec_key_reorder_loop:
 773         movaps  (%AESKEY), %xmm0
 774         movaps  (%ROUNDS64), %xmm1
 775         movaps  %xmm0, (%ROUNDS64)
 776         movaps  %xmm1, (%AESKEY)
 777         lea     0x10(%AESKEY), %AESKEY
 778         lea     -0x10(%ROUNDS64), %ROUNDS64
 779         cmp     %AESKEY, %ROUNDS64
 780         ja      .Ldec_key_reorder_loop
 781 
 782 .align 4
 783 .Ldec_key_inv_loop:
 784         movaps  (%rcx), %xmm0
 785         / Convert an encryption round key to a form usable for decryption
 786         / with the "AES Inverse Mix Columns" instruction
 787         aesimc  %xmm0, %xmm1
 788         movaps  %xmm1, (%rcx)
 789         lea     0x10(%rcx), %rcx
 790         cmp     %ENDAESKEY, %rcx
 791         jnz     .Ldec_key_inv_loop
 792 
 793         SET_TS_OR_POP_XMM0_XMM1(%r10)
 794 
 795 .Ldec_key_exit:
 796         / OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
 797         / OpenSSL: rax = 0 for OK, or non-zero for error
 798         ret
 799         SET_SIZE(rijndael_key_setup_dec_intel)
 800 
 801 
 802 #ifdef  OPENSSL_INTERFACE
 803 #define aes_encrypt_intel       intel_AES_encrypt
 804 #define aes_decrypt_intel       intel_AES_decrypt
 805 
 806 #define INP             rdi     /* P1, 64 bits */
 807 #define OUTP            rsi     /* P2, 64 bits */
 808 #define KEYP            rdx     /* P3, 64 bits */
 809 
 810 /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx:  */
 811 #define NROUNDS32       ecx     /* temporary, 32 bits */
 812 #define NROUNDS         cl      /* temporary,  8 bits */
 813 
 814 #else   /* OpenSolaris Interface */
 815 #define KEYP            rdi     /* P1, 64 bits */
 816 #define NROUNDS         esi     /* P2, 32 bits */
 817 #define INP             rdx     /* P3, 64 bits */
 818 #define OUTP            rcx     /* P4, 64 bits */
 819 #define LENGTH          r8      /* P5, 64 bits */
 820 #endif  /* OPENSSL_INTERFACE */
 821 
 822 #define KEY             xmm0    /* temporary, 128 bits */
 823 #define STATE0          xmm8    /* temporary, 128 bits */
 824 #define STATE1          xmm9    /* temporary, 128 bits */
 825 #define STATE2          xmm10   /* temporary, 128 bits */
 826 #define STATE3          xmm11   /* temporary, 128 bits */
 827 #define STATE4          xmm12   /* temporary, 128 bits */
 828 #define STATE5          xmm13   /* temporary, 128 bits */
 829 #define STATE6          xmm14   /* temporary, 128 bits */
 830 #define STATE7          xmm15   /* temporary, 128 bits */
 831 
 832 /*
 833  * Runs the first two rounds of AES256 on a state register. `op' should be
 834  * aesenc or aesdec.
 835  */
 836 #define AES256_ROUNDS(op, statereg)     \
 837         movaps  -0x60(%KEYP), %KEY;     \
 838         op      %KEY, %statereg;        \
 839         movaps  -0x50(%KEYP), %KEY;     \
 840         op      %KEY, %statereg
 841 
 842 /*
 843  * Runs the first two rounds of AES192, or the 3rd & 4th round of AES256 on
 844  * a state register. `op' should be aesenc or aesdec.
 845  */
 846 #define AES192_ROUNDS(op, statereg)     \
 847         movaps  -0x40(%KEYP), %KEY;     \
 848         op      %KEY, %statereg;        \
 849         movaps  -0x30(%KEYP), %KEY;     \
 850         op      %KEY, %statereg
 851 
 852 /*
 853  * Runs the full 10 rounds of AES128, or the last 10 rounds of AES192/AES256
 854  * on a state register. `op' should be aesenc or aesdec and `lastop' should
 855  * be aesenclast or aesdeclast.
 856  */
 857 #define AES128_ROUNDS(op, lastop, statereg) \
 858         movaps  -0x20(%KEYP), %KEY;     \
 859         op      %KEY, %statereg;        \
 860         movaps  -0x10(%KEYP), %KEY;     \
 861         op      %KEY, %statereg;        \
 862         movaps  (%KEYP), %KEY;          \
 863         op      %KEY, %statereg;        \
 864         movaps  0x10(%KEYP), %KEY;      \
 865         op      %KEY, %statereg;        \
 866         movaps  0x20(%KEYP), %KEY;      \
 867         op      %KEY, %statereg;        \
 868         movaps  0x30(%KEYP), %KEY;      \
 869         op      %KEY, %statereg;        \
 870         movaps  0x40(%KEYP), %KEY;      \
 871         op      %KEY, %statereg;        \
 872         movaps  0x50(%KEYP), %KEY;      \
 873         op      %KEY, %statereg;        \
 874         movaps  0x60(%KEYP), %KEY;      \
 875         op      %KEY, %statereg;        \
 876         movaps  0x70(%KEYP), %KEY;      \
 877         lastop  %KEY, %statereg
 878 
 879 /*
 880  * Macros to run AES encryption rounds. Input must be prefilled in state
 881  * register - output will be left there as well.
 882  * To run AES256, invoke all of these macros in sequence. To run AES192,
 883  * invoke only the -192 and -128 variants. To run AES128, invoke only the
 884  * -128 variant.
 885  */
 886 #define AES256_ENC_ROUNDS(statereg) \
 887         AES256_ROUNDS(aesenc, statereg)
 888 #define AES192_ENC_ROUNDS(statereg) \
 889         AES192_ROUNDS(aesenc, statereg)
 890 #define AES128_ENC_ROUNDS(statereg) \
 891         AES128_ROUNDS(aesenc, aesenclast, statereg)
 892 
 893 /* Same as the AES*_ENC_ROUNDS macros, but for decryption. */
 894 #define AES256_DEC_ROUNDS(statereg) \
 895         AES256_ROUNDS(aesdec, statereg)
 896 #define AES192_DEC_ROUNDS(statereg) \
 897         AES192_ROUNDS(aesdec, statereg)
 898 #define AES128_DEC_ROUNDS(statereg) \
 899         AES128_ROUNDS(aesdec, aesdeclast, statereg)
 900 
 901 
 902 /*
 903  * aes_encrypt_intel()
 904  * Encrypt a single block (in and out can overlap).
 905  *
 906  * For kernel code, caller is responsible for bracketing this call with
 907  * disabling kernel thread preemption and calling aes_accel_save/restore().
 908  *
 909  * Temporary register usage:
 910  * %xmm0        Key
 911  * %xmm8        State
 912  *
 913  * Original OpenSolaris Interface:
 914  * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
 915  *      const uint32_t pt[4], uint32_t ct[4])
 916  *
 917  * Original Intel OpenSSL Interface:
 918  * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
 919  *      const AES_KEY *key)
 920  */
 921 ENTRY_NP(aes_encrypt_intel)
 922         movups  (%INP), %STATE0                 / input
 923         movaps  (%KEYP), %KEY                   / key
 924 
 925 #ifdef  OPENSSL_INTERFACE
 926         mov     240(%KEYP), %NROUNDS32          / round count
 927 #else   /* OpenSolaris Interface */
 928         /* Round count is already present as P2 in %rsi/%esi */
 929 #endif  /* OPENSSL_INTERFACE */
 930 
 931         pxor    %KEY, %STATE0                   / round 0
 932         lea     0x30(%KEYP), %KEYP
 933         cmp     $12, %NROUNDS
 934         jb      .Lenc128
 935         lea     0x20(%KEYP), %KEYP
 936         je      .Lenc192
 937 
 938         / AES 256
 939         lea     0x20(%KEYP), %KEYP
 940         AES256_ENC_ROUNDS(STATE0)
 941 
 942 .align 4
 943 .Lenc192:
 944         / AES 192 and 256
 945         AES192_ENC_ROUNDS(STATE0)
 946 
 947 .align 4
 948 .Lenc128:
 949         / AES 128, 192, and 256
 950         AES128_ENC_ROUNDS(STATE0)
 951         movups  %STATE0, (%OUTP)                / output
 952 
 953         ret
 954         SET_SIZE(aes_encrypt_intel)
 955 
 956 /*
 957  * aes_decrypt_intel()
 958  * Decrypt a single block (in and out can overlap).
 959  *
 960  * For kernel code, caller is responsible for bracketing this call with
 961  * disabling kernel thread preemption and calling aes_accel_save/restore().
 962  *
 963  * Temporary register usage:
 964  * %xmm0        State
 965  * %xmm1        Key
 966  *
 967  * Original OpenSolaris Interface:
 968  * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
 969  *      const uint32_t pt[4], uint32_t ct[4])
 970  *
 971  * Original Intel OpenSSL Interface:
 972  * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
 973  *      const AES_KEY *key);
 974  */
 975 ENTRY_NP(aes_decrypt_intel)
 976         movups  (%INP), %STATE0                 / input
 977         movaps  (%KEYP), %KEY                   / key
 978 
 979 #ifdef  OPENSSL_INTERFACE
 980         mov     240(%KEYP), %NROUNDS32          / round count
 981 #else   /* OpenSolaris Interface */
 982         /* Round count is already present as P2 in %rsi/%esi */
 983 #endif  /* OPENSSL_INTERFACE */
 984 
 985         pxor    %KEY, %STATE0                   / round 0
 986         lea     0x30(%KEYP), %KEYP
 987         cmp     $12, %NROUNDS
 988         jb      .Ldec128
 989         lea     0x20(%KEYP), %KEYP
 990         je      .Ldec192
 991 
 992         / AES 256
 993         lea     0x20(%KEYP), %KEYP
 994         AES256_DEC_ROUNDS(STATE0)
 995 
 996 .align 4
 997 .Ldec192:
 998         / AES 192 and 256
 999         AES192_DEC_ROUNDS(STATE0)
1000 
1001 .align 4
1002 .Ldec128:
1003         / AES 128, 192, and 256
1004         AES128_DEC_ROUNDS(STATE0)
1005         movups  %STATE0, (%OUTP)                / output
1006 
1007         ret
1008         SET_SIZE(aes_decrypt_intel)
1009 
1010 /* Does a pipelined load of eight input blocks into our AES state registers. */
1011 #define AES_LOAD_INPUT_8BLOCKS          \
1012         movups  0x00(%INP), %STATE0;    \
1013         movups  0x10(%INP), %STATE1;    \
1014         movups  0x20(%INP), %STATE2;    \
1015         movups  0x30(%INP), %STATE3;    \
1016         movups  0x40(%INP), %STATE4;    \
1017         movups  0x50(%INP), %STATE5;    \
1018         movups  0x60(%INP), %STATE6;    \
1019         movups  0x70(%INP), %STATE7;
1020 
1021 /* Does a pipelined store of eight AES state registers to the output. */
1022 #define AES_STORE_OUTPUT_8BLOCKS        \
1023         movups  %STATE0, 0x00(%OUTP);   \
1024         movups  %STATE1, 0x10(%OUTP);   \
1025         movups  %STATE2, 0x20(%OUTP);   \
1026         movups  %STATE3, 0x30(%OUTP);   \
1027         movups  %STATE4, 0x40(%OUTP);   \
1028         movups  %STATE5, 0x50(%OUTP);   \
1029         movups  %STATE6, 0x60(%OUTP);   \
1030         movups  %STATE7, 0x70(%OUTP);
1031 
1032 /* Performs a pipelined AES instruction with the key on all state registers. */
1033 #define AES_KEY_STATE_OP_8BLOCKS(op)    \
1034         op      %KEY, %STATE0;          \
1035         op      %KEY, %STATE1;          \
1036         op      %KEY, %STATE2;          \
1037         op      %KEY, %STATE3;          \
1038         op      %KEY, %STATE4;          \
1039         op      %KEY, %STATE5;          \
1040         op      %KEY, %STATE6;          \
1041         op      %KEY, %STATE7
1042 
1043 /* XOR all AES state regs with key to initiate encryption/decryption. */
1044 #define AES_XOR_STATE_8BLOCKS           \
1045         AES_KEY_STATE_OP_8BLOCKS(pxor)
1046 
1047 /*
1048  * Loads a round key from the key schedule offset `off' into the KEY
1049  * register and performs `op' using the KEY on all 8 STATE registers.
1050  */
1051 #define AES_RND_8BLOCKS(op, off)        \
1052         movaps  off(%KEYP), %KEY;       \
1053         AES_KEY_STATE_OP_8BLOCKS(op)
1054 
1055 /*
1056  * void aes_encrypt_intel8(const uint32_t roundkeys[], int numrounds,
1057  *      const void *plaintext, void *ciphertext)
1058  *
1059  * Same as aes_encrypt_intel, but performs the encryption operation on
1060  * 8 independent blocks in sequence, exploiting instruction pipelining.
1061  * This function doesn't support the OpenSSL interface, it's only meant
1062  * for kernel use.
1063  */
1064 ENTRY_NP(aes_encrypt_intel8)
1065         AES_LOAD_INPUT_8BLOCKS          / load input
1066         movaps  (%KEYP), %KEY           / key
1067         AES_XOR_STATE_8BLOCKS           / round 0
1068 
1069         lea     0x30(%KEYP), %KEYP      / point to key schedule
1070         cmp     $12, %NROUNDS           / determine AES variant
1071         jb      .Lenc8_128
1072         lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
1073         je      .Lenc8_192
1074 
1075         lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
1076         AES_RND_8BLOCKS(aesenc, -0x60)  / AES256 R.1
1077         AES_RND_8BLOCKS(aesenc, -0x50)  / AES256 R.2
1078 
1079 .align 4
1080 .Lenc8_192:
1081         AES_RND_8BLOCKS(aesenc, -0x40)  / AES192 R.1; AES256 R.3
1082         AES_RND_8BLOCKS(aesenc, -0x30)  / AES192 R.2; AES256 R.4
1083 
1084 .align 4
1085 .Lenc8_128:
1086         AES_RND_8BLOCKS(aesenc, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
1087         AES_RND_8BLOCKS(aesenc, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
1088         AES_RND_8BLOCKS(aesenc, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
1089         AES_RND_8BLOCKS(aesenc, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
1090         AES_RND_8BLOCKS(aesenc, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
1091         AES_RND_8BLOCKS(aesenc, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
1092         AES_RND_8BLOCKS(aesenc, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
1093         AES_RND_8BLOCKS(aesenc, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
1094         AES_RND_8BLOCKS(aesenc, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
1095         AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1096 
1097         AES_STORE_OUTPUT_8BLOCKS        / store output
1098         ret
1099         SET_SIZE(aes_encrypt_intel8)
1100 
1101 
1102 /*
1103  * void aes_decrypt_intel8(const uint32_t roundkeys[], int numrounds,
1104  *      const void *ciphertext, void *plaintext)
1105  *
1106  * Same as aes_decrypt_intel, but performs the decryption operation on
1107  * 8 independent blocks in sequence, exploiting instruction pipelining.
1108  * This function doesn't support the OpenSSL interface, it's only meant
1109  * for kernel use.
1110  */
1111 ENTRY_NP(aes_decrypt_intel8)
1112         AES_LOAD_INPUT_8BLOCKS          / load input
1113         movaps  (%KEYP), %KEY           / key
1114         AES_XOR_STATE_8BLOCKS           / round 0
1115 
1116         lea     0x30(%KEYP), %KEYP      / point to key schedule
1117         cmp     $12, %NROUNDS           / determine AES variant
1118         jb      .Ldec8_128
1119         lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
1120         je      .Ldec8_192
1121 
1122         lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
1123         AES_RND_8BLOCKS(aesdec, -0x60)  / AES256 R.1
1124         AES_RND_8BLOCKS(aesdec, -0x50)  / AES256 R.2
1125 
1126 .align 4
1127 .Ldec8_192:
1128         AES_RND_8BLOCKS(aesdec, -0x40)  / AES192 R.1; AES256 R.3
1129         AES_RND_8BLOCKS(aesdec, -0x30)  / AES192 R.2; AES256 R.4
1130 
1131 .align 4
1132 .Ldec8_128:
1133         AES_RND_8BLOCKS(aesdec, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
1134         AES_RND_8BLOCKS(aesdec, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
1135         AES_RND_8BLOCKS(aesdec, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
1136         AES_RND_8BLOCKS(aesdec, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
1137         AES_RND_8BLOCKS(aesdec, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
1138         AES_RND_8BLOCKS(aesdec, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
1139         AES_RND_8BLOCKS(aesdec, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
1140         AES_RND_8BLOCKS(aesdec, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
1141         AES_RND_8BLOCKS(aesdec, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
1142         AES_RND_8BLOCKS(aesdeclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1143 
1144         AES_STORE_OUTPUT_8BLOCKS        / store output
1145         ret
1146         SET_SIZE(aes_decrypt_intel8)
1147 
1148 
1149 /*
1150  * This macro encapsulates the entire AES encryption algo for a single
1151  * block, which is prefilled in statereg and which will be replaced by
1152  * the encrypted output. The KEYP register must already point to the
1153  * AES128 key schedule ("lea 0x30(%KEYP), %KEYP" from encryption
1154  * function call) so that consecutive invocations of this macro are
1155  * supported (KEYP is restored after each invocation).
1156  */
1157 #define AES_ENC(statereg, label_128, label_192, label_out)      \
1158         cmp     $12, %NROUNDS;                                  \
1159         jb      label_128;                                      \
1160         je      label_192;                                      \
1161         /* AES 256 only */                                      \
1162         lea     0x40(%KEYP), %KEYP;                             \
1163         AES256_ENC_ROUNDS(statereg);                            \
1164         AES192_ENC_ROUNDS(statereg);                            \
1165         AES128_ENC_ROUNDS(statereg);                            \
1166         lea     -0x40(%KEYP), %KEYP;                            \
1167         jmp     label_out;                                      \
1168 .align 4;                                                       \
1169 label_192:                                                      \
1170         lea     0x20(%KEYP), %KEYP;                             \
1171         /* AES 192 only */                                      \
1172         AES192_ENC_ROUNDS(statereg);                            \
1173         AES128_ENC_ROUNDS(statereg);                            \
1174         lea     -0x20(%KEYP), %KEYP;                            \
1175         jmp     label_out;                                      \
1176 .align 4;                                                       \
1177 label_128:                                                      \
1178         /* AES 128 only */                                      \
1179         AES128_ENC_ROUNDS(statereg);                            \
1180 .align 4;                                                       \
1181 label_out:
1182 
1183 
1184 /*
1185  * void aes_encrypt_cbc_intel8(const uint32_t roundkeys[], int numrounds,
1186  *      const void *plaintext, void *ciphertext, const void *IV)
1187  *
1188  * Encrypts 8 consecutive AES blocks in the CBC mode. Input and output
1189  * may overlap. This provides a modest performance boost over invoking
1190  * the encryption and XOR in separate functions because we can avoid
1191  * copying the ciphertext block to and from memory between encryption
1192  * and XOR calls.
1193  */
1194 #define CBC_IV                  r8      /* input - IV blk pointer */
1195 #define CBC_IV_XMM              xmm1    /* tmp IV location for alignment */
1196 
1197 ENTRY_NP(aes_encrypt_cbc_intel8)
1198         AES_LOAD_INPUT_8BLOCKS          / load input
1199         movaps  (%KEYP), %KEY           / key
1200         AES_XOR_STATE_8BLOCKS           / round 0
1201 
1202         lea     0x30(%KEYP), %KEYP      / point to key schedule
1203         movdqu  (%CBC_IV), %CBC_IV_XMM  / load IV from unaligned memory
1204         pxor    %CBC_IV_XMM, %STATE0    / XOR IV with input block and encrypt
1205         AES_ENC(STATE0, .Lenc_cbc_0_128, .Lenc_cbc_0_192, .Lenc_cbc_0_out)
1206         pxor    %STATE0, %STATE1
1207         AES_ENC(STATE1, .Lenc_cbc_1_128, .Lenc_cbc_1_192, .Lenc_cbc_1_out)
1208         pxor    %STATE1, %STATE2
1209         AES_ENC(STATE2, .Lenc_cbc_2_128, .Lenc_cbc_2_192, .Lenc_cbc_2_out)
1210         pxor    %STATE2, %STATE3
1211         AES_ENC(STATE3, .Lenc_cbc_3_128, .Lenc_cbc_3_192, .Lenc_cbc_3_out)
1212         pxor    %STATE3, %STATE4
1213         AES_ENC(STATE4, .Lenc_cbc_4_128, .Lenc_cbc_4_192, .Lenc_cbc_4_out)
1214         pxor    %STATE4, %STATE5
1215         AES_ENC(STATE5, .Lenc_cbc_5_128, .Lenc_cbc_5_192, .Lenc_cbc_5_out)
1216         pxor    %STATE5, %STATE6
1217         AES_ENC(STATE6, .Lenc_cbc_6_128, .Lenc_cbc_6_192, .Lenc_cbc_6_out)
1218         pxor    %STATE6, %STATE7
1219         AES_ENC(STATE7, .Lenc_cbc_7_128, .Lenc_cbc_7_192, .Lenc_cbc_7_out)
1220 
1221         AES_STORE_OUTPUT_8BLOCKS        / store output
1222         ret
1223         SET_SIZE(aes_encrypt_cbc_intel8)
1224 
1225 /*
1226  * Prefills register state with counters suitable for the CTR encryption
1227  * mode. The counter is assumed to consist of two portions:
1228  * - A lower monotonically increasing 64-bit counter. If the caller wants
1229  *   a smaller counter, they are responsible for checking that it doesn't
1230  *   overflow between encryption calls.
1231  * - An upper static "nonce" portion, in big endian, preloaded into the
1232  *   lower portion of an XMM register.
1233  * This macro adds `ctridx' to the lower_LE counter, swaps it to big
1234  * endian and by way of a temporary general-purpose register loads the
1235  * lower and upper counter portions into a target XMM result register,
1236  * which can then be handed off to the encryption process.
1237  */
1238 #define PREP_CTR_BLOCKS(lower_LE, upper_BE_xmm, ctridx, tmpreg, resreg) \
1239         lea     ctridx(%lower_LE), %tmpreg;                             \
1240         bswap   %tmpreg;                                                \
1241         movq    %tmpreg, %resreg;                                       \
1242         movlhps %upper_BE_xmm, %resreg;                                 \
1243         pshufd  $0b01001110, %resreg, %resreg
1244 
1245 #define CTR_UPPER_BE            r8      /* input - counter upper 64 bits (BE) */
1246 #define CTR_UPPER_BE_XMM        xmm1    /* tmp for upper counter bits */
1247 #define CTR_LOWER_LE            r9      /* input - counter lower 64 bits (LE) */
1248 #define CTR_TMP0                rax     /* tmp for lower 64 bit add & bswap */
1249 #define CTR_TMP1                rbx     /* tmp for lower 64 bit add & bswap */
1250 #define CTR_TMP2                r10     /* tmp for lower 64 bit add & bswap */
1251 #define CTR_TMP3                r11     /* tmp for lower 64 bit add & bswap */
1252 #define CTR_TMP4                r12     /* tmp for lower 64 bit add & bswap */
1253 #define CTR_TMP5                r13     /* tmp for lower 64 bit add & bswap */
1254 #define CTR_TMP6                r14     /* tmp for lower 64 bit add & bswap */
1255 #define CTR_TMP7                r15     /* tmp for lower 64 bit add & bswap */
1256 
1257 /*
1258  * These are used in case CTR encryption input is unaligned before XORing.
1259  * Must not overlap with any STATE[0-7] register.
1260  */
1261 #define TMP_INPUT0      xmm0
1262 #define TMP_INPUT1      xmm1
1263 #define TMP_INPUT2      xmm2
1264 #define TMP_INPUT3      xmm3
1265 #define TMP_INPUT4      xmm4
1266 #define TMP_INPUT5      xmm5
1267 #define TMP_INPUT6      xmm6
1268 #define TMP_INPUT7      xmm7
1269 
1270 /*
1271  * void aes_ctr_intel8(const uint32_t roundkeys[], int numrounds,
1272  *      const void *input, void *output, uint64_t counter_upper_BE,
1273  *      uint64_t counter_lower_LE)
1274  *
1275  * Runs AES on 8 consecutive blocks in counter mode (encryption and
1276  * decryption in counter mode are the same).
1277  */
1278 ENTRY_NP(aes_ctr_intel8)
1279         /* save caller's regs */
1280         pushq   %rbp
1281         movq    %rsp, %rbp
1282         subq    $0x38, %rsp
1283         / CTR_TMP0 is rax, no need to save
1284         movq    %CTR_TMP1, -0x38(%rbp)
1285         movq    %CTR_TMP2, -0x30(%rbp)
1286         movq    %CTR_TMP3, -0x28(%rbp)
1287         movq    %CTR_TMP4, -0x20(%rbp)
1288         movq    %CTR_TMP5, -0x18(%rbp)
1289         movq    %CTR_TMP6, -0x10(%rbp)
1290         movq    %CTR_TMP7, -0x08(%rbp)
1291 
1292         /*
1293          * CTR step 1: prepare big-endian formatted 128-bit counter values,
1294          * placing the result in the AES-NI input state registers.
1295          */
1296         movq    %CTR_UPPER_BE, %CTR_UPPER_BE_XMM
1297         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 0, CTR_TMP0, STATE0)
1298         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 1, CTR_TMP1, STATE1)
1299         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 2, CTR_TMP2, STATE2)
1300         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 3, CTR_TMP3, STATE3)
1301         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 4, CTR_TMP4, STATE4)
1302         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 5, CTR_TMP5, STATE5)
1303         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 6, CTR_TMP6, STATE6)
1304         PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 7, CTR_TMP7, STATE7)
1305 
1306         /*
1307          * CTR step 2: Encrypt the counters.
1308          */
1309         movaps  (%KEYP), %KEY           / key
1310         AES_XOR_STATE_8BLOCKS           / round 0
1311 
1312         /* Determine the AES variant we're going to compute */
1313         lea     0x30(%KEYP), %KEYP      / point to key schedule
1314         cmp     $12, %NROUNDS           / determine AES variant
1315         jb      .Lctr8_128
1316         lea     0x20(%KEYP), %KEYP      / AES192 has larger key schedule
1317         je      .Lctr8_192
1318 
1319         /* AES 256 */
1320         lea     0x20(%KEYP), %KEYP      / AES256 has even larger key schedule
1321         AES_RND_8BLOCKS(aesenc, -0x60)  / AES256 R.1
1322         AES_RND_8BLOCKS(aesenc, -0x50)  / AES256 R.2
1323 
1324 .align 4
1325 .Lctr8_192:
1326         /* AES 192 and 256 */
1327         AES_RND_8BLOCKS(aesenc, -0x40)  / AES192 R.1; AES256 R.3
1328         AES_RND_8BLOCKS(aesenc, -0x30)  / AES192 R.2; AES256 R.4
1329 
1330 .align 4
1331 .Lctr8_128:
1332         /* AES 128, 192, and 256 */
1333         AES_RND_8BLOCKS(aesenc, -0x20)  / AES128 R.1; AES192 R.3; AES256 R.5
1334         AES_RND_8BLOCKS(aesenc, -0x10)  / AES128 R.2; AES192 R.4; AES256 R.6
1335         AES_RND_8BLOCKS(aesenc, 0x00)   / AES128 R.3; AES192 R.5; AES256 R.7
1336         AES_RND_8BLOCKS(aesenc, 0x10)   / AES128 R.4; AES192 R.6; AES256 R.8
1337         AES_RND_8BLOCKS(aesenc, 0x20)   / AES128 R.5; AES192 R.7; AES256 R.9
1338         AES_RND_8BLOCKS(aesenc, 0x30)   / AES128 R.6; AES192 R.8; AES256 R.10
1339         AES_RND_8BLOCKS(aesenc, 0x40)   / AES128 R.7; AES192 R.9; AES256 R.11
1340         AES_RND_8BLOCKS(aesenc, 0x50)   / AES128 R.8; AES192 R.10; AES256 R.12
1341         AES_RND_8BLOCKS(aesenc, 0x60)   / AES128 R.9; AES192 R.11; AES256 R.13
1342         AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14
1343 
1344         /*
1345          * CTR step 3: XOR input data blocks with encrypted counters to
1346          * produce result.
1347          */
1348         mov     %INP, %rax              / pxor requires alignment, so check
1349         andq    $0xf, %rax
1350         jnz     .Lctr_input_unaligned
1351         pxor    0x00(%INP), %STATE0
1352         pxor    0x10(%INP), %STATE1
1353         pxor    0x20(%INP), %STATE2
1354         pxor    0x30(%INP), %STATE3
1355         pxor    0x40(%INP), %STATE4
1356         pxor    0x50(%INP), %STATE5
1357         pxor    0x60(%INP), %STATE6
1358         pxor    0x70(%INP), %STATE7
1359         jmp     .Lctr_out
1360 
1361 .align 4
1362 .Lctr_input_unaligned:
1363         movdqu  0x00(%INP), %TMP_INPUT0
1364         movdqu  0x10(%INP), %TMP_INPUT1
1365         movdqu  0x20(%INP), %TMP_INPUT2
1366         movdqu  0x30(%INP), %TMP_INPUT3
1367         movdqu  0x40(%INP), %TMP_INPUT4
1368         movdqu  0x50(%INP), %TMP_INPUT5
1369         movdqu  0x60(%INP), %TMP_INPUT6
1370         movdqu  0x70(%INP), %TMP_INPUT7
1371         pxor    %TMP_INPUT0, %STATE0
1372         pxor    %TMP_INPUT1, %STATE1
1373         pxor    %TMP_INPUT2, %STATE2
1374         pxor    %TMP_INPUT3, %STATE3
1375         pxor    %TMP_INPUT4, %STATE4
1376         pxor    %TMP_INPUT5, %STATE5
1377         pxor    %TMP_INPUT6, %STATE6
1378         pxor    %TMP_INPUT7, %STATE7
1379 
1380 .align 4
1381 .Lctr_out:
1382         /*
1383          * Step 4: Write out processed blocks to memory.
1384          */
1385         movdqu  %STATE0, 0x00(%OUTP)
1386         movdqu  %STATE1, 0x10(%OUTP)
1387         movdqu  %STATE2, 0x20(%OUTP)
1388         movdqu  %STATE3, 0x30(%OUTP)
1389         movdqu  %STATE4, 0x40(%OUTP)
1390         movdqu  %STATE5, 0x50(%OUTP)
1391         movdqu  %STATE6, 0x60(%OUTP)
1392         movdqu  %STATE7, 0x70(%OUTP)
1393 
1394         /* restore caller's regs */
1395         / CTR_TMP0 is rax, no need to restore
1396         movq    -0x38(%rbp), %CTR_TMP1
1397         movq    -0x30(%rbp), %CTR_TMP2
1398         movq    -0x28(%rbp), %CTR_TMP3
1399         movq    -0x20(%rbp), %CTR_TMP4
1400         movq    -0x18(%rbp), %CTR_TMP5
1401         movq    -0x10(%rbp), %CTR_TMP6
1402         movq    -0x08(%rbp), %CTR_TMP7
1403         leave
1404         ret
1405         SET_SIZE(aes_ctr_intel8)
1406 
1407 #endif  /* lint || __lint */