1 /* 2 * ==================================================================== 3 * Written by Intel Corporation for the OpenSSL project to add support 4 * for Intel AES-NI instructions. Rights for redistribution and usage 5 * in source and binary forms are granted according to the OpenSSL 6 * license. 7 * 8 * Author: Huang Ying <ying.huang at intel dot com> 9 * Vinodh Gopal <vinodh.gopal at intel dot com> 10 * Kahraman Akdemir 11 * 12 * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD) 13 * instructions that are going to be introduced in the next generation 14 * of Intel processor, as of 2009. These instructions enable fast and 15 * secure data encryption and decryption, using the Advanced Encryption 16 * Standard (AES), defined by FIPS Publication number 197. The 17 * architecture introduces six instructions that offer full hardware 18 * support for AES. Four of them support high performance data 19 * encryption and decryption, and the other two instructions support 20 * the AES key expansion procedure. 21 * ==================================================================== 22 */ 23 24 /* 25 * ==================================================================== 26 * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved. 27 * 28 * Redistribution and use in source and binary forms, with or without 29 * modification, are permitted provided that the following conditions 30 * are met: 31 * 32 * 1. Redistributions of source code must retain the above copyright 33 * notice, this list of conditions and the following disclaimer. 34 * 35 * 2. Redistributions in binary form must reproduce the above copyright 36 * notice, this list of conditions and the following disclaimer in 37 * the documentation and/or other materials provided with the 38 * distribution. 39 * 40 * 3. All advertising materials mentioning features or use of this 41 * software must display the following acknowledgment: 42 * "This product includes software developed by the OpenSSL Project 43 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" 44 * 45 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to 46 * endorse or promote products derived from this software without 47 * prior written permission. For written permission, please contact 48 * openssl-core@openssl.org. 49 * 50 * 5. Products derived from this software may not be called "OpenSSL" 51 * nor may "OpenSSL" appear in their names without prior written 52 * permission of the OpenSSL Project. 53 * 54 * 6. Redistributions of any form whatsoever must retain the following 55 * acknowledgment: 56 * "This product includes software developed by the OpenSSL Project 57 * for use in the OpenSSL Toolkit (http://www.openssl.org/)" 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY 60 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 62 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR 63 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 64 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 65 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 66 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 68 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 70 * OF THE POSSIBILITY OF SUCH DAMAGE. 71 * ==================================================================== 72 */ 73 74 /* 75 * ==================================================================== 76 * OpenSolaris OS modifications 77 * 78 * This source originates as files aes-intel.S and eng_aesni_asm.pl, in 79 * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by 80 * Huang Ying of Intel to the openssl-dev mailing list under the subject 81 * of "Add support to Intel AES-NI instruction set for x86_64 platform". 82 * 83 * This OpenSolaris version has these major changes from the original source: 84 * 85 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from 86 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function 87 * definitions for lint. 88 * 89 * 2. Formatted code, added comments, and added #includes and #defines. 90 * 91 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before 92 * calling kpreempt_disable() and kpreempt_enable(). 93 * If the TS bit is not set, Save and restore %xmm registers at the beginning 94 * and end of function calls (%xmm* registers are not saved and restored by 95 * during kernel thread preemption). 96 * 97 * 4. Renamed functions, reordered parameters, and changed return value 98 * to match OpenSolaris: 99 * 100 * OpenSSL interface: 101 * int intel_AES_set_encrypt_key(const unsigned char *userKey, 102 * const int bits, AES_KEY *key); 103 * int intel_AES_set_decrypt_key(const unsigned char *userKey, 104 * const int bits, AES_KEY *key); 105 * Return values for above are non-zero on error, 0 on success. 106 * 107 * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, 108 * const AES_KEY *key); 109 * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, 110 * const AES_KEY *key); 111 * typedef struct aes_key_st { 112 * unsigned int rd_key[4 *(AES_MAXNR + 1)]; 113 * int rounds; 114 * unsigned int pad[3]; 115 * } AES_KEY; 116 * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules 117 * (ks32) instead of 64-bit (ks64). 118 * Number of rounds (aka round count) is at offset 240 of AES_KEY. 119 * 120 * OpenSolaris OS interface (#ifdefs removed for readability): 121 * int rijndael_key_setup_dec_intel(uint32_t rk[], 122 * const uint32_t cipherKey[], uint64_t keyBits); 123 * int rijndael_key_setup_enc_intel(uint32_t rk[], 124 * const uint32_t cipherKey[], uint64_t keyBits); 125 * Return values for above are 0 on error, number of rounds on success. 126 * 127 * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, 128 * const uint32_t pt[4], uint32_t ct[4]); 129 * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, 130 * const uint32_t pt[4], uint32_t ct[4]); 131 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]; 132 * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t; 133 * 134 * typedef union { 135 * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; 136 * } aes_ks_t; 137 * typedef struct aes_key { 138 * aes_ks_t encr_ks, decr_ks; 139 * long double align128; 140 * int flags, nr, type; 141 * } aes_key_t; 142 * 143 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, 144 * ct is crypto text, and MAX_AES_NR is 14. 145 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. 146 * 147 * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary. 148 * 149 * ==================================================================== 150 */ 151 /* 152 * Copyright 2015 by Saso Kiselkov. All rights reserved. 153 */ 154 155 #if defined(lint) || defined(__lint) 156 157 #include <sys/types.h> 158 159 /* ARGSUSED */ 160 void 161 aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], 162 uint32_t ct[4]) { 163 } 164 /* ARGSUSED */ 165 void 166 aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], 167 uint32_t pt[4]) { 168 } 169 /* ARGSUSED */ 170 int 171 rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], 172 uint64_t keyBits) { 173 return (0); 174 } 175 /* ARGSUSED */ 176 int 177 rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], 178 uint64_t keyBits) { 179 return (0); 180 } 181 182 183 #else /* lint */ 184 185 #include <sys/asm_linkage.h> 186 #include <sys/controlregs.h> 187 #ifdef _KERNEL 188 #include <sys/machprivregs.h> 189 #endif 190 191 #ifdef _KERNEL 192 /* 193 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, 194 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it 195 * uses it to pass P2 to syscall. 196 * This also occurs with the STTS macro, but we don't care if 197 * P2 (%rsi) is modified just before function exit. 198 * The CLTS and STTS macros push and pop P1 (%rdi) already. 199 */ 200 #ifdef __xpv 201 #define PROTECTED_CLTS \ 202 push %rsi; \ 203 CLTS; \ 204 pop %rsi 205 #else 206 #define PROTECTED_CLTS \ 207 CLTS 208 #endif /* __xpv */ 209 210 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \ 211 push %rbp; \ 212 mov %rsp, %rbp; \ 213 movq %cr0, tmpreg; \ 214 testq $CR0_TS, tmpreg; \ 215 jnz 1f; \ 216 and $-XMM_ALIGN, %rsp; \ 217 sub $[XMM_SIZE * 2], %rsp; \ 218 movaps %xmm0, 16(%rsp); \ 219 movaps %xmm1, (%rsp); \ 220 jmp 2f; \ 221 1: \ 222 PROTECTED_CLTS; \ 223 2: 224 225 /* 226 * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack, 227 * otherwise set CR0_TS. 228 */ 229 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg) \ 230 testq $CR0_TS, tmpreg; \ 231 jnz 1f; \ 232 movaps (%rsp), %xmm1; \ 233 movaps 16(%rsp), %xmm0; \ 234 jmp 2f; \ 235 1: \ 236 STTS(tmpreg); \ 237 2: \ 238 mov %rbp, %rsp; \ 239 pop %rbp 240 241 /* 242 * If CR0_TS is not set, align stack (with push %rbp) and push 243 * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS 244 */ 245 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \ 246 push %rbp; \ 247 mov %rsp, %rbp; \ 248 movq %cr0, tmpreg; \ 249 testq $CR0_TS, tmpreg; \ 250 jnz 1f; \ 251 and $-XMM_ALIGN, %rsp; \ 252 sub $[XMM_SIZE * 7], %rsp; \ 253 movaps %xmm0, 96(%rsp); \ 254 movaps %xmm1, 80(%rsp); \ 255 movaps %xmm2, 64(%rsp); \ 256 movaps %xmm3, 48(%rsp); \ 257 movaps %xmm4, 32(%rsp); \ 258 movaps %xmm5, 16(%rsp); \ 259 movaps %xmm6, (%rsp); \ 260 jmp 2f; \ 261 1: \ 262 PROTECTED_CLTS; \ 263 2: 264 265 266 /* 267 * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack, 268 * otherwise set CR0_TS. 269 */ 270 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \ 271 testq $CR0_TS, tmpreg; \ 272 jnz 1f; \ 273 movaps (%rsp), %xmm6; \ 274 movaps 16(%rsp), %xmm5; \ 275 movaps 32(%rsp), %xmm4; \ 276 movaps 48(%rsp), %xmm3; \ 277 movaps 64(%rsp), %xmm2; \ 278 movaps 80(%rsp), %xmm1; \ 279 movaps 96(%rsp), %xmm0; \ 280 jmp 2f; \ 281 1: \ 282 STTS(tmpreg); \ 283 2: \ 284 mov %rbp, %rsp; \ 285 pop %rbp 286 287 /* 288 * void aes_accel_save(void *savestate); 289 * 290 * Saves all 16 XMM registers and CR0 to a temporary location pointed to 291 * in the first argument and clears TS in CR0. This must be invoked before 292 * executing any floating point operations inside the kernel (and kernel 293 * thread preemption must be disabled as well). The memory region to which 294 * all state is saved must be at least 16x 128-bit + 64-bit long and must 295 * be 128-bit aligned. 296 */ 297 ENTRY_NP(aes_accel_save) 298 movq %cr0, %rax 299 movq %rax, 0x100(%rdi) 300 testq $CR0_TS, %rax 301 jnz 1f 302 movaps %xmm0, 0x00(%rdi) 303 movaps %xmm1, 0x10(%rdi) 304 movaps %xmm2, 0x20(%rdi) 305 movaps %xmm3, 0x30(%rdi) 306 movaps %xmm4, 0x40(%rdi) 307 movaps %xmm5, 0x50(%rdi) 308 movaps %xmm6, 0x60(%rdi) 309 movaps %xmm7, 0x70(%rdi) 310 movaps %xmm8, 0x80(%rdi) 311 movaps %xmm9, 0x90(%rdi) 312 movaps %xmm10, 0xa0(%rdi) 313 movaps %xmm11, 0xb0(%rdi) 314 movaps %xmm12, 0xc0(%rdi) 315 movaps %xmm13, 0xd0(%rdi) 316 movaps %xmm14, 0xe0(%rdi) 317 movaps %xmm15, 0xf0(%rdi) 318 ret 319 1: 320 PROTECTED_CLTS 321 ret 322 SET_SIZE(aes_accel_save) 323 324 /* 325 * void aes_accel_restore(void *savestate); 326 * 327 * Restores the saved XMM and CR0.TS state from aes_accel_save. 328 */ 329 ENTRY_NP(aes_accel_restore) 330 mov 0x100(%rdi), %rax 331 testq $CR0_TS, %rax 332 jnz 1f 333 movaps 0x00(%rdi), %xmm0 334 movaps 0x10(%rdi), %xmm1 335 movaps 0x20(%rdi), %xmm2 336 movaps 0x30(%rdi), %xmm3 337 movaps 0x40(%rdi), %xmm4 338 movaps 0x50(%rdi), %xmm5 339 movaps 0x60(%rdi), %xmm6 340 movaps 0x70(%rdi), %xmm7 341 movaps 0x80(%rdi), %xmm8 342 movaps 0x90(%rdi), %xmm9 343 movaps 0xa0(%rdi), %xmm10 344 movaps 0xb0(%rdi), %xmm11 345 movaps 0xc0(%rdi), %xmm12 346 movaps 0xd0(%rdi), %xmm13 347 movaps 0xe0(%rdi), %xmm14 348 movaps 0xf0(%rdi), %xmm15 349 ret 350 1: 351 STTS(%rax) 352 ret 353 SET_SIZE(aes_accel_restore) 354 355 #else 356 #define PROTECTED_CLTS 357 #define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) 358 #define SET_TS_OR_POP_XMM0_XMM1(tmpreg) 359 #define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) 360 #define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) 361 #endif /* _KERNEL */ 362 363 364 /* 365 * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(), 366 * _key_expansion_256a(), _key_expansion_256b() 367 * 368 * Helper functions called by rijndael_key_setup_inc_intel(). 369 * Also used indirectly by rijndael_key_setup_dec_intel(). 370 * 371 * Input: 372 * %xmm0 User-provided cipher key 373 * %xmm1 Round constant 374 * Output: 375 * (%rcx) AES key 376 */ 377 378 .align 16 379 _key_expansion_128: 380 _key_expansion_256a: 381 pshufd $0b11111111, %xmm1, %xmm1 382 shufps $0b00010000, %xmm0, %xmm4 383 pxor %xmm4, %xmm0 384 shufps $0b10001100, %xmm0, %xmm4 385 pxor %xmm4, %xmm0 386 pxor %xmm1, %xmm0 387 movaps %xmm0, (%rcx) 388 add $0x10, %rcx 389 ret 390 SET_SIZE(_key_expansion_128) 391 SET_SIZE(_key_expansion_256a) 392 393 .align 16 394 _key_expansion_192a: 395 pshufd $0b01010101, %xmm1, %xmm1 396 shufps $0b00010000, %xmm0, %xmm4 397 pxor %xmm4, %xmm0 398 shufps $0b10001100, %xmm0, %xmm4 399 pxor %xmm4, %xmm0 400 pxor %xmm1, %xmm0 401 402 movaps %xmm2, %xmm5 403 movaps %xmm2, %xmm6 404 pslldq $4, %xmm5 405 pshufd $0b11111111, %xmm0, %xmm3 406 pxor %xmm3, %xmm2 407 pxor %xmm5, %xmm2 408 409 movaps %xmm0, %xmm1 410 shufps $0b01000100, %xmm0, %xmm6 411 movaps %xmm6, (%rcx) 412 shufps $0b01001110, %xmm2, %xmm1 413 movaps %xmm1, 0x10(%rcx) 414 add $0x20, %rcx 415 ret 416 SET_SIZE(_key_expansion_192a) 417 418 .align 16 419 _key_expansion_192b: 420 pshufd $0b01010101, %xmm1, %xmm1 421 shufps $0b00010000, %xmm0, %xmm4 422 pxor %xmm4, %xmm0 423 shufps $0b10001100, %xmm0, %xmm4 424 pxor %xmm4, %xmm0 425 pxor %xmm1, %xmm0 426 427 movaps %xmm2, %xmm5 428 pslldq $4, %xmm5 429 pshufd $0b11111111, %xmm0, %xmm3 430 pxor %xmm3, %xmm2 431 pxor %xmm5, %xmm2 432 433 movaps %xmm0, (%rcx) 434 add $0x10, %rcx 435 ret 436 SET_SIZE(_key_expansion_192b) 437 438 .align 16 439 _key_expansion_256b: 440 pshufd $0b10101010, %xmm1, %xmm1 441 shufps $0b00010000, %xmm2, %xmm4 442 pxor %xmm4, %xmm2 443 shufps $0b10001100, %xmm2, %xmm4 444 pxor %xmm4, %xmm2 445 pxor %xmm1, %xmm2 446 movaps %xmm2, (%rcx) 447 add $0x10, %rcx 448 ret 449 SET_SIZE(_key_expansion_256b) 450 451 /* 452 * void aes_copy_intel(const uint8_t *src, uint8_t *dst); 453 * 454 * Copies one unaligned 128-bit block from `src' to `dst'. The copy is 455 * performed using FPU registers, so make sure FPU state is saved when 456 * running this in the kernel. 457 */ 458 ENTRY_NP(aes_copy_intel) 459 movdqu (%rdi), %xmm0 460 movdqu %xmm0, (%rsi) 461 ret 462 SET_SIZE(aes_copy_intel) 463 464 /* 465 * void aes_xor_intel(const uint8_t *src, uint8_t *dst); 466 * 467 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and 468 * stores the result at `dst'. The XOR is performed using FPU registers, 469 * so make sure FPU state is saved when running this in the kernel. 470 */ 471 ENTRY_NP(aes_xor_intel) 472 movdqu (%rdi), %xmm0 473 movdqu (%rsi), %xmm1 474 pxor %xmm1, %xmm0 475 movdqu %xmm0, (%rsi) 476 ret 477 SET_SIZE(aes_xor_intel) 478 479 /* 480 * void aes_xor_intel8(const uint8_t *src, uint8_t *dst); 481 * 482 * XORs eight pairs of consecutive unaligned 128-bit blocks from `src' and 483 * 'dst' and stores the results at `dst'. The XOR is performed using FPU 484 * registers, so make sure FPU state is saved when running this in the kernel. 485 */ 486 ENTRY_NP(aes_xor_intel8) 487 movdqu 0x00(%rdi), %xmm0 488 movdqu 0x00(%rsi), %xmm1 489 movdqu 0x10(%rdi), %xmm2 490 movdqu 0x10(%rsi), %xmm3 491 movdqu 0x20(%rdi), %xmm4 492 movdqu 0x20(%rsi), %xmm5 493 movdqu 0x30(%rdi), %xmm6 494 movdqu 0x30(%rsi), %xmm7 495 movdqu 0x40(%rdi), %xmm8 496 movdqu 0x40(%rsi), %xmm9 497 movdqu 0x50(%rdi), %xmm10 498 movdqu 0x50(%rsi), %xmm11 499 movdqu 0x60(%rdi), %xmm12 500 movdqu 0x60(%rsi), %xmm13 501 movdqu 0x70(%rdi), %xmm14 502 movdqu 0x70(%rsi), %xmm15 503 pxor %xmm1, %xmm0 504 pxor %xmm3, %xmm2 505 pxor %xmm5, %xmm4 506 pxor %xmm7, %xmm6 507 pxor %xmm9, %xmm8 508 pxor %xmm11, %xmm10 509 pxor %xmm13, %xmm12 510 pxor %xmm15, %xmm14 511 movdqu %xmm0, 0x00(%rsi) 512 movdqu %xmm2, 0x10(%rsi) 513 movdqu %xmm4, 0x20(%rsi) 514 movdqu %xmm6, 0x30(%rsi) 515 movdqu %xmm8, 0x40(%rsi) 516 movdqu %xmm10, 0x50(%rsi) 517 movdqu %xmm12, 0x60(%rsi) 518 movdqu %xmm14, 0x70(%rsi) 519 ret 520 SET_SIZE(aes_xor_intel8) 521 522 /* 523 * rijndael_key_setup_enc_intel() 524 * Expand the cipher key into the encryption key schedule. 525 * 526 * For kernel code, caller is responsible for ensuring kpreempt_disable() 527 * has been called. This is because %xmm registers are not saved/restored. 528 * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set 529 * on entry. Otherwise, if TS is not set, save and restore %xmm registers 530 * on the stack. 531 * 532 * OpenSolaris interface: 533 * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], 534 * uint64_t keyBits); 535 * Return value is 0 on error, number of rounds on success. 536 * 537 * Original Intel OpenSSL interface: 538 * int intel_AES_set_encrypt_key(const unsigned char *userKey, 539 * const int bits, AES_KEY *key); 540 * Return value is non-zero on error, 0 on success. 541 */ 542 543 #ifdef OPENSSL_INTERFACE 544 #define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key 545 #define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key 546 547 #define USERCIPHERKEY rdi /* P1, 64 bits */ 548 #define KEYSIZE32 esi /* P2, 32 bits */ 549 #define KEYSIZE64 rsi /* P2, 64 bits */ 550 #define AESKEY rdx /* P3, 64 bits */ 551 552 #else /* OpenSolaris Interface */ 553 #define AESKEY rdi /* P1, 64 bits */ 554 #define USERCIPHERKEY rsi /* P2, 64 bits */ 555 #define KEYSIZE32 edx /* P3, 32 bits */ 556 #define KEYSIZE64 rdx /* P3, 64 bits */ 557 #endif /* OPENSSL_INTERFACE */ 558 559 #define ROUNDS32 KEYSIZE32 /* temp */ 560 #define ROUNDS64 KEYSIZE64 /* temp */ 561 #define ENDAESKEY USERCIPHERKEY /* temp */ 562 563 564 ENTRY_NP(rijndael_key_setup_enc_intel) 565 CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10) 566 567 / NULL pointer sanity check 568 test %USERCIPHERKEY, %USERCIPHERKEY 569 jz .Lenc_key_invalid_param 570 test %AESKEY, %AESKEY 571 jz .Lenc_key_invalid_param 572 573 movups (%USERCIPHERKEY), %xmm0 / user key (first 16 bytes) 574 movaps %xmm0, (%AESKEY) 575 lea 0x10(%AESKEY), %rcx / key addr 576 pxor %xmm4, %xmm4 / xmm4 is assumed 0 in _key_expansion_x 577 578 cmp $256, %KEYSIZE32 579 jnz .Lenc_key192 580 581 / AES 256: 14 rounds in encryption key schedule 582 #ifdef OPENSSL_INTERFACE 583 mov $14, %ROUNDS32 584 movl %ROUNDS32, 240(%AESKEY) / key.rounds = 14 585 #endif /* OPENSSL_INTERFACE */ 586 587 movups 0x10(%USERCIPHERKEY), %xmm2 / other user key (2nd 16 bytes) 588 movaps %xmm2, (%rcx) 589 add $0x10, %rcx 590 591 aeskeygenassist $0x1, %xmm2, %xmm1 / expand the key 592 call _key_expansion_256a 593 aeskeygenassist $0x1, %xmm0, %xmm1 594 call _key_expansion_256b 595 aeskeygenassist $0x2, %xmm2, %xmm1 / expand the key 596 call _key_expansion_256a 597 aeskeygenassist $0x2, %xmm0, %xmm1 598 call _key_expansion_256b 599 aeskeygenassist $0x4, %xmm2, %xmm1 / expand the key 600 call _key_expansion_256a 601 aeskeygenassist $0x4, %xmm0, %xmm1 602 call _key_expansion_256b 603 aeskeygenassist $0x8, %xmm2, %xmm1 / expand the key 604 call _key_expansion_256a 605 aeskeygenassist $0x8, %xmm0, %xmm1 606 call _key_expansion_256b 607 aeskeygenassist $0x10, %xmm2, %xmm1 / expand the key 608 call _key_expansion_256a 609 aeskeygenassist $0x10, %xmm0, %xmm1 610 call _key_expansion_256b 611 aeskeygenassist $0x20, %xmm2, %xmm1 / expand the key 612 call _key_expansion_256a 613 aeskeygenassist $0x20, %xmm0, %xmm1 614 call _key_expansion_256b 615 aeskeygenassist $0x40, %xmm2, %xmm1 / expand the key 616 call _key_expansion_256a 617 618 SET_TS_OR_POP_XMM0_TO_XMM6(%r10) 619 #ifdef OPENSSL_INTERFACE 620 xor %rax, %rax / return 0 (OK) 621 #else /* Open Solaris Interface */ 622 mov $14, %rax / return # rounds = 14 623 #endif 624 ret 625 626 .align 4 627 .Lenc_key192: 628 cmp $192, %KEYSIZE32 629 jnz .Lenc_key128 630 631 / AES 192: 12 rounds in encryption key schedule 632 #ifdef OPENSSL_INTERFACE 633 mov $12, %ROUNDS32 634 movl %ROUNDS32, 240(%AESKEY) / key.rounds = 12 635 #endif /* OPENSSL_INTERFACE */ 636 637 movq 0x10(%USERCIPHERKEY), %xmm2 / other user key 638 aeskeygenassist $0x1, %xmm2, %xmm1 / expand the key 639 call _key_expansion_192a 640 aeskeygenassist $0x2, %xmm2, %xmm1 / expand the key 641 call _key_expansion_192b 642 aeskeygenassist $0x4, %xmm2, %xmm1 / expand the key 643 call _key_expansion_192a 644 aeskeygenassist $0x8, %xmm2, %xmm1 / expand the key 645 call _key_expansion_192b 646 aeskeygenassist $0x10, %xmm2, %xmm1 / expand the key 647 call _key_expansion_192a 648 aeskeygenassist $0x20, %xmm2, %xmm1 / expand the key 649 call _key_expansion_192b 650 aeskeygenassist $0x40, %xmm2, %xmm1 / expand the key 651 call _key_expansion_192a 652 aeskeygenassist $0x80, %xmm2, %xmm1 / expand the key 653 call _key_expansion_192b 654 655 SET_TS_OR_POP_XMM0_TO_XMM6(%r10) 656 #ifdef OPENSSL_INTERFACE 657 xor %rax, %rax / return 0 (OK) 658 #else /* OpenSolaris Interface */ 659 mov $12, %rax / return # rounds = 12 660 #endif 661 ret 662 663 .align 4 664 .Lenc_key128: 665 cmp $128, %KEYSIZE32 666 jnz .Lenc_key_invalid_key_bits 667 668 / AES 128: 10 rounds in encryption key schedule 669 #ifdef OPENSSL_INTERFACE 670 mov $10, %ROUNDS32 671 movl %ROUNDS32, 240(%AESKEY) / key.rounds = 10 672 #endif /* OPENSSL_INTERFACE */ 673 674 aeskeygenassist $0x1, %xmm0, %xmm1 / expand the key 675 call _key_expansion_128 676 aeskeygenassist $0x2, %xmm0, %xmm1 / expand the key 677 call _key_expansion_128 678 aeskeygenassist $0x4, %xmm0, %xmm1 / expand the key 679 call _key_expansion_128 680 aeskeygenassist $0x8, %xmm0, %xmm1 / expand the key 681 call _key_expansion_128 682 aeskeygenassist $0x10, %xmm0, %xmm1 / expand the key 683 call _key_expansion_128 684 aeskeygenassist $0x20, %xmm0, %xmm1 / expand the key 685 call _key_expansion_128 686 aeskeygenassist $0x40, %xmm0, %xmm1 / expand the key 687 call _key_expansion_128 688 aeskeygenassist $0x80, %xmm0, %xmm1 / expand the key 689 call _key_expansion_128 690 aeskeygenassist $0x1b, %xmm0, %xmm1 / expand the key 691 call _key_expansion_128 692 aeskeygenassist $0x36, %xmm0, %xmm1 / expand the key 693 call _key_expansion_128 694 695 SET_TS_OR_POP_XMM0_TO_XMM6(%r10) 696 #ifdef OPENSSL_INTERFACE 697 xor %rax, %rax / return 0 (OK) 698 #else /* OpenSolaris Interface */ 699 mov $10, %rax / return # rounds = 10 700 #endif 701 ret 702 703 .Lenc_key_invalid_param: 704 #ifdef OPENSSL_INTERFACE 705 SET_TS_OR_POP_XMM0_TO_XMM6(%r10) 706 mov $-1, %rax / user key or AES key pointer is NULL 707 ret 708 #else 709 /* FALLTHROUGH */ 710 #endif /* OPENSSL_INTERFACE */ 711 712 .Lenc_key_invalid_key_bits: 713 SET_TS_OR_POP_XMM0_TO_XMM6(%r10) 714 #ifdef OPENSSL_INTERFACE 715 mov $-2, %rax / keysize is invalid 716 #else /* Open Solaris Interface */ 717 xor %rax, %rax / a key pointer is NULL or invalid keysize 718 #endif /* OPENSSL_INTERFACE */ 719 720 ret 721 SET_SIZE(rijndael_key_setup_enc_intel) 722 723 724 /* 725 * rijndael_key_setup_dec_intel() 726 * Expand the cipher key into the decryption key schedule. 727 * 728 * For kernel code, caller is responsible for ensuring kpreempt_disable() 729 * has been called. This is because %xmm registers are not saved/restored. 730 * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set 731 * on entry. Otherwise, if TS is not set, save and restore %xmm registers 732 * on the stack. 733 * 734 * OpenSolaris interface: 735 * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], 736 * uint64_t keyBits); 737 * Return value is 0 on error, number of rounds on success. 738 * P1->P2, P2->P3, P3->P1 739 * 740 * Original Intel OpenSSL interface: 741 * int intel_AES_set_decrypt_key(const unsigned char *userKey, 742 * const int bits, AES_KEY *key); 743 * Return value is non-zero on error, 0 on success. 744 */ 745 ENTRY_NP(rijndael_key_setup_dec_intel) 746 / Generate round keys used for encryption 747 call rijndael_key_setup_enc_intel 748 test %rax, %rax 749 #ifdef OPENSSL_INTERFACE 750 jnz .Ldec_key_exit / Failed if returned non-0 751 #else /* OpenSolaris Interface */ 752 jz .Ldec_key_exit / Failed if returned 0 753 #endif /* OPENSSL_INTERFACE */ 754 755 CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) 756 757 /* 758 * Convert round keys used for encryption 759 * to a form usable for decryption 760 */ 761 #ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */ 762 mov %rax, %ROUNDS64 / set # rounds (10, 12, or 14) 763 / (already set for OpenSSL) 764 #endif 765 766 lea 0x10(%AESKEY), %rcx / key addr 767 shl $4, %ROUNDS32 768 add %AESKEY, %ROUNDS64 769 mov %ROUNDS64, %ENDAESKEY 770 771 .align 4 772 .Ldec_key_reorder_loop: 773 movaps (%AESKEY), %xmm0 774 movaps (%ROUNDS64), %xmm1 775 movaps %xmm0, (%ROUNDS64) 776 movaps %xmm1, (%AESKEY) 777 lea 0x10(%AESKEY), %AESKEY 778 lea -0x10(%ROUNDS64), %ROUNDS64 779 cmp %AESKEY, %ROUNDS64 780 ja .Ldec_key_reorder_loop 781 782 .align 4 783 .Ldec_key_inv_loop: 784 movaps (%rcx), %xmm0 785 / Convert an encryption round key to a form usable for decryption 786 / with the "AES Inverse Mix Columns" instruction 787 aesimc %xmm0, %xmm1 788 movaps %xmm1, (%rcx) 789 lea 0x10(%rcx), %rcx 790 cmp %ENDAESKEY, %rcx 791 jnz .Ldec_key_inv_loop 792 793 SET_TS_OR_POP_XMM0_XMM1(%r10) 794 795 .Ldec_key_exit: 796 / OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error 797 / OpenSSL: rax = 0 for OK, or non-zero for error 798 ret 799 SET_SIZE(rijndael_key_setup_dec_intel) 800 801 802 #ifdef OPENSSL_INTERFACE 803 #define aes_encrypt_intel intel_AES_encrypt 804 #define aes_decrypt_intel intel_AES_decrypt 805 806 #define INP rdi /* P1, 64 bits */ 807 #define OUTP rsi /* P2, 64 bits */ 808 #define KEYP rdx /* P3, 64 bits */ 809 810 /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */ 811 #define NROUNDS32 ecx /* temporary, 32 bits */ 812 #define NROUNDS cl /* temporary, 8 bits */ 813 814 #else /* OpenSolaris Interface */ 815 #define KEYP rdi /* P1, 64 bits */ 816 #define NROUNDS esi /* P2, 32 bits */ 817 #define INP rdx /* P3, 64 bits */ 818 #define OUTP rcx /* P4, 64 bits */ 819 #define LENGTH r8 /* P5, 64 bits */ 820 #endif /* OPENSSL_INTERFACE */ 821 822 #define KEY xmm0 /* temporary, 128 bits */ 823 #define STATE0 xmm8 /* temporary, 128 bits */ 824 #define STATE1 xmm9 /* temporary, 128 bits */ 825 #define STATE2 xmm10 /* temporary, 128 bits */ 826 #define STATE3 xmm11 /* temporary, 128 bits */ 827 #define STATE4 xmm12 /* temporary, 128 bits */ 828 #define STATE5 xmm13 /* temporary, 128 bits */ 829 #define STATE6 xmm14 /* temporary, 128 bits */ 830 #define STATE7 xmm15 /* temporary, 128 bits */ 831 832 /* 833 * Runs the first two rounds of AES256 on a state register. `op' should be 834 * aesenc or aesdec. 835 */ 836 #define AES256_ROUNDS(op, statereg) \ 837 movaps -0x60(%KEYP), %KEY; \ 838 op %KEY, %statereg; \ 839 movaps -0x50(%KEYP), %KEY; \ 840 op %KEY, %statereg 841 842 /* 843 * Runs the first two rounds of AES192, or the 3rd & 4th round of AES256 on 844 * a state register. `op' should be aesenc or aesdec. 845 */ 846 #define AES192_ROUNDS(op, statereg) \ 847 movaps -0x40(%KEYP), %KEY; \ 848 op %KEY, %statereg; \ 849 movaps -0x30(%KEYP), %KEY; \ 850 op %KEY, %statereg 851 852 /* 853 * Runs the full 10 rounds of AES128, or the last 10 rounds of AES192/AES256 854 * on a state register. `op' should be aesenc or aesdec and `lastop' should 855 * be aesenclast or aesdeclast. 856 */ 857 #define AES128_ROUNDS(op, lastop, statereg) \ 858 movaps -0x20(%KEYP), %KEY; \ 859 op %KEY, %statereg; \ 860 movaps -0x10(%KEYP), %KEY; \ 861 op %KEY, %statereg; \ 862 movaps (%KEYP), %KEY; \ 863 op %KEY, %statereg; \ 864 movaps 0x10(%KEYP), %KEY; \ 865 op %KEY, %statereg; \ 866 movaps 0x20(%KEYP), %KEY; \ 867 op %KEY, %statereg; \ 868 movaps 0x30(%KEYP), %KEY; \ 869 op %KEY, %statereg; \ 870 movaps 0x40(%KEYP), %KEY; \ 871 op %KEY, %statereg; \ 872 movaps 0x50(%KEYP), %KEY; \ 873 op %KEY, %statereg; \ 874 movaps 0x60(%KEYP), %KEY; \ 875 op %KEY, %statereg; \ 876 movaps 0x70(%KEYP), %KEY; \ 877 lastop %KEY, %statereg 878 879 /* 880 * Macros to run AES encryption rounds. Input must be prefilled in state 881 * register - output will be left there as well. 882 * To run AES256, invoke all of these macros in sequence. To run AES192, 883 * invoke only the -192 and -128 variants. To run AES128, invoke only the 884 * -128 variant. 885 */ 886 #define AES256_ENC_ROUNDS(statereg) \ 887 AES256_ROUNDS(aesenc, statereg) 888 #define AES192_ENC_ROUNDS(statereg) \ 889 AES192_ROUNDS(aesenc, statereg) 890 #define AES128_ENC_ROUNDS(statereg) \ 891 AES128_ROUNDS(aesenc, aesenclast, statereg) 892 893 /* Same as the AES*_ENC_ROUNDS macros, but for decryption. */ 894 #define AES256_DEC_ROUNDS(statereg) \ 895 AES256_ROUNDS(aesdec, statereg) 896 #define AES192_DEC_ROUNDS(statereg) \ 897 AES192_ROUNDS(aesdec, statereg) 898 #define AES128_DEC_ROUNDS(statereg) \ 899 AES128_ROUNDS(aesdec, aesdeclast, statereg) 900 901 902 /* 903 * aes_encrypt_intel() 904 * Encrypt a single block (in and out can overlap). 905 * 906 * For kernel code, caller is responsible for bracketing this call with 907 * disabling kernel thread preemption and calling aes_accel_save/restore(). 908 * 909 * Temporary register usage: 910 * %xmm0 Key 911 * %xmm8 State 912 * 913 * Original OpenSolaris Interface: 914 * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, 915 * const uint32_t pt[4], uint32_t ct[4]) 916 * 917 * Original Intel OpenSSL Interface: 918 * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, 919 * const AES_KEY *key) 920 */ 921 ENTRY_NP(aes_encrypt_intel) 922 movups (%INP), %STATE0 / input 923 movaps (%KEYP), %KEY / key 924 925 #ifdef OPENSSL_INTERFACE 926 mov 240(%KEYP), %NROUNDS32 / round count 927 #else /* OpenSolaris Interface */ 928 /* Round count is already present as P2 in %rsi/%esi */ 929 #endif /* OPENSSL_INTERFACE */ 930 931 pxor %KEY, %STATE0 / round 0 932 lea 0x30(%KEYP), %KEYP 933 cmp $12, %NROUNDS 934 jb .Lenc128 935 lea 0x20(%KEYP), %KEYP 936 je .Lenc192 937 938 / AES 256 939 lea 0x20(%KEYP), %KEYP 940 AES256_ENC_ROUNDS(STATE0) 941 942 .align 4 943 .Lenc192: 944 / AES 192 and 256 945 AES192_ENC_ROUNDS(STATE0) 946 947 .align 4 948 .Lenc128: 949 / AES 128, 192, and 256 950 AES128_ENC_ROUNDS(STATE0) 951 movups %STATE0, (%OUTP) / output 952 953 ret 954 SET_SIZE(aes_encrypt_intel) 955 956 /* 957 * aes_decrypt_intel() 958 * Decrypt a single block (in and out can overlap). 959 * 960 * For kernel code, caller is responsible for bracketing this call with 961 * disabling kernel thread preemption and calling aes_accel_save/restore(). 962 * 963 * Temporary register usage: 964 * %xmm0 State 965 * %xmm1 Key 966 * 967 * Original OpenSolaris Interface: 968 * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, 969 * const uint32_t pt[4], uint32_t ct[4]) 970 * 971 * Original Intel OpenSSL Interface: 972 * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, 973 * const AES_KEY *key); 974 */ 975 ENTRY_NP(aes_decrypt_intel) 976 movups (%INP), %STATE0 / input 977 movaps (%KEYP), %KEY / key 978 979 #ifdef OPENSSL_INTERFACE 980 mov 240(%KEYP), %NROUNDS32 / round count 981 #else /* OpenSolaris Interface */ 982 /* Round count is already present as P2 in %rsi/%esi */ 983 #endif /* OPENSSL_INTERFACE */ 984 985 pxor %KEY, %STATE0 / round 0 986 lea 0x30(%KEYP), %KEYP 987 cmp $12, %NROUNDS 988 jb .Ldec128 989 lea 0x20(%KEYP), %KEYP 990 je .Ldec192 991 992 / AES 256 993 lea 0x20(%KEYP), %KEYP 994 AES256_DEC_ROUNDS(STATE0) 995 996 .align 4 997 .Ldec192: 998 / AES 192 and 256 999 AES192_DEC_ROUNDS(STATE0) 1000 1001 .align 4 1002 .Ldec128: 1003 / AES 128, 192, and 256 1004 AES128_DEC_ROUNDS(STATE0) 1005 movups %STATE0, (%OUTP) / output 1006 1007 ret 1008 SET_SIZE(aes_decrypt_intel) 1009 1010 /* Does a pipelined load of eight input blocks into our AES state registers. */ 1011 #define AES_LOAD_INPUT_8BLOCKS \ 1012 movups 0x00(%INP), %STATE0; \ 1013 movups 0x10(%INP), %STATE1; \ 1014 movups 0x20(%INP), %STATE2; \ 1015 movups 0x30(%INP), %STATE3; \ 1016 movups 0x40(%INP), %STATE4; \ 1017 movups 0x50(%INP), %STATE5; \ 1018 movups 0x60(%INP), %STATE6; \ 1019 movups 0x70(%INP), %STATE7; 1020 1021 /* Does a pipelined store of eight AES state registers to the output. */ 1022 #define AES_STORE_OUTPUT_8BLOCKS \ 1023 movups %STATE0, 0x00(%OUTP); \ 1024 movups %STATE1, 0x10(%OUTP); \ 1025 movups %STATE2, 0x20(%OUTP); \ 1026 movups %STATE3, 0x30(%OUTP); \ 1027 movups %STATE4, 0x40(%OUTP); \ 1028 movups %STATE5, 0x50(%OUTP); \ 1029 movups %STATE6, 0x60(%OUTP); \ 1030 movups %STATE7, 0x70(%OUTP); 1031 1032 /* Performs a pipelined AES instruction with the key on all state registers. */ 1033 #define AES_KEY_STATE_OP_8BLOCKS(op) \ 1034 op %KEY, %STATE0; \ 1035 op %KEY, %STATE1; \ 1036 op %KEY, %STATE2; \ 1037 op %KEY, %STATE3; \ 1038 op %KEY, %STATE4; \ 1039 op %KEY, %STATE5; \ 1040 op %KEY, %STATE6; \ 1041 op %KEY, %STATE7 1042 1043 /* XOR all AES state regs with key to initiate encryption/decryption. */ 1044 #define AES_XOR_STATE_8BLOCKS \ 1045 AES_KEY_STATE_OP_8BLOCKS(pxor) 1046 1047 /* 1048 * Loads a round key from the key schedule offset `off' into the KEY 1049 * register and performs `op' using the KEY on all 8 STATE registers. 1050 */ 1051 #define AES_RND_8BLOCKS(op, off) \ 1052 movaps off(%KEYP), %KEY; \ 1053 AES_KEY_STATE_OP_8BLOCKS(op) 1054 1055 /* 1056 * void aes_encrypt_intel8(const uint32_t roundkeys[], int numrounds, 1057 * const void *plaintext, void *ciphertext) 1058 * 1059 * Same as aes_encrypt_intel, but performs the encryption operation on 1060 * 8 independent blocks in sequence, exploiting instruction pipelining. 1061 * This function doesn't support the OpenSSL interface, it's only meant 1062 * for kernel use. 1063 */ 1064 ENTRY_NP(aes_encrypt_intel8) 1065 AES_LOAD_INPUT_8BLOCKS / load input 1066 movaps (%KEYP), %KEY / key 1067 AES_XOR_STATE_8BLOCKS / round 0 1068 1069 lea 0x30(%KEYP), %KEYP / point to key schedule 1070 cmp $12, %NROUNDS / determine AES variant 1071 jb .Lenc8_128 1072 lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule 1073 je .Lenc8_192 1074 1075 lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule 1076 AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1 1077 AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2 1078 1079 .align 4 1080 .Lenc8_192: 1081 AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3 1082 AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4 1083 1084 .align 4 1085 .Lenc8_128: 1086 AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5 1087 AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6 1088 AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7 1089 AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8 1090 AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9 1091 AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10 1092 AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11 1093 AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12 1094 AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13 1095 AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14 1096 1097 AES_STORE_OUTPUT_8BLOCKS / store output 1098 ret 1099 SET_SIZE(aes_encrypt_intel8) 1100 1101 1102 /* 1103 * void aes_decrypt_intel8(const uint32_t roundkeys[], int numrounds, 1104 * const void *ciphertext, void *plaintext) 1105 * 1106 * Same as aes_decrypt_intel, but performs the decryption operation on 1107 * 8 independent blocks in sequence, exploiting instruction pipelining. 1108 * This function doesn't support the OpenSSL interface, it's only meant 1109 * for kernel use. 1110 */ 1111 ENTRY_NP(aes_decrypt_intel8) 1112 AES_LOAD_INPUT_8BLOCKS / load input 1113 movaps (%KEYP), %KEY / key 1114 AES_XOR_STATE_8BLOCKS / round 0 1115 1116 lea 0x30(%KEYP), %KEYP / point to key schedule 1117 cmp $12, %NROUNDS / determine AES variant 1118 jb .Ldec8_128 1119 lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule 1120 je .Ldec8_192 1121 1122 lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule 1123 AES_RND_8BLOCKS(aesdec, -0x60) / AES256 R.1 1124 AES_RND_8BLOCKS(aesdec, -0x50) / AES256 R.2 1125 1126 .align 4 1127 .Ldec8_192: 1128 AES_RND_8BLOCKS(aesdec, -0x40) / AES192 R.1; AES256 R.3 1129 AES_RND_8BLOCKS(aesdec, -0x30) / AES192 R.2; AES256 R.4 1130 1131 .align 4 1132 .Ldec8_128: 1133 AES_RND_8BLOCKS(aesdec, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5 1134 AES_RND_8BLOCKS(aesdec, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6 1135 AES_RND_8BLOCKS(aesdec, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7 1136 AES_RND_8BLOCKS(aesdec, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8 1137 AES_RND_8BLOCKS(aesdec, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9 1138 AES_RND_8BLOCKS(aesdec, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10 1139 AES_RND_8BLOCKS(aesdec, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11 1140 AES_RND_8BLOCKS(aesdec, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12 1141 AES_RND_8BLOCKS(aesdec, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13 1142 AES_RND_8BLOCKS(aesdeclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14 1143 1144 AES_STORE_OUTPUT_8BLOCKS / store output 1145 ret 1146 SET_SIZE(aes_decrypt_intel8) 1147 1148 1149 /* 1150 * This macro encapsulates the entire AES encryption algo for a single 1151 * block, which is prefilled in statereg and which will be replaced by 1152 * the encrypted output. The KEYP register must already point to the 1153 * AES128 key schedule ("lea 0x30(%KEYP), %KEYP" from encryption 1154 * function call) so that consecutive invocations of this macro are 1155 * supported (KEYP is restored after each invocation). 1156 */ 1157 #define AES_ENC(statereg, label_128, label_192, label_out) \ 1158 cmp $12, %NROUNDS; \ 1159 jb label_128; \ 1160 je label_192; \ 1161 /* AES 256 only */ \ 1162 lea 0x40(%KEYP), %KEYP; \ 1163 AES256_ENC_ROUNDS(statereg); \ 1164 AES192_ENC_ROUNDS(statereg); \ 1165 AES128_ENC_ROUNDS(statereg); \ 1166 lea -0x40(%KEYP), %KEYP; \ 1167 jmp label_out; \ 1168 .align 4; \ 1169 label_192: \ 1170 lea 0x20(%KEYP), %KEYP; \ 1171 /* AES 192 only */ \ 1172 AES192_ENC_ROUNDS(statereg); \ 1173 AES128_ENC_ROUNDS(statereg); \ 1174 lea -0x20(%KEYP), %KEYP; \ 1175 jmp label_out; \ 1176 .align 4; \ 1177 label_128: \ 1178 /* AES 128 only */ \ 1179 AES128_ENC_ROUNDS(statereg); \ 1180 .align 4; \ 1181 label_out: 1182 1183 1184 /* 1185 * void aes_encrypt_cbc_intel8(const uint32_t roundkeys[], int numrounds, 1186 * const void *plaintext, void *ciphertext, const void *IV) 1187 * 1188 * Encrypts 8 consecutive AES blocks in the CBC mode. Input and output 1189 * may overlap. This provides a modest performance boost over invoking 1190 * the encryption and XOR in separate functions because we can avoid 1191 * copying the ciphertext block to and from memory between encryption 1192 * and XOR calls. 1193 */ 1194 #define CBC_IV r8 /* input - IV blk pointer */ 1195 #define CBC_IV_XMM xmm1 /* tmp IV location for alignment */ 1196 1197 ENTRY_NP(aes_encrypt_cbc_intel8) 1198 AES_LOAD_INPUT_8BLOCKS / load input 1199 movaps (%KEYP), %KEY / key 1200 AES_XOR_STATE_8BLOCKS / round 0 1201 1202 lea 0x30(%KEYP), %KEYP / point to key schedule 1203 movdqu (%CBC_IV), %CBC_IV_XMM / load IV from unaligned memory 1204 pxor %CBC_IV_XMM, %STATE0 / XOR IV with input block and encrypt 1205 AES_ENC(STATE0, .Lenc_cbc_0_128, .Lenc_cbc_0_192, .Lenc_cbc_0_out) 1206 pxor %STATE0, %STATE1 1207 AES_ENC(STATE1, .Lenc_cbc_1_128, .Lenc_cbc_1_192, .Lenc_cbc_1_out) 1208 pxor %STATE1, %STATE2 1209 AES_ENC(STATE2, .Lenc_cbc_2_128, .Lenc_cbc_2_192, .Lenc_cbc_2_out) 1210 pxor %STATE2, %STATE3 1211 AES_ENC(STATE3, .Lenc_cbc_3_128, .Lenc_cbc_3_192, .Lenc_cbc_3_out) 1212 pxor %STATE3, %STATE4 1213 AES_ENC(STATE4, .Lenc_cbc_4_128, .Lenc_cbc_4_192, .Lenc_cbc_4_out) 1214 pxor %STATE4, %STATE5 1215 AES_ENC(STATE5, .Lenc_cbc_5_128, .Lenc_cbc_5_192, .Lenc_cbc_5_out) 1216 pxor %STATE5, %STATE6 1217 AES_ENC(STATE6, .Lenc_cbc_6_128, .Lenc_cbc_6_192, .Lenc_cbc_6_out) 1218 pxor %STATE6, %STATE7 1219 AES_ENC(STATE7, .Lenc_cbc_7_128, .Lenc_cbc_7_192, .Lenc_cbc_7_out) 1220 1221 AES_STORE_OUTPUT_8BLOCKS / store output 1222 ret 1223 SET_SIZE(aes_encrypt_cbc_intel8) 1224 1225 /* 1226 * Prefills register state with counters suitable for the CTR encryption 1227 * mode. The counter is assumed to consist of two portions: 1228 * - A lower monotonically increasing 64-bit counter. If the caller wants 1229 * a smaller counter, they are responsible for checking that it doesn't 1230 * overflow between encryption calls. 1231 * - An upper static "nonce" portion, in big endian, preloaded into the 1232 * lower portion of an XMM register. 1233 * This macro adds `ctridx' to the lower_LE counter, swaps it to big 1234 * endian and by way of a temporary general-purpose register loads the 1235 * lower and upper counter portions into a target XMM result register, 1236 * which can then be handed off to the encryption process. 1237 */ 1238 #define PREP_CTR_BLOCKS(lower_LE, upper_BE_xmm, ctridx, tmpreg, resreg) \ 1239 lea ctridx(%lower_LE), %tmpreg; \ 1240 bswap %tmpreg; \ 1241 movq %tmpreg, %resreg; \ 1242 movlhps %upper_BE_xmm, %resreg; \ 1243 pshufd $0b01001110, %resreg, %resreg 1244 1245 #define CTR_UPPER_BE r8 /* input - counter upper 64 bits (BE) */ 1246 #define CTR_UPPER_BE_XMM xmm1 /* tmp for upper counter bits */ 1247 #define CTR_LOWER_LE r9 /* input - counter lower 64 bits (LE) */ 1248 #define CTR_TMP0 rax /* tmp for lower 64 bit add & bswap */ 1249 #define CTR_TMP1 rbx /* tmp for lower 64 bit add & bswap */ 1250 #define CTR_TMP2 r10 /* tmp for lower 64 bit add & bswap */ 1251 #define CTR_TMP3 r11 /* tmp for lower 64 bit add & bswap */ 1252 #define CTR_TMP4 r12 /* tmp for lower 64 bit add & bswap */ 1253 #define CTR_TMP5 r13 /* tmp for lower 64 bit add & bswap */ 1254 #define CTR_TMP6 r14 /* tmp for lower 64 bit add & bswap */ 1255 #define CTR_TMP7 r15 /* tmp for lower 64 bit add & bswap */ 1256 1257 /* 1258 * These are used in case CTR encryption input is unaligned before XORing. 1259 * Must not overlap with any STATE[0-7] register. 1260 */ 1261 #define TMP_INPUT0 xmm0 1262 #define TMP_INPUT1 xmm1 1263 #define TMP_INPUT2 xmm2 1264 #define TMP_INPUT3 xmm3 1265 #define TMP_INPUT4 xmm4 1266 #define TMP_INPUT5 xmm5 1267 #define TMP_INPUT6 xmm6 1268 #define TMP_INPUT7 xmm7 1269 1270 /* 1271 * void aes_ctr_intel8(const uint32_t roundkeys[], int numrounds, 1272 * const void *input, void *output, uint64_t counter_upper_BE, 1273 * uint64_t counter_lower_LE) 1274 * 1275 * Runs AES on 8 consecutive blocks in counter mode (encryption and 1276 * decryption in counter mode are the same). 1277 */ 1278 ENTRY_NP(aes_ctr_intel8) 1279 /* save caller's regs */ 1280 pushq %rbp 1281 movq %rsp, %rbp 1282 subq $0x38, %rsp 1283 / CTR_TMP0 is rax, no need to save 1284 movq %CTR_TMP1, -0x38(%rbp) 1285 movq %CTR_TMP2, -0x30(%rbp) 1286 movq %CTR_TMP3, -0x28(%rbp) 1287 movq %CTR_TMP4, -0x20(%rbp) 1288 movq %CTR_TMP5, -0x18(%rbp) 1289 movq %CTR_TMP6, -0x10(%rbp) 1290 movq %CTR_TMP7, -0x08(%rbp) 1291 1292 /* 1293 * CTR step 1: prepare big-endian formatted 128-bit counter values, 1294 * placing the result in the AES-NI input state registers. 1295 */ 1296 movq %CTR_UPPER_BE, %CTR_UPPER_BE_XMM 1297 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 0, CTR_TMP0, STATE0) 1298 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 1, CTR_TMP1, STATE1) 1299 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 2, CTR_TMP2, STATE2) 1300 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 3, CTR_TMP3, STATE3) 1301 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 4, CTR_TMP4, STATE4) 1302 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 5, CTR_TMP5, STATE5) 1303 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 6, CTR_TMP6, STATE6) 1304 PREP_CTR_BLOCKS(CTR_LOWER_LE, CTR_UPPER_BE_XMM, 7, CTR_TMP7, STATE7) 1305 1306 /* 1307 * CTR step 2: Encrypt the counters. 1308 */ 1309 movaps (%KEYP), %KEY / key 1310 AES_XOR_STATE_8BLOCKS / round 0 1311 1312 /* Determine the AES variant we're going to compute */ 1313 lea 0x30(%KEYP), %KEYP / point to key schedule 1314 cmp $12, %NROUNDS / determine AES variant 1315 jb .Lctr8_128 1316 lea 0x20(%KEYP), %KEYP / AES192 has larger key schedule 1317 je .Lctr8_192 1318 1319 /* AES 256 */ 1320 lea 0x20(%KEYP), %KEYP / AES256 has even larger key schedule 1321 AES_RND_8BLOCKS(aesenc, -0x60) / AES256 R.1 1322 AES_RND_8BLOCKS(aesenc, -0x50) / AES256 R.2 1323 1324 .align 4 1325 .Lctr8_192: 1326 /* AES 192 and 256 */ 1327 AES_RND_8BLOCKS(aesenc, -0x40) / AES192 R.1; AES256 R.3 1328 AES_RND_8BLOCKS(aesenc, -0x30) / AES192 R.2; AES256 R.4 1329 1330 .align 4 1331 .Lctr8_128: 1332 /* AES 128, 192, and 256 */ 1333 AES_RND_8BLOCKS(aesenc, -0x20) / AES128 R.1; AES192 R.3; AES256 R.5 1334 AES_RND_8BLOCKS(aesenc, -0x10) / AES128 R.2; AES192 R.4; AES256 R.6 1335 AES_RND_8BLOCKS(aesenc, 0x00) / AES128 R.3; AES192 R.5; AES256 R.7 1336 AES_RND_8BLOCKS(aesenc, 0x10) / AES128 R.4; AES192 R.6; AES256 R.8 1337 AES_RND_8BLOCKS(aesenc, 0x20) / AES128 R.5; AES192 R.7; AES256 R.9 1338 AES_RND_8BLOCKS(aesenc, 0x30) / AES128 R.6; AES192 R.8; AES256 R.10 1339 AES_RND_8BLOCKS(aesenc, 0x40) / AES128 R.7; AES192 R.9; AES256 R.11 1340 AES_RND_8BLOCKS(aesenc, 0x50) / AES128 R.8; AES192 R.10; AES256 R.12 1341 AES_RND_8BLOCKS(aesenc, 0x60) / AES128 R.9; AES192 R.11; AES256 R.13 1342 AES_RND_8BLOCKS(aesenclast, 0x70)/ AES128 R.10; AES192 R.12; AES256 R.14 1343 1344 /* 1345 * CTR step 3: XOR input data blocks with encrypted counters to 1346 * produce result. 1347 */ 1348 mov %INP, %rax / pxor requires alignment, so check 1349 andq $0xf, %rax 1350 jnz .Lctr_input_unaligned 1351 pxor 0x00(%INP), %STATE0 1352 pxor 0x10(%INP), %STATE1 1353 pxor 0x20(%INP), %STATE2 1354 pxor 0x30(%INP), %STATE3 1355 pxor 0x40(%INP), %STATE4 1356 pxor 0x50(%INP), %STATE5 1357 pxor 0x60(%INP), %STATE6 1358 pxor 0x70(%INP), %STATE7 1359 jmp .Lctr_out 1360 1361 .align 4 1362 .Lctr_input_unaligned: 1363 movdqu 0x00(%INP), %TMP_INPUT0 1364 movdqu 0x10(%INP), %TMP_INPUT1 1365 movdqu 0x20(%INP), %TMP_INPUT2 1366 movdqu 0x30(%INP), %TMP_INPUT3 1367 movdqu 0x40(%INP), %TMP_INPUT4 1368 movdqu 0x50(%INP), %TMP_INPUT5 1369 movdqu 0x60(%INP), %TMP_INPUT6 1370 movdqu 0x70(%INP), %TMP_INPUT7 1371 pxor %TMP_INPUT0, %STATE0 1372 pxor %TMP_INPUT1, %STATE1 1373 pxor %TMP_INPUT2, %STATE2 1374 pxor %TMP_INPUT3, %STATE3 1375 pxor %TMP_INPUT4, %STATE4 1376 pxor %TMP_INPUT5, %STATE5 1377 pxor %TMP_INPUT6, %STATE6 1378 pxor %TMP_INPUT7, %STATE7 1379 1380 .align 4 1381 .Lctr_out: 1382 /* 1383 * Step 4: Write out processed blocks to memory. 1384 */ 1385 movdqu %STATE0, 0x00(%OUTP) 1386 movdqu %STATE1, 0x10(%OUTP) 1387 movdqu %STATE2, 0x20(%OUTP) 1388 movdqu %STATE3, 0x30(%OUTP) 1389 movdqu %STATE4, 0x40(%OUTP) 1390 movdqu %STATE5, 0x50(%OUTP) 1391 movdqu %STATE6, 0x60(%OUTP) 1392 movdqu %STATE7, 0x70(%OUTP) 1393 1394 /* restore caller's regs */ 1395 / CTR_TMP0 is rax, no need to restore 1396 movq -0x38(%rbp), %CTR_TMP1 1397 movq -0x30(%rbp), %CTR_TMP2 1398 movq -0x28(%rbp), %CTR_TMP3 1399 movq -0x20(%rbp), %CTR_TMP4 1400 movq -0x18(%rbp), %CTR_TMP5 1401 movq -0x10(%rbp), %CTR_TMP6 1402 movq -0x08(%rbp), %CTR_TMP7 1403 leave 1404 ret 1405 SET_SIZE(aes_ctr_intel8) 1406 1407 #endif /* lint || __lint */