1 /* 2 * Copyright (c) 2013, CRYPTOGAMS by <appro@openssl.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * * Redistributions of source code must retain copyright notices, 10 * this list of conditions and the following disclaimer. 11 * 12 * * Redistributions in binary form must reproduce the above 13 * copyright notice, this list of conditions and the following 14 * disclaimer in the documentation and/or other materials 15 * provided with the distribution. 16 * 17 * * Neither the name of the CRYPTOGAMS nor the names of its 18 * copyright holder and contributors may be used to endorse or 19 * promote products derived from this software without specific 20 * prior written permission. 21 * 22 * ALTERNATIVELY, provided that this notice is retained in full, this 23 * product may be distributed under the terms of the GNU General Public 24 * License (GPL), in which case the provisions of the GPL apply INSTEAD OF 25 * those given above. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 */ 39 /* 40 * Copyright 2015 by Saso Kiselkov on Illumos port sections. 41 */ 42 43 #if defined(lint) || defined(__lint) 44 45 #include <sys/types.h> 46 47 /*ARGSUSED*/ 48 void 49 gcm_ghash_clmul(uint64_t ghash[2], const uint8_t Htable[256], 50 const uint8_t *inp, size_t len) 51 { 52 } 53 54 /*ARGSUSED*/ 55 void 56 gcm_init_clmul(const uint64_t hash_init[2], uint8_t Htable[256]) 57 { 58 } 59 60 #else /* lint */ 61 62 #include <sys/asm_linkage.h> 63 #include <sys/controlregs.h> 64 #ifdef _KERNEL 65 #include <sys/machprivregs.h> 66 #endif 67 68 .text 69 .align XMM_ALIGN 70 .Lbyte_swap16_mask: 71 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 72 .L0x1c2_polynomial: 73 .byte 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc2 74 .L7_mask: 75 .long 7, 0, 7, 0 76 77 #define Xi xmm0 /* hash value */ 78 #define Xhi xmm1 /* hash value high order 64 bits */ 79 #define Hkey xmm2 /* hash key */ 80 #define T1 xmm3 /* temp1 */ 81 #define T2 xmm4 /* temp2 */ 82 #define T3 xmm5 /* temp3 */ 83 #define Xb0 xmm6 /* cipher block #0 */ 84 #define Xb1 xmm7 /* cipher block #1 */ 85 #define Xb2 xmm8 /* cipher block #2 */ 86 #define Xb3 xmm9 /* cipher block #3 */ 87 #define Xb4 xmm10 /* cipher block #4 */ 88 #define Xb5 xmm11 /* cipher block #5 */ 89 #define Xb6 xmm12 /* cipher block #6 */ 90 #define Xb7 xmm13 /* cipher block #7 */ 91 92 #define clmul64x64_T2(tmpreg) \ 93 movdqa %Xi, %Xhi; \ 94 pshufd $0b01001110, %Xi, %T1; \ 95 pxor %Xi, %T1; \ 96 \ 97 pclmulqdq $0x00, %Hkey, %Xi; \ 98 pclmulqdq $0x11, %Hkey, %Xhi; \ 99 pclmulqdq $0x00, %tmpreg, %T1; \ 100 pxor %Xi, %T1; \ 101 pxor %Xhi, %T1; \ 102 \ 103 movdqa %T1, %T2; \ 104 psrldq $8, %T1; \ 105 pslldq $8, %T2; \ 106 pxor %T1, %Xhi; \ 107 pxor %T2, %Xi 108 109 #define reduction_alg9 \ 110 /* 1st phase */ \ 111 movdqa %Xi, %T2; \ 112 movdqa %Xi, %T1; \ 113 psllq $5, %Xi; \ 114 pxor %Xi, %T1; \ 115 psllq $1, %Xi; \ 116 pxor %T1, %Xi; \ 117 psllq $57, %Xi; \ 118 movdqa %Xi, %T1; \ 119 pslldq $8, %Xi; \ 120 psrldq $8, %T1; \ 121 pxor %T2, %Xi; \ 122 pxor %T1, %Xhi; \ 123 /* 2nd phase */ \ 124 movdqa %Xi, %T2; \ 125 psrlq $1, %Xi; \ 126 pxor %T2, %Xhi; \ 127 pxor %Xi, %T2; \ 128 psrlq $5, %Xi; \ 129 pxor %T2, %Xi; \ 130 psrlq $1, %Xi; \ 131 pxor %Xhi, %Xi 132 133 #define Xip rdi 134 #define Htbl rsi 135 #define inp rdx 136 #define len rcx 137 138 #define Xln xmm6 139 #define Xmn xmm7 140 #define Xhn xmm8 141 #define Hkey2 xmm9 142 #define HK xmm10 143 #define Xl xmm11 144 #define Xm xmm12 145 #define Xh xmm13 146 #define Hkey3 xmm14 147 #define Hkey4 xmm15 148 149 /* 150 * void gcm_ghash_clmul(uint64_t Xi[2], const uint64_t Htable[32], 151 * const uint8_t *inp, size_t len) 152 */ 153 ENTRY_NP(gcm_ghash_clmul) 154 movdqa .Lbyte_swap16_mask(%rip), %T3 155 mov $0xA040608020C0E000, %rax / ((7..0) ? 0xE0) & 0xff 156 157 movdqu (%Xip), %Xi 158 movdqu (%Htbl), %Hkey 159 movdqu 0x20(%Htbl), %HK 160 pshufb %T3, %Xi 161 162 sub $0x10, %len 163 jz .Lodd_tail 164 165 movdqu 0x10(%Htbl), %Hkey2 166 167 cmp $0x30, %len 168 jb .Lskip4x 169 170 sub $0x30, %len 171 movdqu 0x30(%Htbl),%Hkey3 172 movdqu 0x40(%Htbl),%Hkey4 173 174 /* Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P */ 175 movdqu 0x30(%inp), %Xln 176 movdqu 0x20(%inp), %Xl 177 pshufb %T3, %Xln 178 pshufb %T3, %Xl 179 movdqa %Xln, %Xhn 180 pshufd $0b01001110, %Xln, %Xmn 181 pxor %Xln, %Xmn 182 pclmulqdq $0x00, %Hkey, %Xln 183 pclmulqdq $0x11, %Hkey, %Xhn 184 pclmulqdq $0x00, %HK, %Xmn 185 186 movdqa %Xl, %Xh 187 pshufd $0b01001110, %Xl, %Xm 188 pxor %Xl, %Xm 189 pclmulqdq $0x00, %Hkey2, %Xl 190 pclmulqdq $0x11, %Hkey2, %Xh 191 xorps %Xl, %Xln 192 pclmulqdq $0x10, %HK, %Xm 193 xorps %Xh, %Xhn 194 movups 0x50(%Htbl), %HK 195 xorps %Xm, %Xmn 196 197 movdqu 0x10(%inp), %Xl 198 movdqu 0x00(%inp), %T1 199 pshufb %T3, %Xl 200 pshufb %T3, %T1 201 movdqa %Xl, %Xh 202 pshufd $0b01001110, %Xl, %Xm 203 pxor %T1, %Xi 204 pxor %Xl, %Xm 205 pclmulqdq $0x00, %Hkey3, %Xl 206 movdqa %Xi, %Xhi 207 pshufd $0b01001110, %Xi, %T1 208 pxor %Xi, %T1 209 pclmulqdq $0x11, %Hkey3, %Xh 210 xorps %Xl, %Xln 211 pclmulqdq $0x00, %HK, %Xm 212 xorps %Xh, %Xhn 213 214 lea 0x40(%inp), %inp 215 sub $0x40, %len 216 jc .Ltail4x 217 218 jmp .Lmod4_loop 219 220 .align 32 221 .Lmod4_loop: 222 pclmulqdq $0x00, %Hkey4, %Xi 223 xorps %Xm, %Xmn 224 movdqu 0x30(%inp), %Xl 225 pshufb %T3, %Xl 226 pclmulqdq $0x11, %Hkey4, %Xhi 227 xorps %Xln, %Xi 228 movdqu 0x20(%inp), %Xln 229 movdqa %Xl, %Xh 230 pshufd $0b01001110, %Xl, %Xm 231 pclmulqdq $0x10, %HK, %T1 232 xorps %Xhn, %Xhi 233 pxor %Xl, %Xm 234 pshufb %T3, %Xln 235 movups 0x20(%Htbl), %HK 236 pclmulqdq $0x00, %Hkey, %Xl 237 xorps %Xmn, %T1 238 movdqa %Xln, %Xhn 239 pshufd $0b01001110, %Xln, %Xmn 240 241 pxor %Xi, %T1 / aggregated Karatsuba post-processing 242 pxor %Xln, %Xmn 243 pxor %Xhi, %T1 244 movdqa %T1, %T2 245 pslldq $8, %T1 246 pclmulqdq $0x11, %Hkey, %Xh 247 psrldq $8, %T2 248 pxor %T1, %Xi 249 movdqa .L7_mask(%rip), %T1 250 pxor %T2, %Xhi 251 movq %rax, %T2 252 253 pand %Xi, %T1 / 1st phase 254 pshufb %T1, %T2 255 pclmulqdq $0x00, %HK, %Xm 256 pxor %Xi, %T2 257 psllq $57, %T2 258 movdqa %T2, %T1 259 pslldq $8, %T2 260 pclmulqdq $0x00, %Hkey2, %Xln 261 psrldq $8, %T1 262 pxor %T2, %Xi 263 pxor %T1, %Xhi 264 movdqu 0(%inp), %T1 265 266 movdqa %Xi, %T2 / 2nd phase 267 psrlq $1, %Xi 268 pclmulqdq $0x11, %Hkey2, %Xhn 269 xorps %Xl, %Xln 270 movdqu 0x10(%inp), %Xl 271 pshufb %T3, %Xl 272 pclmulqdq $0x10, %HK, %Xmn 273 xorps %Xh, %Xhn 274 movups 0x50(%Htbl), %HK 275 pshufb %T3, %T1 276 pxor %T2, %Xhi 277 pxor %Xi, %T2 278 psrlq $5, %Xi 279 280 movdqa %Xl, %Xh 281 pxor %Xm, %Xmn 282 pshufd $0b01001110, %Xl, %Xm 283 pxor %Xl, %Xm 284 pclmulqdq $0x00, %Hkey3, %Xl 285 pxor %T2, %Xi 286 pxor %T1, %Xhi 287 psrlq $1, %Xi 288 pclmulqdq $0x11, %Hkey3, %Xh 289 xorps %Xl, %Xln 290 pxor %Xhi, %Xi 291 292 pclmulqdq $0x00, %HK, %Xm 293 xorps %Xh, %Xhn 294 295 movdqa %Xi, %Xhi 296 pshufd $0b01001110, %Xi, %T1 297 pxor %Xi, %T1 298 299 lea 0x40(%inp), %inp 300 sub $0x40, %len 301 jnc .Lmod4_loop 302 303 .Ltail4x: 304 pclmulqdq $0x00, %Hkey4, %Xi 305 xorps %Xm, %Xmn 306 pclmulqdq $0x11, %Hkey4, %Xhi 307 xorps %Xln, %Xi 308 pclmulqdq $0x10, %HK, %T1 309 xorps %Xhn, %Xhi 310 pxor %Xi, %Xhi / aggregated Karatsuba post-processing 311 pxor %Xmn, %T1 312 313 pxor %Xhi, %T1 314 pxor %Xi, %Xhi 315 316 movdqa %T1, %T2 317 psrldq $8, %T1 318 pslldq $8, %T2 319 pxor %T1, %Xhi 320 pxor %T2, %Xi 321 322 reduction_alg9 323 324 add $0x40, %len 325 jz .Ldone 326 movdqu 0x20(%Htbl), %HK 327 sub $0x10, %len 328 jz .Lodd_tail 329 .Lskip4x: 330 331 /* 332 * Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 333 * [(H*Ii+1) + (H*Xi+1)] mod P = 334 * [(H*Ii+1) + H^2*(Ii+Xi)] mod P 335 */ 336 movdqu (%inp), %T1 / Ii 337 movdqu 16(%inp), %Xln / Ii+1 338 pshufb %T3, %T1 339 pshufb %T3, %Xln 340 pxor %T1, %Xi / Ii+Xi 341 342 movdqa %Xln, %Xhn 343 pshufd $0b01001110, %Xln, %T1 344 pxor %Xln, %T1 345 pclmulqdq $0x00, %Hkey, %Xln 346 pclmulqdq $0x11, %Hkey, %Xhn 347 pclmulqdq $0x00, %HK, %T1 348 349 lea 32(%inp), %inp / i+=2 350 sub $0x20, %len 351 jbe .Leven_tail 352 jmp .Lmod_loop 353 354 .align 32 355 .Lmod_loop: 356 movdqa %Xi, %Xhi 357 pshufd $0b01001110, %Xi, %T2 358 pxor %Xi, %T2 359 360 pclmulqdq $0x00, %Hkey2, %Xi 361 pclmulqdq $0x11, %Hkey2, %Xhi 362 pclmulqdq $0x10, %HK, %T2 363 364 pxor %Xln, %Xi / (H*Ii+1) + H^2*(Ii+Xi) 365 pxor %Xhn, %Xhi 366 movdqu (%inp), %Xhn / Ii 367 pshufb %T3, %Xhn 368 movdqu 16(%inp), %Xln / Ii+1 369 370 pxor %Xi, %T1 / aggregated Karatsuba post-proc 371 pxor %Xhi, %T1 372 pxor %Xhn, %Xhi / "Ii+Xi", consume early 373 pxor %T1, %T2 374 pshufb %T3, %Xln 375 movdqa %T2, %T1 376 psrldq $8, %T1 377 pslldq $8, %T2 378 pxor %T1, %Xhi 379 pxor %T2, %Xi 380 381 movdqa %Xln, %Xhn 382 383 movdqa %Xi, %T2 / 1st phase 384 movdqa %Xi, %T1 385 psllq $5, %Xi 386 pclmulqdq $0x00, %Hkey, %Xln 387 pxor %Xi, %T1 388 psllq $1, %Xi 389 pxor %T1, %Xi 390 psllq $57, %Xi 391 movdqa %Xi, %T1 392 pslldq $8, %Xi 393 psrldq $8, %T1 394 pxor %T2, %Xi 395 pxor %T1, %Xhi 396 pshufd $0b01001110, %Xhn, %T1 397 pxor %Xhn, %T1 398 399 pclmulqdq $0x11, %Hkey, %Xhn 400 movdqa %Xi, %T2 / 2nd phase 401 psrlq $1, %Xi 402 pxor %T2, %Xhi 403 pxor %Xi, %T2 404 psrlq $5, %Xi 405 pxor %T2, %Xi 406 psrlq $1, %Xi 407 pclmulqdq $0x00, %HK, %T1 408 pxor %Xhi, %Xi 409 410 lea 32(%inp), %inp 411 sub $0x20, %len 412 ja .Lmod_loop 413 414 .Leven_tail: 415 movdqa %Xi, %Xhi 416 pshufd $0b01001110, %Xi, %T2 417 pxor %Xi, %T2 418 419 pclmulqdq $0x00, %Hkey2, %Xi 420 pclmulqdq $0x11, %Hkey2, %Xhi 421 pclmulqdq $0x10, %HK, %T2 422 423 pxor %Xln, %Xi /* (H*Ii+1) + H^2*(Ii+Xi) */ 424 pxor %Xhn, %Xhi 425 pxor %Xi, %T1 426 pxor %Xhi, %T1 427 pxor %T1, %T2 428 movdqa %T2, %T1 429 psrldq $8, %T1 430 pslldq $8, %T2 431 pxor %T1, %Xhi 432 pxor %T2, %Xi 433 434 reduction_alg9 435 436 test %len, %len 437 jnz .Ldone 438 439 .Lodd_tail: 440 movdqu (%inp), %T1 / Ii 441 pshufb %T3, %T1 442 pxor %T1, %Xi / Ii+Xi 443 444 clmul64x64_T2(HK) / H*(Ii+Xi) 445 reduction_alg9 446 447 .Ldone: 448 pshufb %T3, %Xi 449 movdqu %Xi, (%Xip) 450 451 ret 452 SET_SIZE(gcm_ghash_clmul) 453 454 /* 455 * void gcm_init_clmul(const void *Xi, void *Htable) 456 */ 457 ENTRY_NP(gcm_init_clmul) 458 movdqu (%Xip), %Hkey 459 pshufd $0b01001110, %Hkey, %Hkey / dword swap 460 461 / <<1 twist 462 pshufd $0b11111111, %Hkey, %T2 / broadcast uppermost dword 463 movdqa %Hkey, %T1 464 psllq $1, %Hkey 465 pxor %T3, %T3 466 psrlq $63, %T1 467 pcmpgtd %T2, %T3 / broadcast carry bit 468 pslldq $8, %T1 469 por %T1, %Hkey / H<<=1 470 471 / magic reduction 472 pand .L0x1c2_polynomial(%rip), %T3 473 pxor %T3, %Hkey / if(carry) H^=0x1c2_polynomial 474 475 / calculate H^2 476 pshufd $0b01001110, %Hkey, %HK 477 movdqa %Hkey, %Xi 478 pxor %Hkey, %HK 479 480 clmul64x64_T2(HK) 481 reduction_alg9 482 483 pshufd $0b01001110, %Hkey, %T1 484 pshufd $0b01001110, %Xi, %T2 485 pxor %Hkey, %T1 / Karatsuba pre-processing 486 movdqu %Hkey, 0x00(%Htbl) / save H 487 pxor %Xi, %T2 / Karatsuba pre-processing 488 movdqu %Xi, 0x10(%Htbl) / save H^2 489 palignr $8, %T1, %T2 / low part is H.lo^H.hi... 490 movdqu %T2, 0x20(%Htbl) / save Karatsuba "salt" 491 492 clmul64x64_T2(HK) / H^3 493 reduction_alg9 494 495 movdqa %Xi, %T3 496 497 clmul64x64_T2(HK) / H^4 498 reduction_alg9 499 500 pshufd $0b01001110, %T3, %T1 501 pshufd $0b01001110, %Xi, %T2 502 pxor %T3, %T1 / Karatsuba pre-processing 503 movdqu %T3, 0x30(%Htbl) / save H^3 504 pxor %Xi, %T2 / Karatsuba pre-processing 505 movdqu %Xi, 0x40(%Htbl) / save H^4 506 palignr $8, %T1, %T2 / low part is H^3.lo^H^3.hi... 507 movdqu %T2, 0x50(%Htbl) / save Karatsuba "salt" 508 509 ret 510 SET_SIZE(gcm_init_clmul) 511 512 #endif /* lint || __lint */