1 /*
   2  * Copyright (c) 2013, CRYPTOGAMS by <appro@openssl.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * *    Redistributions of source code must retain copyright notices,
  10  *      this list of conditions and the following disclaimer.
  11  *
  12  * *    Redistributions in binary form must reproduce the above
  13  *      copyright notice, this list of conditions and the following
  14  *      disclaimer in the documentation and/or other materials
  15  *      provided with the distribution.
  16  *
  17  * *    Neither the name of the CRYPTOGAMS nor the names of its
  18  *      copyright holder and contributors may be used to endorse or
  19  *      promote products derived from this software without specific
  20  *      prior written permission.
  21  *
  22  * ALTERNATIVELY, provided that this notice is retained in full, this
  23  * product may be distributed under the terms of the GNU General Public
  24  * License (GPL), in which case the provisions of the GPL apply INSTEAD OF
  25  * those given above.
  26  *
  27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
  28  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  31  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  32  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  33  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  34  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  35  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  36  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38  */
  39 /*
  40  * Copyright 2015 by Saso Kiselkov on Illumos port sections.
  41  */
  42 
  43 #if defined(lint) || defined(__lint)
  44 
  45 #include <sys/types.h>
  46 
  47 /*ARGSUSED*/
  48 void
  49 gcm_ghash_clmul(uint64_t ghash[2], const uint8_t Htable[256],
  50     const uint8_t *inp, size_t len)
  51 {
  52 }
  53 
  54 /*ARGSUSED*/
  55 void
  56 gcm_init_clmul(const uint64_t hash_init[2], uint8_t Htable[256])
  57 {
  58 }
  59 
  60 #else   /* lint */
  61 
  62 #include <sys/asm_linkage.h>
  63 #include <sys/controlregs.h>
  64 #ifdef _KERNEL
  65 #include <sys/machprivregs.h>
  66 #endif
  67 
  68 .text
  69 .align XMM_ALIGN
  70 .Lbyte_swap16_mask:
  71         .byte   15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  72 .L0x1c2_polynomial:
  73         .byte   1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc2
  74 .L7_mask:
  75         .long   7, 0, 7, 0
  76 
  77 #define Xi      xmm0    /* hash value */
  78 #define Xhi     xmm1    /* hash value high order 64 bits */
  79 #define Hkey    xmm2    /* hash key */
  80 #define T1      xmm3    /* temp1 */
  81 #define T2      xmm4    /* temp2 */
  82 #define T3      xmm5    /* temp3 */
  83 #define Xb0     xmm6    /* cipher block #0 */
  84 #define Xb1     xmm7    /* cipher block #1 */
  85 #define Xb2     xmm8    /* cipher block #2 */
  86 #define Xb3     xmm9    /* cipher block #3 */
  87 #define Xb4     xmm10   /* cipher block #4 */
  88 #define Xb5     xmm11   /* cipher block #5 */
  89 #define Xb6     xmm12   /* cipher block #6 */
  90 #define Xb7     xmm13   /* cipher block #7 */
  91 
  92 #define clmul64x64_T2(tmpreg)                           \
  93         movdqa          %Xi, %Xhi;                      \
  94         pshufd          $0b01001110, %Xi, %T1;          \
  95         pxor            %Xi, %T1;                       \
  96                                                         \
  97         pclmulqdq       $0x00, %Hkey, %Xi;              \
  98         pclmulqdq       $0x11, %Hkey, %Xhi;             \
  99         pclmulqdq       $0x00, %tmpreg, %T1;            \
 100         pxor            %Xi, %T1;                       \
 101         pxor            %Xhi, %T1;                      \
 102                                                         \
 103         movdqa          %T1, %T2;                       \
 104         psrldq          $8, %T1;                        \
 105         pslldq          $8, %T2;                        \
 106         pxor            %T1, %Xhi;                      \
 107         pxor            %T2, %Xi
 108 
 109 #define reduction_alg9                                  \
 110         /* 1st phase */                                 \
 111         movdqa          %Xi, %T2;                       \
 112         movdqa          %Xi, %T1;                       \
 113         psllq           $5, %Xi;                        \
 114         pxor            %Xi, %T1;                       \
 115         psllq           $1, %Xi;                        \
 116         pxor            %T1, %Xi;                       \
 117         psllq           $57, %Xi;                       \
 118         movdqa          %Xi, %T1;                       \
 119         pslldq          $8, %Xi;                        \
 120         psrldq          $8, %T1;                        \
 121         pxor            %T2, %Xi;                       \
 122         pxor            %T1, %Xhi;                      \
 123         /* 2nd phase */                                 \
 124         movdqa          %Xi, %T2;                       \
 125         psrlq           $1, %Xi;                        \
 126         pxor            %T2, %Xhi;                      \
 127         pxor            %Xi, %T2;                       \
 128         psrlq           $5, %Xi;                        \
 129         pxor            %T2, %Xi;                       \
 130         psrlq           $1, %Xi;                        \
 131         pxor            %Xhi, %Xi
 132 
 133 #define Xip     rdi
 134 #define Htbl    rsi
 135 #define inp     rdx
 136 #define len     rcx
 137 
 138 #define Xln     xmm6
 139 #define Xmn     xmm7
 140 #define Xhn     xmm8
 141 #define Hkey2   xmm9
 142 #define HK      xmm10
 143 #define Xl      xmm11
 144 #define Xm      xmm12
 145 #define Xh      xmm13
 146 #define Hkey3   xmm14
 147 #define Hkey4   xmm15
 148 
 149 /*
 150  * void gcm_ghash_clmul(uint64_t Xi[2], const uint64_t Htable[32],
 151  *      const uint8_t *inp, size_t len)
 152  */
 153 ENTRY_NP(gcm_ghash_clmul)
 154         movdqa          .Lbyte_swap16_mask(%rip), %T3
 155         mov             $0xA040608020C0E000, %rax       / ((7..0) ? 0xE0) & 0xff
 156 
 157         movdqu          (%Xip), %Xi
 158         movdqu          (%Htbl), %Hkey
 159         movdqu          0x20(%Htbl), %HK
 160         pshufb          %T3, %Xi
 161 
 162         sub             $0x10, %len
 163         jz              .Lodd_tail
 164 
 165         movdqu          0x10(%Htbl), %Hkey2
 166 
 167         cmp             $0x30, %len
 168         jb              .Lskip4x
 169 
 170         sub             $0x30, %len
 171         movdqu          0x30(%Htbl),%Hkey3
 172         movdqu          0x40(%Htbl),%Hkey4
 173 
 174         /* Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P */
 175         movdqu          0x30(%inp), %Xln
 176         movdqu          0x20(%inp), %Xl
 177         pshufb          %T3, %Xln
 178         pshufb          %T3, %Xl
 179         movdqa          %Xln, %Xhn
 180         pshufd          $0b01001110, %Xln, %Xmn
 181         pxor            %Xln, %Xmn
 182         pclmulqdq       $0x00, %Hkey, %Xln
 183         pclmulqdq       $0x11, %Hkey, %Xhn
 184         pclmulqdq       $0x00, %HK, %Xmn
 185 
 186         movdqa          %Xl, %Xh
 187         pshufd          $0b01001110, %Xl, %Xm
 188         pxor            %Xl, %Xm
 189         pclmulqdq       $0x00, %Hkey2, %Xl
 190         pclmulqdq       $0x11, %Hkey2, %Xh
 191         xorps           %Xl, %Xln
 192         pclmulqdq       $0x10, %HK, %Xm
 193         xorps           %Xh, %Xhn
 194         movups          0x50(%Htbl), %HK
 195         xorps           %Xm, %Xmn
 196 
 197         movdqu          0x10(%inp), %Xl
 198         movdqu          0x00(%inp), %T1
 199         pshufb          %T3, %Xl
 200         pshufb          %T3, %T1
 201         movdqa          %Xl, %Xh
 202         pshufd          $0b01001110, %Xl, %Xm
 203         pxor            %T1, %Xi
 204         pxor            %Xl, %Xm
 205         pclmulqdq       $0x00, %Hkey3, %Xl
 206         movdqa          %Xi, %Xhi
 207         pshufd          $0b01001110, %Xi, %T1
 208         pxor            %Xi, %T1
 209         pclmulqdq       $0x11, %Hkey3, %Xh
 210         xorps           %Xl, %Xln
 211         pclmulqdq       $0x00, %HK, %Xm
 212         xorps           %Xh, %Xhn
 213 
 214         lea             0x40(%inp), %inp
 215         sub             $0x40, %len
 216         jc              .Ltail4x
 217 
 218         jmp             .Lmod4_loop
 219 
 220 .align  32
 221 .Lmod4_loop:
 222         pclmulqdq       $0x00, %Hkey4, %Xi
 223         xorps           %Xm, %Xmn
 224         movdqu          0x30(%inp), %Xl
 225         pshufb          %T3, %Xl
 226         pclmulqdq       $0x11, %Hkey4, %Xhi
 227         xorps           %Xln, %Xi
 228         movdqu          0x20(%inp), %Xln
 229         movdqa          %Xl, %Xh
 230         pshufd          $0b01001110, %Xl, %Xm
 231         pclmulqdq       $0x10, %HK, %T1
 232         xorps           %Xhn, %Xhi
 233         pxor            %Xl, %Xm
 234         pshufb          %T3, %Xln
 235         movups          0x20(%Htbl), %HK
 236         pclmulqdq       $0x00, %Hkey, %Xl
 237         xorps           %Xmn, %T1
 238         movdqa          %Xln, %Xhn
 239         pshufd          $0b01001110, %Xln, %Xmn
 240 
 241         pxor            %Xi, %T1        / aggregated Karatsuba post-processing
 242         pxor            %Xln, %Xmn
 243         pxor            %Xhi, %T1
 244         movdqa          %T1, %T2
 245         pslldq          $8, %T1
 246         pclmulqdq       $0x11, %Hkey, %Xh
 247         psrldq          $8, %T2
 248         pxor            %T1, %Xi
 249         movdqa          .L7_mask(%rip), %T1
 250         pxor            %T2, %Xhi
 251         movq            %rax, %T2
 252 
 253         pand            %Xi, %T1        / 1st phase
 254         pshufb          %T1, %T2
 255         pclmulqdq       $0x00, %HK, %Xm
 256         pxor            %Xi, %T2
 257         psllq           $57, %T2
 258         movdqa          %T2, %T1
 259         pslldq          $8, %T2
 260         pclmulqdq       $0x00, %Hkey2, %Xln
 261         psrldq          $8, %T1
 262         pxor            %T2, %Xi
 263         pxor            %T1, %Xhi
 264         movdqu          0(%inp), %T1
 265 
 266         movdqa          %Xi, %T2        / 2nd phase
 267         psrlq           $1, %Xi
 268         pclmulqdq       $0x11, %Hkey2, %Xhn
 269         xorps           %Xl, %Xln
 270         movdqu          0x10(%inp), %Xl
 271         pshufb          %T3, %Xl
 272         pclmulqdq       $0x10, %HK, %Xmn
 273         xorps           %Xh, %Xhn
 274         movups          0x50(%Htbl), %HK
 275         pshufb          %T3, %T1
 276         pxor            %T2, %Xhi
 277         pxor            %Xi, %T2
 278         psrlq           $5, %Xi
 279 
 280         movdqa          %Xl, %Xh
 281         pxor            %Xm, %Xmn
 282         pshufd          $0b01001110, %Xl, %Xm
 283         pxor            %Xl, %Xm
 284         pclmulqdq       $0x00, %Hkey3, %Xl
 285         pxor            %T2, %Xi
 286         pxor            %T1, %Xhi
 287         psrlq           $1, %Xi
 288         pclmulqdq       $0x11, %Hkey3, %Xh
 289         xorps           %Xl, %Xln
 290         pxor            %Xhi, %Xi
 291 
 292         pclmulqdq       $0x00, %HK, %Xm
 293         xorps           %Xh, %Xhn
 294 
 295         movdqa          %Xi, %Xhi
 296         pshufd          $0b01001110, %Xi, %T1
 297         pxor            %Xi, %T1
 298 
 299         lea             0x40(%inp), %inp
 300         sub             $0x40, %len
 301         jnc             .Lmod4_loop
 302 
 303 .Ltail4x:
 304         pclmulqdq       $0x00, %Hkey4, %Xi
 305         xorps           %Xm, %Xmn
 306         pclmulqdq       $0x11, %Hkey4, %Xhi
 307         xorps           %Xln, %Xi
 308         pclmulqdq       $0x10, %HK, %T1
 309         xorps           %Xhn, %Xhi
 310         pxor            %Xi, %Xhi       / aggregated Karatsuba post-processing
 311         pxor            %Xmn, %T1
 312 
 313         pxor            %Xhi, %T1
 314         pxor            %Xi, %Xhi
 315 
 316         movdqa          %T1, %T2
 317         psrldq          $8, %T1
 318         pslldq          $8, %T2
 319         pxor            %T1, %Xhi
 320         pxor            %T2, %Xi
 321 
 322         reduction_alg9
 323 
 324         add     $0x40, %len
 325         jz      .Ldone
 326         movdqu  0x20(%Htbl), %HK
 327         sub     $0x10, %len
 328         jz      .Lodd_tail
 329 .Lskip4x:
 330 
 331         /*
 332          * Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
 333          *      [(H*Ii+1) + (H*Xi+1)] mod P =
 334          *      [(H*Ii+1) + H^2*(Ii+Xi)] mod P
 335          */
 336         movdqu          (%inp), %T1             / Ii
 337         movdqu          16(%inp), %Xln          / Ii+1
 338         pshufb          %T3, %T1
 339         pshufb          %T3, %Xln
 340         pxor            %T1, %Xi                / Ii+Xi
 341 
 342         movdqa          %Xln, %Xhn
 343         pshufd          $0b01001110, %Xln, %T1
 344         pxor            %Xln, %T1
 345         pclmulqdq       $0x00, %Hkey, %Xln
 346         pclmulqdq       $0x11, %Hkey, %Xhn
 347         pclmulqdq       $0x00, %HK, %T1
 348 
 349         lea             32(%inp), %inp          / i+=2
 350         sub             $0x20, %len
 351         jbe             .Leven_tail
 352         jmp             .Lmod_loop
 353 
 354 .align  32
 355 .Lmod_loop:
 356         movdqa          %Xi, %Xhi
 357         pshufd          $0b01001110, %Xi, %T2
 358         pxor            %Xi, %T2
 359 
 360         pclmulqdq       $0x00, %Hkey2, %Xi
 361         pclmulqdq       $0x11, %Hkey2, %Xhi
 362         pclmulqdq       $0x10, %HK, %T2
 363 
 364         pxor            %Xln, %Xi               / (H*Ii+1) + H^2*(Ii+Xi)
 365         pxor            %Xhn, %Xhi
 366         movdqu          (%inp), %Xhn            / Ii
 367         pshufb          %T3, %Xhn
 368         movdqu          16(%inp), %Xln          / Ii+1
 369 
 370         pxor            %Xi, %T1                / aggregated Karatsuba post-proc
 371         pxor            %Xhi, %T1
 372         pxor            %Xhn, %Xhi              / "Ii+Xi",  consume early
 373         pxor            %T1, %T2
 374         pshufb          %T3, %Xln
 375         movdqa          %T2, %T1
 376         psrldq          $8, %T1
 377         pslldq          $8, %T2
 378         pxor            %T1, %Xhi
 379         pxor            %T2, %Xi
 380 
 381         movdqa          %Xln, %Xhn
 382 
 383         movdqa          %Xi, %T2                / 1st phase
 384         movdqa          %Xi, %T1
 385         psllq           $5, %Xi
 386         pclmulqdq       $0x00, %Hkey, %Xln
 387         pxor            %Xi, %T1
 388         psllq           $1, %Xi
 389         pxor            %T1, %Xi
 390         psllq           $57, %Xi
 391         movdqa          %Xi, %T1
 392         pslldq          $8, %Xi
 393         psrldq          $8, %T1
 394         pxor            %T2, %Xi
 395         pxor            %T1, %Xhi
 396         pshufd          $0b01001110, %Xhn, %T1
 397         pxor            %Xhn, %T1
 398 
 399         pclmulqdq       $0x11, %Hkey, %Xhn
 400         movdqa          %Xi, %T2                / 2nd phase
 401         psrlq           $1, %Xi
 402         pxor            %T2, %Xhi
 403         pxor            %Xi, %T2
 404         psrlq           $5, %Xi
 405         pxor            %T2, %Xi
 406         psrlq           $1, %Xi
 407         pclmulqdq       $0x00, %HK, %T1
 408         pxor            %Xhi, %Xi
 409 
 410         lea             32(%inp), %inp
 411         sub             $0x20, %len
 412         ja              .Lmod_loop
 413 
 414 .Leven_tail:
 415         movdqa          %Xi, %Xhi
 416         pshufd          $0b01001110, %Xi, %T2
 417         pxor            %Xi, %T2
 418 
 419         pclmulqdq       $0x00, %Hkey2, %Xi
 420         pclmulqdq       $0x11, %Hkey2, %Xhi
 421         pclmulqdq       $0x10, %HK, %T2
 422 
 423         pxor            %Xln, %Xi               /* (H*Ii+1) + H^2*(Ii+Xi) */
 424         pxor            %Xhn, %Xhi
 425         pxor            %Xi, %T1
 426         pxor            %Xhi, %T1
 427         pxor            %T1, %T2
 428         movdqa          %T2, %T1
 429         psrldq          $8, %T1
 430         pslldq          $8, %T2
 431         pxor            %T1, %Xhi
 432         pxor            %T2, %Xi
 433 
 434         reduction_alg9
 435 
 436         test            %len, %len
 437         jnz             .Ldone
 438 
 439 .Lodd_tail:
 440         movdqu          (%inp), %T1                     / Ii
 441         pshufb          %T3, %T1
 442         pxor            %T1, %Xi                        / Ii+Xi
 443 
 444         clmul64x64_T2(HK)                               / H*(Ii+Xi)
 445         reduction_alg9
 446 
 447 .Ldone:
 448         pshufb          %T3, %Xi
 449         movdqu          %Xi, (%Xip)
 450 
 451         ret
 452         SET_SIZE(gcm_ghash_clmul)
 453 
 454 /*
 455  * void gcm_init_clmul(const void *Xi, void *Htable)
 456  */
 457 ENTRY_NP(gcm_init_clmul)
 458         movdqu          (%Xip), %Hkey
 459         pshufd          $0b01001110, %Hkey, %Hkey       / dword swap
 460 
 461         / <<1 twist
 462         pshufd          $0b11111111, %Hkey, %T2 / broadcast uppermost dword
 463         movdqa          %Hkey, %T1
 464         psllq           $1, %Hkey
 465         pxor            %T3, %T3
 466         psrlq           $63, %T1
 467         pcmpgtd         %T2, %T3                / broadcast carry bit
 468         pslldq          $8, %T1
 469         por             %T1, %Hkey              / H<<=1
 470 
 471         / magic reduction
 472         pand            .L0x1c2_polynomial(%rip), %T3
 473         pxor            %T3, %Hkey              / if(carry) H^=0x1c2_polynomial
 474 
 475         / calculate H^2
 476         pshufd          $0b01001110, %Hkey, %HK
 477         movdqa          %Hkey, %Xi
 478         pxor            %Hkey, %HK
 479 
 480         clmul64x64_T2(HK)
 481         reduction_alg9
 482 
 483         pshufd          $0b01001110, %Hkey, %T1
 484         pshufd          $0b01001110, %Xi, %T2
 485         pxor            %Hkey, %T1              / Karatsuba pre-processing
 486         movdqu          %Hkey, 0x00(%Htbl)      / save H
 487         pxor            %Xi, %T2                / Karatsuba pre-processing
 488         movdqu          %Xi, 0x10(%Htbl)        / save H^2
 489         palignr         $8, %T1, %T2            / low part is H.lo^H.hi...
 490         movdqu          %T2, 0x20(%Htbl)        / save Karatsuba "salt"
 491 
 492         clmul64x64_T2(HK)                       / H^3
 493         reduction_alg9
 494 
 495         movdqa          %Xi, %T3
 496 
 497         clmul64x64_T2(HK)                       / H^4
 498         reduction_alg9
 499 
 500         pshufd          $0b01001110, %T3, %T1
 501         pshufd          $0b01001110, %Xi, %T2
 502         pxor            %T3, %T1                / Karatsuba pre-processing
 503         movdqu          %T3, 0x30(%Htbl)        / save H^3
 504         pxor            %Xi, %T2                / Karatsuba pre-processing
 505         movdqu          %Xi, 0x40(%Htbl)        / save H^4
 506         palignr         $8, %T1, %T2            / low part is H^3.lo^H^3.hi...
 507         movdqu          %T2, 0x50(%Htbl)        / save Karatsuba "salt"
 508 
 509         ret
 510         SET_SIZE(gcm_init_clmul)
 511 
 512 #endif  /* lint || __lint */