illumos-gate Wdiff usr/src/common/crypto/modes/amd64/gcm_intel.s

Print this page

4896 Performance improvements for KCF AES modes

Split	Close
Expand all
Collapse all

          --- old/usr/src/common/crypto/modes/amd64/gcm_intel.s
          +++ new/usr/src/common/crypto/modes/amd64/gcm_intel.s

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END

↓ open down ↓

19 lines elided

↑ open up ↑

  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2009 Intel Corporation
  24   24   * All Rights Reserved.
  25   25   */
  26   26  /*
  27   27   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  28   28   * Use is subject to license terms.
  29   29   */
       30 +/*
       31 + * Copyright 2015 by Saso Kiselkov. All rights reserved.
       32 + */
  30   33  
  31   34  /*
  32   35   * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
  33   36   * instructions.  This file contains an accelerated
  34   37   * Galois Field Multiplication implementation.
  35   38   *
  36   39   * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
  37   40   * carry-less multiplication. More information about PCLMULQDQ can be
  38   41   * found at:
  39   42   * http://software.intel.com/en-us/articles/

  40   43   * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
  41   44   *
  42   45   */
  43   46  
  44   47  /*
  45   48   * ====================================================================
  46   49   * OpenSolaris OS modifications
  47   50   *
  48   51   * This source originates as file galois_hash_asm.c from
  49   52   * Intel Corporation dated September 21, 2009.
  50   53   *
  51   54   * This OpenSolaris version has these major changes from the original source:
  52   55   *
  53   56   * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  54   57   * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
  55   58   * definition for lint.
  56   59   *
  57   60   * 2. Formatted code, added comments, and added #includes and #defines.
  58   61   *
  59   62   * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
  60   63   * calling kpreempt_disable() and kpreempt_enable().
  61   64   * If the TS bit is not set, Save and restore %xmm registers at the beginning
  62   65   * and end of function calls (%xmm* registers are not saved and restored by
  63   66   * during kernel thread preemption).
  64   67   *
  65   68   * 4. Removed code to perform hashing.  This is already done with C macro
  66   69   * GHASH in gcm.c.  For better performance, this removed code should be
  67   70   * reintegrated in the future to replace the C GHASH macro.
  68   71   *
  69   72   * 5. Added code to byte swap 16-byte input and output.
  70   73   *
  71   74   * 6. Folded in comments from the original C source with embedded assembly
  72   75   * (SB_w_shift_xor.c)
  73   76   *
  74   77   * 7. Renamed function and reordered parameters to match OpenSolaris:
  75   78   * Intel interface:
  76   79   *      void galois_hash_asm(unsigned char *hk, unsigned char *s,
  77   80   *              unsigned char *d, int length)
  78   81   * OpenSolaris OS interface:
  79   82   *      void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
  80   83   * ====================================================================
  81   84   */
  82   85

↓ open down ↓

43 lines elided

↑ open up ↑

  83   86  
  84   87  #if defined(lint) || defined(__lint)
  85   88  
  86   89  #include <sys/types.h>
  87   90  
  88   91  /* ARGSUSED */
  89   92  void
  90   93  gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
  91   94  }
  92   95  
       96 +#ifdef  _KERNEL
       97 +/*ARGSUSED*/
       98 +void
       99 +gcm_intel_save(void *savestate)
      100 +{
      101 +}
      102 +
      103 +/*ARGSUSED*/
      104 +void
      105 +gcm_accel_restore(void *savestate)
      106 +{
      107 +}
      108 +#endif  /* _KERNEL */
      109 +
  93  110  #else   /* lint */
  94  111  
  95  112  #include <sys/asm_linkage.h>
  96  113  #include <sys/controlregs.h>
  97  114  #ifdef _KERNEL
  98  115  #include <sys/machprivregs.h>
  99  116  #endif
 100  117  
 101  118  #ifdef _KERNEL
 102  119          /*

 103  120           * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv.  That is,
 104  121           * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
 105  122           * uses it to pass P2 to syscall.
 106  123           * This also occurs with the STTS macro, but we don't care if
 107  124           * P2 (%rsi) is modified just before function exit.
 108  125           * The CLTS and STTS macros push and pop P1 (%rdi) already.

↓ open down ↓

6 lines elided

↑ open up ↑

 109  126           */
 110  127  #ifdef __xpv
 111  128  #define PROTECTED_CLTS \
 112  129          push    %rsi; \
 113  130          CLTS; \
 114  131          pop     %rsi
 115  132  #else
 116  133  #define PROTECTED_CLTS \
 117  134          CLTS
 118  135  #endif  /* __xpv */
 119      -
 120      -        /*
 121      -         * If CR0_TS is not set, align stack (with push %rbp) and push
 122      -         * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
 123      -         */
 124      -#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
 125      -        push    %rbp; \
 126      -        mov     %rsp, %rbp; \
 127      -        movq    %cr0, tmpreg; \
 128      -        testq   $CR0_TS, tmpreg; \
 129      -        jnz     1f; \
 130      -        and     $-XMM_ALIGN, %rsp; \
 131      -        sub     $[XMM_SIZE * 11], %rsp; \
 132      -        movaps  %xmm0, 160(%rsp); \
 133      -        movaps  %xmm1, 144(%rsp); \
 134      -        movaps  %xmm2, 128(%rsp); \
 135      -        movaps  %xmm3, 112(%rsp); \
 136      -        movaps  %xmm4, 96(%rsp); \
 137      -        movaps  %xmm5, 80(%rsp); \
 138      -        movaps  %xmm6, 64(%rsp); \
 139      -        movaps  %xmm7, 48(%rsp); \
 140      -        movaps  %xmm8, 32(%rsp); \
 141      -        movaps  %xmm9, 16(%rsp); \
 142      -        movaps  %xmm10, (%rsp); \
 143      -        jmp     2f; \
 144      -1: \
 145      -        PROTECTED_CLTS; \
 146      -2:
 147      -
 148      -
 149      -        /*
 150      -         * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
 151      -         * otherwise set CR0_TS.
 152      -         */
 153      -#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
 154      -        testq   $CR0_TS, tmpreg; \
 155      -        jnz     1f; \
 156      -        movaps  (%rsp), %xmm10; \
 157      -        movaps  16(%rsp), %xmm9; \
 158      -        movaps  32(%rsp), %xmm8; \
 159      -        movaps  48(%rsp), %xmm7; \
 160      -        movaps  64(%rsp), %xmm6; \
 161      -        movaps  80(%rsp), %xmm5; \
 162      -        movaps  96(%rsp), %xmm4; \
 163      -        movaps  112(%rsp), %xmm3; \
 164      -        movaps  128(%rsp), %xmm2; \
 165      -        movaps  144(%rsp), %xmm1; \
 166      -        movaps  160(%rsp), %xmm0; \
 167      -        jmp     2f; \
 168      -1: \
 169      -        STTS(tmpreg); \
 170      -2: \
 171      -        mov     %rbp, %rsp; \
 172      -        pop     %rbp
 173      -
 174      -
 175      -#else
 176      -#define PROTECTED_CLTS
 177      -#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
 178      -#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
 179  136  #endif  /* _KERNEL */
 180  137  
 181      -/*
 182      - * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
 183      - */
 184      -
 185      -// static uint8_t byte_swap16_mask[] = {
 186      -//       15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
 187  138  .text
 188  139  .align XMM_ALIGN
      140 +/*
      141 + * Use this mask to byte-swap a 16-byte integer with the pshufb instruction:
      142 + * static uint8_t byte_swap16_mask[] = {
      143 + *      15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
      144 + */
 189  145  .Lbyte_swap16_mask:
 190  146          .byte   15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 191  147  
      148 +#ifdef  _KERNEL
      149 +/*
      150 + * void gcm_intel_save(void *savestate)
      151 + *
      152 + * Saves the XMM0--XMM14 registers and CR0 to a temporary location pointed
      153 + * to in the first argument and clears TS in CR0. This must be invoked before
      154 + * executing accelerated GCM computations inside the kernel (and kernel
      155 + * thread preemption must be disabled as well). The memory region to which
      156 + * all state is saved must be at least 16x 128-bit + 64-bit long and must
      157 + * be 128-bit aligned.
      158 + */
      159 +ENTRY_NP(gcm_accel_save)
      160 +        movq    %cr0, %rax
      161 +        movq    %rax, 0x100(%rdi)
      162 +        testq   $CR0_TS, %rax
      163 +        jnz     1f
      164 +        /* FPU is in use, save registers */
      165 +        movaps  %xmm0, 0x00(%rdi)
      166 +        movaps  %xmm1, 0x10(%rdi)
      167 +        movaps  %xmm2, 0x20(%rdi)
      168 +        movaps  %xmm3, 0x30(%rdi)
      169 +        movaps  %xmm4, 0x40(%rdi)
      170 +        movaps  %xmm5, 0x50(%rdi)
      171 +        movaps  %xmm6, 0x60(%rdi)
      172 +        movaps  %xmm7, 0x70(%rdi)
      173 +        movaps  %xmm8, 0x80(%rdi)
      174 +        movaps  %xmm9, 0x90(%rdi)
      175 +        movaps  %xmm10, 0xa0(%rdi)
      176 +        movaps  %xmm11, 0xb0(%rdi)
      177 +        movaps  %xmm12, 0xc0(%rdi)
      178 +        movaps  %xmm13, 0xd0(%rdi)
      179 +        movaps  %xmm14, 0xe0(%rdi)
      180 +        movaps  %xmm15, 0xf0(%rdi)
      181 +        ret
      182 +1:
      183 +        PROTECTED_CLTS
      184 +        ret
      185 +        SET_SIZE(gcm_accel_save)
 192  186  
      187 +/*
      188 + * void gcm_accel_restore(void *savestate)
      189 + *
      190 + * Restores the saved XMM and CR0.TS state from aes_accel_save.
      191 + */
      192 +ENTRY_NP(gcm_accel_restore)
      193 +        movq    0x100(%rdi), %rax
      194 +        testq   $CR0_TS, %rax
      195 +        jnz     1f
      196 +        movaps  0x00(%rdi), %xmm0
      197 +        movaps  0x10(%rdi), %xmm1
      198 +        movaps  0x20(%rdi), %xmm2
      199 +        movaps  0x30(%rdi), %xmm3
      200 +        movaps  0x40(%rdi), %xmm4
      201 +        movaps  0x50(%rdi), %xmm5
      202 +        movaps  0x60(%rdi), %xmm6
      203 +        movaps  0x70(%rdi), %xmm7
      204 +        movaps  0x80(%rdi), %xmm8
      205 +        movaps  0x90(%rdi), %xmm9
      206 +        movaps  0xa0(%rdi), %xmm10
      207 +        movaps  0xb0(%rdi), %xmm11
      208 +        movaps  0xc0(%rdi), %xmm12
      209 +        movaps  0xd0(%rdi), %xmm13
      210 +        movaps  0xe0(%rdi), %xmm14
      211 +        movaps  0xf0(%rdi), %xmm15
      212 +        ret
      213 +1:
      214 +        STTS(%rax)
      215 +        ret
      216 +        SET_SIZE(gcm_accel_restore)
 193  217  
      218 +#endif  /* _KERNEL */
      219 +
 194  220  /*
 195  221   * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
 196  222   *
 197  223   * Perform a carry-less multiplication (that is, use XOR instead of the
 198  224   * multiply operator) on P1 and P2 and place the result in P3.
 199  225   *
 200  226   * Byte swap the input and the output.
 201  227   *
 202      - * Note: x_in, y, and res all point to a block of 20-byte numbers
      228 + * Note: x_in, y, and res all point to a block of 16-byte numbers
 203  229   * (an array of two 64-bit integers).
 204  230   *
 205      - * Note2: For kernel code, caller is responsible for ensuring
 206      - * kpreempt_disable() has been called.  This is because %xmm registers are
 207      - * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
 208      - * respectively, if TS is set on entry.  Otherwise, if TS is not set,
 209      - * save and restore %xmm registers on the stack.
      231 + * Note2: For kernel code, caller is responsible for bracketing this call with
      232 + * disabling kernel thread preemption and calling gcm_accel_save/restore().
 210  233   *
 211  234   * Note3: Original Intel definition:
 212  235   * void galois_hash_asm(unsigned char *hk, unsigned char *s,
 213  236   *      unsigned char *d, int length)
 214  237   *
 215  238   * Note4: Register/parameter mapping:
 216  239   * Intel:
 217  240   *      Parameter 1: %rcx (copied to %xmm0)     hk or x_in
 218  241   *      Parameter 2: %rdx (copied to %xmm1)     s or y
 219  242   *      Parameter 3: %rdi (result)              d or res
 220  243   * OpenSolaris:
 221  244   *      Parameter 1: %rdi (copied to %xmm0)     x_in
 222  245   *      Parameter 2: %rsi (copied to %xmm1)     y
 223  246   *      Parameter 3: %rdx (result)              res
 224  247   */
 225  248  
 226  249  ENTRY_NP(gcm_mul_pclmulqdq)
 227      -        CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
 228      -
 229  250          //
 230  251          // Copy Parameters
 231  252          //
 232  253          movdqu  (%rdi), %xmm0   // P1
 233  254          movdqu  (%rsi), %xmm1   // P2
 234  255  
 235  256          //
 236  257          // Byte swap 16-byte input
 237  258          //
 238  259          lea     .Lbyte_swap16_mask(%rip), %rax

 239  260          movaps  (%rax), %xmm10
 240  261          pshufb  %xmm10, %xmm0
 241  262          pshufb  %xmm10, %xmm1
 242  263  
 243  264  
 244  265          //
 245  266          // Multiply with the hash key
 246  267          //
 247  268          movdqu  %xmm0, %xmm3
 248  269          pclmulqdq $0, %xmm1, %xmm3      // xmm3 holds a0*b0
 249  270  
 250  271          movdqu  %xmm0, %xmm4
 251  272          pclmulqdq $16, %xmm1, %xmm4     // xmm4 holds a0*b1
 252  273  
 253  274          movdqu  %xmm0, %xmm5
 254  275          pclmulqdq $1, %xmm1, %xmm5      // xmm5 holds a1*b0
 255  276          movdqu  %xmm0, %xmm6
 256  277          pclmulqdq $17, %xmm1, %xmm6     // xmm6 holds a1*b1
 257  278  
 258  279          pxor    %xmm5, %xmm4    // xmm4 holds a0*b1 + a1*b0
 259  280  
 260  281          movdqu  %xmm4, %xmm5    // move the contents of xmm4 to xmm5
 261  282          psrldq  $8, %xmm4       // shift by xmm4 64 bits to the right
 262  283          pslldq  $8, %xmm5       // shift by xmm5 64 bits to the left
 263  284          pxor    %xmm5, %xmm3
 264  285          pxor    %xmm4, %xmm6    // Register pair <xmm6:xmm3> holds the result
 265  286                                  // of the carry-less multiplication of
 266  287                                  // xmm0 by xmm1.
 267  288  
 268  289          // We shift the result of the multiplication by one bit position
 269  290          // to the left to cope for the fact that the bits are reversed.
 270  291          movdqu  %xmm3, %xmm7
 271  292          movdqu  %xmm6, %xmm8
 272  293          pslld   $1, %xmm3
 273  294          pslld   $1, %xmm6
 274  295          psrld   $31, %xmm7
 275  296          psrld   $31, %xmm8
 276  297          movdqu  %xmm7, %xmm9
 277  298          pslldq  $4, %xmm8
 278  299          pslldq  $4, %xmm7
 279  300          psrldq  $12, %xmm9
 280  301          por     %xmm7, %xmm3
 281  302          por     %xmm8, %xmm6
 282  303          por     %xmm9, %xmm6
 283  304  
 284  305          //
 285  306          // First phase of the reduction
 286  307          //
 287  308          // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
 288  309          // independently.
 289  310          movdqu  %xmm3, %xmm7
 290  311          movdqu  %xmm3, %xmm8
 291  312          movdqu  %xmm3, %xmm9
 292  313          pslld   $31, %xmm7      // packed right shift shifting << 31
 293  314          pslld   $30, %xmm8      // packed right shift shifting << 30
 294  315          pslld   $25, %xmm9      // packed right shift shifting << 25
 295  316          pxor    %xmm8, %xmm7    // xor the shifted versions
 296  317          pxor    %xmm9, %xmm7
 297  318          movdqu  %xmm7, %xmm8
 298  319          pslldq  $12, %xmm7
 299  320          psrldq  $4, %xmm8
 300  321          pxor    %xmm7, %xmm3    // first phase of the reduction complete
 301  322  
 302  323          //
 303  324          // Second phase of the reduction
 304  325          //
 305  326          // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
 306  327          // shift operations.
 307  328          movdqu  %xmm3, %xmm2
 308  329          movdqu  %xmm3, %xmm4    // packed left shifting >> 1
 309  330          movdqu  %xmm3, %xmm5
 310  331          psrld   $1, %xmm2
 311  332          psrld   $2, %xmm4       // packed left shifting >> 2
 312  333          psrld   $7, %xmm5       // packed left shifting >> 7
 313  334          pxor    %xmm4, %xmm2    // xor the shifted versions
 314  335          pxor    %xmm5, %xmm2
 315  336          pxor    %xmm8, %xmm2
 316  337          pxor    %xmm2, %xmm3
 317  338          pxor    %xmm3, %xmm6    // the result is in xmm6
 318  339  
 319  340          //
 320  341          // Byte swap 16-byte result
 321  342          //
 322  343          pshufb  %xmm10, %xmm6   // %xmm10 has the swap mask

↓ open down ↓

84 lines elided

↑ open up ↑

 323  344  
 324  345          //
 325  346          // Store the result
 326  347          //
 327  348          movdqu  %xmm6, (%rdx)   // P3
 328  349  
 329  350  
 330  351          //
 331  352          // Cleanup and Return
 332  353          //
 333      -        SET_TS_OR_POP_XMM_REGISTERS(%r10)
 334  354          ret
 335  355          SET_SIZE(gcm_mul_pclmulqdq)
 336  356  
 337  357  #endif  /* lint || __lint */

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX