Print this page
4896 Performance improvements for KCF AES modes

Split Close
Expand all
Collapse all
          --- old/usr/src/common/crypto/modes/amd64/gcm_intel.s
          +++ new/usr/src/common/crypto/modes/amd64/gcm_intel.s
↓ open down ↓ 19 lines elided ↑ open up ↑
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2009 Intel Corporation
  24   24   * All Rights Reserved.
  25   25   */
  26   26  /*
  27   27   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  28   28   * Use is subject to license terms.
  29   29   */
       30 +/*
       31 + * Copyright 2015 by Saso Kiselkov. All rights reserved.
       32 + */
  30   33  
  31   34  /*
  32   35   * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
  33   36   * instructions.  This file contains an accelerated
  34   37   * Galois Field Multiplication implementation.
  35   38   *
  36   39   * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
  37   40   * carry-less multiplication. More information about PCLMULQDQ can be
  38   41   * found at:
  39   42   * http://software.intel.com/en-us/articles/
↓ open down ↓ 43 lines elided ↑ open up ↑
  83   86  
  84   87  #if defined(lint) || defined(__lint)
  85   88  
  86   89  #include <sys/types.h>
  87   90  
  88   91  /* ARGSUSED */
  89   92  void
  90   93  gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
  91   94  }
  92   95  
       96 +#ifdef  _KERNEL
       97 +/*ARGSUSED*/
       98 +void
       99 +gcm_intel_save(void *savestate)
      100 +{
      101 +}
      102 +
      103 +/*ARGSUSED*/
      104 +void
      105 +gcm_accel_restore(void *savestate)
      106 +{
      107 +}
      108 +#endif  /* _KERNEL */
      109 +
  93  110  #else   /* lint */
  94  111  
  95  112  #include <sys/asm_linkage.h>
  96  113  #include <sys/controlregs.h>
  97  114  #ifdef _KERNEL
  98  115  #include <sys/machprivregs.h>
  99  116  #endif
 100  117  
 101  118  #ifdef _KERNEL
 102  119          /*
↓ open down ↓ 6 lines elided ↑ open up ↑
 109  126           */
 110  127  #ifdef __xpv
 111  128  #define PROTECTED_CLTS \
 112  129          push    %rsi; \
 113  130          CLTS; \
 114  131          pop     %rsi
 115  132  #else
 116  133  #define PROTECTED_CLTS \
 117  134          CLTS
 118  135  #endif  /* __xpv */
 119      -
 120      -        /*
 121      -         * If CR0_TS is not set, align stack (with push %rbp) and push
 122      -         * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
 123      -         */
 124      -#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
 125      -        push    %rbp; \
 126      -        mov     %rsp, %rbp; \
 127      -        movq    %cr0, tmpreg; \
 128      -        testq   $CR0_TS, tmpreg; \
 129      -        jnz     1f; \
 130      -        and     $-XMM_ALIGN, %rsp; \
 131      -        sub     $[XMM_SIZE * 11], %rsp; \
 132      -        movaps  %xmm0, 160(%rsp); \
 133      -        movaps  %xmm1, 144(%rsp); \
 134      -        movaps  %xmm2, 128(%rsp); \
 135      -        movaps  %xmm3, 112(%rsp); \
 136      -        movaps  %xmm4, 96(%rsp); \
 137      -        movaps  %xmm5, 80(%rsp); \
 138      -        movaps  %xmm6, 64(%rsp); \
 139      -        movaps  %xmm7, 48(%rsp); \
 140      -        movaps  %xmm8, 32(%rsp); \
 141      -        movaps  %xmm9, 16(%rsp); \
 142      -        movaps  %xmm10, (%rsp); \
 143      -        jmp     2f; \
 144      -1: \
 145      -        PROTECTED_CLTS; \
 146      -2:
 147      -
 148      -
 149      -        /*
 150      -         * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
 151      -         * otherwise set CR0_TS.
 152      -         */
 153      -#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
 154      -        testq   $CR0_TS, tmpreg; \
 155      -        jnz     1f; \
 156      -        movaps  (%rsp), %xmm10; \
 157      -        movaps  16(%rsp), %xmm9; \
 158      -        movaps  32(%rsp), %xmm8; \
 159      -        movaps  48(%rsp), %xmm7; \
 160      -        movaps  64(%rsp), %xmm6; \
 161      -        movaps  80(%rsp), %xmm5; \
 162      -        movaps  96(%rsp), %xmm4; \
 163      -        movaps  112(%rsp), %xmm3; \
 164      -        movaps  128(%rsp), %xmm2; \
 165      -        movaps  144(%rsp), %xmm1; \
 166      -        movaps  160(%rsp), %xmm0; \
 167      -        jmp     2f; \
 168      -1: \
 169      -        STTS(tmpreg); \
 170      -2: \
 171      -        mov     %rbp, %rsp; \
 172      -        pop     %rbp
 173      -
 174      -
 175      -#else
 176      -#define PROTECTED_CLTS
 177      -#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
 178      -#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
 179  136  #endif  /* _KERNEL */
 180  137  
 181      -/*
 182      - * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
 183      - */
 184      -
 185      -// static uint8_t byte_swap16_mask[] = {
 186      -//       15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
 187  138  .text
 188  139  .align XMM_ALIGN
      140 +/*
      141 + * Use this mask to byte-swap a 16-byte integer with the pshufb instruction:
      142 + * static uint8_t byte_swap16_mask[] = {
      143 + *      15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
      144 + */
 189  145  .Lbyte_swap16_mask:
 190  146          .byte   15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 191  147  
      148 +#ifdef  _KERNEL
      149 +/*
      150 + * void gcm_intel_save(void *savestate)
      151 + *
      152 + * Saves the XMM0--XMM14 registers and CR0 to a temporary location pointed
      153 + * to in the first argument and clears TS in CR0. This must be invoked before
      154 + * executing accelerated GCM computations inside the kernel (and kernel
      155 + * thread preemption must be disabled as well). The memory region to which
      156 + * all state is saved must be at least 16x 128-bit + 64-bit long and must
      157 + * be 128-bit aligned.
      158 + */
      159 +ENTRY_NP(gcm_accel_save)
      160 +        movq    %cr0, %rax
      161 +        movq    %rax, 0x100(%rdi)
      162 +        testq   $CR0_TS, %rax
      163 +        jnz     1f
      164 +        /* FPU is in use, save registers */
      165 +        movaps  %xmm0, 0x00(%rdi)
      166 +        movaps  %xmm1, 0x10(%rdi)
      167 +        movaps  %xmm2, 0x20(%rdi)
      168 +        movaps  %xmm3, 0x30(%rdi)
      169 +        movaps  %xmm4, 0x40(%rdi)
      170 +        movaps  %xmm5, 0x50(%rdi)
      171 +        movaps  %xmm6, 0x60(%rdi)
      172 +        movaps  %xmm7, 0x70(%rdi)
      173 +        movaps  %xmm8, 0x80(%rdi)
      174 +        movaps  %xmm9, 0x90(%rdi)
      175 +        movaps  %xmm10, 0xa0(%rdi)
      176 +        movaps  %xmm11, 0xb0(%rdi)
      177 +        movaps  %xmm12, 0xc0(%rdi)
      178 +        movaps  %xmm13, 0xd0(%rdi)
      179 +        movaps  %xmm14, 0xe0(%rdi)
      180 +        movaps  %xmm15, 0xf0(%rdi)
      181 +        ret
      182 +1:
      183 +        PROTECTED_CLTS
      184 +        ret
      185 +        SET_SIZE(gcm_accel_save)
 192  186  
      187 +/*
      188 + * void gcm_accel_restore(void *savestate)
      189 + *
      190 + * Restores the saved XMM and CR0.TS state from aes_accel_save.
      191 + */
      192 +ENTRY_NP(gcm_accel_restore)
      193 +        movq    0x100(%rdi), %rax
      194 +        testq   $CR0_TS, %rax
      195 +        jnz     1f
      196 +        movaps  0x00(%rdi), %xmm0
      197 +        movaps  0x10(%rdi), %xmm1
      198 +        movaps  0x20(%rdi), %xmm2
      199 +        movaps  0x30(%rdi), %xmm3
      200 +        movaps  0x40(%rdi), %xmm4
      201 +        movaps  0x50(%rdi), %xmm5
      202 +        movaps  0x60(%rdi), %xmm6
      203 +        movaps  0x70(%rdi), %xmm7
      204 +        movaps  0x80(%rdi), %xmm8
      205 +        movaps  0x90(%rdi), %xmm9
      206 +        movaps  0xa0(%rdi), %xmm10
      207 +        movaps  0xb0(%rdi), %xmm11
      208 +        movaps  0xc0(%rdi), %xmm12
      209 +        movaps  0xd0(%rdi), %xmm13
      210 +        movaps  0xe0(%rdi), %xmm14
      211 +        movaps  0xf0(%rdi), %xmm15
      212 +        ret
      213 +1:
      214 +        STTS(%rax)
      215 +        ret
      216 +        SET_SIZE(gcm_accel_restore)
 193  217  
      218 +#endif  /* _KERNEL */
      219 +
 194  220  /*
 195  221   * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
 196  222   *
 197  223   * Perform a carry-less multiplication (that is, use XOR instead of the
 198  224   * multiply operator) on P1 and P2 and place the result in P3.
 199  225   *
 200  226   * Byte swap the input and the output.
 201  227   *
 202      - * Note: x_in, y, and res all point to a block of 20-byte numbers
      228 + * Note: x_in, y, and res all point to a block of 16-byte numbers
 203  229   * (an array of two 64-bit integers).
 204  230   *
 205      - * Note2: For kernel code, caller is responsible for ensuring
 206      - * kpreempt_disable() has been called.  This is because %xmm registers are
 207      - * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
 208      - * respectively, if TS is set on entry.  Otherwise, if TS is not set,
 209      - * save and restore %xmm registers on the stack.
      231 + * Note2: For kernel code, caller is responsible for bracketing this call with
      232 + * disabling kernel thread preemption and calling gcm_accel_save/restore().
 210  233   *
 211  234   * Note3: Original Intel definition:
 212  235   * void galois_hash_asm(unsigned char *hk, unsigned char *s,
 213  236   *      unsigned char *d, int length)
 214  237   *
 215  238   * Note4: Register/parameter mapping:
 216  239   * Intel:
 217  240   *      Parameter 1: %rcx (copied to %xmm0)     hk or x_in
 218  241   *      Parameter 2: %rdx (copied to %xmm1)     s or y
 219  242   *      Parameter 3: %rdi (result)              d or res
 220  243   * OpenSolaris:
 221  244   *      Parameter 1: %rdi (copied to %xmm0)     x_in
 222  245   *      Parameter 2: %rsi (copied to %xmm1)     y
 223  246   *      Parameter 3: %rdx (result)              res
 224  247   */
 225  248  
 226  249  ENTRY_NP(gcm_mul_pclmulqdq)
 227      -        CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
 228      -
 229  250          //
 230  251          // Copy Parameters
 231  252          //
 232  253          movdqu  (%rdi), %xmm0   // P1
 233  254          movdqu  (%rsi), %xmm1   // P2
 234  255  
 235  256          //
 236  257          // Byte swap 16-byte input
 237  258          //
 238  259          lea     .Lbyte_swap16_mask(%rip), %rax
↓ open down ↓ 84 lines elided ↑ open up ↑
 323  344  
 324  345          //
 325  346          // Store the result
 326  347          //
 327  348          movdqu  %xmm6, (%rdx)   // P3
 328  349  
 329  350  
 330  351          //
 331  352          // Cleanup and Return
 332  353          //
 333      -        SET_TS_OR_POP_XMM_REGISTERS(%r10)
 334  354          ret
 335  355          SET_SIZE(gcm_mul_pclmulqdq)
 336  356  
 337  357  #endif  /* lint || __lint */
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX