1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009 Intel Corporation
  24  * All Rights Reserved.
  25  */
  26 /*
  27  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  28  * Use is subject to license terms.
  29  */
  30 /*
  31  * Copyright 2015 by Saso Kiselkov. All rights reserved.
  32  */
  33 
  34 /*
  35  * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
  36  * instructions.  This file contains an accelerated
  37  * Galois Field Multiplication implementation.
  38  *
  39  * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
  40  * carry-less multiplication. More information about PCLMULQDQ can be
  41  * found at:
  42  * http://software.intel.com/en-us/articles/
  43  * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
  44  *
  45  */
  46 
  47 /*
  48  * ====================================================================
  49  * OpenSolaris OS modifications
  50  *
  51  * This source originates as file galois_hash_asm.c from
  52  * Intel Corporation dated September 21, 2009.
  53  *
  54  * This OpenSolaris version has these major changes from the original source:
  55  *
  56  * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  57  * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
  58  * definition for lint.
  59  *
  60  * 2. Formatted code, added comments, and added #includes and #defines.
  61  *
  62  * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
  63  * calling kpreempt_disable() and kpreempt_enable().
  64  * If the TS bit is not set, Save and restore %xmm registers at the beginning
  65  * and end of function calls (%xmm* registers are not saved and restored by
  66  * during kernel thread preemption).
  67  *
  68  * 4. Removed code to perform hashing.  This is already done with C macro
  69  * GHASH in gcm.c.  For better performance, this removed code should be
  70  * reintegrated in the future to replace the C GHASH macro.
  71  *
  72  * 5. Added code to byte swap 16-byte input and output.
  73  *
  74  * 6. Folded in comments from the original C source with embedded assembly
  75  * (SB_w_shift_xor.c)
  76  *
  77  * 7. Renamed function and reordered parameters to match OpenSolaris:
  78  * Intel interface:
  79  *      void galois_hash_asm(unsigned char *hk, unsigned char *s,
  80  *              unsigned char *d, int length)
  81  * OpenSolaris OS interface:
  82  *      void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
  83  * ====================================================================
  84  */
  85 
  86 
  87 #if defined(lint) || defined(__lint)
  88 
  89 #include <sys/types.h>
  90 
  91 /* ARGSUSED */
  92 void
  93 gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
  94 }
  95 
  96 #ifdef  _KERNEL
  97 /*ARGSUSED*/
  98 void
  99 gcm_intel_save(void *savestate)
 100 {
 101 }
 102 
 103 /*ARGSUSED*/
 104 void
 105 gcm_accel_restore(void *savestate)
 106 {
 107 }
 108 #endif  /* _KERNEL */
 109 
 110 #else   /* lint */
 111 
 112 #include <sys/asm_linkage.h>
 113 #include <sys/controlregs.h>
 114 #ifdef _KERNEL
 115 #include <sys/machprivregs.h>
 116 #endif
 117 
 118 #ifdef _KERNEL
 119         /*
 120          * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv.  That is,
 121          * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
 122          * uses it to pass P2 to syscall.
 123          * This also occurs with the STTS macro, but we don't care if
 124          * P2 (%rsi) is modified just before function exit.
 125          * The CLTS and STTS macros push and pop P1 (%rdi) already.
 126          */
 127 #ifdef __xpv
 128 #define PROTECTED_CLTS \
 129         push    %rsi; \
 130         CLTS; \
 131         pop     %rsi
 132 #else
 133 #define PROTECTED_CLTS \
 134         CLTS
 135 #endif  /* __xpv */
 136 #endif  /* _KERNEL */
 137 
 138 .text
 139 .align XMM_ALIGN
 140 /*
 141  * Use this mask to byte-swap a 16-byte integer with the pshufb instruction:
 142  * static uint8_t byte_swap16_mask[] = {
 143  *      15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
 144  */
 145 .Lbyte_swap16_mask:
 146         .byte   15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 147 
 148 #ifdef  _KERNEL
 149 /*
 150  * void gcm_intel_save(void *savestate)
 151  *
 152  * Saves the XMM0--XMM14 registers and CR0 to a temporary location pointed
 153  * to in the first argument and clears TS in CR0. This must be invoked before
 154  * executing accelerated GCM computations inside the kernel (and kernel
 155  * thread preemption must be disabled as well). The memory region to which
 156  * all state is saved must be at least 16x 128-bit + 64-bit long and must
 157  * be 128-bit aligned.
 158  */
 159 ENTRY_NP(gcm_accel_save)
 160         movq    %cr0, %rax
 161         movq    %rax, 0x100(%rdi)
 162         testq   $CR0_TS, %rax
 163         jnz     1f
 164         /* FPU is in use, save registers */
 165         movaps  %xmm0, 0x00(%rdi)
 166         movaps  %xmm1, 0x10(%rdi)
 167         movaps  %xmm2, 0x20(%rdi)
 168         movaps  %xmm3, 0x30(%rdi)
 169         movaps  %xmm4, 0x40(%rdi)
 170         movaps  %xmm5, 0x50(%rdi)
 171         movaps  %xmm6, 0x60(%rdi)
 172         movaps  %xmm7, 0x70(%rdi)
 173         movaps  %xmm8, 0x80(%rdi)
 174         movaps  %xmm9, 0x90(%rdi)
 175         movaps  %xmm10, 0xa0(%rdi)
 176         movaps  %xmm11, 0xb0(%rdi)
 177         movaps  %xmm12, 0xc0(%rdi)
 178         movaps  %xmm13, 0xd0(%rdi)
 179         movaps  %xmm14, 0xe0(%rdi)
 180         movaps  %xmm15, 0xf0(%rdi)
 181         ret
 182 1:
 183         PROTECTED_CLTS
 184         ret
 185         SET_SIZE(gcm_accel_save)
 186 
 187 /*
 188  * void gcm_accel_restore(void *savestate)
 189  *
 190  * Restores the saved XMM and CR0.TS state from aes_accel_save.
 191  */
 192 ENTRY_NP(gcm_accel_restore)
 193         movq    0x100(%rdi), %rax
 194         testq   $CR0_TS, %rax
 195         jnz     1f
 196         movaps  0x00(%rdi), %xmm0
 197         movaps  0x10(%rdi), %xmm1
 198         movaps  0x20(%rdi), %xmm2
 199         movaps  0x30(%rdi), %xmm3
 200         movaps  0x40(%rdi), %xmm4
 201         movaps  0x50(%rdi), %xmm5
 202         movaps  0x60(%rdi), %xmm6
 203         movaps  0x70(%rdi), %xmm7
 204         movaps  0x80(%rdi), %xmm8
 205         movaps  0x90(%rdi), %xmm9
 206         movaps  0xa0(%rdi), %xmm10
 207         movaps  0xb0(%rdi), %xmm11
 208         movaps  0xc0(%rdi), %xmm12
 209         movaps  0xd0(%rdi), %xmm13
 210         movaps  0xe0(%rdi), %xmm14
 211         movaps  0xf0(%rdi), %xmm15
 212         ret
 213 1:
 214         STTS(%rax)
 215         ret
 216         SET_SIZE(gcm_accel_restore)
 217 
 218 #endif  /* _KERNEL */
 219 
 220 /*
 221  * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
 222  *
 223  * Perform a carry-less multiplication (that is, use XOR instead of the
 224  * multiply operator) on P1 and P2 and place the result in P3.
 225  *
 226  * Byte swap the input and the output.
 227  *
 228  * Note: x_in, y, and res all point to a block of 16-byte numbers
 229  * (an array of two 64-bit integers).
 230  *
 231  * Note2: For kernel code, caller is responsible for bracketing this call with
 232  * disabling kernel thread preemption and calling gcm_accel_save/restore().
 233  *
 234  * Note3: Original Intel definition:
 235  * void galois_hash_asm(unsigned char *hk, unsigned char *s,
 236  *      unsigned char *d, int length)
 237  *
 238  * Note4: Register/parameter mapping:
 239  * Intel:
 240  *      Parameter 1: %rcx (copied to %xmm0)     hk or x_in
 241  *      Parameter 2: %rdx (copied to %xmm1)     s or y
 242  *      Parameter 3: %rdi (result)              d or res
 243  * OpenSolaris:
 244  *      Parameter 1: %rdi (copied to %xmm0)     x_in
 245  *      Parameter 2: %rsi (copied to %xmm1)     y
 246  *      Parameter 3: %rdx (result)              res
 247  */
 248 
 249 ENTRY_NP(gcm_mul_pclmulqdq)
 250         //
 251         // Copy Parameters
 252         //
 253         movdqu  (%rdi), %xmm0   // P1
 254         movdqu  (%rsi), %xmm1   // P2
 255 
 256         //
 257         // Byte swap 16-byte input
 258         //
 259         lea     .Lbyte_swap16_mask(%rip), %rax
 260         movaps  (%rax), %xmm10
 261         pshufb  %xmm10, %xmm0
 262         pshufb  %xmm10, %xmm1
 263 
 264 
 265         //
 266         // Multiply with the hash key
 267         //
 268         movdqu  %xmm0, %xmm3
 269         pclmulqdq $0, %xmm1, %xmm3      // xmm3 holds a0*b0
 270 
 271         movdqu  %xmm0, %xmm4
 272         pclmulqdq $16, %xmm1, %xmm4     // xmm4 holds a0*b1
 273 
 274         movdqu  %xmm0, %xmm5
 275         pclmulqdq $1, %xmm1, %xmm5      // xmm5 holds a1*b0
 276         movdqu  %xmm0, %xmm6
 277         pclmulqdq $17, %xmm1, %xmm6     // xmm6 holds a1*b1
 278 
 279         pxor    %xmm5, %xmm4    // xmm4 holds a0*b1 + a1*b0
 280 
 281         movdqu  %xmm4, %xmm5    // move the contents of xmm4 to xmm5
 282         psrldq  $8, %xmm4       // shift by xmm4 64 bits to the right
 283         pslldq  $8, %xmm5       // shift by xmm5 64 bits to the left
 284         pxor    %xmm5, %xmm3
 285         pxor    %xmm4, %xmm6    // Register pair <xmm6:xmm3> holds the result
 286                                 // of the carry-less multiplication of
 287                                 // xmm0 by xmm1.
 288 
 289         // We shift the result of the multiplication by one bit position
 290         // to the left to cope for the fact that the bits are reversed.
 291         movdqu  %xmm3, %xmm7
 292         movdqu  %xmm6, %xmm8
 293         pslld   $1, %xmm3
 294         pslld   $1, %xmm6
 295         psrld   $31, %xmm7
 296         psrld   $31, %xmm8
 297         movdqu  %xmm7, %xmm9
 298         pslldq  $4, %xmm8
 299         pslldq  $4, %xmm7
 300         psrldq  $12, %xmm9
 301         por     %xmm7, %xmm3
 302         por     %xmm8, %xmm6
 303         por     %xmm9, %xmm6
 304 
 305         //
 306         // First phase of the reduction
 307         //
 308         // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
 309         // independently.
 310         movdqu  %xmm3, %xmm7
 311         movdqu  %xmm3, %xmm8
 312         movdqu  %xmm3, %xmm9
 313         pslld   $31, %xmm7      // packed right shift shifting << 31
 314         pslld   $30, %xmm8      // packed right shift shifting << 30
 315         pslld   $25, %xmm9      // packed right shift shifting << 25
 316         pxor    %xmm8, %xmm7    // xor the shifted versions
 317         pxor    %xmm9, %xmm7
 318         movdqu  %xmm7, %xmm8
 319         pslldq  $12, %xmm7
 320         psrldq  $4, %xmm8
 321         pxor    %xmm7, %xmm3    // first phase of the reduction complete
 322 
 323         //
 324         // Second phase of the reduction
 325         //
 326         // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
 327         // shift operations.
 328         movdqu  %xmm3, %xmm2
 329         movdqu  %xmm3, %xmm4    // packed left shifting >> 1
 330         movdqu  %xmm3, %xmm5
 331         psrld   $1, %xmm2
 332         psrld   $2, %xmm4       // packed left shifting >> 2
 333         psrld   $7, %xmm5       // packed left shifting >> 7
 334         pxor    %xmm4, %xmm2    // xor the shifted versions
 335         pxor    %xmm5, %xmm2
 336         pxor    %xmm8, %xmm2
 337         pxor    %xmm2, %xmm3
 338         pxor    %xmm3, %xmm6    // the result is in xmm6
 339 
 340         //
 341         // Byte swap 16-byte result
 342         //
 343         pshufb  %xmm10, %xmm6   // %xmm10 has the swap mask
 344 
 345         //
 346         // Store the result
 347         //
 348         movdqu  %xmm6, (%rdx)   // P3
 349 
 350 
 351         //
 352         // Cleanup and Return
 353         //
 354         ret
 355         SET_SIZE(gcm_mul_pclmulqdq)
 356 
 357 #endif  /* lint || __lint */