1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009 Intel Corporation 24 * All Rights Reserved. 25 */ 26 /* 27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 /* 31 * Copyright 2015 by Saso Kiselkov. All rights reserved. 32 */ 33 34 /* 35 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 36 * instructions. This file contains an accelerated 37 * Galois Field Multiplication implementation. 38 * 39 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, 40 * carry-less multiplication. More information about PCLMULQDQ can be 41 * found at: 42 * http://software.intel.com/en-us/articles/ 43 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ 44 * 45 */ 46 47 /* 48 * ==================================================================== 49 * OpenSolaris OS modifications 50 * 51 * This source originates as file galois_hash_asm.c from 52 * Intel Corporation dated September 21, 2009. 53 * 54 * This OpenSolaris version has these major changes from the original source: 55 * 56 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from 57 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function 58 * definition for lint. 59 * 60 * 2. Formatted code, added comments, and added #includes and #defines. 61 * 62 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before 63 * calling kpreempt_disable() and kpreempt_enable(). 64 * If the TS bit is not set, Save and restore %xmm registers at the beginning 65 * and end of function calls (%xmm* registers are not saved and restored by 66 * during kernel thread preemption). 67 * 68 * 4. Removed code to perform hashing. This is already done with C macro 69 * GHASH in gcm.c. For better performance, this removed code should be 70 * reintegrated in the future to replace the C GHASH macro. 71 * 72 * 5. Added code to byte swap 16-byte input and output. 73 * 74 * 6. Folded in comments from the original C source with embedded assembly 75 * (SB_w_shift_xor.c) 76 * 77 * 7. Renamed function and reordered parameters to match OpenSolaris: 78 * Intel interface: 79 * void galois_hash_asm(unsigned char *hk, unsigned char *s, 80 * unsigned char *d, int length) 81 * OpenSolaris OS interface: 82 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); 83 * ==================================================================== 84 */ 85 86 87 #if defined(lint) || defined(__lint) 88 89 #include <sys/types.h> 90 91 /* ARGSUSED */ 92 void 93 gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { 94 } 95 96 #ifdef _KERNEL 97 /*ARGSUSED*/ 98 void 99 gcm_intel_save(void *savestate) 100 { 101 } 102 103 /*ARGSUSED*/ 104 void 105 gcm_accel_restore(void *savestate) 106 { 107 } 108 #endif /* _KERNEL */ 109 110 #else /* lint */ 111 112 #include <sys/asm_linkage.h> 113 #include <sys/controlregs.h> 114 #ifdef _KERNEL 115 #include <sys/machprivregs.h> 116 #endif 117 118 #ifdef _KERNEL 119 /* 120 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, 121 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it 122 * uses it to pass P2 to syscall. 123 * This also occurs with the STTS macro, but we don't care if 124 * P2 (%rsi) is modified just before function exit. 125 * The CLTS and STTS macros push and pop P1 (%rdi) already. 126 */ 127 #ifdef __xpv 128 #define PROTECTED_CLTS \ 129 push %rsi; \ 130 CLTS; \ 131 pop %rsi 132 #else 133 #define PROTECTED_CLTS \ 134 CLTS 135 #endif /* __xpv */ 136 #endif /* _KERNEL */ 137 138 .text 139 .align XMM_ALIGN 140 /* 141 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction: 142 * static uint8_t byte_swap16_mask[] = { 143 * 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; 144 */ 145 .Lbyte_swap16_mask: 146 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 147 148 #ifdef _KERNEL 149 /* 150 * void gcm_intel_save(void *savestate) 151 * 152 * Saves the XMM0--XMM14 registers and CR0 to a temporary location pointed 153 * to in the first argument and clears TS in CR0. This must be invoked before 154 * executing accelerated GCM computations inside the kernel (and kernel 155 * thread preemption must be disabled as well). The memory region to which 156 * all state is saved must be at least 16x 128-bit + 64-bit long and must 157 * be 128-bit aligned. 158 */ 159 ENTRY_NP(gcm_accel_save) 160 movq %cr0, %rax 161 movq %rax, 0x100(%rdi) 162 testq $CR0_TS, %rax 163 jnz 1f 164 /* FPU is in use, save registers */ 165 movaps %xmm0, 0x00(%rdi) 166 movaps %xmm1, 0x10(%rdi) 167 movaps %xmm2, 0x20(%rdi) 168 movaps %xmm3, 0x30(%rdi) 169 movaps %xmm4, 0x40(%rdi) 170 movaps %xmm5, 0x50(%rdi) 171 movaps %xmm6, 0x60(%rdi) 172 movaps %xmm7, 0x70(%rdi) 173 movaps %xmm8, 0x80(%rdi) 174 movaps %xmm9, 0x90(%rdi) 175 movaps %xmm10, 0xa0(%rdi) 176 movaps %xmm11, 0xb0(%rdi) 177 movaps %xmm12, 0xc0(%rdi) 178 movaps %xmm13, 0xd0(%rdi) 179 movaps %xmm14, 0xe0(%rdi) 180 movaps %xmm15, 0xf0(%rdi) 181 ret 182 1: 183 PROTECTED_CLTS 184 ret 185 SET_SIZE(gcm_accel_save) 186 187 /* 188 * void gcm_accel_restore(void *savestate) 189 * 190 * Restores the saved XMM and CR0.TS state from aes_accel_save. 191 */ 192 ENTRY_NP(gcm_accel_restore) 193 movq 0x100(%rdi), %rax 194 testq $CR0_TS, %rax 195 jnz 1f 196 movaps 0x00(%rdi), %xmm0 197 movaps 0x10(%rdi), %xmm1 198 movaps 0x20(%rdi), %xmm2 199 movaps 0x30(%rdi), %xmm3 200 movaps 0x40(%rdi), %xmm4 201 movaps 0x50(%rdi), %xmm5 202 movaps 0x60(%rdi), %xmm6 203 movaps 0x70(%rdi), %xmm7 204 movaps 0x80(%rdi), %xmm8 205 movaps 0x90(%rdi), %xmm9 206 movaps 0xa0(%rdi), %xmm10 207 movaps 0xb0(%rdi), %xmm11 208 movaps 0xc0(%rdi), %xmm12 209 movaps 0xd0(%rdi), %xmm13 210 movaps 0xe0(%rdi), %xmm14 211 movaps 0xf0(%rdi), %xmm15 212 ret 213 1: 214 STTS(%rax) 215 ret 216 SET_SIZE(gcm_accel_restore) 217 218 #endif /* _KERNEL */ 219 220 /* 221 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); 222 * 223 * Perform a carry-less multiplication (that is, use XOR instead of the 224 * multiply operator) on P1 and P2 and place the result in P3. 225 * 226 * Byte swap the input and the output. 227 * 228 * Note: x_in, y, and res all point to a block of 16-byte numbers 229 * (an array of two 64-bit integers). 230 * 231 * Note2: For kernel code, caller is responsible for bracketing this call with 232 * disabling kernel thread preemption and calling gcm_accel_save/restore(). 233 * 234 * Note3: Original Intel definition: 235 * void galois_hash_asm(unsigned char *hk, unsigned char *s, 236 * unsigned char *d, int length) 237 * 238 * Note4: Register/parameter mapping: 239 * Intel: 240 * Parameter 1: %rcx (copied to %xmm0) hk or x_in 241 * Parameter 2: %rdx (copied to %xmm1) s or y 242 * Parameter 3: %rdi (result) d or res 243 * OpenSolaris: 244 * Parameter 1: %rdi (copied to %xmm0) x_in 245 * Parameter 2: %rsi (copied to %xmm1) y 246 * Parameter 3: %rdx (result) res 247 */ 248 249 ENTRY_NP(gcm_mul_pclmulqdq) 250 // 251 // Copy Parameters 252 // 253 movdqu (%rdi), %xmm0 // P1 254 movdqu (%rsi), %xmm1 // P2 255 256 // 257 // Byte swap 16-byte input 258 // 259 lea .Lbyte_swap16_mask(%rip), %rax 260 movaps (%rax), %xmm10 261 pshufb %xmm10, %xmm0 262 pshufb %xmm10, %xmm1 263 264 265 // 266 // Multiply with the hash key 267 // 268 movdqu %xmm0, %xmm3 269 pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 270 271 movdqu %xmm0, %xmm4 272 pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 273 274 movdqu %xmm0, %xmm5 275 pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 276 movdqu %xmm0, %xmm6 277 pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 278 279 pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 280 281 movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 282 psrldq $8, %xmm4 // shift by xmm4 64 bits to the right 283 pslldq $8, %xmm5 // shift by xmm5 64 bits to the left 284 pxor %xmm5, %xmm3 285 pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result 286 // of the carry-less multiplication of 287 // xmm0 by xmm1. 288 289 // We shift the result of the multiplication by one bit position 290 // to the left to cope for the fact that the bits are reversed. 291 movdqu %xmm3, %xmm7 292 movdqu %xmm6, %xmm8 293 pslld $1, %xmm3 294 pslld $1, %xmm6 295 psrld $31, %xmm7 296 psrld $31, %xmm8 297 movdqu %xmm7, %xmm9 298 pslldq $4, %xmm8 299 pslldq $4, %xmm7 300 psrldq $12, %xmm9 301 por %xmm7, %xmm3 302 por %xmm8, %xmm6 303 por %xmm9, %xmm6 304 305 // 306 // First phase of the reduction 307 // 308 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts 309 // independently. 310 movdqu %xmm3, %xmm7 311 movdqu %xmm3, %xmm8 312 movdqu %xmm3, %xmm9 313 pslld $31, %xmm7 // packed right shift shifting << 31 314 pslld $30, %xmm8 // packed right shift shifting << 30 315 pslld $25, %xmm9 // packed right shift shifting << 25 316 pxor %xmm8, %xmm7 // xor the shifted versions 317 pxor %xmm9, %xmm7 318 movdqu %xmm7, %xmm8 319 pslldq $12, %xmm7 320 psrldq $4, %xmm8 321 pxor %xmm7, %xmm3 // first phase of the reduction complete 322 323 // 324 // Second phase of the reduction 325 // 326 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these 327 // shift operations. 328 movdqu %xmm3, %xmm2 329 movdqu %xmm3, %xmm4 // packed left shifting >> 1 330 movdqu %xmm3, %xmm5 331 psrld $1, %xmm2 332 psrld $2, %xmm4 // packed left shifting >> 2 333 psrld $7, %xmm5 // packed left shifting >> 7 334 pxor %xmm4, %xmm2 // xor the shifted versions 335 pxor %xmm5, %xmm2 336 pxor %xmm8, %xmm2 337 pxor %xmm2, %xmm3 338 pxor %xmm3, %xmm6 // the result is in xmm6 339 340 // 341 // Byte swap 16-byte result 342 // 343 pshufb %xmm10, %xmm6 // %xmm10 has the swap mask 344 345 // 346 // Store the result 347 // 348 movdqu %xmm6, (%rdx) // P3 349 350 351 // 352 // Cleanup and Return 353 // 354 ret 355 SET_SIZE(gcm_mul_pclmulqdq) 356 357 #endif /* lint || __lint */