1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright 2019 Joyent, Inc.
  28  */
  29 
  30 #include <sys/asm_linkage.h>
  31 #include <sys/regset.h>
  32 #include <sys/privregs.h>
  33 
  34 #include "assym.h"
  35 
  36 /*
  37  * Do block operations using Streaming SIMD extensions
  38  */
  39 
  40 #if defined(DEBUG)
  41 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg)   \
  42         movq    %gs:CPU_THREAD, t;              \
  43         movsbl  T_PREEMPT(t), r32;              \
  44         testl   r32, r32;                       \
  45         jne     5f;                             \
  46         pushq   %rbp;                           \
  47         movq    %rsp, %rbp;                     \
  48         leaq    msg(%rip), %rdi;                \
  49         xorl    %eax, %eax;                     \
  50         call    panic;                          \
  51 5:
  52 #else   /* DEBUG */
  53 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg)
  54 #endif  /* DEBUG */
  55 
  56 #define BLOCKSHIFT      6
  57 #define BLOCKSIZE       64      /* (1 << BLOCKSHIFT) */
  58 #define BLOCKMASK       63      /* (BLOCKSIZE - 1) */
  59 
  60 #if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
  61 #error  "mucked up constants"
  62 #endif
  63 
  64 #define SAVE_XMM0(r)                            \
  65         SAVE_XMM_PROLOG(r, 1);                  \
  66         movdqa  %xmm0, (r)
  67 
  68 #define ZERO_LOOP_INIT_XMM(dst)                 \
  69         pxor    %xmm0, %xmm0
  70 
  71 #define ZERO_LOOP_BODY_XMM(dst, cnt)            \
  72         movntdq %xmm0, (dst);                   \
  73         movntdq %xmm0, 0x10(dst);               \
  74         movntdq %xmm0, 0x20(dst);               \
  75         movntdq %xmm0, 0x30(dst);               \
  76         addq    $BLOCKSIZE, dst;                \
  77         subq    $1, cnt
  78 
  79 #define ZERO_LOOP_FINI_XMM(dst)                 \
  80         mfence
  81 
  82 #define RSTOR_XMM0(r)                           \
  83         movdqa  0x0(r), %xmm0;                  \
  84         RSTOR_XMM_EPILOG(r, 1)
  85 
  86         /*
  87          * %rdi         dst
  88          * %rsi         size
  89          * %rax         saved %cr0 (#if DEBUG then %eax is t->t_preempt)
  90          * %r8          pointer to %xmm register save area
  91          */
  92         ENTRY(hwblkclr)
  93         pushq   %rbp
  94         movq    %rsp, %rbp
  95         testl   $BLOCKMASK, %edi        /* address must be BLOCKSIZE aligned */
  96         jne     .dobzero
  97         cmpq    $BLOCKSIZE, %rsi        /* size must be at least BLOCKSIZE */
  98         jl      .dobzero
  99         testq   $BLOCKMASK, %rsi        /* .. and be a multiple of BLOCKSIZE */
 100         jne     .dobzero
 101         shrq    $BLOCKSHIFT, %rsi
 102 
 103         ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
 104         movq    %cr0, %rax
 105         clts
 106         testl   $CR0_TS, %eax
 107         jnz     1f
 108 
 109         SAVE_XMM0(%r8)
 110 1:      ZERO_LOOP_INIT_XMM(%rdi)
 111 9:      ZERO_LOOP_BODY_XMM(%rdi, %rsi)
 112         jnz     9b
 113         ZERO_LOOP_FINI_XMM(%rdi)
 114 
 115         testl   $CR0_TS, %eax
 116         jnz     2f
 117         RSTOR_XMM0(%r8)
 118 2:      movq    %rax, %cr0
 119         leave
 120         ret
 121 .dobzero:
 122         leave
 123         jmp     bzero
 124         SET_SIZE(hwblkclr)
 125 
 126 
 127 #define PREFETCH_START(src)                     \
 128         prefetchnta     0x0(src);               \
 129         prefetchnta     0x40(src)
 130 
 131 #define SAVE_XMMS(r)                            \
 132         SAVE_XMM_PROLOG(r, 8);                  \
 133         movdqa  %xmm0, (r);                     \
 134         movdqa  %xmm1, 0x10(r);                 \
 135         movdqa  %xmm2, 0x20(r);                 \
 136         movdqa  %xmm3, 0x30(r);                 \
 137         movdqa  %xmm4, 0x40(r);                 \
 138         movdqa  %xmm5, 0x50(r);                 \
 139         movdqa  %xmm6, 0x60(r);                 \
 140         movdqa  %xmm7, 0x70(r)
 141 
 142 #define COPY_LOOP_INIT_XMM(src)                 \
 143         prefetchnta     0x80(src);              \
 144         prefetchnta     0xc0(src);              \
 145         movdqa  0x0(src), %xmm0;                \
 146         movdqa  0x10(src), %xmm1;               \
 147         movdqa  0x20(src), %xmm2;               \
 148         movdqa  0x30(src), %xmm3;               \
 149         movdqa  0x40(src), %xmm4;               \
 150         movdqa  0x50(src), %xmm5;               \
 151         movdqa  0x60(src), %xmm6;               \
 152         movdqa  0x70(src), %xmm7;               \
 153         addq    $0x80, src
 154 
 155 #define COPY_LOOP_BODY_XMM(src, dst, cnt)       \
 156         prefetchnta     0x80(src);              \
 157         prefetchnta     0xc0(src);              \
 158         prefetchnta     0x100(src);             \
 159         prefetchnta     0x140(src);             \
 160         movntdq %xmm0, (dst);                   \
 161         movntdq %xmm1, 0x10(dst);               \
 162         movntdq %xmm2, 0x20(dst);               \
 163         movntdq %xmm3, 0x30(dst);               \
 164         movdqa  0x0(src), %xmm0;                \
 165         movdqa  0x10(src), %xmm1;               \
 166         movntdq %xmm4, 0x40(dst);               \
 167         movntdq %xmm5, 0x50(dst);               \
 168         movdqa  0x20(src), %xmm2;               \
 169         movdqa  0x30(src), %xmm3;               \
 170         movntdq %xmm6, 0x60(dst);               \
 171         movntdq %xmm7, 0x70(dst);               \
 172         movdqa  0x40(src), %xmm4;               \
 173         movdqa  0x50(src), %xmm5;               \
 174         addq    $0x80, dst;                     \
 175         movdqa  0x60(src), %xmm6;               \
 176         movdqa  0x70(src), %xmm7;               \
 177         addq    $0x80, src;                     \
 178         subl    $1, cnt
 179 
 180 #define COPY_LOOP_FINI_XMM(dst)                 \
 181         movntdq %xmm0, 0x0(dst);                \
 182         movntdq %xmm1, 0x10(dst);               \
 183         movntdq %xmm2, 0x20(dst);               \
 184         movntdq %xmm3, 0x30(dst);               \
 185         movntdq %xmm4, 0x40(dst);               \
 186         movntdq %xmm5, 0x50(dst);               \
 187         movntdq %xmm6, 0x60(dst);               \
 188         movntdq %xmm7, 0x70(dst)
 189 
 190 #define RSTOR_XMMS(r)                           \
 191         movdqa  0x0(r), %xmm0;                  \
 192         movdqa  0x10(r), %xmm1;                 \
 193         movdqa  0x20(r), %xmm2;                 \
 194         movdqa  0x30(r), %xmm3;                 \
 195         movdqa  0x40(r), %xmm4;                 \
 196         movdqa  0x50(r), %xmm5;                 \
 197         movdqa  0x60(r), %xmm6;                 \
 198         movdqa  0x70(r), %xmm7;                 \
 199         RSTOR_XMM_EPILOG(r, 8)
 200 
 201         /*
 202          * %rdi         src
 203          * %rsi         dst
 204          * %rdx         #if DEBUG then curthread
 205          * %ecx         loop count
 206          * %rax         saved %cr0 (#if DEBUG then %eax is t->t_prempt)
 207          * %r8          pointer to %xmm register save area
 208          */
 209         ENTRY(hwblkpagecopy)
 210         pushq   %rbp
 211         movq    %rsp, %rbp
 212         PREFETCH_START(%rdi)
 213         /*
 214          * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
 215          * load and final store save us on loop count
 216          */
 217         movl    $_CONST(32 - 1), %ecx
 218         ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
 219         movq    %cr0, %rax
 220         clts
 221         testl   $CR0_TS, %eax
 222         jnz     3f
 223         SAVE_XMMS(%r8)
 224 3:      COPY_LOOP_INIT_XMM(%rdi)
 225 4:      COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
 226         jnz     4b
 227         COPY_LOOP_FINI_XMM(%rsi)
 228         testl   $CR0_TS, %eax
 229         jnz     5f
 230         RSTOR_XMMS(%r8)
 231 5:      movq    %rax, %cr0
 232         mfence
 233         leave
 234         ret
 235         SET_SIZE(hwblkpagecopy)
 236 
 237         ENTRY(block_zero_no_xmm)
 238         pushq   %rbp
 239         movq    %rsp, %rbp
 240         xorl    %eax, %eax
 241         addq    %rsi, %rdi
 242         negq    %rsi
 243 1:
 244         movnti  %rax, (%rdi, %rsi)
 245         movnti  %rax, 8(%rdi, %rsi)
 246         movnti  %rax, 16(%rdi, %rsi)
 247         movnti  %rax, 24(%rdi, %rsi)
 248         addq    $32, %rsi
 249         jnz     1b
 250         mfence
 251         leave
 252         ret
 253         SET_SIZE(block_zero_no_xmm)
 254 
 255 
 256         ENTRY(page_copy_no_xmm)
 257         movq    $MMU_STD_PAGESIZE, %rcx
 258         addq    %rcx, %rdi
 259         addq    %rcx, %rsi
 260         negq    %rcx
 261 1:
 262         movq    (%rsi, %rcx), %rax
 263         movnti  %rax, (%rdi, %rcx)
 264         movq    8(%rsi, %rcx), %rax
 265         movnti  %rax, 8(%rdi, %rcx)
 266         movq    16(%rsi, %rcx), %rax
 267         movnti  %rax, 16(%rdi, %rcx)
 268         movq    24(%rsi, %rcx), %rax
 269         movnti  %rax, 24(%rdi, %rcx)
 270         addq    $32, %rcx
 271         jnz     1b
 272         mfence
 273         ret
 274         SET_SIZE(page_copy_no_xmm)
 275 
 276 #if defined(DEBUG)
 277         .text
 278 .not_disabled:
 279         .string "sseblk: preemption not disabled!"
 280 #endif