1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 #include <sys/asm_linkage.h>
  29 #include <sys/regset.h>
  30 #include <sys/privregs.h>
  31 
  32 #if defined(__lint)
  33 #include <sys/types.h>
  34 #include <sys/archsystm.h>
  35 #else
  36 #include "assym.h"
  37 #endif
  38 
  39 /*
  40  * Do block operations using Streaming SIMD extensions
  41  */
  42 
  43 #if defined(DEBUG)
  44 #if defined(__amd64)
  45 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg)   \
  46         movq    %gs:CPU_THREAD, t;              \
  47         movsbl  T_PREEMPT(t), r32;              \
  48         testl   r32, r32;                       \
  49         jne     5f;                             \
  50         pushq   %rbp;                           \
  51         movq    %rsp, %rbp;                     \
  52         leaq    msg(%rip), %rdi;                \
  53         xorl    %eax, %eax;                     \
  54         call    panic;                          \
  55 5:
  56 #elif defined(__i386)
  57 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg)   \
  58         movl    %gs:CPU_THREAD, t;              \
  59         movsbl  T_PREEMPT(t), r32;              \
  60         testl   r32, r32;                       \
  61         jne     5f;                             \
  62         pushl   %ebp;                           \
  63         movl    %esp, %ebp;                     \
  64         pushl   $msg;                           \
  65         call    panic;                          \
  66 5:
  67 #endif  /* __i386 */
  68 #else   /* DEBUG */
  69 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg)
  70 #endif  /* DEBUG */
  71 
  72 #define BLOCKSHIFT      6
  73 #define BLOCKSIZE       64      /* (1 << BLOCKSHIFT) */
  74 #define BLOCKMASK       63      /* (BLOCKSIZE - 1) */
  75 
  76 #if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
  77 #error  "mucked up constants"
  78 #endif
  79 
  80 #if defined(__lint)
  81 
  82 /*ARGSUSED*/
  83 void
  84 hwblkclr(void *addr, size_t size)
  85 {}
  86 
  87 #else   /* __lint */
  88 
  89 #if defined(__amd64)
  90 #define ADD     addq
  91 #define SUB     subq
  92 #else
  93 #define ADD     addl
  94 #define SUB     subl
  95 #endif
  96 
  97 #define SAVE_XMM0(r)                            \
  98         SAVE_XMM_PROLOG(r, 1);                  \
  99         movdqa  %xmm0, (r)
 100 
 101 #define ZERO_LOOP_INIT_XMM(dst)                 \
 102         pxor    %xmm0, %xmm0
 103 
 104 #define ZERO_LOOP_BODY_XMM(dst, cnt)            \
 105         movntdq %xmm0, (dst);                   \
 106         movntdq %xmm0, 0x10(dst);               \
 107         movntdq %xmm0, 0x20(dst);               \
 108         movntdq %xmm0, 0x30(dst);               \
 109         ADD     $BLOCKSIZE, dst;                \
 110         SUB     $1, cnt
 111 
 112 #define ZERO_LOOP_FINI_XMM(dst)                 \
 113         mfence
 114 
 115 #define RSTOR_XMM0(r)                           \
 116         movdqa  0x0(r), %xmm0;                  \
 117         RSTOR_XMM_EPILOG(r, 1)
 118 
 119 #if defined(__amd64)
 120 
 121         /*
 122          * %rdi         dst
 123          * %rsi         size
 124          * %rax         saved %cr0 (#if DEBUG then %eax is t->t_preempt)
 125          * %r8          pointer to %xmm register save area
 126          */
 127         ENTRY(hwblkclr)
 128         pushq   %rbp
 129         movq    %rsp, %rbp
 130         testl   $BLOCKMASK, %edi        /* address must be BLOCKSIZE aligned */
 131         jne     .dobzero
 132         cmpq    $BLOCKSIZE, %rsi        /* size must be at least BLOCKSIZE */
 133         jl      .dobzero
 134         testq   $BLOCKMASK, %rsi        /* .. and be a multiple of BLOCKSIZE */
 135         jne     .dobzero
 136         shrq    $BLOCKSHIFT, %rsi
 137 
 138         ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
 139         movq    %cr0, %rax
 140         clts
 141         testl   $CR0_TS, %eax
 142         jnz     1f
 143 
 144         SAVE_XMM0(%r8)
 145 1:      ZERO_LOOP_INIT_XMM(%rdi)
 146 9:      ZERO_LOOP_BODY_XMM(%rdi, %rsi)
 147         jnz     9b
 148         ZERO_LOOP_FINI_XMM(%rdi)
 149 
 150         testl   $CR0_TS, %eax
 151         jnz     2f
 152         RSTOR_XMM0(%r8)
 153 2:      movq    %rax, %cr0
 154         leave
 155         ret
 156 .dobzero:
 157         leave
 158         jmp     bzero
 159         SET_SIZE(hwblkclr)
 160 
 161 #elif defined(__i386)
 162 
 163         /*
 164          * %eax         dst
 165          * %ecx         size in bytes, loop count
 166          * %ebx         saved %cr0 (#if DEBUG then t->t_preempt)
 167          * %edi         pointer to %xmm register save area
 168          */
 169         ENTRY(hwblkclr)
 170         movl    4(%esp), %eax
 171         movl    8(%esp), %ecx
 172         testl   $BLOCKMASK, %eax        /* address must be BLOCKSIZE aligned */
 173         jne     .dobzero
 174         cmpl    $BLOCKSIZE, %ecx        /* size must be at least BLOCKSIZE */
 175         jl      .dobzero
 176         testl   $BLOCKMASK, %ecx        /* .. and be a multiple of BLOCKSIZE */
 177         jne     .dobzero
 178         shrl    $BLOCKSHIFT, %ecx
 179         movl    0xc(%esp), %edx
 180         pushl   %ebx
 181 
 182         pushl   %esi
 183         ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
 184         popl    %esi
 185         movl    %cr0, %ebx
 186         clts
 187         testl   $CR0_TS, %ebx
 188         jnz     1f
 189 
 190         pushl   %edi
 191         SAVE_XMM0(%edi)
 192 1:      ZERO_LOOP_INIT_XMM(%eax)
 193 9:      ZERO_LOOP_BODY_XMM(%eax, %ecx)
 194         jnz     9b
 195         ZERO_LOOP_FINI_XMM(%eax)
 196 
 197         testl   $CR0_TS, %ebx
 198         jnz     2f
 199         RSTOR_XMM0(%edi)
 200         popl    %edi
 201 2:      movl    %ebx, %cr0
 202         popl    %ebx
 203         ret
 204 .dobzero:
 205         jmp     bzero
 206         SET_SIZE(hwblkclr)
 207 
 208 #endif  /* __i386 */
 209 #endif  /* __lint */
 210 
 211 
 212 #if defined(__lint)
 213 
 214 /*ARGSUSED*/
 215 void
 216 hwblkpagecopy(const void *src, void *dst)
 217 {}
 218 
 219 #else   /* __lint */
 220 
 221 #define PREFETCH_START(src)                     \
 222         prefetchnta     0x0(src);               \
 223         prefetchnta     0x40(src)
 224 
 225 #define SAVE_XMMS(r)                            \
 226         SAVE_XMM_PROLOG(r, 8);                  \
 227         movdqa  %xmm0, (r);                     \
 228         movdqa  %xmm1, 0x10(r);                 \
 229         movdqa  %xmm2, 0x20(r);                 \
 230         movdqa  %xmm3, 0x30(r);                 \
 231         movdqa  %xmm4, 0x40(r);                 \
 232         movdqa  %xmm5, 0x50(r);                 \
 233         movdqa  %xmm6, 0x60(r);                 \
 234         movdqa  %xmm7, 0x70(r)
 235 
 236 #define COPY_LOOP_INIT_XMM(src)                 \
 237         prefetchnta     0x80(src);              \
 238         prefetchnta     0xc0(src);              \
 239         movdqa  0x0(src), %xmm0;                \
 240         movdqa  0x10(src), %xmm1;               \
 241         movdqa  0x20(src), %xmm2;               \
 242         movdqa  0x30(src), %xmm3;               \
 243         movdqa  0x40(src), %xmm4;               \
 244         movdqa  0x50(src), %xmm5;               \
 245         movdqa  0x60(src), %xmm6;               \
 246         movdqa  0x70(src), %xmm7;               \
 247         ADD     $0x80, src
 248 
 249 #define COPY_LOOP_BODY_XMM(src, dst, cnt)       \
 250         prefetchnta     0x80(src);              \
 251         prefetchnta     0xc0(src);              \
 252         prefetchnta     0x100(src);             \
 253         prefetchnta     0x140(src);             \
 254         movntdq %xmm0, (dst);                   \
 255         movntdq %xmm1, 0x10(dst);               \
 256         movntdq %xmm2, 0x20(dst);               \
 257         movntdq %xmm3, 0x30(dst);               \
 258         movdqa  0x0(src), %xmm0;                \
 259         movdqa  0x10(src), %xmm1;               \
 260         movntdq %xmm4, 0x40(dst);               \
 261         movntdq %xmm5, 0x50(dst);               \
 262         movdqa  0x20(src), %xmm2;               \
 263         movdqa  0x30(src), %xmm3;               \
 264         movntdq %xmm6, 0x60(dst);               \
 265         movntdq %xmm7, 0x70(dst);               \
 266         movdqa  0x40(src), %xmm4;               \
 267         movdqa  0x50(src), %xmm5;               \
 268         ADD     $0x80, dst;                     \
 269         movdqa  0x60(src), %xmm6;               \
 270         movdqa  0x70(src), %xmm7;               \
 271         ADD     $0x80, src;                     \
 272         subl    $1, cnt
 273 
 274 #define COPY_LOOP_FINI_XMM(dst)                 \
 275         movntdq %xmm0, 0x0(dst);                \
 276         movntdq %xmm1, 0x10(dst);               \
 277         movntdq %xmm2, 0x20(dst);               \
 278         movntdq %xmm3, 0x30(dst);               \
 279         movntdq %xmm4, 0x40(dst);               \
 280         movntdq %xmm5, 0x50(dst);               \
 281         movntdq %xmm6, 0x60(dst);               \
 282         movntdq %xmm7, 0x70(dst)
 283 
 284 #define RSTOR_XMMS(r)                           \
 285         movdqa  0x0(r), %xmm0;                  \
 286         movdqa  0x10(r), %xmm1;                 \
 287         movdqa  0x20(r), %xmm2;                 \
 288         movdqa  0x30(r), %xmm3;                 \
 289         movdqa  0x40(r), %xmm4;                 \
 290         movdqa  0x50(r), %xmm5;                 \
 291         movdqa  0x60(r), %xmm6;                 \
 292         movdqa  0x70(r), %xmm7;                 \
 293         RSTOR_XMM_EPILOG(r, 8)
 294 
 295 #if defined(__amd64)
 296 
 297         /*
 298          * %rdi         src
 299          * %rsi         dst
 300          * %rdx         #if DEBUG then curthread
 301          * %ecx         loop count
 302          * %rax         saved %cr0 (#if DEBUG then %eax is t->t_prempt)
 303          * %r8          pointer to %xmm register save area
 304          */
 305         ENTRY(hwblkpagecopy)
 306         pushq   %rbp
 307         movq    %rsp, %rbp
 308         PREFETCH_START(%rdi)
 309         /*
 310          * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
 311          * load and final store save us on loop count
 312          */
 313         movl    $_CONST(32 - 1), %ecx
 314         ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
 315         movq    %cr0, %rax
 316         clts
 317         testl   $CR0_TS, %eax
 318         jnz     3f
 319         SAVE_XMMS(%r8)
 320 3:      COPY_LOOP_INIT_XMM(%rdi)
 321 4:      COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
 322         jnz     4b
 323         COPY_LOOP_FINI_XMM(%rsi)
 324         testl   $CR0_TS, %eax
 325         jnz     5f
 326         RSTOR_XMMS(%r8)
 327 5:      movq    %rax, %cr0
 328         mfence
 329         leave
 330         ret
 331         SET_SIZE(hwblkpagecopy)
 332 
 333 #elif defined(__i386)
 334 
 335         /*
 336          * %eax         src
 337          * %edx         dst
 338          * %ecx         loop count
 339          * %ebx         saved %cr0 (#if DEBUG then t->t_prempt)
 340          * %edi         pointer to %xmm register save area
 341          * %esi         #if DEBUG temporary thread pointer
 342          */
 343         ENTRY(hwblkpagecopy)
 344         movl    4(%esp), %eax
 345         movl    8(%esp), %edx
 346         PREFETCH_START(%eax)
 347         pushl   %ebx
 348         /*
 349          * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
 350          * load and final store save us one loop count
 351          */
 352         movl    $_CONST(32 - 1), %ecx
 353         pushl   %esi
 354         ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
 355         popl    %esi
 356         movl    %cr0, %ebx
 357         clts
 358         testl   $CR0_TS, %ebx
 359         jnz     3f
 360         pushl   %edi
 361         SAVE_XMMS(%edi)
 362 3:      COPY_LOOP_INIT_XMM(%eax)
 363 4:      COPY_LOOP_BODY_XMM(%eax, %edx, %ecx)
 364         jnz     4b
 365         COPY_LOOP_FINI_XMM(%edx)
 366         testl   $CR0_TS, %ebx
 367         jnz     5f
 368         RSTOR_XMMS(%edi)
 369         popl    %edi
 370 5:      movl    %ebx, %cr0
 371         popl    %ebx
 372         mfence
 373         ret
 374         SET_SIZE(hwblkpagecopy)
 375 
 376 #endif  /* __i386 */
 377 #endif  /* __lint */
 378 
 379 #if defined(__lint)
 380 
 381 /*
 382  * Version of hwblkclr which doesn't use XMM registers.
 383  * Note that it requires aligned dst and len.
 384  *
 385  * XXPV This needs to be performance tuned at some point.
 386  *      Is 4 the best number of iterations to unroll?
 387  */
 388 /*ARGSUSED*/
 389 void
 390 block_zero_no_xmm(void *dst, int len)
 391 {}
 392 
 393 #else   /* __lint */
 394 
 395 #if defined(__amd64)
 396 
 397         ENTRY(block_zero_no_xmm)
 398         pushq   %rbp
 399         movq    %rsp, %rbp
 400         xorl    %eax, %eax
 401         addq    %rsi, %rdi
 402         negq    %rsi
 403 1:
 404         movnti  %rax, (%rdi, %rsi)
 405         movnti  %rax, 8(%rdi, %rsi)
 406         movnti  %rax, 16(%rdi, %rsi)
 407         movnti  %rax, 24(%rdi, %rsi)
 408         addq    $32, %rsi
 409         jnz     1b
 410         mfence
 411         leave
 412         ret
 413         SET_SIZE(block_zero_no_xmm)
 414 
 415 #elif defined(__i386)
 416 
 417         ENTRY(block_zero_no_xmm)
 418         pushl   %ebp
 419         movl    %esp, %ebp
 420         xorl    %eax, %eax
 421         movl    8(%ebp), %edx
 422         movl    12(%ebp), %ecx
 423         addl    %ecx, %edx
 424         negl    %ecx
 425 1:
 426         movnti  %eax, (%edx, %ecx)
 427         movnti  %eax, 4(%edx, %ecx)
 428         movnti  %eax, 8(%edx, %ecx)
 429         movnti  %eax, 12(%edx, %ecx)
 430         addl    $16, %ecx
 431         jnz     1b
 432         mfence
 433         leave
 434         ret
 435         SET_SIZE(block_zero_no_xmm)
 436 
 437 #endif  /* __i386 */
 438 #endif  /* __lint */
 439 
 440 
 441 #if defined(__lint)
 442 
 443 /*
 444  * Version of page copy which doesn't use XMM registers.
 445  *
 446  * XXPV This needs to be performance tuned at some point.
 447  *      Is 4 the right number of iterations to unroll?
 448  *      Is the load/store order optimal? Should it use prefetch?
 449  */
 450 /*ARGSUSED*/
 451 void
 452 page_copy_no_xmm(void *dst, void *src)
 453 {}
 454 
 455 #else   /* __lint */
 456 
 457 #if defined(__amd64)
 458 
 459         ENTRY(page_copy_no_xmm)
 460         movq    $MMU_STD_PAGESIZE, %rcx
 461         addq    %rcx, %rdi
 462         addq    %rcx, %rsi
 463         negq    %rcx
 464 1:
 465         movq    (%rsi, %rcx), %rax
 466         movnti  %rax, (%rdi, %rcx)
 467         movq    8(%rsi, %rcx), %rax
 468         movnti  %rax, 8(%rdi, %rcx)
 469         movq    16(%rsi, %rcx), %rax
 470         movnti  %rax, 16(%rdi, %rcx)
 471         movq    24(%rsi, %rcx), %rax
 472         movnti  %rax, 24(%rdi, %rcx)
 473         addq    $32, %rcx
 474         jnz     1b
 475         mfence
 476         ret
 477         SET_SIZE(page_copy_no_xmm)
 478 
 479 #elif defined(__i386)
 480 
 481         ENTRY(page_copy_no_xmm)
 482         pushl   %esi
 483         movl    $MMU_STD_PAGESIZE, %ecx
 484         movl    8(%esp), %edx
 485         movl    12(%esp), %esi
 486         addl    %ecx, %edx
 487         addl    %ecx, %esi
 488         negl    %ecx
 489 1:
 490         movl    (%esi, %ecx), %eax
 491         movnti  %eax, (%edx, %ecx)
 492         movl    4(%esi, %ecx), %eax
 493         movnti  %eax, 4(%edx, %ecx)
 494         movl    8(%esi, %ecx), %eax
 495         movnti  %eax, 8(%edx, %ecx)
 496         movl    12(%esi, %ecx), %eax
 497         movnti  %eax, 12(%edx, %ecx)
 498         addl    $16, %ecx
 499         jnz     1b
 500         mfence
 501         popl    %esi
 502         ret
 503         SET_SIZE(page_copy_no_xmm)
 504 
 505 #endif  /* __i386 */
 506 #endif  /* __lint */
 507 
 508 #if defined(DEBUG) && !defined(__lint)
 509         .text
 510 .not_disabled:
 511         .string "sseblk: preemption not disabled!"
 512 #endif