1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 #include <sys/asm_linkage.h>
29 #include <sys/regset.h>
30 #include <sys/privregs.h>
31
32 #if defined(__lint)
33 #include <sys/types.h>
34 #include <sys/archsystm.h>
35 #else
36 #include "assym.h"
37 #endif
38
39 /*
40 * Do block operations using Streaming SIMD extensions
41 */
42
43 #if defined(DEBUG)
44 #if defined(__amd64)
45 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg) \
46 movq %gs:CPU_THREAD, t; \
47 movsbl T_PREEMPT(t), r32; \
48 testl r32, r32; \
49 jne 5f; \
50 pushq %rbp; \
51 movq %rsp, %rbp; \
52 leaq msg(%rip), %rdi; \
53 xorl %eax, %eax; \
54 call panic; \
55 5:
56 #elif defined(__i386)
57 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg) \
58 movl %gs:CPU_THREAD, t; \
59 movsbl T_PREEMPT(t), r32; \
60 testl r32, r32; \
61 jne 5f; \
62 pushl %ebp; \
63 movl %esp, %ebp; \
64 pushl $msg; \
65 call panic; \
66 5:
67 #endif /* __i386 */
68 #else /* DEBUG */
69 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg)
70 #endif /* DEBUG */
71
72 #define BLOCKSHIFT 6
73 #define BLOCKSIZE 64 /* (1 << BLOCKSHIFT) */
74 #define BLOCKMASK 63 /* (BLOCKSIZE - 1) */
75
76 #if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
77 #error "mucked up constants"
78 #endif
79
80 #if defined(__lint)
81
82 /*ARGSUSED*/
83 void
84 hwblkclr(void *addr, size_t size)
85 {}
86
87 #else /* __lint */
88
89 #if defined(__amd64)
90 #define ADD addq
91 #define SUB subq
92 #else
93 #define ADD addl
94 #define SUB subl
95 #endif
96
97 #define SAVE_XMM0(r) \
98 SAVE_XMM_PROLOG(r, 1); \
99 movdqa %xmm0, (r)
100
101 #define ZERO_LOOP_INIT_XMM(dst) \
102 pxor %xmm0, %xmm0
103
104 #define ZERO_LOOP_BODY_XMM(dst, cnt) \
105 movntdq %xmm0, (dst); \
106 movntdq %xmm0, 0x10(dst); \
107 movntdq %xmm0, 0x20(dst); \
108 movntdq %xmm0, 0x30(dst); \
109 ADD $BLOCKSIZE, dst; \
110 SUB $1, cnt
111
112 #define ZERO_LOOP_FINI_XMM(dst) \
113 mfence
114
115 #define RSTOR_XMM0(r) \
116 movdqa 0x0(r), %xmm0; \
117 RSTOR_XMM_EPILOG(r, 1)
118
119 #if defined(__amd64)
120
121 /*
122 * %rdi dst
123 * %rsi size
124 * %rax saved %cr0 (#if DEBUG then %eax is t->t_preempt)
125 * %r8 pointer to %xmm register save area
126 */
127 ENTRY(hwblkclr)
128 pushq %rbp
129 movq %rsp, %rbp
130 testl $BLOCKMASK, %edi /* address must be BLOCKSIZE aligned */
131 jne .dobzero
132 cmpq $BLOCKSIZE, %rsi /* size must be at least BLOCKSIZE */
133 jl .dobzero
134 testq $BLOCKMASK, %rsi /* .. and be a multiple of BLOCKSIZE */
135 jne .dobzero
136 shrq $BLOCKSHIFT, %rsi
137
138 ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
139 movq %cr0, %rax
140 clts
141 testl $CR0_TS, %eax
142 jnz 1f
143
144 SAVE_XMM0(%r8)
145 1: ZERO_LOOP_INIT_XMM(%rdi)
146 9: ZERO_LOOP_BODY_XMM(%rdi, %rsi)
147 jnz 9b
148 ZERO_LOOP_FINI_XMM(%rdi)
149
150 testl $CR0_TS, %eax
151 jnz 2f
152 RSTOR_XMM0(%r8)
153 2: movq %rax, %cr0
154 leave
155 ret
156 .dobzero:
157 leave
158 jmp bzero
159 SET_SIZE(hwblkclr)
160
161 #elif defined(__i386)
162
163 /*
164 * %eax dst
165 * %ecx size in bytes, loop count
166 * %ebx saved %cr0 (#if DEBUG then t->t_preempt)
167 * %edi pointer to %xmm register save area
168 */
169 ENTRY(hwblkclr)
170 movl 4(%esp), %eax
171 movl 8(%esp), %ecx
172 testl $BLOCKMASK, %eax /* address must be BLOCKSIZE aligned */
173 jne .dobzero
174 cmpl $BLOCKSIZE, %ecx /* size must be at least BLOCKSIZE */
175 jl .dobzero
176 testl $BLOCKMASK, %ecx /* .. and be a multiple of BLOCKSIZE */
177 jne .dobzero
178 shrl $BLOCKSHIFT, %ecx
179 movl 0xc(%esp), %edx
180 pushl %ebx
181
182 pushl %esi
183 ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
184 popl %esi
185 movl %cr0, %ebx
186 clts
187 testl $CR0_TS, %ebx
188 jnz 1f
189
190 pushl %edi
191 SAVE_XMM0(%edi)
192 1: ZERO_LOOP_INIT_XMM(%eax)
193 9: ZERO_LOOP_BODY_XMM(%eax, %ecx)
194 jnz 9b
195 ZERO_LOOP_FINI_XMM(%eax)
196
197 testl $CR0_TS, %ebx
198 jnz 2f
199 RSTOR_XMM0(%edi)
200 popl %edi
201 2: movl %ebx, %cr0
202 popl %ebx
203 ret
204 .dobzero:
205 jmp bzero
206 SET_SIZE(hwblkclr)
207
208 #endif /* __i386 */
209 #endif /* __lint */
210
211
212 #if defined(__lint)
213
214 /*ARGSUSED*/
215 void
216 hwblkpagecopy(const void *src, void *dst)
217 {}
218
219 #else /* __lint */
220
221 #define PREFETCH_START(src) \
222 prefetchnta 0x0(src); \
223 prefetchnta 0x40(src)
224
225 #define SAVE_XMMS(r) \
226 SAVE_XMM_PROLOG(r, 8); \
227 movdqa %xmm0, (r); \
228 movdqa %xmm1, 0x10(r); \
229 movdqa %xmm2, 0x20(r); \
230 movdqa %xmm3, 0x30(r); \
231 movdqa %xmm4, 0x40(r); \
232 movdqa %xmm5, 0x50(r); \
233 movdqa %xmm6, 0x60(r); \
234 movdqa %xmm7, 0x70(r)
235
236 #define COPY_LOOP_INIT_XMM(src) \
237 prefetchnta 0x80(src); \
238 prefetchnta 0xc0(src); \
239 movdqa 0x0(src), %xmm0; \
240 movdqa 0x10(src), %xmm1; \
241 movdqa 0x20(src), %xmm2; \
242 movdqa 0x30(src), %xmm3; \
243 movdqa 0x40(src), %xmm4; \
244 movdqa 0x50(src), %xmm5; \
245 movdqa 0x60(src), %xmm6; \
246 movdqa 0x70(src), %xmm7; \
247 ADD $0x80, src
248
249 #define COPY_LOOP_BODY_XMM(src, dst, cnt) \
250 prefetchnta 0x80(src); \
251 prefetchnta 0xc0(src); \
252 prefetchnta 0x100(src); \
253 prefetchnta 0x140(src); \
254 movntdq %xmm0, (dst); \
255 movntdq %xmm1, 0x10(dst); \
256 movntdq %xmm2, 0x20(dst); \
257 movntdq %xmm3, 0x30(dst); \
258 movdqa 0x0(src), %xmm0; \
259 movdqa 0x10(src), %xmm1; \
260 movntdq %xmm4, 0x40(dst); \
261 movntdq %xmm5, 0x50(dst); \
262 movdqa 0x20(src), %xmm2; \
263 movdqa 0x30(src), %xmm3; \
264 movntdq %xmm6, 0x60(dst); \
265 movntdq %xmm7, 0x70(dst); \
266 movdqa 0x40(src), %xmm4; \
267 movdqa 0x50(src), %xmm5; \
268 ADD $0x80, dst; \
269 movdqa 0x60(src), %xmm6; \
270 movdqa 0x70(src), %xmm7; \
271 ADD $0x80, src; \
272 subl $1, cnt
273
274 #define COPY_LOOP_FINI_XMM(dst) \
275 movntdq %xmm0, 0x0(dst); \
276 movntdq %xmm1, 0x10(dst); \
277 movntdq %xmm2, 0x20(dst); \
278 movntdq %xmm3, 0x30(dst); \
279 movntdq %xmm4, 0x40(dst); \
280 movntdq %xmm5, 0x50(dst); \
281 movntdq %xmm6, 0x60(dst); \
282 movntdq %xmm7, 0x70(dst)
283
284 #define RSTOR_XMMS(r) \
285 movdqa 0x0(r), %xmm0; \
286 movdqa 0x10(r), %xmm1; \
287 movdqa 0x20(r), %xmm2; \
288 movdqa 0x30(r), %xmm3; \
289 movdqa 0x40(r), %xmm4; \
290 movdqa 0x50(r), %xmm5; \
291 movdqa 0x60(r), %xmm6; \
292 movdqa 0x70(r), %xmm7; \
293 RSTOR_XMM_EPILOG(r, 8)
294
295 #if defined(__amd64)
296
297 /*
298 * %rdi src
299 * %rsi dst
300 * %rdx #if DEBUG then curthread
301 * %ecx loop count
302 * %rax saved %cr0 (#if DEBUG then %eax is t->t_prempt)
303 * %r8 pointer to %xmm register save area
304 */
305 ENTRY(hwblkpagecopy)
306 pushq %rbp
307 movq %rsp, %rbp
308 PREFETCH_START(%rdi)
309 /*
310 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
311 * load and final store save us on loop count
312 */
313 movl $_CONST(32 - 1), %ecx
314 ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
315 movq %cr0, %rax
316 clts
317 testl $CR0_TS, %eax
318 jnz 3f
319 SAVE_XMMS(%r8)
320 3: COPY_LOOP_INIT_XMM(%rdi)
321 4: COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
322 jnz 4b
323 COPY_LOOP_FINI_XMM(%rsi)
324 testl $CR0_TS, %eax
325 jnz 5f
326 RSTOR_XMMS(%r8)
327 5: movq %rax, %cr0
328 mfence
329 leave
330 ret
331 SET_SIZE(hwblkpagecopy)
332
333 #elif defined(__i386)
334
335 /*
336 * %eax src
337 * %edx dst
338 * %ecx loop count
339 * %ebx saved %cr0 (#if DEBUG then t->t_prempt)
340 * %edi pointer to %xmm register save area
341 * %esi #if DEBUG temporary thread pointer
342 */
343 ENTRY(hwblkpagecopy)
344 movl 4(%esp), %eax
345 movl 8(%esp), %edx
346 PREFETCH_START(%eax)
347 pushl %ebx
348 /*
349 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
350 * load and final store save us one loop count
351 */
352 movl $_CONST(32 - 1), %ecx
353 pushl %esi
354 ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
355 popl %esi
356 movl %cr0, %ebx
357 clts
358 testl $CR0_TS, %ebx
359 jnz 3f
360 pushl %edi
361 SAVE_XMMS(%edi)
362 3: COPY_LOOP_INIT_XMM(%eax)
363 4: COPY_LOOP_BODY_XMM(%eax, %edx, %ecx)
364 jnz 4b
365 COPY_LOOP_FINI_XMM(%edx)
366 testl $CR0_TS, %ebx
367 jnz 5f
368 RSTOR_XMMS(%edi)
369 popl %edi
370 5: movl %ebx, %cr0
371 popl %ebx
372 mfence
373 ret
374 SET_SIZE(hwblkpagecopy)
375
376 #endif /* __i386 */
377 #endif /* __lint */
378
379 #if defined(__lint)
380
381 /*
382 * Version of hwblkclr which doesn't use XMM registers.
383 * Note that it requires aligned dst and len.
384 *
385 * XXPV This needs to be performance tuned at some point.
386 * Is 4 the best number of iterations to unroll?
387 */
388 /*ARGSUSED*/
389 void
390 block_zero_no_xmm(void *dst, int len)
391 {}
392
393 #else /* __lint */
394
395 #if defined(__amd64)
396
397 ENTRY(block_zero_no_xmm)
398 pushq %rbp
399 movq %rsp, %rbp
400 xorl %eax, %eax
401 addq %rsi, %rdi
402 negq %rsi
403 1:
404 movnti %rax, (%rdi, %rsi)
405 movnti %rax, 8(%rdi, %rsi)
406 movnti %rax, 16(%rdi, %rsi)
407 movnti %rax, 24(%rdi, %rsi)
408 addq $32, %rsi
409 jnz 1b
410 mfence
411 leave
412 ret
413 SET_SIZE(block_zero_no_xmm)
414
415 #elif defined(__i386)
416
417 ENTRY(block_zero_no_xmm)
418 pushl %ebp
419 movl %esp, %ebp
420 xorl %eax, %eax
421 movl 8(%ebp), %edx
422 movl 12(%ebp), %ecx
423 addl %ecx, %edx
424 negl %ecx
425 1:
426 movnti %eax, (%edx, %ecx)
427 movnti %eax, 4(%edx, %ecx)
428 movnti %eax, 8(%edx, %ecx)
429 movnti %eax, 12(%edx, %ecx)
430 addl $16, %ecx
431 jnz 1b
432 mfence
433 leave
434 ret
435 SET_SIZE(block_zero_no_xmm)
436
437 #endif /* __i386 */
438 #endif /* __lint */
439
440
441 #if defined(__lint)
442
443 /*
444 * Version of page copy which doesn't use XMM registers.
445 *
446 * XXPV This needs to be performance tuned at some point.
447 * Is 4 the right number of iterations to unroll?
448 * Is the load/store order optimal? Should it use prefetch?
449 */
450 /*ARGSUSED*/
451 void
452 page_copy_no_xmm(void *dst, void *src)
453 {}
454
455 #else /* __lint */
456
457 #if defined(__amd64)
458
459 ENTRY(page_copy_no_xmm)
460 movq $MMU_STD_PAGESIZE, %rcx
461 addq %rcx, %rdi
462 addq %rcx, %rsi
463 negq %rcx
464 1:
465 movq (%rsi, %rcx), %rax
466 movnti %rax, (%rdi, %rcx)
467 movq 8(%rsi, %rcx), %rax
468 movnti %rax, 8(%rdi, %rcx)
469 movq 16(%rsi, %rcx), %rax
470 movnti %rax, 16(%rdi, %rcx)
471 movq 24(%rsi, %rcx), %rax
472 movnti %rax, 24(%rdi, %rcx)
473 addq $32, %rcx
474 jnz 1b
475 mfence
476 ret
477 SET_SIZE(page_copy_no_xmm)
478
479 #elif defined(__i386)
480
481 ENTRY(page_copy_no_xmm)
482 pushl %esi
483 movl $MMU_STD_PAGESIZE, %ecx
484 movl 8(%esp), %edx
485 movl 12(%esp), %esi
486 addl %ecx, %edx
487 addl %ecx, %esi
488 negl %ecx
489 1:
490 movl (%esi, %ecx), %eax
491 movnti %eax, (%edx, %ecx)
492 movl 4(%esi, %ecx), %eax
493 movnti %eax, 4(%edx, %ecx)
494 movl 8(%esi, %ecx), %eax
495 movnti %eax, 8(%edx, %ecx)
496 movl 12(%esi, %ecx), %eax
497 movnti %eax, 12(%edx, %ecx)
498 addl $16, %ecx
499 jnz 1b
500 mfence
501 popl %esi
502 ret
503 SET_SIZE(page_copy_no_xmm)
504
505 #endif /* __i386 */
506 #endif /* __lint */
507
508 #if defined(DEBUG) && !defined(__lint)
509 .text
510 .not_disabled:
511 .string "sseblk: preemption not disabled!"
512 #endif