6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 #include <sys/asm_linkage.h>
29 #include <sys/regset.h>
30 #include <sys/privregs.h>
31
32 #if defined(__lint)
33 #include <sys/types.h>
34 #include <sys/archsystm.h>
35 #else
36 #include "assym.h"
37 #endif
38
39 /*
40 * Do block operations using Streaming SIMD extensions
41 */
42
43 #if defined(DEBUG)
44 #if defined(__amd64)
45 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg) \
46 movq %gs:CPU_THREAD, t; \
47 movsbl T_PREEMPT(t), r32; \
48 testl r32, r32; \
49 jne 5f; \
50 pushq %rbp; \
51 movq %rsp, %rbp; \
52 leaq msg(%rip), %rdi; \
53 xorl %eax, %eax; \
54 call panic; \
55 5:
56 #elif defined(__i386)
57 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg) \
58 movl %gs:CPU_THREAD, t; \
59 movsbl T_PREEMPT(t), r32; \
60 testl r32, r32; \
61 jne 5f; \
62 pushl %ebp; \
63 movl %esp, %ebp; \
64 pushl $msg; \
65 call panic; \
66 5:
67 #endif /* __i386 */
68 #else /* DEBUG */
69 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg)
70 #endif /* DEBUG */
71
72 #define BLOCKSHIFT 6
73 #define BLOCKSIZE 64 /* (1 << BLOCKSHIFT) */
74 #define BLOCKMASK 63 /* (BLOCKSIZE - 1) */
75
76 #if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
77 #error "mucked up constants"
78 #endif
79
80 #if defined(__lint)
81
82 /*ARGSUSED*/
83 void
84 hwblkclr(void *addr, size_t size)
85 {}
86
87 #else /* __lint */
88
89 #if defined(__amd64)
90 #define ADD addq
91 #define SUB subq
92 #else
93 #define ADD addl
94 #define SUB subl
95 #endif
96
97 #define SAVE_XMM0(r) \
98 SAVE_XMM_PROLOG(r, 1); \
99 movdqa %xmm0, (r)
100
101 #define ZERO_LOOP_INIT_XMM(dst) \
102 pxor %xmm0, %xmm0
103
104 #define ZERO_LOOP_BODY_XMM(dst, cnt) \
105 movntdq %xmm0, (dst); \
106 movntdq %xmm0, 0x10(dst); \
107 movntdq %xmm0, 0x20(dst); \
108 movntdq %xmm0, 0x30(dst); \
109 ADD $BLOCKSIZE, dst; \
110 SUB $1, cnt
111
112 #define ZERO_LOOP_FINI_XMM(dst) \
113 mfence
114
115 #define RSTOR_XMM0(r) \
116 movdqa 0x0(r), %xmm0; \
117 RSTOR_XMM_EPILOG(r, 1)
118
119 #if defined(__amd64)
120
121 /*
122 * %rdi dst
123 * %rsi size
124 * %rax saved %cr0 (#if DEBUG then %eax is t->t_preempt)
125 * %r8 pointer to %xmm register save area
126 */
127 ENTRY(hwblkclr)
128 pushq %rbp
129 movq %rsp, %rbp
130 testl $BLOCKMASK, %edi /* address must be BLOCKSIZE aligned */
131 jne .dobzero
132 cmpq $BLOCKSIZE, %rsi /* size must be at least BLOCKSIZE */
133 jl .dobzero
134 testq $BLOCKMASK, %rsi /* .. and be a multiple of BLOCKSIZE */
135 jne .dobzero
136 shrq $BLOCKSHIFT, %rsi
137
138 ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
139 movq %cr0, %rax
140 clts
141 testl $CR0_TS, %eax
142 jnz 1f
143
144 SAVE_XMM0(%r8)
145 1: ZERO_LOOP_INIT_XMM(%rdi)
146 9: ZERO_LOOP_BODY_XMM(%rdi, %rsi)
147 jnz 9b
148 ZERO_LOOP_FINI_XMM(%rdi)
149
150 testl $CR0_TS, %eax
151 jnz 2f
152 RSTOR_XMM0(%r8)
153 2: movq %rax, %cr0
154 leave
155 ret
156 .dobzero:
157 leave
158 jmp bzero
159 SET_SIZE(hwblkclr)
160
161 #elif defined(__i386)
162
163 /*
164 * %eax dst
165 * %ecx size in bytes, loop count
166 * %ebx saved %cr0 (#if DEBUG then t->t_preempt)
167 * %edi pointer to %xmm register save area
168 */
169 ENTRY(hwblkclr)
170 movl 4(%esp), %eax
171 movl 8(%esp), %ecx
172 testl $BLOCKMASK, %eax /* address must be BLOCKSIZE aligned */
173 jne .dobzero
174 cmpl $BLOCKSIZE, %ecx /* size must be at least BLOCKSIZE */
175 jl .dobzero
176 testl $BLOCKMASK, %ecx /* .. and be a multiple of BLOCKSIZE */
177 jne .dobzero
178 shrl $BLOCKSHIFT, %ecx
179 movl 0xc(%esp), %edx
180 pushl %ebx
181
182 pushl %esi
183 ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
184 popl %esi
185 movl %cr0, %ebx
186 clts
187 testl $CR0_TS, %ebx
188 jnz 1f
189
190 pushl %edi
191 SAVE_XMM0(%edi)
192 1: ZERO_LOOP_INIT_XMM(%eax)
193 9: ZERO_LOOP_BODY_XMM(%eax, %ecx)
194 jnz 9b
195 ZERO_LOOP_FINI_XMM(%eax)
196
197 testl $CR0_TS, %ebx
198 jnz 2f
199 RSTOR_XMM0(%edi)
200 popl %edi
201 2: movl %ebx, %cr0
202 popl %ebx
203 ret
204 .dobzero:
205 jmp bzero
206 SET_SIZE(hwblkclr)
207
208 #endif /* __i386 */
209 #endif /* __lint */
210
211
212 #if defined(__lint)
213
214 /*ARGSUSED*/
215 void
216 hwblkpagecopy(const void *src, void *dst)
217 {}
218
219 #else /* __lint */
220
221 #define PREFETCH_START(src) \
222 prefetchnta 0x0(src); \
223 prefetchnta 0x40(src)
224
225 #define SAVE_XMMS(r) \
226 SAVE_XMM_PROLOG(r, 8); \
227 movdqa %xmm0, (r); \
228 movdqa %xmm1, 0x10(r); \
229 movdqa %xmm2, 0x20(r); \
230 movdqa %xmm3, 0x30(r); \
231 movdqa %xmm4, 0x40(r); \
232 movdqa %xmm5, 0x50(r); \
233 movdqa %xmm6, 0x60(r); \
234 movdqa %xmm7, 0x70(r)
235
236 #define COPY_LOOP_INIT_XMM(src) \
237 prefetchnta 0x80(src); \
238 prefetchnta 0xc0(src); \
239 movdqa 0x0(src), %xmm0; \
240 movdqa 0x10(src), %xmm1; \
241 movdqa 0x20(src), %xmm2; \
242 movdqa 0x30(src), %xmm3; \
243 movdqa 0x40(src), %xmm4; \
244 movdqa 0x50(src), %xmm5; \
245 movdqa 0x60(src), %xmm6; \
246 movdqa 0x70(src), %xmm7; \
247 ADD $0x80, src
248
249 #define COPY_LOOP_BODY_XMM(src, dst, cnt) \
250 prefetchnta 0x80(src); \
251 prefetchnta 0xc0(src); \
252 prefetchnta 0x100(src); \
253 prefetchnta 0x140(src); \
254 movntdq %xmm0, (dst); \
255 movntdq %xmm1, 0x10(dst); \
256 movntdq %xmm2, 0x20(dst); \
257 movntdq %xmm3, 0x30(dst); \
258 movdqa 0x0(src), %xmm0; \
259 movdqa 0x10(src), %xmm1; \
260 movntdq %xmm4, 0x40(dst); \
261 movntdq %xmm5, 0x50(dst); \
262 movdqa 0x20(src), %xmm2; \
263 movdqa 0x30(src), %xmm3; \
264 movntdq %xmm6, 0x60(dst); \
265 movntdq %xmm7, 0x70(dst); \
266 movdqa 0x40(src), %xmm4; \
267 movdqa 0x50(src), %xmm5; \
268 ADD $0x80, dst; \
269 movdqa 0x60(src), %xmm6; \
270 movdqa 0x70(src), %xmm7; \
271 ADD $0x80, src; \
272 subl $1, cnt
273
274 #define COPY_LOOP_FINI_XMM(dst) \
275 movntdq %xmm0, 0x0(dst); \
276 movntdq %xmm1, 0x10(dst); \
277 movntdq %xmm2, 0x20(dst); \
278 movntdq %xmm3, 0x30(dst); \
279 movntdq %xmm4, 0x40(dst); \
280 movntdq %xmm5, 0x50(dst); \
281 movntdq %xmm6, 0x60(dst); \
282 movntdq %xmm7, 0x70(dst)
283
284 #define RSTOR_XMMS(r) \
285 movdqa 0x0(r), %xmm0; \
286 movdqa 0x10(r), %xmm1; \
287 movdqa 0x20(r), %xmm2; \
288 movdqa 0x30(r), %xmm3; \
289 movdqa 0x40(r), %xmm4; \
290 movdqa 0x50(r), %xmm5; \
291 movdqa 0x60(r), %xmm6; \
292 movdqa 0x70(r), %xmm7; \
293 RSTOR_XMM_EPILOG(r, 8)
294
295 #if defined(__amd64)
296
297 /*
298 * %rdi src
299 * %rsi dst
300 * %rdx #if DEBUG then curthread
301 * %ecx loop count
302 * %rax saved %cr0 (#if DEBUG then %eax is t->t_prempt)
303 * %r8 pointer to %xmm register save area
304 */
305 ENTRY(hwblkpagecopy)
306 pushq %rbp
307 movq %rsp, %rbp
308 PREFETCH_START(%rdi)
309 /*
310 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
311 * load and final store save us on loop count
312 */
313 movl $_CONST(32 - 1), %ecx
314 ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
315 movq %cr0, %rax
316 clts
317 testl $CR0_TS, %eax
318 jnz 3f
319 SAVE_XMMS(%r8)
320 3: COPY_LOOP_INIT_XMM(%rdi)
321 4: COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
322 jnz 4b
323 COPY_LOOP_FINI_XMM(%rsi)
324 testl $CR0_TS, %eax
325 jnz 5f
326 RSTOR_XMMS(%r8)
327 5: movq %rax, %cr0
328 mfence
329 leave
330 ret
331 SET_SIZE(hwblkpagecopy)
332
333 #elif defined(__i386)
334
335 /*
336 * %eax src
337 * %edx dst
338 * %ecx loop count
339 * %ebx saved %cr0 (#if DEBUG then t->t_prempt)
340 * %edi pointer to %xmm register save area
341 * %esi #if DEBUG temporary thread pointer
342 */
343 ENTRY(hwblkpagecopy)
344 movl 4(%esp), %eax
345 movl 8(%esp), %edx
346 PREFETCH_START(%eax)
347 pushl %ebx
348 /*
349 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
350 * load and final store save us one loop count
351 */
352 movl $_CONST(32 - 1), %ecx
353 pushl %esi
354 ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
355 popl %esi
356 movl %cr0, %ebx
357 clts
358 testl $CR0_TS, %ebx
359 jnz 3f
360 pushl %edi
361 SAVE_XMMS(%edi)
362 3: COPY_LOOP_INIT_XMM(%eax)
363 4: COPY_LOOP_BODY_XMM(%eax, %edx, %ecx)
364 jnz 4b
365 COPY_LOOP_FINI_XMM(%edx)
366 testl $CR0_TS, %ebx
367 jnz 5f
368 RSTOR_XMMS(%edi)
369 popl %edi
370 5: movl %ebx, %cr0
371 popl %ebx
372 mfence
373 ret
374 SET_SIZE(hwblkpagecopy)
375
376 #endif /* __i386 */
377 #endif /* __lint */
378
379 #if defined(__lint)
380
381 /*
382 * Version of hwblkclr which doesn't use XMM registers.
383 * Note that it requires aligned dst and len.
384 *
385 * XXPV This needs to be performance tuned at some point.
386 * Is 4 the best number of iterations to unroll?
387 */
388 /*ARGSUSED*/
389 void
390 block_zero_no_xmm(void *dst, int len)
391 {}
392
393 #else /* __lint */
394
395 #if defined(__amd64)
396
397 ENTRY(block_zero_no_xmm)
398 pushq %rbp
399 movq %rsp, %rbp
400 xorl %eax, %eax
401 addq %rsi, %rdi
402 negq %rsi
403 1:
404 movnti %rax, (%rdi, %rsi)
405 movnti %rax, 8(%rdi, %rsi)
406 movnti %rax, 16(%rdi, %rsi)
407 movnti %rax, 24(%rdi, %rsi)
408 addq $32, %rsi
409 jnz 1b
410 mfence
411 leave
412 ret
413 SET_SIZE(block_zero_no_xmm)
414
415 #elif defined(__i386)
416
417 ENTRY(block_zero_no_xmm)
418 pushl %ebp
419 movl %esp, %ebp
420 xorl %eax, %eax
421 movl 8(%ebp), %edx
422 movl 12(%ebp), %ecx
423 addl %ecx, %edx
424 negl %ecx
425 1:
426 movnti %eax, (%edx, %ecx)
427 movnti %eax, 4(%edx, %ecx)
428 movnti %eax, 8(%edx, %ecx)
429 movnti %eax, 12(%edx, %ecx)
430 addl $16, %ecx
431 jnz 1b
432 mfence
433 leave
434 ret
435 SET_SIZE(block_zero_no_xmm)
436
437 #endif /* __i386 */
438 #endif /* __lint */
439
440
441 #if defined(__lint)
442
443 /*
444 * Version of page copy which doesn't use XMM registers.
445 *
446 * XXPV This needs to be performance tuned at some point.
447 * Is 4 the right number of iterations to unroll?
448 * Is the load/store order optimal? Should it use prefetch?
449 */
450 /*ARGSUSED*/
451 void
452 page_copy_no_xmm(void *dst, void *src)
453 {}
454
455 #else /* __lint */
456
457 #if defined(__amd64)
458
459 ENTRY(page_copy_no_xmm)
460 movq $MMU_STD_PAGESIZE, %rcx
461 addq %rcx, %rdi
462 addq %rcx, %rsi
463 negq %rcx
464 1:
465 movq (%rsi, %rcx), %rax
466 movnti %rax, (%rdi, %rcx)
467 movq 8(%rsi, %rcx), %rax
468 movnti %rax, 8(%rdi, %rcx)
469 movq 16(%rsi, %rcx), %rax
470 movnti %rax, 16(%rdi, %rcx)
471 movq 24(%rsi, %rcx), %rax
472 movnti %rax, 24(%rdi, %rcx)
473 addq $32, %rcx
474 jnz 1b
475 mfence
476 ret
477 SET_SIZE(page_copy_no_xmm)
478
479 #elif defined(__i386)
480
481 ENTRY(page_copy_no_xmm)
482 pushl %esi
483 movl $MMU_STD_PAGESIZE, %ecx
484 movl 8(%esp), %edx
485 movl 12(%esp), %esi
486 addl %ecx, %edx
487 addl %ecx, %esi
488 negl %ecx
489 1:
490 movl (%esi, %ecx), %eax
491 movnti %eax, (%edx, %ecx)
492 movl 4(%esi, %ecx), %eax
493 movnti %eax, 4(%edx, %ecx)
494 movl 8(%esi, %ecx), %eax
495 movnti %eax, 8(%edx, %ecx)
496 movl 12(%esi, %ecx), %eax
497 movnti %eax, 12(%edx, %ecx)
498 addl $16, %ecx
499 jnz 1b
500 mfence
501 popl %esi
502 ret
503 SET_SIZE(page_copy_no_xmm)
504
505 #endif /* __i386 */
506 #endif /* __lint */
507
508 #if defined(DEBUG) && !defined(__lint)
509 .text
510 .not_disabled:
511 .string "sseblk: preemption not disabled!"
512 #endif
|
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright 2019 Joyent, Inc.
28 */
29
30 #include <sys/asm_linkage.h>
31 #include <sys/regset.h>
32 #include <sys/privregs.h>
33
34 #include "assym.h"
35
36 /*
37 * Do block operations using Streaming SIMD extensions
38 */
39
40 #if defined(DEBUG)
41 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg) \
42 movq %gs:CPU_THREAD, t; \
43 movsbl T_PREEMPT(t), r32; \
44 testl r32, r32; \
45 jne 5f; \
46 pushq %rbp; \
47 movq %rsp, %rbp; \
48 leaq msg(%rip), %rdi; \
49 xorl %eax, %eax; \
50 call panic; \
51 5:
52 #else /* DEBUG */
53 #define ASSERT_KPREEMPT_DISABLED(t, r32, msg)
54 #endif /* DEBUG */
55
56 #define BLOCKSHIFT 6
57 #define BLOCKSIZE 64 /* (1 << BLOCKSHIFT) */
58 #define BLOCKMASK 63 /* (BLOCKSIZE - 1) */
59
60 #if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
61 #error "mucked up constants"
62 #endif
63
64 #define SAVE_XMM0(r) \
65 SAVE_XMM_PROLOG(r, 1); \
66 movdqa %xmm0, (r)
67
68 #define ZERO_LOOP_INIT_XMM(dst) \
69 pxor %xmm0, %xmm0
70
71 #define ZERO_LOOP_BODY_XMM(dst, cnt) \
72 movntdq %xmm0, (dst); \
73 movntdq %xmm0, 0x10(dst); \
74 movntdq %xmm0, 0x20(dst); \
75 movntdq %xmm0, 0x30(dst); \
76 addq $BLOCKSIZE, dst; \
77 subq $1, cnt
78
79 #define ZERO_LOOP_FINI_XMM(dst) \
80 mfence
81
82 #define RSTOR_XMM0(r) \
83 movdqa 0x0(r), %xmm0; \
84 RSTOR_XMM_EPILOG(r, 1)
85
86 /*
87 * %rdi dst
88 * %rsi size
89 * %rax saved %cr0 (#if DEBUG then %eax is t->t_preempt)
90 * %r8 pointer to %xmm register save area
91 */
92 ENTRY(hwblkclr)
93 pushq %rbp
94 movq %rsp, %rbp
95 testl $BLOCKMASK, %edi /* address must be BLOCKSIZE aligned */
96 jne .dobzero
97 cmpq $BLOCKSIZE, %rsi /* size must be at least BLOCKSIZE */
98 jl .dobzero
99 testq $BLOCKMASK, %rsi /* .. and be a multiple of BLOCKSIZE */
100 jne .dobzero
101 shrq $BLOCKSHIFT, %rsi
102
103 ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
104 movq %cr0, %rax
105 clts
106 testl $CR0_TS, %eax
107 jnz 1f
108
109 SAVE_XMM0(%r8)
110 1: ZERO_LOOP_INIT_XMM(%rdi)
111 9: ZERO_LOOP_BODY_XMM(%rdi, %rsi)
112 jnz 9b
113 ZERO_LOOP_FINI_XMM(%rdi)
114
115 testl $CR0_TS, %eax
116 jnz 2f
117 RSTOR_XMM0(%r8)
118 2: movq %rax, %cr0
119 leave
120 ret
121 .dobzero:
122 leave
123 jmp bzero
124 SET_SIZE(hwblkclr)
125
126
127 #define PREFETCH_START(src) \
128 prefetchnta 0x0(src); \
129 prefetchnta 0x40(src)
130
131 #define SAVE_XMMS(r) \
132 SAVE_XMM_PROLOG(r, 8); \
133 movdqa %xmm0, (r); \
134 movdqa %xmm1, 0x10(r); \
135 movdqa %xmm2, 0x20(r); \
136 movdqa %xmm3, 0x30(r); \
137 movdqa %xmm4, 0x40(r); \
138 movdqa %xmm5, 0x50(r); \
139 movdqa %xmm6, 0x60(r); \
140 movdqa %xmm7, 0x70(r)
141
142 #define COPY_LOOP_INIT_XMM(src) \
143 prefetchnta 0x80(src); \
144 prefetchnta 0xc0(src); \
145 movdqa 0x0(src), %xmm0; \
146 movdqa 0x10(src), %xmm1; \
147 movdqa 0x20(src), %xmm2; \
148 movdqa 0x30(src), %xmm3; \
149 movdqa 0x40(src), %xmm4; \
150 movdqa 0x50(src), %xmm5; \
151 movdqa 0x60(src), %xmm6; \
152 movdqa 0x70(src), %xmm7; \
153 addq $0x80, src
154
155 #define COPY_LOOP_BODY_XMM(src, dst, cnt) \
156 prefetchnta 0x80(src); \
157 prefetchnta 0xc0(src); \
158 prefetchnta 0x100(src); \
159 prefetchnta 0x140(src); \
160 movntdq %xmm0, (dst); \
161 movntdq %xmm1, 0x10(dst); \
162 movntdq %xmm2, 0x20(dst); \
163 movntdq %xmm3, 0x30(dst); \
164 movdqa 0x0(src), %xmm0; \
165 movdqa 0x10(src), %xmm1; \
166 movntdq %xmm4, 0x40(dst); \
167 movntdq %xmm5, 0x50(dst); \
168 movdqa 0x20(src), %xmm2; \
169 movdqa 0x30(src), %xmm3; \
170 movntdq %xmm6, 0x60(dst); \
171 movntdq %xmm7, 0x70(dst); \
172 movdqa 0x40(src), %xmm4; \
173 movdqa 0x50(src), %xmm5; \
174 addq $0x80, dst; \
175 movdqa 0x60(src), %xmm6; \
176 movdqa 0x70(src), %xmm7; \
177 addq $0x80, src; \
178 subl $1, cnt
179
180 #define COPY_LOOP_FINI_XMM(dst) \
181 movntdq %xmm0, 0x0(dst); \
182 movntdq %xmm1, 0x10(dst); \
183 movntdq %xmm2, 0x20(dst); \
184 movntdq %xmm3, 0x30(dst); \
185 movntdq %xmm4, 0x40(dst); \
186 movntdq %xmm5, 0x50(dst); \
187 movntdq %xmm6, 0x60(dst); \
188 movntdq %xmm7, 0x70(dst)
189
190 #define RSTOR_XMMS(r) \
191 movdqa 0x0(r), %xmm0; \
192 movdqa 0x10(r), %xmm1; \
193 movdqa 0x20(r), %xmm2; \
194 movdqa 0x30(r), %xmm3; \
195 movdqa 0x40(r), %xmm4; \
196 movdqa 0x50(r), %xmm5; \
197 movdqa 0x60(r), %xmm6; \
198 movdqa 0x70(r), %xmm7; \
199 RSTOR_XMM_EPILOG(r, 8)
200
201 /*
202 * %rdi src
203 * %rsi dst
204 * %rdx #if DEBUG then curthread
205 * %ecx loop count
206 * %rax saved %cr0 (#if DEBUG then %eax is t->t_prempt)
207 * %r8 pointer to %xmm register save area
208 */
209 ENTRY(hwblkpagecopy)
210 pushq %rbp
211 movq %rsp, %rbp
212 PREFETCH_START(%rdi)
213 /*
214 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
215 * load and final store save us on loop count
216 */
217 movl $_CONST(32 - 1), %ecx
218 ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
219 movq %cr0, %rax
220 clts
221 testl $CR0_TS, %eax
222 jnz 3f
223 SAVE_XMMS(%r8)
224 3: COPY_LOOP_INIT_XMM(%rdi)
225 4: COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
226 jnz 4b
227 COPY_LOOP_FINI_XMM(%rsi)
228 testl $CR0_TS, %eax
229 jnz 5f
230 RSTOR_XMMS(%r8)
231 5: movq %rax, %cr0
232 mfence
233 leave
234 ret
235 SET_SIZE(hwblkpagecopy)
236
237 ENTRY(block_zero_no_xmm)
238 pushq %rbp
239 movq %rsp, %rbp
240 xorl %eax, %eax
241 addq %rsi, %rdi
242 negq %rsi
243 1:
244 movnti %rax, (%rdi, %rsi)
245 movnti %rax, 8(%rdi, %rsi)
246 movnti %rax, 16(%rdi, %rsi)
247 movnti %rax, 24(%rdi, %rsi)
248 addq $32, %rsi
249 jnz 1b
250 mfence
251 leave
252 ret
253 SET_SIZE(block_zero_no_xmm)
254
255
256 ENTRY(page_copy_no_xmm)
257 movq $MMU_STD_PAGESIZE, %rcx
258 addq %rcx, %rdi
259 addq %rcx, %rsi
260 negq %rcx
261 1:
262 movq (%rsi, %rcx), %rax
263 movnti %rax, (%rdi, %rcx)
264 movq 8(%rsi, %rcx), %rax
265 movnti %rax, 8(%rdi, %rcx)
266 movq 16(%rsi, %rcx), %rax
267 movnti %rax, 16(%rdi, %rcx)
268 movq 24(%rsi, %rcx), %rax
269 movnti %rax, 24(%rdi, %rcx)
270 addq $32, %rcx
271 jnz 1b
272 mfence
273 ret
274 SET_SIZE(page_copy_no_xmm)
275
276 #if defined(DEBUG)
277 .text
278 .not_disabled:
279 .string "sseblk: preemption not disabled!"
280 #endif
|