Print this page
de-linting of .s files
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/sun4v/cpu/niagara_copy.s
+++ new/usr/src/uts/sun4v/cpu/niagara_copy.s
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25
26 26 #include <sys/param.h>
27 27 #include <sys/errno.h>
↓ open down ↓ |
27 lines elided |
↑ open up ↑ |
28 28 #include <sys/asm_linkage.h>
29 29 #include <sys/vtrace.h>
30 30 #include <sys/machthread.h>
31 31 #include <sys/clock.h>
32 32 #include <sys/asi.h>
33 33 #include <sys/fsr.h>
34 34 #include <sys/privregs.h>
35 35 #include <sys/machasi.h>
36 36 #include <sys/niagaraasi.h>
37 37
38 -#if !defined(lint)
39 38 #include "assym.h"
40 -#endif /* lint */
41 39
42 40
43 41 /*
44 42 * Pseudo-code to aid in understanding the control flow of the
45 43 * bcopy/kcopy routine.
46 44 *
47 45 * ! WARNING : <Register usage convention>
48 46 * ! In kcopy() the %o5, holds previous error handler and a flag
49 47 * ! LOFAULT_SET (low bits). The %o5 is null in bcopy().
50 48 * ! The %o5 is not available for any other use.
51 49 *
52 50 * On entry:
53 51 * ! Determine whether to use the FP register version or the
54 52 * ! the leaf routine version depending on the size of the copy.
55 53 * ! Set up error handling accordingly.
56 54 * ! The transition point depends on FP_COPY
57 55 * ! For both versions %o5 is reserved
58 56 *
59 57 * kcopy():
60 58 * if(length > FP_COPY)
61 59 * go to regular_kcopy
62 60 *
63 61 * ! Setup_leaf_rtn_error_handler
64 62 * %o5 = curthread->t_lofault; ! save existing handler in %o5
65 63 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag
66 64 * curthread->t_lofault = .sm_copyerr;
67 65 * goto small_bcopy();
68 66 *
69 67 * regular_kcopy:
70 68 * save_registers()
71 69 * %o5 = curthread->t_lofault; ! save existing handler in %o5
72 70 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag
73 71 * curthread->t_lofault = .copyerr;
74 72 * goto do_copy();
75 73 *
76 74 * bcopy():
77 75 * if(length > FP_COPY)
78 76 * go to regular_bcopy
79 77 *
80 78 * ! Setup_leaf_rtn_error_handler
81 79 * %o5 = curthread->t_lofault; ! save existing handler in %o5
82 80 * curthread->t_lofault = .sm_copyerr;
83 81 * goto small_bcopy();
84 82 *
85 83 * regular_bcopy:
86 84 * %o5 = curthread->t_lofault; ! save existing handler in %o5
87 85 * curthread->t_lofault = .copyerr;
88 86 * goto do_copy();
89 87 *
90 88 * small_bcopy:
91 89 * ! handle copies smaller than FP_COPY
92 90 * restore t_lofault handler
93 91 * exit
94 92 *
95 93 * do_copy:
96 94 * ! handle copies larger than FP_COPY
97 95 * save fp_regs
98 96 * blockcopy;
99 97 * restore fp_regs
100 98 * restore t_lofault handler if came from kcopy();
101 99 *
102 100 *
103 101 * In leaf lofault handler:
104 102 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault
105 103 * return (errno)
106 104 *
107 105 * In lofault handler:
108 106 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault
109 107 * restore fp_regs
110 108 * return (errno)
111 109 *
112 110 *
113 111 *
114 112 * For all of bcopy/copyin/copyout the copy logic is specialized according
115 113 * to how the src and dst is aligned and how much data needs to be moved.
116 114 * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL))
117 115 *
118 116 * N2/RF Flow :
119 117 *
120 118 * if (count < FP_COPY) { (584 bytes)
121 119 * set small fault handler (no register window save/restore)
122 120 * if count < SHORTCOPY (7 bytes)
123 121 * copy bytes; go to short_exit
124 122 * else
125 123 * determine dst alignment, move minimum bytes/halfwords to
126 124 * get dst aligned on long word boundary
127 125 * if( src is on long word boundary ) {
128 126 * medlong: src/dst aligned on 8 bytes
129 127 * copy with ldx/stx in 4-way unrolled loop;
130 128 * copy final 0-31 bytes; go to short_exit
131 129 * } else { src/dst not aligned on 8 bytes
132 130 * if src is word aligned, ld/st words in 32-byte chunks
133 131 * if src is half word aligned, ld half, ld word, ld half; pack
134 132 * into long word, store long words in 32-byte chunks
135 133 * if src is byte aligned, ld byte,half,word parts; pack into long
136 134 * word, store long words in 32-byte chunks
137 135 * move final 0-31 bytes according to src alignment; go to short_exit
138 136 * short_exit:
139 137 * restore trap handler if needed, retl
140 138 * else { More than FP_COPY bytes
141 139 * set fault handler
142 140 * disable kernel preemption
143 141 * save registers, save FP registers if in use
144 142 * move bytes to align destination register on long word boundary
145 143 * if(src is on long word boundary) { src/dst aligned on 8 bytes
146 144 * align dst on 64 byte boundary; use 8-way test for each of 8 possible
147 145 * src alignments relative to a 64 byte boundary to select the
148 146 * 16-way unrolled loop (128 bytes) to use for
149 147 * block load, fmovd, block-init-store, block-store, fmovd operations
150 148 * then go to remain_stuff.
151 149 * remain_stuff: move remaining bytes. go to long_exit
152 150 * } else {
153 151 * setup alignaddr for faligndata instructions
154 152 * align dst on 64 byte boundary; use 8-way test for each of 8 possible
155 153 * src alignments to nearest long word relative to 64 byte boundary to
156 154 * select the 8-way unrolled loop (64 bytes) to use for
157 155 * block load, falign, fmovd, block-store loop
158 156 * (only use block-init-store when src/dst on 8 byte boundaries.)
159 157 * goto unalign_done.
160 158 * unalign_done:
161 159 * move remaining bytes for unaligned cases. go to long_exit
162 160 * long_exit:
163 161 * restore %gsr, FP regs (either from stack or set to zero),
164 162 * restore trap handler, check for kernel preemption request,
165 163 * handle if needed, ret.
166 164 * }
167 165 *
168 166 * Other platforms include hw_bcopy_limit_[1248] to control the exact
169 167 * point where the FP register code is used. On those platforms, the
170 168 * FP register code did not leave data in L2 cache, potentially affecting
171 169 * performance more than the gain/loss from the algorithm difference.
172 170 * For N2/RF, block store places data in the L2 cache, so use or non-use
173 171 * of the FP registers has no effect on L2 cache behavior.
174 172 * The cost for testing hw_bcopy_limit_* according to different
175 173 * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits
176 174 * were not used. That cost was judged too high relative to the benefits,
177 175 * so the hw_bcopy_limit option is omitted from this code.
178 176 */
179 177
180 178 /*
181 179 * Less then or equal this number of bytes we will always copy byte-for-byte
182 180 */
183 181 #define SMALL_LIMIT 7
184 182
185 183 /*
186 184 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
187 185 * handler was set
188 186 */
189 187 #define LOFAULT_SET 2
190 188
191 189 /*
192 190 * This define is to align data for the unaligned source cases.
193 191 * The data1, data2 and data3 is merged into data1 and data2.
194 192 * The data3 is preserved for next merge.
195 193 */
196 194 #define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \
197 195 sllx data1, lshift, data1 ;\
198 196 srlx data2, rshift, tmp ;\
199 197 or data1, tmp, data1 ;\
200 198 sllx data2, lshift, data2 ;\
201 199 srlx data3, rshift, tmp ;\
202 200 or data2, tmp, data2
203 201 /*
204 202 * This macro is to align the data. Basically it merges
205 203 * data1 and data2 to form double word.
206 204 */
207 205 #define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \
208 206 sllx data1, lshift, data1 ;\
209 207 srlx data2, rshift, tmp ;\
210 208 or data1, tmp, data1
211 209
212 210 #if !defined(NIAGARA_IMPL)
213 211 /*
214 212 * Flags set in the lower bits of the t_lofault address:
215 213 * FPUSED_FLAG: The FP registers were in use and must be restored
216 214 * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls
217 215 * COPY_FLAGS: Both of the above
218 216 *
219 217 * Other flags:
220 218 * KPREEMPT_FLAG: kpreempt needs to be called
221 219 */
222 220 #define FPUSED_FLAG 1
223 221 #define LOFAULT_SET 2
224 222 #define COPY_FLAGS (FPUSED_FLAG | LOFAULT_SET)
225 223 #define KPREEMPT_FLAG 4
226 224
227 225 #define ALIGN_OFF_1_7 \
228 226 faligndata %d0, %d2, %d48 ;\
229 227 faligndata %d2, %d4, %d50 ;\
230 228 faligndata %d4, %d6, %d52 ;\
231 229 faligndata %d6, %d8, %d54 ;\
232 230 faligndata %d8, %d10, %d56 ;\
233 231 faligndata %d10, %d12, %d58 ;\
234 232 faligndata %d12, %d14, %d60 ;\
235 233 faligndata %d14, %d16, %d62
236 234
237 235 #define ALIGN_OFF_8_15 \
238 236 faligndata %d2, %d4, %d48 ;\
239 237 faligndata %d4, %d6, %d50 ;\
240 238 faligndata %d6, %d8, %d52 ;\
241 239 faligndata %d8, %d10, %d54 ;\
242 240 faligndata %d10, %d12, %d56 ;\
243 241 faligndata %d12, %d14, %d58 ;\
244 242 faligndata %d14, %d16, %d60 ;\
245 243 faligndata %d16, %d18, %d62
246 244
247 245 #define ALIGN_OFF_16_23 \
248 246 faligndata %d4, %d6, %d48 ;\
249 247 faligndata %d6, %d8, %d50 ;\
250 248 faligndata %d8, %d10, %d52 ;\
251 249 faligndata %d10, %d12, %d54 ;\
252 250 faligndata %d12, %d14, %d56 ;\
253 251 faligndata %d14, %d16, %d58 ;\
254 252 faligndata %d16, %d18, %d60 ;\
255 253 faligndata %d18, %d20, %d62
256 254
257 255 #define ALIGN_OFF_24_31 \
258 256 faligndata %d6, %d8, %d48 ;\
259 257 faligndata %d8, %d10, %d50 ;\
260 258 faligndata %d10, %d12, %d52 ;\
261 259 faligndata %d12, %d14, %d54 ;\
262 260 faligndata %d14, %d16, %d56 ;\
263 261 faligndata %d16, %d18, %d58 ;\
264 262 faligndata %d18, %d20, %d60 ;\
265 263 faligndata %d20, %d22, %d62
266 264
267 265 #define ALIGN_OFF_32_39 \
268 266 faligndata %d8, %d10, %d48 ;\
269 267 faligndata %d10, %d12, %d50 ;\
270 268 faligndata %d12, %d14, %d52 ;\
271 269 faligndata %d14, %d16, %d54 ;\
272 270 faligndata %d16, %d18, %d56 ;\
273 271 faligndata %d18, %d20, %d58 ;\
274 272 faligndata %d20, %d22, %d60 ;\
275 273 faligndata %d22, %d24, %d62
276 274
277 275 #define ALIGN_OFF_40_47 \
278 276 faligndata %d10, %d12, %d48 ;\
279 277 faligndata %d12, %d14, %d50 ;\
280 278 faligndata %d14, %d16, %d52 ;\
281 279 faligndata %d16, %d18, %d54 ;\
282 280 faligndata %d18, %d20, %d56 ;\
283 281 faligndata %d20, %d22, %d58 ;\
284 282 faligndata %d22, %d24, %d60 ;\
285 283 faligndata %d24, %d26, %d62
286 284
287 285 #define ALIGN_OFF_48_55 \
288 286 faligndata %d12, %d14, %d48 ;\
289 287 faligndata %d14, %d16, %d50 ;\
290 288 faligndata %d16, %d18, %d52 ;\
291 289 faligndata %d18, %d20, %d54 ;\
292 290 faligndata %d20, %d22, %d56 ;\
293 291 faligndata %d22, %d24, %d58 ;\
294 292 faligndata %d24, %d26, %d60 ;\
295 293 faligndata %d26, %d28, %d62
296 294
297 295 #define ALIGN_OFF_56_63 \
298 296 faligndata %d14, %d16, %d48 ;\
299 297 faligndata %d16, %d18, %d50 ;\
300 298 faligndata %d18, %d20, %d52 ;\
301 299 faligndata %d20, %d22, %d54 ;\
302 300 faligndata %d22, %d24, %d56 ;\
303 301 faligndata %d24, %d26, %d58 ;\
304 302 faligndata %d26, %d28, %d60 ;\
305 303 faligndata %d28, %d30, %d62
306 304
307 305 /*
308 306 * FP_COPY indicates the minimum number of bytes needed
309 307 * to justify using FP/VIS-accelerated memory operations.
310 308 * The FPBLK code assumes a minimum number of bytes are available
311 309 * to be moved on entry. Check that code carefully before
312 310 * reducing FP_COPY below 256.
313 311 */
314 312 #define FP_COPY 584
315 313 #define SHORTCOPY 7
316 314 #define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P
317 315 #define ASI_STBI_AIUS ASI_BLK_INIT_QUAD_LDD_AIUS
318 316 #define CACHE_LINE 64
319 317 #define VIS_BLOCKSIZE 64
320 318
321 319 /*
322 320 * Size of stack frame in order to accomodate a 64-byte aligned
323 321 * floating-point register save area and 2 64-bit temp locations.
324 322 * All copy functions use three quadrants of fp registers; to assure a
325 323 * block-aligned three block buffer in which to save we must reserve
326 324 * four blocks on stack.
327 325 *
328 326 * _______________________________________ <-- %fp + STACK_BIAS
329 327 * | We may need to preserve 3 quadrants |
330 328 * | of fp regs, but since we do so with |
331 329 * | BST/BLD we need room in which to |
332 330 * | align to VIS_BLOCKSIZE bytes. So |
333 331 * | this area is 4 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET
334 332 * |-------------------------------------|
335 333 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET
336 334 * |-------------------------------------|
337 335 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET
338 336 * ---------------------------------------
339 337 */
340 338 #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
341 339 #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 4)
342 340 #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 3) + 1)
343 341 #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8)
344 342 #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8)
345 343
346 344 /*
347 345 * In FP copies if we do not have preserved data to restore over
348 346 * the fp regs we used then we must zero those regs to avoid
349 347 * exposing portions of the data to later threads (data security).
350 348 */
351 349 #define FZERO \
352 350 fzero %f0 ;\
353 351 fzero %f2 ;\
354 352 faddd %f0, %f2, %f4 ;\
355 353 fmuld %f0, %f2, %f6 ;\
356 354 faddd %f0, %f2, %f8 ;\
357 355 fmuld %f0, %f2, %f10 ;\
358 356 faddd %f0, %f2, %f12 ;\
359 357 fmuld %f0, %f2, %f14 ;\
360 358 faddd %f0, %f2, %f16 ;\
361 359 fmuld %f0, %f2, %f18 ;\
362 360 faddd %f0, %f2, %f20 ;\
363 361 fmuld %f0, %f2, %f22 ;\
364 362 faddd %f0, %f2, %f24 ;\
365 363 fmuld %f0, %f2, %f26 ;\
366 364 faddd %f0, %f2, %f28 ;\
↓ open down ↓ |
316 lines elided |
↑ open up ↑ |
367 365 fmuld %f0, %f2, %f30 ;\
368 366 faddd %f0, %f2, %f48 ;\
369 367 fmuld %f0, %f2, %f50 ;\
370 368 faddd %f0, %f2, %f52 ;\
371 369 fmuld %f0, %f2, %f54 ;\
372 370 faddd %f0, %f2, %f56 ;\
373 371 fmuld %f0, %f2, %f58 ;\
374 372 faddd %f0, %f2, %f60 ;\
375 373 fmuld %f0, %f2, %f62
376 374
377 -#if !defined(lint)
378 -
379 375 /*
380 376 * Macros to save and restore fp registers to/from the stack.
381 377 * Used to save and restore in-use fp registers when we want to use FP.
382 378 */
383 379 #define BST_FP_TOSTACK(tmp1) \
384 380 /* membar #Sync */ ;\
385 381 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
386 382 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
387 383 stda %f0, [tmp1]ASI_BLK_P ;\
388 384 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
389 385 stda %f16, [tmp1]ASI_BLK_P ;\
390 386 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
391 387 stda %f48, [tmp1]ASI_BLK_P ;\
392 388 membar #Sync
393 389
↓ open down ↓ |
5 lines elided |
↑ open up ↑ |
394 390 #define BLD_FP_FROMSTACK(tmp1) \
395 391 /* membar #Sync - provided at copy completion */ ;\
396 392 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\
397 393 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\
398 394 ldda [tmp1]ASI_BLK_P, %f0 ;\
399 395 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
400 396 ldda [tmp1]ASI_BLK_P, %f16 ;\
401 397 add tmp1, VIS_BLOCKSIZE, tmp1 ;\
402 398 ldda [tmp1]ASI_BLK_P, %f48 ;\
403 399 membar #Sync
404 -#endif /* NIAGARA_IMPL */
405 400
406 -#endif /* lint */
401 +#endif
407 402 /*
408 403 * Copy a block of storage, returning an error code if `from' or
409 404 * `to' takes a kernel pagefault which cannot be resolved.
410 405 * Returns errno value on pagefault error, 0 if all ok
411 406 */
412 407
413 -#if defined(lint)
414 -
415 -/* ARGSUSED */
416 -int
417 -kcopy(const void *from, void *to, size_t count)
418 -{ return(0); }
419 -
420 -#else /* lint */
421 -
422 408 .seg ".text"
423 409 .align 4
424 410
425 411 ENTRY(kcopy)
426 412 #if !defined(NIAGARA_IMPL)
427 413 cmp %o2, FP_COPY ! check for small copy/leaf case
428 414 bgt,pt %ncc, .kcopy_more !
429 415 nop
430 416 .kcopy_small: ! setup error handler
431 417 sethi %hi(.sm_copyerr), %o4
432 418 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value
433 419 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
434 420 ! Note that we carefully do *not* flag the setting of
435 421 ! t_lofault.
436 422 membar #Sync ! sync error barrier
437 423 b .sm_do_copy ! common code
438 424 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault
439 425
440 426
441 427 .kcopy_more:
442 428 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
443 429 sethi %hi(.copyerr), %l7 ! copyerr is lofault value
444 430 or %l7, %lo(.copyerr), %l7
445 431 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
446 432 ! Note that we carefully do *not* flag the setting of
447 433 ! t_lofault.
448 434 membar #Sync ! sync error barrier
449 435 b .do_copy ! common code
450 436 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
451 437
452 438 /*
453 439 * We got here because of a fault during a small kcopy or bcopy.
454 440 * if a fault handler existed when bcopy was called.
455 441 * No floating point registers are used by the small copies.
456 442 * Small copies are from a leaf routine
457 443 * Errno value is in %g1.
458 444 */
459 445 .sm_copyerr:
460 446 ! The kcopy will always set a t_lofault handler. If it fires,
461 447 ! we're expected to just return the error code and not to
462 448 ! invoke any existing error handler. As far as bcopy is concerned,
463 449 ! we only set t_lofault if there was an existing lofault handler.
464 450 ! In that case we're expected to invoke the previously existing
465 451 ! handler after resetting the t_lofault value.
466 452 btst LOFAULT_SET, %o5
467 453 membar #Sync ! sync error barrier
468 454 andn %o5, LOFAULT_SET, %o5 ! clear fault flag
469 455 bnz,pn %ncc, 3f
470 456 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
471 457 retl
472 458 mov %g1, %o0
473 459 3:
474 460 ! We're here via bcopy. There must have been an error handler
475 461 ! in place otherwise we would have died a nasty death already.
476 462 jmp %o5 ! goto real handler
477 463 mov %g0, %o0
478 464 /*
479 465 * end of .sm_copyerr
480 466 */
481 467
482 468 /*
483 469 * We got here because of a fault during kcopy or bcopy if a fault
484 470 * handler existed when bcopy was called.
485 471 * stack and fp registers need to be restored
486 472 * Errno value is in %g1.
487 473 */
488 474 .copyerr:
489 475 sethi %hi(.copyerr2), %l1
490 476 or %l1, %lo(.copyerr2), %l1
491 477 membar #Sync ! sync error barrier
492 478 stn %l1, [THREAD_REG + T_LOFAULT] ! set t_lofault
493 479 btst FPUSED_FLAG, %o5
494 480 bz,pt %xcc, 1f
495 481 and %o5, LOFAULT_SET, %l1 ! copy flag to %l1
496 482
497 483 membar #Sync ! sync error barrier
498 484 wr %l5, 0, %gsr
499 485 btst FPRS_FEF, %g5
500 486 bz,pt %icc, 4f
501 487 nop
502 488 ! restore fpregs from stack
503 489 BLD_FP_FROMSTACK(%o2)
504 490 ba,pt %ncc, 2f
505 491 wr %g5, 0, %fprs ! restore fprs
506 492 4:
507 493 FZERO
508 494 wr %g5, 0, %fprs ! restore fprs
509 495 2:
510 496 ldn [THREAD_REG + T_LWP], %o2
511 497 brnz,pt %o2, 1f
512 498 nop
513 499
514 500 ldsb [THREAD_REG + T_PREEMPT], %l0
515 501 deccc %l0
516 502 bnz,pn %ncc, 1f
517 503 stb %l0, [THREAD_REG + T_PREEMPT]
518 504
519 505 ! Check for a kernel preemption request
520 506 ldn [THREAD_REG + T_CPU], %l0
521 507 ldub [%l0 + CPU_KPRUNRUN], %l0
522 508 brnz,a,pt %l0, 1f ! Need to call kpreempt?
523 509 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
524 510
525 511 ! The kcopy will always set a t_lofault handler. If it fires,
526 512 ! we're expected to just return the error code and not to
527 513 ! invoke any existing error handler. As far as bcopy is concerned,
528 514 ! we only set t_lofault if there was an existing lofault handler.
529 515 ! In that case we're expected to invoke the previously existing
530 516 ! handler after resetting the t_lofault value.
531 517 1:
532 518 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address
533 519 membar #Sync ! sync error barrier
534 520 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
535 521
536 522 ! call kpreempt if necessary
537 523 btst KPREEMPT_FLAG, %l1
538 524 bz,pt %icc, 2f
539 525 nop
540 526 call kpreempt
541 527 rdpr %pil, %o0 ! pass %pil
542 528 2:
543 529 btst LOFAULT_SET, %l1
544 530 bnz,pn %ncc, 3f
545 531 nop
546 532 ret
547 533 restore %g1, 0, %o0
548 534 3:
549 535 ! We're here via bcopy. There must have been an error handler
550 536 ! in place otherwise we would have died a nasty death already.
551 537 jmp %o5 ! goto real handler
552 538 restore %g0, 0, %o0 ! dispose of copy window
553 539
554 540 /*
555 541 * We got here because of a fault in .copyerr. We can't safely restore fp
556 542 * state, so we panic.
557 543 */
558 544 fp_panic_msg:
559 545 .asciz "Unable to restore fp state after copy operation"
560 546
561 547 .align 4
562 548 .copyerr2:
563 549 set fp_panic_msg, %o0
564 550 call panic
565 551 nop
566 552 /*
567 553 * end of .copyerr
568 554 */
569 555
570 556 #else /* NIAGARA_IMPL */
571 557 save %sp, -SA(MINFRAME), %sp
572 558 set .copyerr, %l7 ! copyerr is lofault value
573 559 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
574 560 or %o5, LOFAULT_SET, %o5
575 561 membar #Sync ! sync error barrier
576 562 b .do_copy ! common code
577 563 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
578 564
579 565 /*
580 566 * We got here because of a fault during kcopy.
581 567 * Errno value is in %g1.
582 568 */
583 569 .copyerr:
584 570 ! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
↓ open down ↓ |
153 lines elided |
↑ open up ↑ |
585 571 ! into %o5 to indicate it has set t_lofault handler. Need to clear
586 572 ! LOFAULT_SET flag before restoring the error handler.
587 573 andn %o5, LOFAULT_SET, %o5
588 574 membar #Sync ! sync error barrier
589 575 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
590 576 ret
591 577 restore %g1, 0, %o0
592 578 #endif /* NIAGARA_IMPL */
593 579
594 580 SET_SIZE(kcopy)
595 -#endif /* lint */
596 581
597 582
598 583 /*
599 584 * Copy a block of storage - must not overlap (from + len <= to).
600 585 */
601 -#if defined(lint)
602 586
603 -/* ARGSUSED */
604 -void
605 -bcopy(const void *from, void *to, size_t count)
606 -{}
607 -
608 -#else /* lint */
609 -
610 587 ENTRY(bcopy)
611 588 #if !defined(NIAGARA_IMPL)
612 589 cmp %o2, FP_COPY ! check for small copy/leaf case
613 590 bgt,pt %ncc, .bcopy_more !
614 591 nop
615 592 .bcopy_small: ! setup error handler
616 593 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
617 594 tst %o5
618 595 bz,pt %icc, .sm_do_copy
619 596 sethi %hi(.sm_copyerr), %o4
620 597 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value
621 598 membar #Sync ! sync error barrier
622 599 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault
623 600 or %o5, LOFAULT_SET, %o5 ! Error should trampoline
624 601 .sm_do_copy:
625 602 mov %o0, %g1 ! save %o0
626 603 cmp %o2, SHORTCOPY ! make sure there is enough to align
627 604 ble,pt %ncc, .bc_smallest
628 605 andcc %o1, 0x7, %o3 ! is dest long aligned
629 606 bnz,pn %ncc, .bc_align
630 607 andcc %o1, 1, %o3 ! is dest byte aligned
631 608
632 609 ! Destination is long word aligned
633 610 .bc_al_src:
634 611 andcc %o0, 7, %o3
635 612 brnz,pt %o3, .bc_src_dst_unal8
636 613 nop
637 614 /*
638 615 * Special case for handling when src and dest are both long word aligned
639 616 * and total data to move is less than FP_COPY bytes
640 617 * Also handles finish up for large block moves, so may be less than 32 bytes
641 618 */
642 619 .bc_medlong:
643 620 subcc %o2, 31, %o2 ! adjust length to allow cc test
644 621 ble,pt %ncc, .bc_medl31
645 622 nop
646 623 .bc_medl32:
647 624 ldx [%o0], %o4 ! move 32 bytes
648 625 subcc %o2, 32, %o2 ! decrement length count by 32
649 626 stx %o4, [%o1]
650 627 ldx [%o0+8], %o4
651 628 stx %o4, [%o1+8]
652 629 ldx [%o0+16], %o4
653 630 add %o0, 32, %o0 ! increase src ptr by 32
654 631 stx %o4, [%o1+16]
655 632 ldx [%o0-8], %o4
656 633 add %o1, 32, %o1 ! increase dst ptr by 32
657 634 bgu,pt %ncc, .bc_medl32 ! repeat if at least 32 bytes left
658 635 stx %o4, [%o1-8]
659 636 .bc_medl31:
660 637 addcc %o2, 24, %o2 ! adjust count to be off by 7
661 638 ble,pt %ncc, .bc_medl7 ! skip if 7 or fewer bytes left
662 639 nop
663 640 .bc_medl8:
664 641 ldx [%o0], %o4 ! move 8 bytes
665 642 add %o0, 8, %o0 ! increase src ptr by 8
666 643 subcc %o2, 8, %o2 ! decrease count by 8
667 644 add %o1, 8, %o1 ! increase dst ptr by 8
668 645 bgu,pt %ncc, .bc_medl8
669 646 stx %o4, [%o1-8]
670 647 .bc_medl7:
671 648 addcc %o2, 7, %o2 ! finish adjustment of remaining count
672 649 bnz,pt %ncc, .bc_small4 ! do final bytes if not finished
673 650
674 651 .bc_smallx: ! finish up and exit
675 652 tst %o5
676 653 bz,pt %ncc, .bc_sm_done
677 654 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address
678 655 membar #Sync ! sync error barrier
679 656 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
680 657 .bc_sm_done:
681 658 retl
682 659 mov %g0, %o0
683 660
684 661 .bc_small4:
685 662 cmp %o2, 4
686 663 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
687 664 nop !
688 665 ld [%o0], %o4 ! move 4 bytes
689 666 add %o0, 4, %o0 ! increase src ptr by 4
690 667 add %o1, 4, %o1 ! increase dst ptr by 4
691 668 subcc %o2, 4, %o2 ! decrease count by 4
692 669 bz,pt %ncc, .bc_smallx
693 670 stw %o4, [%o1-4]
694 671
695 672 .bc_small3x: ! Exactly 1, 2, or 3 bytes remain
696 673 subcc %o2, 1, %o2 ! reduce count for cc test
697 674 ldub [%o0], %o4 ! load one byte
698 675 bz,pt %ncc, .bc_smallx
699 676 stb %o4, [%o1] ! store one byte
700 677 ldub [%o0+1], %o4 ! load second byte
701 678 subcc %o2, 1, %o2
702 679 bz,pt %ncc, .bc_smallx
703 680 stb %o4, [%o1+1] ! store second byte
704 681 ldub [%o0+2], %o4 ! load third byte
705 682 ba .bc_smallx
706 683 stb %o4, [%o1+2] ! store third byte
707 684
708 685 .bc_smallest: ! 7 or fewer bytes remain
709 686 tst %o2
710 687 bz,pt %ncc, .bc_smallx
711 688 cmp %o2, 4
712 689 blt,pt %ncc, .bc_small3x
713 690 nop
714 691 ldub [%o0], %o4 ! read byte
715 692 subcc %o2, 4, %o2 ! reduce count by 4
716 693 stb %o4, [%o1] ! write byte
717 694 ldub [%o0+1], %o4 ! repeat for total of 4 bytes
718 695 add %o0, 4, %o0 ! advance src by 4
719 696 stb %o4, [%o1+1]
720 697 ldub [%o0-2], %o4
721 698 add %o1, 4, %o1 ! advance dst by 4
722 699 stb %o4, [%o1-2]
723 700 ldub [%o0-1], %o4
724 701 bnz,pt %ncc, .bc_small3x
725 702 stb %o4, [%o1-1]
726 703 ba .bc_smallx
727 704 nop
728 705
729 706 /*
730 707 * Align destination to long word boundary
731 708 */
732 709 .bc_align: ! byte align test in prior branch delay
733 710 bnz,pt %ncc, .bc_al_d1
734 711 .bc_al_d1f: ! dest is now half word aligned
735 712 andcc %o1, 2, %o3
736 713 bnz,pt %ncc, .bc_al_d2
737 714 .bc_al_d2f: ! dest is now word aligned
738 715 andcc %o1, 4, %o3 ! is dest longword aligned?
739 716 bz,pt %ncc, .bc_al_src
740 717 nop
741 718 .bc_al_d4: ! dest is word aligned; src is unknown
742 719 ldub [%o0], %o4 ! move a word (src align unknown)
743 720 ldub [%o0+1], %o3
744 721 sll %o4, 24, %o4 ! position
745 722 sll %o3, 16, %o3 ! position
746 723 or %o4, %o3, %o3 ! merge
747 724 ldub [%o0+2], %o4
748 725 sll %o4, 8, %o4 ! position
749 726 or %o4, %o3, %o3 ! merge
750 727 ldub [%o0+3], %o4
751 728 or %o4, %o3, %o4 ! merge
752 729 stw %o4,[%o1] ! store four bytes
753 730 add %o0, 4, %o0 ! adjust src by 4
754 731 add %o1, 4, %o1 ! adjust dest by 4
755 732 sub %o2, 4, %o2 ! adjust count by 4
756 733 andcc %o0, 7, %o3 ! check for src long word alignment
757 734 brz,pt %o3, .bc_medlong
758 735 .bc_src_dst_unal8:
759 736 ! dst is 8-byte aligned, src is not
760 737 ! Size is less than FP_COPY
761 738 ! Following code is to select for alignment
762 739 andcc %o0, 0x3, %o3 ! test word alignment
763 740 bz,pt %ncc, .bc_medword
764 741 nop
765 742 andcc %o0, 0x1, %o3 ! test halfword alignment
766 743 bnz,pt %ncc, .bc_med_byte ! go to byte move if not halfword
767 744 andcc %o0, 0x2, %o3 ! test which byte alignment
768 745 ba .bc_medhalf
769 746 nop
770 747 .bc_al_d1: ! align dest to half word
771 748 ldub [%o0], %o4 ! move a byte
772 749 add %o0, 1, %o0
773 750 stb %o4, [%o1]
774 751 add %o1, 1, %o1
775 752 andcc %o1, 2, %o3
776 753 bz,pt %ncc, .bc_al_d2f
777 754 sub %o2, 1, %o2
778 755 .bc_al_d2: ! align dest to word
779 756 ldub [%o0], %o4 ! move a half-word (src align unknown)
780 757 ldub [%o0+1], %o3
781 758 sll %o4, 8, %o4 ! position
782 759 or %o4, %o3, %o4 ! merge
783 760 sth %o4, [%o1]
784 761 add %o0, 2, %o0
785 762 add %o1, 2, %o1
786 763 andcc %o1, 4, %o3 ! is dest longword aligned?
787 764 bz,pt %ncc, .bc_al_src
788 765 sub %o2, 2, %o2
789 766 ba .bc_al_d4
790 767 nop
791 768 /*
792 769 * Handle all cases where src and dest are aligned on word
793 770 * boundaries. Use unrolled loops for better performance.
794 771 * This option wins over standard large data move when
795 772 * source and destination is in cache for medium
796 773 * to short data moves.
797 774 */
798 775 .bc_medword:
799 776 subcc %o2, 31, %o2 ! adjust length to allow cc test
800 777 ble,pt %ncc, .bc_medw31
801 778 nop
802 779 .bc_medw32:
803 780 ld [%o0], %o4 ! move a block of 32 bytes
804 781 stw %o4, [%o1]
805 782 ld [%o0+4], %o4
806 783 stw %o4, [%o1+4]
807 784 ld [%o0+8], %o4
808 785 stw %o4, [%o1+8]
809 786 ld [%o0+12], %o4
810 787 stw %o4, [%o1+12]
811 788 ld [%o0+16], %o4
812 789 stw %o4, [%o1+16]
813 790 ld [%o0+20], %o4
814 791 subcc %o2, 32, %o2 ! decrement length count
815 792 stw %o4, [%o1+20]
816 793 ld [%o0+24], %o4
817 794 add %o0, 32, %o0 ! increase src ptr by 32
818 795 stw %o4, [%o1+24]
819 796 ld [%o0-4], %o4
820 797 add %o1, 32, %o1 ! increase dst ptr by 32
821 798 bgu,pt %ncc, .bc_medw32 ! repeat if at least 32 bytes left
822 799 stw %o4, [%o1-4]
823 800 .bc_medw31:
824 801 addcc %o2, 24, %o2 ! adjust count to be off by 7
825 802 ble,pt %ncc, .bc_medw7 ! skip if 7 or fewer bytes left
826 803 nop !
827 804 .bc_medw15:
828 805 ld [%o0], %o4 ! move a block of 8 bytes
829 806 subcc %o2, 8, %o2 ! decrement length count
830 807 stw %o4, [%o1]
831 808 add %o0, 8, %o0 ! increase src ptr by 8
832 809 ld [%o0-4], %o4
833 810 add %o1, 8, %o1 ! increase dst ptr by 8
834 811 bgu,pt %ncc, .bc_medw15
835 812 stw %o4, [%o1-4]
836 813 .bc_medw7:
837 814 addcc %o2, 7, %o2 ! finish adjustment of remaining count
838 815 bz,pt %ncc, .bc_smallx ! exit if finished
839 816 cmp %o2, 4
840 817 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
841 818 nop !
842 819 ld [%o0], %o4 ! move 4 bytes
843 820 add %o0, 4, %o0 ! increase src ptr by 4
844 821 add %o1, 4, %o1 ! increase dst ptr by 4
845 822 subcc %o2, 4, %o2 ! decrease count by 4
846 823 bnz .bc_small3x
847 824 stw %o4, [%o1-4]
848 825 ba .bc_smallx
849 826 nop
850 827
851 828 .bc_medhalf:
852 829 subcc %o2, 31, %o2 ! adjust length to allow cc test
853 830 ble,pt %ncc, .bc_medh31
854 831 nop
855 832 .bc_medh32: ! load and store block of 32 bytes
856 833 subcc %o2, 32, %o2 ! decrement length count
857 834
858 835 lduh [%o0], %o4 ! move 32 bytes
859 836 lduw [%o0+2], %o3
860 837 sllx %o4, 48, %o4
861 838 sllx %o3, 16, %o3
862 839 or %o4, %o3, %o3
863 840 lduh [%o0+6], %o4
864 841 or %o4, %o3, %o4
865 842 stx %o4, [%o1]
866 843
867 844 lduh [%o0+8], %o4
868 845 lduw [%o0+10], %o3
869 846 sllx %o4, 48, %o4
870 847 sllx %o3, 16, %o3
871 848 or %o4, %o3, %o3
872 849 lduh [%o0+14], %o4
873 850 or %o4, %o3, %o4
874 851 stx %o4, [%o1+8]
875 852
876 853 lduh [%o0+16], %o4
877 854 lduw [%o0+18], %o3
878 855 sllx %o4, 48, %o4
879 856 sllx %o3, 16, %o3
880 857 or %o4, %o3, %o3
881 858 lduh [%o0+22], %o4
882 859 or %o4, %o3, %o4
883 860 stx %o4, [%o1+16]
884 861
885 862 add %o0, 32, %o0 ! increase src ptr by 32
886 863 add %o1, 32, %o1 ! increase dst ptr by 32
887 864
888 865 lduh [%o0-8], %o4
889 866 lduw [%o0-6], %o3
890 867 sllx %o4, 48, %o4
891 868 sllx %o3, 16, %o3
892 869 or %o4, %o3, %o3
893 870 lduh [%o0-2], %o4
894 871 or %o3, %o4, %o4
895 872 bgu,pt %ncc, .bc_medh32 ! repeat if at least 32 bytes left
896 873 stx %o4, [%o1-8]
897 874
898 875 .bc_medh31:
899 876 addcc %o2, 24, %o2 ! adjust count to be off by 7
900 877 ble,pt %ncc, .bc_medh7 ! skip if 7 or fewer bytes left
901 878 nop !
902 879 .bc_medh15:
903 880 lduh [%o0], %o4 ! move 16 bytes
904 881 subcc %o2, 8, %o2 ! decrement length count
905 882 lduw [%o0+2], %o3
906 883 sllx %o4, 48, %o4
907 884 sllx %o3, 16, %o3
908 885 or %o4, %o3, %o3
909 886 add %o1, 8, %o1 ! increase dst ptr by 8
910 887 lduh [%o0+6], %o4
911 888 add %o0, 8, %o0 ! increase src ptr by 8
912 889 or %o4, %o3, %o4
913 890 bgu,pt %ncc, .bc_medh15
914 891 stx %o4, [%o1-8]
915 892 .bc_medh7:
916 893 addcc %o2, 7, %o2 ! finish adjustment of remaining count
917 894 bz,pt %ncc, .bc_smallx ! exit if finished
918 895 cmp %o2, 4
919 896 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
920 897 nop !
921 898 lduh [%o0], %o4
922 899 sll %o4, 16, %o4
923 900 lduh [%o0+2], %o3
924 901 or %o3, %o4, %o4
925 902 subcc %o2, 4, %o2
926 903 add %o0, 4, %o0
927 904 add %o1, 4, %o1
928 905 bnz .bc_small3x
929 906 stw %o4, [%o1-4]
930 907 ba .bc_smallx
931 908 nop
932 909
933 910 .align 16
934 911 .bc_med_byte:
935 912 bnz,pt %ncc, .bc_medbh32a ! go to correct byte move
936 913 subcc %o2, 31, %o2 ! adjust length to allow cc test
937 914 ble,pt %ncc, .bc_medb31
938 915 nop
939 916 .bc_medb32: ! Alignment 1 or 5
940 917 subcc %o2, 32, %o2 ! decrement length count
941 918
942 919 ldub [%o0], %o4 ! load and store a block of 32 bytes
943 920 sllx %o4, 56, %o3
944 921 lduh [%o0+1], %o4
945 922 sllx %o4, 40, %o4
946 923 or %o4, %o3, %o3
947 924 lduw [%o0+3], %o4
948 925 sllx %o4, 8, %o4
949 926 or %o4, %o3, %o3
950 927 ldub [%o0+7], %o4
951 928 or %o4, %o3, %o4
952 929 stx %o4, [%o1]
953 930
954 931 ldub [%o0+8], %o4
955 932 sllx %o4, 56, %o3
956 933 lduh [%o0+9], %o4
957 934 sllx %o4, 40, %o4
958 935 or %o4, %o3, %o3
959 936 lduw [%o0+11], %o4
960 937 sllx %o4, 8, %o4
961 938 or %o4, %o3, %o3
962 939 ldub [%o0+15], %o4
963 940 or %o4, %o3, %o4
964 941 stx %o4, [%o1+8]
965 942
966 943 ldub [%o0+16], %o4
967 944 sllx %o4, 56, %o3
968 945 lduh [%o0+17], %o4
969 946 sllx %o4, 40, %o4
970 947 or %o4, %o3, %o3
971 948 lduw [%o0+19], %o4
972 949 sllx %o4, 8, %o4
973 950 or %o4, %o3, %o3
974 951 ldub [%o0+23], %o4
975 952 or %o4, %o3, %o4
976 953 stx %o4, [%o1+16]
977 954
978 955 add %o0, 32, %o0 ! increase src ptr by 32
979 956 add %o1, 32, %o1 ! increase dst ptr by 32
980 957
981 958 ldub [%o0-8], %o4
982 959 sllx %o4, 56, %o3
983 960 lduh [%o0-7], %o4
984 961 sllx %o4, 40, %o4
985 962 or %o4, %o3, %o3
986 963 lduw [%o0-5], %o4
987 964 sllx %o4, 8, %o4
988 965 or %o4, %o3, %o3
989 966 ldub [%o0-1], %o4
990 967 or %o4, %o3, %o4
991 968 bgu,pt %ncc, .bc_medb32 ! repeat if at least 32 bytes left
992 969 stx %o4, [%o1-8]
993 970
994 971 .bc_medb31: ! 31 or fewer bytes remaining
995 972 addcc %o2, 24, %o2 ! adjust count to be off by 7
996 973 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left
997 974 nop !
998 975 .bc_medb15:
999 976
1000 977 ldub [%o0], %o4 ! load and store a block of 8 bytes
1001 978 subcc %o2, 8, %o2 ! decrement length count
1002 979 sllx %o4, 56, %o3
1003 980 lduh [%o0+1], %o4
1004 981 sllx %o4, 40, %o4
1005 982 or %o4, %o3, %o3
1006 983 lduw [%o0+3], %o4
1007 984 add %o1, 8, %o1 ! increase dst ptr by 16
1008 985 sllx %o4, 8, %o4
1009 986 or %o4, %o3, %o3
1010 987 ldub [%o0+7], %o4
1011 988 add %o0, 8, %o0 ! increase src ptr by 16
1012 989 or %o4, %o3, %o4
1013 990 bgu,pt %ncc, .bc_medb15
1014 991 stx %o4, [%o1-8]
1015 992 .bc_medb7:
1016 993 addcc %o2, 7, %o2 ! finish adjustment of remaining count
1017 994 bz,pt %ncc, .bc_smallx ! exit if finished
1018 995 cmp %o2, 4
1019 996 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left
1020 997 nop !
1021 998 ldub [%o0], %o4 ! move 4 bytes
1022 999 sll %o4, 24, %o3
1023 1000 lduh [%o0+1], %o4
1024 1001 sll %o4, 8, %o4
1025 1002 or %o4, %o3, %o3
1026 1003 ldub [%o0+3], %o4
1027 1004 or %o4, %o3, %o4
1028 1005 subcc %o2, 4, %o2
1029 1006 add %o0, 4, %o0
1030 1007 add %o1, 4, %o1
1031 1008 bnz .bc_small3x
1032 1009 stw %o4, [%o1-4]
1033 1010 ba .bc_smallx
1034 1011 nop
1035 1012
1036 1013 .align 16
1037 1014 .bc_medbh32a: ! Alignment 3 or 7
1038 1015 ble,pt %ncc, .bc_medbh31
1039 1016 nop
1040 1017 .bc_medbh32: ! Alignment 3 or 7
1041 1018 subcc %o2, 32, %o2 ! decrement length count
1042 1019
1043 1020 ldub [%o0], %o4 ! load and store a block of 32 bytes
1044 1021 sllx %o4, 56, %o3
1045 1022 lduw [%o0+1], %o4
1046 1023 sllx %o4, 24, %o4
1047 1024 or %o4, %o3, %o3
1048 1025 lduh [%o0+5], %o4
1049 1026 sllx %o4, 8, %o4
1050 1027 or %o4, %o3, %o3
1051 1028 ldub [%o0+7], %o4
1052 1029 or %o4, %o3, %o4
1053 1030 stx %o4, [%o1]
1054 1031
1055 1032 ldub [%o0+8], %o4
1056 1033 sllx %o4, 56, %o3
1057 1034 lduw [%o0+9], %o4
1058 1035 sllx %o4, 24, %o4
1059 1036 or %o4, %o3, %o3
1060 1037 lduh [%o0+13], %o4
1061 1038 sllx %o4, 8, %o4
1062 1039 or %o4, %o3, %o3
1063 1040 ldub [%o0+15], %o4
1064 1041 or %o4, %o3, %o4
1065 1042 stx %o4, [%o1+8]
1066 1043
1067 1044 ldub [%o0+16], %o4
1068 1045 sllx %o4, 56, %o3
1069 1046 lduw [%o0+17], %o4
1070 1047 sllx %o4, 24, %o4
1071 1048 or %o4, %o3, %o3
1072 1049 lduh [%o0+21], %o4
1073 1050 sllx %o4, 8, %o4
1074 1051 or %o4, %o3, %o3
1075 1052 ldub [%o0+23], %o4
1076 1053 or %o4, %o3, %o4
1077 1054 stx %o4, [%o1+16]
1078 1055
1079 1056 add %o0, 32, %o0 ! increase src ptr by 32
1080 1057 add %o1, 32, %o1 ! increase dst ptr by 32
1081 1058
1082 1059 ldub [%o0-8], %o4
1083 1060 sllx %o4, 56, %o3
1084 1061 lduw [%o0-7], %o4
1085 1062 sllx %o4, 24, %o4
1086 1063 or %o4, %o3, %o3
1087 1064 lduh [%o0-3], %o4
1088 1065 sllx %o4, 8, %o4
1089 1066 or %o4, %o3, %o3
1090 1067 ldub [%o0-1], %o4
1091 1068 or %o4, %o3, %o4
1092 1069 bgu,pt %ncc, .bc_medbh32 ! repeat if at least 32 bytes left
1093 1070 stx %o4, [%o1-8]
1094 1071
1095 1072 .bc_medbh31:
1096 1073 addcc %o2, 24, %o2 ! adjust count to be off by 7
1097 1074 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left
1098 1075 nop !
1099 1076 .bc_medbh15:
1100 1077 ldub [%o0], %o4 ! load and store a block of 8 bytes
1101 1078 sllx %o4, 56, %o3
1102 1079 lduw [%o0+1], %o4
1103 1080 sllx %o4, 24, %o4
1104 1081 or %o4, %o3, %o3
1105 1082 lduh [%o0+5], %o4
1106 1083 sllx %o4, 8, %o4
1107 1084 or %o4, %o3, %o3
1108 1085 ldub [%o0+7], %o4
1109 1086 or %o4, %o3, %o4
1110 1087 stx %o4, [%o1]
1111 1088 subcc %o2, 8, %o2 ! decrement length count
1112 1089 add %o1, 8, %o1 ! increase dst ptr by 8
1113 1090 add %o0, 8, %o0 ! increase src ptr by 8
1114 1091 bgu,pt %ncc, .bc_medbh15
1115 1092 stx %o4, [%o1-8]
1116 1093 ba .bc_medb7
1117 1094 nop
1118 1095
1119 1096 SET_SIZE(bcopy)
1120 1097 /*
1121 1098 * The _more entry points are not intended to be used directly by
1122 1099 * any caller from outside this file. They are provided to allow
1123 1100 * profiling and dtrace of the portions of the copy code that uses
1124 1101 * the floating point registers.
1125 1102 */
1126 1103 ENTRY(bcopy_more)
1127 1104 .bcopy_more:
1128 1105 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1129 1106 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler
1130 1107 brz,pt %o5, .do_copy
1131 1108 nop
1132 1109 sethi %hi(.copyerr), %l7 ! copyerr is lofault value
1133 1110 or %l7, %lo(.copyerr), %l7
1134 1111 membar #Sync ! sync error barrier
1135 1112 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault
1136 1113 ! We've already captured whether t_lofault was zero on entry.
1137 1114 ! We need to mark ourselves as being from bcopy since both
1138 1115 ! kcopy and bcopy use the same code path. If LOFAULT_SET is
1139 1116 ! set and the saved lofault was zero, we won't reset lofault on
1140 1117 ! returning.
1141 1118 or %o5, LOFAULT_SET, %o5
1142 1119 .do_copy:
1143 1120 ldn [THREAD_REG + T_LWP], %o3
1144 1121 brnz,pt %o3, 1f
1145 1122 nop
1146 1123 /*
1147 1124 * kpreempt_disable();
1148 1125 */
1149 1126 ldsb [THREAD_REG +T_PREEMPT], %o3
1150 1127 inc %o3
1151 1128 stb %o3, [THREAD_REG + T_PREEMPT]
1152 1129 1:
1153 1130 /*
1154 1131 * Following code is for large copies. We know there is at
1155 1132 * least FP_COPY bytes available. FP regs are used, so
1156 1133 * we save registers and fp regs before starting
1157 1134 */
1158 1135 rd %fprs, %g5 ! check for unused fp
1159 1136 or %o5,FPUSED_FLAG,%o5
1160 1137 ! if fprs.fef == 0, set it.
1161 1138 ! Setting it when already set costs more than checking
1162 1139 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0
1163 1140 bz,pt %ncc, .bc_fp_unused
1164 1141 prefetch [%i0 + (1 * CACHE_LINE)], #one_read
1165 1142 BST_FP_TOSTACK(%o3)
1166 1143 ba .bc_fp_ready
1167 1144 .bc_fp_unused:
1168 1145 andcc %i1, 1, %o3 ! is dest byte aligned
1169 1146 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
1170 1147 .bc_fp_ready:
1171 1148 rd %gsr, %l5 ! save %gsr value
1172 1149 bnz,pt %ncc, .bc_big_d1
1173 1150 .bc_big_d1f: ! dest is now half word aligned
1174 1151 andcc %i1, 2, %o3
1175 1152 bnz,pt %ncc, .bc_big_d2
1176 1153 .bc_big_d2f: ! dest is now word aligned
1177 1154 andcc %i1, 4, %o3
1178 1155 bnz,pt %ncc, .bc_big_d4
1179 1156 .bc_big_d4f: ! dest is now long word aligned
1180 1157 andcc %i0, 7, %o3 ! is src long word aligned
1181 1158 brnz,pt %o3, .bc_big_unal8
1182 1159 prefetch [%i0 + (2 * CACHE_LINE)], #one_read
1183 1160
1184 1161 ! Src and dst are long word aligned
1185 1162 ! align dst to 64 byte boundary
1186 1163 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
1187 1164 brz,pn %o3, .bc_al_to_64
1188 1165 nop
1189 1166 sub %o3, 64, %o3 ! %o3 has negative bytes to move
1190 1167 add %i2, %o3, %i2 ! adjust remaining count
1191 1168 andcc %o3, 8, %o4 ! odd long words to move?
1192 1169 brz,pt %o4, .bc_al_to_16
1193 1170 nop
1194 1171 add %o3, 8, %o3
1195 1172 ldx [%i0], %o4
1196 1173 add %i0, 8, %i0 ! increment src ptr
1197 1174 add %i1, 8, %i1 ! increment dst ptr
1198 1175 stx %o4, [%i1-8]
1199 1176 ! Dest is aligned on 16 bytes, src 8 byte aligned
1200 1177 .bc_al_to_16:
1201 1178 andcc %o3, 0x30, %o4 ! pair of long words to move?
1202 1179 brz,pt %o4, .bc_al_to_64
1203 1180 nop
1204 1181 .bc_al_mv_16:
1205 1182 add %o3, 16, %o3
1206 1183 ldx [%i0], %o4
1207 1184 stx %o4, [%i1]
1208 1185 ldx [%i0+8], %o4
1209 1186 add %i0, 16, %i0 ! increment src ptr
1210 1187 stx %o4, [%i1+8]
1211 1188 andcc %o3, 48, %o4
1212 1189 brnz,pt %o4, .bc_al_mv_16
1213 1190 add %i1, 16, %i1 ! increment dst ptr
1214 1191 ! Dest is aligned on 64 bytes, src 8 byte aligned
1215 1192 .bc_al_to_64:
1216 1193 ! Determine source alignment
1217 1194 ! to correct 8 byte offset
1218 1195 andcc %i0, 32, %o3
1219 1196 brnz,pn %o3, .bc_aln_1
1220 1197 andcc %i0, 16, %o3
1221 1198 brnz,pn %o3, .bc_aln_01
1222 1199 andcc %i0, 8, %o3
1223 1200 brz,pn %o3, .bc_aln_000
1224 1201 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1225 1202 ba .bc_aln_001
1226 1203 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1227 1204
1228 1205 .bc_aln_01:
1229 1206 brnz,pn %o3, .bc_aln_011
1230 1207 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1231 1208 ba .bc_aln_010
1232 1209 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1233 1210 .bc_aln_1:
1234 1211 andcc %i0, 16, %o3
1235 1212 brnz,pn %o3, .bc_aln_11
1236 1213 andcc %i0, 8, %o3
1237 1214 brnz,pn %o3, .bc_aln_101
1238 1215 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1239 1216 ba .bc_aln_100
1240 1217 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1241 1218 .bc_aln_11:
1242 1219 brz,pn %o3, .bc_aln_110
1243 1220 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1244 1221
1245 1222 .bc_aln_111:
1246 1223 ! Alignment off by 8 bytes
1247 1224 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1248 1225 ldd [%i0], %d0
1249 1226 add %i0, 8, %i0
1250 1227 sub %i2, 8, %i2
1251 1228 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1252 1229 and %i2, 0x7f, %i2 ! residue bytes in %i2
1253 1230 sub %i1, %i0, %i1
1254 1231 .bc_aln_111_loop:
1255 1232 ldda [%i0]ASI_BLK_P,%d16 ! block load
1256 1233 subcc %o3, 64, %o3
1257 1234 fmovd %d16, %d2
1258 1235 fmovd %d18, %d4
1259 1236 fmovd %d20, %d6
1260 1237 fmovd %d22, %d8
1261 1238 fmovd %d24, %d10
1262 1239 fmovd %d26, %d12
1263 1240 fmovd %d28, %d14
1264 1241 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1265 1242 stda %d0,[%i0+%i1]ASI_BLK_P
1266 1243 add %i0, 64, %i0
1267 1244 fmovd %d30, %d0
1268 1245 bgt,pt %ncc, .bc_aln_111_loop
1269 1246 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1270 1247 add %i1, %i0, %i1
1271 1248
1272 1249 std %d0, [%i1]
1273 1250 ba .bc_remain_stuff
1274 1251 add %i1, 8, %i1
1275 1252 ! END OF aln_111
1276 1253
1277 1254 .bc_aln_110:
1278 1255 ! Alignment off by 16 bytes
1279 1256 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1280 1257 ldd [%i0], %d0
1281 1258 ldd [%i0+8], %d2
1282 1259 add %i0, 16, %i0
1283 1260 sub %i2, 16, %i2
1284 1261 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1285 1262 and %i2, 0x7f, %i2 ! residue bytes in %i2
1286 1263 sub %i1, %i0, %i1
1287 1264 .bc_aln_110_loop:
1288 1265 ldda [%i0]ASI_BLK_P,%d16 ! block load
1289 1266 subcc %o3, 64, %o3
1290 1267 fmovd %d16, %d4
1291 1268 fmovd %d18, %d6
1292 1269 fmovd %d20, %d8
1293 1270 fmovd %d22, %d10
1294 1271 fmovd %d24, %d12
1295 1272 fmovd %d26, %d14
1296 1273 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1297 1274 stda %d0,[%i0+%i1]ASI_BLK_P
1298 1275 add %i0, 64, %i0
1299 1276 fmovd %d28, %d0
1300 1277 fmovd %d30, %d2
1301 1278 bgt,pt %ncc, .bc_aln_110_loop
1302 1279 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1303 1280 add %i1, %i0, %i1
1304 1281
1305 1282 std %d0, [%i1]
1306 1283 std %d2, [%i1+8]
1307 1284 ba .bc_remain_stuff
1308 1285 add %i1, 16, %i1
1309 1286 ! END OF aln_110
1310 1287
1311 1288 .bc_aln_101:
1312 1289 ! Alignment off by 24 bytes
1313 1290 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1314 1291 ldd [%i0], %d0
1315 1292 ldd [%i0+8], %d2
1316 1293 ldd [%i0+16], %d4
1317 1294 add %i0, 24, %i0
1318 1295 sub %i2, 24, %i2
1319 1296 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1320 1297 and %i2, 0x7f, %i2 ! residue bytes in %i2
1321 1298 sub %i1, %i0, %i1
1322 1299 .bc_aln_101_loop:
1323 1300 ldda [%i0]ASI_BLK_P,%d16 ! block load
1324 1301 subcc %o3, 64, %o3
1325 1302 fmovd %d16, %d6
1326 1303 fmovd %d18, %d8
1327 1304 fmovd %d20, %d10
1328 1305 fmovd %d22, %d12
1329 1306 fmovd %d24, %d14
1330 1307 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1331 1308 stda %d0,[%i0+%i1]ASI_BLK_P
1332 1309 add %i0, 64, %i0
1333 1310 fmovd %d26, %d0
1334 1311 fmovd %d28, %d2
1335 1312 fmovd %d30, %d4
1336 1313 bgt,pt %ncc, .bc_aln_101_loop
1337 1314 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1338 1315 add %i1, %i0, %i1
1339 1316
1340 1317 std %d0, [%i1]
1341 1318 std %d2, [%i1+8]
1342 1319 std %d4, [%i1+16]
1343 1320 ba .bc_remain_stuff
1344 1321 add %i1, 24, %i1
1345 1322 ! END OF aln_101
1346 1323
1347 1324 .bc_aln_100:
1348 1325 ! Alignment off by 32 bytes
1349 1326 ldd [%i0], %d0
1350 1327 ldd [%i0+8], %d2
1351 1328 ldd [%i0+16],%d4
1352 1329 ldd [%i0+24],%d6
1353 1330 add %i0, 32, %i0
1354 1331 sub %i2, 32, %i2
1355 1332 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1356 1333 and %i2, 0x7f, %i2 ! residue bytes in %i2
1357 1334 sub %i1, %i0, %i1
1358 1335 .bc_aln_100_loop:
1359 1336 ldda [%i0]ASI_BLK_P,%d16 ! block load
1360 1337 subcc %o3, 64, %o3
1361 1338 fmovd %d16, %d8
1362 1339 fmovd %d18, %d10
1363 1340 fmovd %d20, %d12
1364 1341 fmovd %d22, %d14
1365 1342 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1366 1343 stda %d0,[%i0+%i1]ASI_BLK_P
1367 1344 add %i0, 64, %i0
1368 1345 fmovd %d24, %d0
1369 1346 fmovd %d26, %d2
1370 1347 fmovd %d28, %d4
1371 1348 fmovd %d30, %d6
1372 1349 bgt,pt %ncc, .bc_aln_100_loop
1373 1350 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1374 1351 add %i1, %i0, %i1
1375 1352
1376 1353 std %d0, [%i1]
1377 1354 std %d2, [%i1+8]
1378 1355 std %d4, [%i1+16]
1379 1356 std %d6, [%i1+24]
1380 1357 ba .bc_remain_stuff
1381 1358 add %i1, 32, %i1
1382 1359 ! END OF aln_100
1383 1360
1384 1361 .bc_aln_011:
1385 1362 ! Alignment off by 40 bytes
1386 1363 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1387 1364 ldd [%i0], %d0
1388 1365 ldd [%i0+8], %d2
1389 1366 ldd [%i0+16], %d4
1390 1367 ldd [%i0+24], %d6
1391 1368 ldd [%i0+32], %d8
1392 1369 add %i0, 40, %i0
1393 1370 sub %i2, 40, %i2
1394 1371 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1395 1372 and %i2, 0x7f, %i2 ! residue bytes in %i2
1396 1373 sub %i1, %i0, %i1
1397 1374 .bc_aln_011_loop:
1398 1375 ldda [%i0]ASI_BLK_P,%d16 ! block load
1399 1376 subcc %o3, 64, %o3
1400 1377 fmovd %d16, %d10
1401 1378 fmovd %d18, %d12
1402 1379 fmovd %d20, %d14
1403 1380 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1404 1381 stda %d0,[%i0+%i1]ASI_BLK_P
1405 1382 add %i0, 64, %i0
1406 1383 fmovd %d22, %d0
1407 1384 fmovd %d24, %d2
1408 1385 fmovd %d26, %d4
1409 1386 fmovd %d28, %d6
1410 1387 fmovd %d30, %d8
1411 1388 bgt,pt %ncc, .bc_aln_011_loop
1412 1389 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1413 1390 add %i1, %i0, %i1
1414 1391
1415 1392 std %d0, [%i1]
1416 1393 std %d2, [%i1+8]
1417 1394 std %d4, [%i1+16]
1418 1395 std %d6, [%i1+24]
1419 1396 std %d8, [%i1+32]
1420 1397 ba .bc_remain_stuff
1421 1398 add %i1, 40, %i1
1422 1399 ! END OF aln_011
1423 1400
1424 1401 .bc_aln_010:
1425 1402 ! Alignment off by 48 bytes
1426 1403 ldd [%i0], %d0
1427 1404 ldd [%i0+8], %d2
1428 1405 ldd [%i0+16], %d4
1429 1406 ldd [%i0+24], %d6
1430 1407 ldd [%i0+32], %d8
1431 1408 ldd [%i0+40], %d10
1432 1409 add %i0, 48, %i0
1433 1410 sub %i2, 48, %i2
1434 1411 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1435 1412 and %i2, 0x7f, %i2 ! residue bytes in %i2
1436 1413 sub %i1, %i0, %i1
1437 1414 .bc_aln_010_loop:
1438 1415 ldda [%i0]ASI_BLK_P,%d16 ! block load
1439 1416 subcc %o3, 64, %o3
1440 1417 fmovd %d16, %d12
1441 1418 fmovd %d18, %d14
1442 1419 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1443 1420 stda %d0,[%i0+%i1]ASI_BLK_P
1444 1421 add %i0, 64, %i0
1445 1422 fmovd %d20, %d0
1446 1423 fmovd %d22, %d2
1447 1424 fmovd %d24, %d4
1448 1425 fmovd %d26, %d6
1449 1426 fmovd %d28, %d8
1450 1427 fmovd %d30, %d10
1451 1428 bgt,pt %ncc, .bc_aln_010_loop
1452 1429 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1453 1430 add %i1, %i0, %i1
1454 1431
1455 1432 std %d0, [%i1]
1456 1433 std %d2, [%i1+8]
1457 1434 std %d4, [%i1+16]
1458 1435 std %d6, [%i1+24]
1459 1436 std %d8, [%i1+32]
1460 1437 std %d10, [%i1+40]
1461 1438 ba .bc_remain_stuff
1462 1439 add %i1, 48, %i1
1463 1440 ! END OF aln_010
1464 1441
1465 1442 .bc_aln_001:
1466 1443 ! Alignment off by 56 bytes
1467 1444 ldd [%i0], %d0
1468 1445 ldd [%i0+8], %d2
1469 1446 ldd [%i0+16], %d4
1470 1447 ldd [%i0+24], %d6
1471 1448 ldd [%i0+32], %d8
1472 1449 ldd [%i0+40], %d10
1473 1450 ldd [%i0+48], %d12
1474 1451 add %i0, 56, %i0
1475 1452 sub %i2, 56, %i2
1476 1453 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1477 1454 and %i2, 0x7f, %i2 ! residue bytes in %i2
1478 1455 sub %i1, %i0, %i1
1479 1456 .bc_aln_001_loop:
1480 1457 ldda [%i0]ASI_BLK_P,%d16 ! block load
1481 1458 subcc %o3, 64, %o3
1482 1459 fmovd %d16, %d14
1483 1460 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1484 1461 stda %d0,[%i0+%i1]ASI_BLK_P
1485 1462 add %i0, 64, %i0
1486 1463 fmovd %d18, %d0
1487 1464 fmovd %d20, %d2
1488 1465 fmovd %d22, %d4
1489 1466 fmovd %d24, %d6
1490 1467 fmovd %d26, %d8
1491 1468 fmovd %d28, %d10
1492 1469 fmovd %d30, %d12
1493 1470 bgt,pt %ncc, .bc_aln_001_loop
1494 1471 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1495 1472 add %i1, %i0, %i1
1496 1473
1497 1474 std %d0, [%i1]
1498 1475 std %d2, [%i1+8]
1499 1476 std %d4, [%i1+16]
1500 1477 std %d6, [%i1+24]
1501 1478 std %d8, [%i1+32]
1502 1479 std %d10, [%i1+40]
1503 1480 std %d12, [%i1+48]
1504 1481 ba .bc_remain_stuff
1505 1482 add %i1, 56, %i1
1506 1483 ! END OF aln_001
1507 1484
1508 1485 .bc_aln_000:
1509 1486 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1510 1487 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
1511 1488 and %i2, 0x7f, %i2 ! residue bytes in %i2
1512 1489 sub %i1, %i0, %i1
1513 1490 .bc_aln_000_loop:
1514 1491 ldda [%i0]ASI_BLK_P,%d0
1515 1492 subcc %o3, 64, %o3
1516 1493 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1517 1494 stda %d0,[%i0+%i1]ASI_BLK_P
1518 1495 add %i0, 64, %i0
1519 1496 bgt,pt %ncc, .bc_aln_000_loop
1520 1497 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1521 1498 add %i1, %i0, %i1
1522 1499
1523 1500 ! END OF aln_000
1524 1501
1525 1502 .bc_remain_stuff:
1526 1503 subcc %i2, 31, %i2 ! adjust length to allow cc test
1527 1504 ble,pt %ncc, .bc_aln_31
1528 1505 nop
1529 1506 .bc_aln_32:
1530 1507 ldx [%i0], %o4 ! move 32 bytes
1531 1508 subcc %i2, 32, %i2 ! decrement length count by 32
1532 1509 stx %o4, [%i1]
1533 1510 ldx [%i0+8], %o4
1534 1511 stx %o4, [%i1+8]
1535 1512 ldx [%i0+16], %o4
1536 1513 add %i0, 32, %i0 ! increase src ptr by 32
1537 1514 stx %o4, [%i1+16]
1538 1515 ldx [%i0-8], %o4
1539 1516 add %i1, 32, %i1 ! increase dst ptr by 32
1540 1517 bgu,pt %ncc, .bc_aln_32 ! repeat if at least 32 bytes left
1541 1518 stx %o4, [%i1-8]
1542 1519 .bc_aln_31:
1543 1520 addcc %i2, 24, %i2 ! adjust count to be off by 7
1544 1521 ble,pt %ncc, .bc_aln_7 ! skip if 7 or fewer bytes left
1545 1522 nop !
1546 1523 .bc_aln_15:
1547 1524 ldx [%i0], %o4 ! move 8 bytes
1548 1525 add %i0, 8, %i0 ! increase src ptr by 8
1549 1526 subcc %i2, 8, %i2 ! decrease count by 8
1550 1527 add %i1, 8, %i1 ! increase dst ptr by 8
1551 1528 bgu,pt %ncc, .bc_aln_15
1552 1529 stx %o4, [%i1-8] !
1553 1530 .bc_aln_7:
1554 1531 addcc %i2, 7, %i2 ! finish adjustment of remaining count
1555 1532 bz,pt %ncc, .bc_exit ! exit if finished
1556 1533 cmp %i2, 4
1557 1534 blt,pt %ncc, .bc_unaln3x ! skip if less than 4 bytes left
1558 1535 nop !
1559 1536 ld [%i0], %o4 ! move 4 bytes
1560 1537 add %i0, 4, %i0 ! increase src ptr by 4
1561 1538 add %i1, 4, %i1 ! increase dst ptr by 4
1562 1539 subcc %i2, 4, %i2 ! decrease count by 4
1563 1540 bnz .bc_unaln3x
1564 1541 stw %o4, [%i1-4]
1565 1542 ba .bc_exit
1566 1543 nop
1567 1544
1568 1545 ! destination alignment code
1569 1546 .bc_big_d1:
1570 1547 ldub [%i0], %o4 ! move a byte
1571 1548 add %i0, 1, %i0
1572 1549 stb %o4, [%i1]
1573 1550 add %i1, 1, %i1
1574 1551 andcc %i1, 2, %o3
1575 1552 bz,pt %ncc, .bc_big_d2f
1576 1553 sub %i2, 1, %i2
1577 1554 .bc_big_d2:
1578 1555 ldub [%i0], %o4 ! move a half-word (src align unknown)
1579 1556 ldub [%i0+1], %o3
1580 1557 add %i0, 2, %i0
1581 1558 sll %o4, 8, %o4 ! position
1582 1559 or %o4, %o3, %o4 ! merge
1583 1560 sth %o4, [%i1]
1584 1561 add %i1, 2, %i1
1585 1562 andcc %i1, 4, %o3
1586 1563 bz,pt %ncc, .bc_big_d4f
1587 1564 sub %i2, 2, %i2
1588 1565 .bc_big_d4:
1589 1566 ldub [%i0], %o4 ! move a word (src align unknown)
1590 1567 ldub [%i0+1], %o3
1591 1568 sll %o4, 24, %o4 ! position
1592 1569 sll %o3, 16, %o3 ! position
1593 1570 or %o4, %o3, %o3 ! merge
1594 1571 ldub [%i0+2], %o4
1595 1572 sll %o4, 8, %o4 ! position
1596 1573 or %o4, %o3, %o3 ! merge
1597 1574 ldub [%i0+3], %o4
1598 1575 or %o4, %o3, %o4 ! merge
1599 1576 stw %o4,[%i1] ! store four bytes
1600 1577 add %i0, 4, %i0 ! adjust src by 4
1601 1578 add %i1, 4, %i1 ! adjust dest by 4
1602 1579 ba .bc_big_d4f
1603 1580 sub %i2, 4, %i2 ! adjust count by 4
1604 1581
1605 1582
1606 1583 ! Dst is on 8 byte boundary; src is not;
1607 1584 .bc_big_unal8:
1608 1585 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
1609 1586 bz %ncc, .bc_unalnsrc
1610 1587 sub %o3, 64, %o3 ! %o3 will be multiple of 8
1611 1588 neg %o3 ! bytes until dest is 64 byte aligned
1612 1589 sub %i2, %o3, %i2 ! update cnt with bytes to be moved
1613 1590 ! Move bytes according to source alignment
1614 1591 andcc %i0, 0x1, %o4
1615 1592 bnz %ncc, .bc_unalnbyte ! check for byte alignment
1616 1593 nop
1617 1594 andcc %i0, 2, %o4 ! check for half word alignment
1618 1595 bnz %ncc, .bc_unalnhalf
1619 1596 nop
1620 1597 ! Src is word aligned, move bytes until dest 64 byte aligned
1621 1598 .bc_unalnword:
1622 1599 ld [%i0], %o4 ! load 4 bytes
1623 1600 stw %o4, [%i1] ! and store 4 bytes
1624 1601 ld [%i0+4], %o4 ! load 4 bytes
1625 1602 add %i0, 8, %i0 ! increase src ptr by 8
1626 1603 stw %o4, [%i1+4] ! and store 4 bytes
1627 1604 subcc %o3, 8, %o3 ! decrease count by 8
1628 1605 bnz %ncc, .bc_unalnword
1629 1606 add %i1, 8, %i1 ! increase dst ptr by 8
1630 1607 ba .bc_unalnsrc
1631 1608 nop
1632 1609
1633 1610 ! Src is half-word aligned, move bytes until dest 64 byte aligned
1634 1611 .bc_unalnhalf:
1635 1612 lduh [%i0], %o4 ! load 2 bytes
1636 1613 sllx %o4, 32, %i3 ! shift left
1637 1614 lduw [%i0+2], %o4
1638 1615 or %o4, %i3, %i3
1639 1616 sllx %i3, 16, %i3
1640 1617 lduh [%i0+6], %o4
1641 1618 or %o4, %i3, %i3
1642 1619 stx %i3, [%i1]
1643 1620 add %i0, 8, %i0
1644 1621 subcc %o3, 8, %o3
1645 1622 bnz %ncc, .bc_unalnhalf
1646 1623 add %i1, 8, %i1
1647 1624 ba .bc_unalnsrc
1648 1625 nop
1649 1626
1650 1627 ! Src is Byte aligned, move bytes until dest 64 byte aligned
1651 1628 .bc_unalnbyte:
1652 1629 sub %i1, %i0, %i1 ! share pointer advance
1653 1630 .bc_unalnbyte_loop:
1654 1631 ldub [%i0], %o4
1655 1632 sllx %o4, 56, %i3
1656 1633 lduh [%i0+1], %o4
1657 1634 sllx %o4, 40, %o4
1658 1635 or %o4, %i3, %i3
1659 1636 lduh [%i0+3], %o4
1660 1637 sllx %o4, 24, %o4
1661 1638 or %o4, %i3, %i3
1662 1639 lduh [%i0+5], %o4
1663 1640 sllx %o4, 8, %o4
1664 1641 or %o4, %i3, %i3
1665 1642 ldub [%i0+7], %o4
1666 1643 or %o4, %i3, %i3
1667 1644 stx %i3, [%i1+%i0]
1668 1645 subcc %o3, 8, %o3
1669 1646 bnz %ncc, .bc_unalnbyte_loop
1670 1647 add %i0, 8, %i0
1671 1648 add %i1,%i0, %i1 ! restore pointer
1672 1649
1673 1650 ! Destination is now block (64 byte aligned), src is not 8 byte aligned
1674 1651 .bc_unalnsrc:
1675 1652 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
1676 1653 and %i2, 0x3f, %i2 ! residue bytes in %i2
1677 1654 add %i2, 64, %i2 ! Insure we don't load beyond
1678 1655 sub %i3, 64, %i3 ! end of source buffer
1679 1656
1680 1657 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
1681 1658 prefetch [%o4 + (3 * CACHE_LINE)], #one_read
1682 1659 alignaddr %i0, %g0, %g0 ! generate %gsr
1683 1660 add %i0, %i3, %i0 ! advance %i0 to after blocks
1684 1661 !
1685 1662 ! Determine source alignment to correct 8 byte offset
1686 1663 andcc %i0, 0x20, %o3
1687 1664 brnz,pn %o3, .bc_unaln_1
1688 1665 andcc %i0, 0x10, %o3
1689 1666 brnz,pn %o3, .bc_unaln_01
1690 1667 andcc %i0, 0x08, %o3
1691 1668 brz,a %o3, .bc_unaln_000
1692 1669 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1693 1670 ba .bc_unaln_001
1694 1671 nop
1695 1672 .bc_unaln_01:
1696 1673 brnz,a %o3, .bc_unaln_011
1697 1674 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1698 1675 ba .bc_unaln_010
1699 1676 nop
1700 1677 .bc_unaln_1:
1701 1678 brnz,pn %o3, .bc_unaln_11
1702 1679 andcc %i0, 0x08, %o3
1703 1680 brnz,a %o3, .bc_unaln_101
1704 1681 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1705 1682 ba .bc_unaln_100
1706 1683 nop
1707 1684 .bc_unaln_11:
1708 1685 brz,pn %o3, .bc_unaln_110
1709 1686 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1710 1687
1711 1688 .bc_unaln_111:
1712 1689 ldd [%o4+56], %d14
1713 1690 .bc_unaln_111_loop:
1714 1691 add %o4, 64, %o4
1715 1692 ldda [%o4]ASI_BLK_P, %d16
1716 1693 faligndata %d14, %d16, %d48
1717 1694 faligndata %d16, %d18, %d50
1718 1695 faligndata %d18, %d20, %d52
1719 1696 faligndata %d20, %d22, %d54
1720 1697 faligndata %d22, %d24, %d56
1721 1698 faligndata %d24, %d26, %d58
1722 1699 faligndata %d26, %d28, %d60
1723 1700 faligndata %d28, %d30, %d62
1724 1701 fmovd %d30, %d14
1725 1702 stda %d48, [%i1]ASI_BLK_P
1726 1703 subcc %i3, 64, %i3
1727 1704 add %i1, 64, %i1
1728 1705 bgu,pt %ncc, .bc_unaln_111_loop
1729 1706 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1730 1707 ba .bc_unaln_done
1731 1708 nop
1732 1709
1733 1710 .bc_unaln_110:
1734 1711 ldd [%o4+48], %d12
1735 1712 ldd [%o4+56], %d14
1736 1713 .bc_unaln_110_loop:
1737 1714 add %o4, 64, %o4
1738 1715 ldda [%o4]ASI_BLK_P, %d16
1739 1716 faligndata %d12, %d14, %d48
1740 1717 faligndata %d14, %d16, %d50
1741 1718 faligndata %d16, %d18, %d52
1742 1719 faligndata %d18, %d20, %d54
1743 1720 faligndata %d20, %d22, %d56
1744 1721 faligndata %d22, %d24, %d58
1745 1722 faligndata %d24, %d26, %d60
1746 1723 faligndata %d26, %d28, %d62
1747 1724 fmovd %d28, %d12
1748 1725 fmovd %d30, %d14
1749 1726 stda %d48, [%i1]ASI_BLK_P
1750 1727 subcc %i3, 64, %i3
1751 1728 add %i1, 64, %i1
1752 1729 bgu,pt %ncc, .bc_unaln_110_loop
1753 1730 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1754 1731 ba .bc_unaln_done
1755 1732 nop
1756 1733
1757 1734 .bc_unaln_101:
1758 1735 ldd [%o4+40], %d10
1759 1736 ldd [%o4+48], %d12
1760 1737 ldd [%o4+56], %d14
1761 1738 .bc_unaln_101_loop:
1762 1739 add %o4, 64, %o4
1763 1740 ldda [%o4]ASI_BLK_P, %d16
1764 1741 faligndata %d10, %d12, %d48
1765 1742 faligndata %d12, %d14, %d50
1766 1743 faligndata %d14, %d16, %d52
1767 1744 faligndata %d16, %d18, %d54
1768 1745 faligndata %d18, %d20, %d56
1769 1746 faligndata %d20, %d22, %d58
1770 1747 faligndata %d22, %d24, %d60
1771 1748 faligndata %d24, %d26, %d62
1772 1749 fmovd %d26, %d10
1773 1750 fmovd %d28, %d12
1774 1751 fmovd %d30, %d14
1775 1752 stda %d48, [%i1]ASI_BLK_P
1776 1753 subcc %i3, 64, %i3
1777 1754 add %i1, 64, %i1
1778 1755 bgu,pt %ncc, .bc_unaln_101_loop
1779 1756 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1780 1757 ba .bc_unaln_done
1781 1758 nop
1782 1759
1783 1760 .bc_unaln_100:
1784 1761 ldd [%o4+32], %d8
1785 1762 ldd [%o4+40], %d10
1786 1763 ldd [%o4+48], %d12
1787 1764 ldd [%o4+56], %d14
1788 1765 .bc_unaln_100_loop:
1789 1766 add %o4, 64, %o4
1790 1767 ldda [%o4]ASI_BLK_P, %d16
1791 1768 faligndata %d8, %d10, %d48
1792 1769 faligndata %d10, %d12, %d50
1793 1770 faligndata %d12, %d14, %d52
1794 1771 faligndata %d14, %d16, %d54
1795 1772 faligndata %d16, %d18, %d56
1796 1773 faligndata %d18, %d20, %d58
1797 1774 faligndata %d20, %d22, %d60
1798 1775 faligndata %d22, %d24, %d62
1799 1776 fmovd %d24, %d8
1800 1777 fmovd %d26, %d10
1801 1778 fmovd %d28, %d12
1802 1779 fmovd %d30, %d14
1803 1780 stda %d48, [%i1]ASI_BLK_P
1804 1781 subcc %i3, 64, %i3
1805 1782 add %i1, 64, %i1
1806 1783 bgu,pt %ncc, .bc_unaln_100_loop
1807 1784 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1808 1785 ba .bc_unaln_done
1809 1786 nop
1810 1787
1811 1788 .bc_unaln_011:
1812 1789 ldd [%o4+24], %d6
1813 1790 ldd [%o4+32], %d8
1814 1791 ldd [%o4+40], %d10
1815 1792 ldd [%o4+48], %d12
1816 1793 ldd [%o4+56], %d14
1817 1794 .bc_unaln_011_loop:
1818 1795 add %o4, 64, %o4
1819 1796 ldda [%o4]ASI_BLK_P, %d16
1820 1797 faligndata %d6, %d8, %d48
1821 1798 faligndata %d8, %d10, %d50
1822 1799 faligndata %d10, %d12, %d52
1823 1800 faligndata %d12, %d14, %d54
1824 1801 faligndata %d14, %d16, %d56
1825 1802 faligndata %d16, %d18, %d58
1826 1803 faligndata %d18, %d20, %d60
1827 1804 faligndata %d20, %d22, %d62
1828 1805 fmovd %d22, %d6
1829 1806 fmovd %d24, %d8
1830 1807 fmovd %d26, %d10
1831 1808 fmovd %d28, %d12
1832 1809 fmovd %d30, %d14
1833 1810 stda %d48, [%i1]ASI_BLK_P
1834 1811 subcc %i3, 64, %i3
1835 1812 add %i1, 64, %i1
1836 1813 bgu,pt %ncc, .bc_unaln_011_loop
1837 1814 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1838 1815 ba .bc_unaln_done
1839 1816 nop
1840 1817
1841 1818 .bc_unaln_010:
1842 1819 ldd [%o4+16], %d4
1843 1820 ldd [%o4+24], %d6
1844 1821 ldd [%o4+32], %d8
1845 1822 ldd [%o4+40], %d10
1846 1823 ldd [%o4+48], %d12
1847 1824 ldd [%o4+56], %d14
1848 1825 .bc_unaln_010_loop:
1849 1826 add %o4, 64, %o4
1850 1827 ldda [%o4]ASI_BLK_P, %d16
1851 1828 faligndata %d4, %d6, %d48
1852 1829 faligndata %d6, %d8, %d50
1853 1830 faligndata %d8, %d10, %d52
1854 1831 faligndata %d10, %d12, %d54
1855 1832 faligndata %d12, %d14, %d56
1856 1833 faligndata %d14, %d16, %d58
1857 1834 faligndata %d16, %d18, %d60
1858 1835 faligndata %d18, %d20, %d62
1859 1836 fmovd %d20, %d4
1860 1837 fmovd %d22, %d6
1861 1838 fmovd %d24, %d8
1862 1839 fmovd %d26, %d10
1863 1840 fmovd %d28, %d12
1864 1841 fmovd %d30, %d14
1865 1842 stda %d48, [%i1]ASI_BLK_P
1866 1843 subcc %i3, 64, %i3
1867 1844 add %i1, 64, %i1
1868 1845 bgu,pt %ncc, .bc_unaln_010_loop
1869 1846 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1870 1847 ba .bc_unaln_done
1871 1848 nop
1872 1849
1873 1850 .bc_unaln_001:
1874 1851 ldd [%o4+8], %d2
1875 1852 ldd [%o4+16], %d4
1876 1853 ldd [%o4+24], %d6
1877 1854 ldd [%o4+32], %d8
1878 1855 ldd [%o4+40], %d10
1879 1856 ldd [%o4+48], %d12
1880 1857 ldd [%o4+56], %d14
1881 1858 .bc_unaln_001_loop:
1882 1859 add %o4, 64, %o4
1883 1860 ldda [%o4]ASI_BLK_P, %d16
1884 1861 faligndata %d2, %d4, %d48
1885 1862 faligndata %d4, %d6, %d50
1886 1863 faligndata %d6, %d8, %d52
1887 1864 faligndata %d8, %d10, %d54
1888 1865 faligndata %d10, %d12, %d56
1889 1866 faligndata %d12, %d14, %d58
1890 1867 faligndata %d14, %d16, %d60
1891 1868 faligndata %d16, %d18, %d62
1892 1869 fmovd %d18, %d2
1893 1870 fmovd %d20, %d4
1894 1871 fmovd %d22, %d6
1895 1872 fmovd %d24, %d8
1896 1873 fmovd %d26, %d10
1897 1874 fmovd %d28, %d12
1898 1875 fmovd %d30, %d14
1899 1876 stda %d48, [%i1]ASI_BLK_P
1900 1877 subcc %i3, 64, %i3
1901 1878 add %i1, 64, %i1
1902 1879 bgu,pt %ncc, .bc_unaln_001_loop
1903 1880 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1904 1881 ba .bc_unaln_done
1905 1882 nop
1906 1883
1907 1884 .bc_unaln_000:
1908 1885 ldda [%o4]ASI_BLK_P, %d0
1909 1886 .bc_unaln_000_loop:
1910 1887 add %o4, 64, %o4
1911 1888 ldda [%o4]ASI_BLK_P, %d16
1912 1889 faligndata %d0, %d2, %d48
1913 1890 faligndata %d2, %d4, %d50
1914 1891 faligndata %d4, %d6, %d52
1915 1892 faligndata %d6, %d8, %d54
1916 1893 faligndata %d8, %d10, %d56
1917 1894 faligndata %d10, %d12, %d58
1918 1895 faligndata %d12, %d14, %d60
1919 1896 faligndata %d14, %d16, %d62
1920 1897 fmovd %d16, %d0
1921 1898 fmovd %d18, %d2
1922 1899 fmovd %d20, %d4
1923 1900 fmovd %d22, %d6
1924 1901 fmovd %d24, %d8
1925 1902 fmovd %d26, %d10
1926 1903 fmovd %d28, %d12
1927 1904 fmovd %d30, %d14
1928 1905 stda %d48, [%i1]ASI_BLK_P
1929 1906 subcc %i3, 64, %i3
1930 1907 add %i1, 64, %i1
1931 1908 bgu,pt %ncc, .bc_unaln_000_loop
1932 1909 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1933 1910
1934 1911 .bc_unaln_done:
1935 1912 ! Handle trailing bytes, 64 to 127
1936 1913 ! Dest long word aligned, Src not long word aligned
1937 1914 cmp %i2, 15
1938 1915 bleu %ncc, .bc_unaln_short
1939 1916
1940 1917 andn %i2, 0x7, %i3 ! %i3 is multiple of 8
1941 1918 and %i2, 0x7, %i2 ! residue bytes in %i2
1942 1919 add %i2, 8, %i2
1943 1920 sub %i3, 8, %i3 ! insure we don't load past end of src
1944 1921 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
1945 1922 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
1946 1923 ldd [%o4], %d0 ! fetch partial word
1947 1924 .bc_unaln_by8:
1948 1925 ldd [%o4+8], %d2
1949 1926 add %o4, 8, %o4
1950 1927 faligndata %d0, %d2, %d16
1951 1928 subcc %i3, 8, %i3
1952 1929 std %d16, [%i1]
1953 1930 fmovd %d2, %d0
1954 1931 bgu,pt %ncc, .bc_unaln_by8
1955 1932 add %i1, 8, %i1
1956 1933
1957 1934 .bc_unaln_short:
1958 1935 cmp %i2, 8
1959 1936 blt,pt %ncc, .bc_unalnfin
1960 1937 nop
1961 1938 ldub [%i0], %o4
1962 1939 sll %o4, 24, %o3
1963 1940 ldub [%i0+1], %o4
1964 1941 sll %o4, 16, %o4
1965 1942 or %o4, %o3, %o3
1966 1943 ldub [%i0+2], %o4
1967 1944 sll %o4, 8, %o4
1968 1945 or %o4, %o3, %o3
1969 1946 ldub [%i0+3], %o4
1970 1947 or %o4, %o3, %o3
1971 1948 stw %o3, [%i1]
1972 1949 ldub [%i0+4], %o4
1973 1950 sll %o4, 24, %o3
1974 1951 ldub [%i0+5], %o4
1975 1952 sll %o4, 16, %o4
1976 1953 or %o4, %o3, %o3
1977 1954 ldub [%i0+6], %o4
1978 1955 sll %o4, 8, %o4
1979 1956 or %o4, %o3, %o3
1980 1957 ldub [%i0+7], %o4
1981 1958 or %o4, %o3, %o3
1982 1959 stw %o3, [%i1+4]
1983 1960 add %i0, 8, %i0
1984 1961 add %i1, 8, %i1
1985 1962 sub %i2, 8, %i2
1986 1963 .bc_unalnfin:
1987 1964 cmp %i2, 4
1988 1965 blt,pt %ncc, .bc_unalnz
1989 1966 tst %i2
1990 1967 ldub [%i0], %o3 ! read byte
1991 1968 subcc %i2, 4, %i2 ! reduce count by 4
1992 1969 sll %o3, 24, %o3 ! position
1993 1970 ldub [%i0+1], %o4
1994 1971 sll %o4, 16, %o4 ! position
1995 1972 or %o4, %o3, %o3 ! merge
1996 1973 ldub [%i0+2], %o4
1997 1974 sll %o4, 8, %o4 ! position
1998 1975 or %o4, %o3, %o3 ! merge
1999 1976 add %i1, 4, %i1 ! advance dst by 4
2000 1977 ldub [%i0+3], %o4
2001 1978 add %i0, 4, %i0 ! advance src by 4
2002 1979 or %o4, %o3, %o4 ! merge
2003 1980 bnz,pt %ncc, .bc_unaln3x
2004 1981 stw %o4, [%i1-4]
2005 1982 ba .bc_exit
2006 1983 nop
2007 1984 .bc_unalnz:
2008 1985 bz,pt %ncc, .bc_exit
2009 1986 .bc_unaln3x: ! Exactly 1, 2, or 3 bytes remain
2010 1987 subcc %i2, 1, %i2 ! reduce count for cc test
2011 1988 ldub [%i0], %o4 ! load one byte
2012 1989 bz,pt %ncc, .bc_exit
2013 1990 stb %o4, [%i1] ! store one byte
2014 1991 ldub [%i0+1], %o4 ! load second byte
2015 1992 subcc %i2, 1, %i2
2016 1993 bz,pt %ncc, .bc_exit
2017 1994 stb %o4, [%i1+1] ! store second byte
2018 1995 ldub [%i0+2], %o4 ! load third byte
2019 1996 stb %o4, [%i1+2] ! store third byte
2020 1997 .bc_exit:
2021 1998 wr %l5, %g0, %gsr ! restore %gsr
2022 1999 brnz %g5, .bc_fp_restore
2023 2000 and %o5, COPY_FLAGS, %l1 ! save flags in %l1
2024 2001 FZERO
2025 2002 wr %g5, %g0, %fprs
2026 2003 ba,pt %ncc, .bc_ex2
2027 2004 nop
2028 2005 .bc_fp_restore:
2029 2006 BLD_FP_FROMSTACK(%o4)
2030 2007 .bc_ex2:
2031 2008 ldn [THREAD_REG + T_LWP], %o2
2032 2009 brnz,pt %o2, 1f
2033 2010 nop
2034 2011
2035 2012 ldsb [THREAD_REG + T_PREEMPT], %l0
2036 2013 deccc %l0
2037 2014 bnz,pn %ncc, 1f
2038 2015 stb %l0, [THREAD_REG + T_PREEMPT]
2039 2016
2040 2017 ! Check for a kernel preemption request
2041 2018 ldn [THREAD_REG + T_CPU], %l0
2042 2019 ldub [%l0 + CPU_KPRUNRUN], %l0
2043 2020 brnz,a,pt %l0, 1f ! Need to call kpreempt?
2044 2021 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
2045 2022 1:
2046 2023 btst LOFAULT_SET, %l1
2047 2024 bz,pn %icc, 3f
2048 2025 andncc %o5, COPY_FLAGS, %o5
2049 2026 ! Here via bcopy. Check to see if the handler was NULL.
2050 2027 ! If so, just return quietly. Otherwise, reset the
2051 2028 ! handler and return.
2052 2029 bz,pn %ncc, 2f
2053 2030 nop
2054 2031 membar #Sync
2055 2032 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2056 2033 2:
2057 2034 btst KPREEMPT_FLAG, %l1
2058 2035 bz,pt %icc, 3f
2059 2036 nop
2060 2037 call kpreempt
2061 2038 rdpr %pil, %o0 ! pass %pil
2062 2039 3:
2063 2040 ret
2064 2041 restore %g0, 0, %o0
2065 2042
2066 2043 SET_SIZE(bcopy_more)
2067 2044
2068 2045
2069 2046 #else /* NIAGARA_IMPL */
2070 2047 save %sp, -SA(MINFRAME), %sp
2071 2048 clr %o5 ! flag LOFAULT_SET is not set for bcopy
2072 2049 .do_copy:
2073 2050 cmp %i2, 12 ! for small counts
2074 2051 blu %ncc, .bytecp ! just copy bytes
2075 2052 .empty
2076 2053
2077 2054 cmp %i2, 128 ! for less than 128 bytes
2078 2055 blu,pn %ncc, .bcb_punt ! no block st/quad ld
2079 2056 nop
2080 2057
2081 2058 set use_hw_bcopy, %o2
2082 2059 ld [%o2], %o2
2083 2060 brz,pn %o2, .bcb_punt
2084 2061 nop
2085 2062
2086 2063 subcc %i1, %i0, %i3
2087 2064 bneg,a,pn %ncc, 1f
2088 2065 neg %i3
2089 2066 1:
2090 2067 /*
2091 2068 * Compare against 256 since we should be checking block addresses
2092 2069 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
2093 2070 * src = dest + (64 * 3) + 63.
2094 2071 */
2095 2072 cmp %i3, 256
2096 2073 blu,pn %ncc, .bcb_punt
2097 2074 nop
2098 2075
2099 2076 /*
2100 2077 * Copy that reach here have at least 2 blocks of data to copy.
2101 2078 */
2102 2079 .do_blockcopy:
2103 2080 ! Swap src/dst since the code below is memcpy code
2104 2081 ! and memcpy/bcopy have different calling sequences
2105 2082 mov %i1, %i5
2106 2083 mov %i0, %i1
2107 2084 mov %i5, %i0
2108 2085
2109 2086 ! Block (64 bytes) align the destination.
2110 2087 andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes
2111 2088 bz %xcc, .chksrc ! dst is already double aligned
2112 2089 sub %i3, 0x40, %i3
2113 2090 neg %i3 ! bytes till dst 64 bytes aligned
2114 2091 sub %i2, %i3, %i2 ! update i2 with new count
2115 2092
2116 2093 ! Based on source and destination alignment do
2117 2094 ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2118 2095
2119 2096 ! Is dst & src 8B aligned
2120 2097 or %i0, %i1, %o2
2121 2098 andcc %o2, 0x7, %g0
2122 2099 bz %ncc, .alewdcp
2123 2100 nop
2124 2101
2125 2102 ! Is dst & src 4B aligned
2126 2103 andcc %o2, 0x3, %g0
2127 2104 bz %ncc, .alwdcp
2128 2105 nop
2129 2106
2130 2107 ! Is dst & src 2B aligned
2131 2108 andcc %o2, 0x1, %g0
2132 2109 bz %ncc, .alhlfwdcp
2133 2110 nop
2134 2111
2135 2112 ! 1B aligned
2136 2113 1: ldub [%i1], %o2
2137 2114 stb %o2, [%i0]
2138 2115 inc %i1
2139 2116 deccc %i3
2140 2117 bgu,pt %ncc, 1b
2141 2118 inc %i0
2142 2119
2143 2120 ba .chksrc
2144 2121 nop
2145 2122
2146 2123 ! dst & src 4B aligned
2147 2124 .alwdcp:
2148 2125 ld [%i1], %o2
2149 2126 st %o2, [%i0]
2150 2127 add %i1, 0x4, %i1
2151 2128 subcc %i3, 0x4, %i3
2152 2129 bgu,pt %ncc, .alwdcp
2153 2130 add %i0, 0x4, %i0
2154 2131
2155 2132 ba .chksrc
2156 2133 nop
2157 2134
2158 2135 ! dst & src 2B aligned
2159 2136 .alhlfwdcp:
2160 2137 lduh [%i1], %o2
2161 2138 stuh %o2, [%i0]
2162 2139 add %i1, 0x2, %i1
2163 2140 subcc %i3, 0x2, %i3
2164 2141 bgu,pt %ncc, .alhlfwdcp
2165 2142 add %i0, 0x2, %i0
2166 2143
2167 2144 ba .chksrc
2168 2145 nop
2169 2146
2170 2147 ! dst & src 8B aligned
2171 2148 .alewdcp:
2172 2149 ldx [%i1], %o2
2173 2150 stx %o2, [%i0]
2174 2151 add %i1, 0x8, %i1
2175 2152 subcc %i3, 0x8, %i3
2176 2153 bgu,pt %ncc, .alewdcp
2177 2154 add %i0, 0x8, %i0
2178 2155
2179 2156 ! Now Destination is block (64 bytes) aligned
2180 2157 .chksrc:
2181 2158 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
2182 2159 sub %i2, %i3, %i2 ! Residue bytes in %i2
2183 2160
2184 2161 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2185 2162
2186 2163 andcc %i1, 0xf, %o2 ! is src quadword aligned
2187 2164 bz,pn %xcc, .blkcpy ! src offset in %o2
2188 2165 nop
2189 2166 cmp %o2, 0x8
2190 2167 bg .cpy_upper_double
2191 2168 nop
2192 2169 bl .cpy_lower_double
2193 2170 nop
2194 2171
2195 2172 ! Falls through when source offset is equal to 8 i.e.
2196 2173 ! source is double word aligned.
2197 2174 ! In this case no shift/merge of data is required
2198 2175 sub %i1, %o2, %i1 ! align the src at 16 bytes.
2199 2176 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
2200 2177 prefetch [%l0+0x0], #one_read
2201 2178 ldda [%i1+0x0]%asi, %l2
2202 2179 loop0:
2203 2180 ldda [%i1+0x10]%asi, %l4
2204 2181 prefetch [%l0+0x40], #one_read
2205 2182
2206 2183 stxa %l3, [%i0+0x0]%asi
2207 2184 stxa %l4, [%i0+0x8]%asi
2208 2185
2209 2186 ldda [%i1+0x20]%asi, %l2
2210 2187 stxa %l5, [%i0+0x10]%asi
2211 2188 stxa %l2, [%i0+0x18]%asi
2212 2189
2213 2190 ldda [%i1+0x30]%asi, %l4
2214 2191 stxa %l3, [%i0+0x20]%asi
2215 2192 stxa %l4, [%i0+0x28]%asi
2216 2193
2217 2194 ldda [%i1+0x40]%asi, %l2
2218 2195 stxa %l5, [%i0+0x30]%asi
2219 2196 stxa %l2, [%i0+0x38]%asi
2220 2197
2221 2198 add %l0, 0x40, %l0
2222 2199 add %i1, 0x40, %i1
2223 2200 subcc %i3, 0x40, %i3
2224 2201 bgu,pt %xcc, loop0
2225 2202 add %i0, 0x40, %i0
2226 2203 ba .blkdone
2227 2204 add %i1, %o2, %i1 ! increment the source by src offset
2228 2205 ! the src offset was stored in %o2
2229 2206
2230 2207 .cpy_lower_double:
2231 2208 sub %i1, %o2, %i1 ! align the src at 16 bytes.
2232 2209 sll %o2, 3, %o0 ! %o0 left shift
2233 2210 mov 0x40, %o1
2234 2211 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
2235 2212 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
2236 2213 prefetch [%l0+0x0], #one_read
2237 2214 ldda [%i1+0x0]%asi, %l2 ! partial data in %l2 and %l3 has
2238 2215 ! complete data
2239 2216 loop1:
2240 2217 ldda [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read.
2241 2218 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
2242 2219 ! into %l2 and %l3
2243 2220 prefetch [%l0+0x40], #one_read
2244 2221 stxa %l2, [%i0+0x0]%asi
2245 2222 stxa %l3, [%i0+0x8]%asi
2246 2223
2247 2224 ldda [%i1+0x20]%asi, %l2
2248 2225 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
2249 2226 stxa %l4, [%i0+0x10]%asi ! %l4 from previous read
2250 2227 stxa %l5, [%i0+0x18]%asi ! into %l4 and %l5
2251 2228
2252 2229 ! Repeat the same for next 32 bytes.
2253 2230
2254 2231 ldda [%i1+0x30]%asi, %l4
2255 2232 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2256 2233 stxa %l2, [%i0+0x20]%asi
2257 2234 stxa %l3, [%i0+0x28]%asi
2258 2235
2259 2236 ldda [%i1+0x40]%asi, %l2
2260 2237 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2261 2238 stxa %l4, [%i0+0x30]%asi
2262 2239 stxa %l5, [%i0+0x38]%asi
2263 2240
2264 2241 add %l0, 0x40, %l0
2265 2242 add %i1, 0x40, %i1
2266 2243 subcc %i3, 0x40, %i3
2267 2244 bgu,pt %xcc, loop1
2268 2245 add %i0, 0x40, %i0
2269 2246 ba .blkdone
2270 2247 add %i1, %o2, %i1 ! increment the source by src offset
2271 2248 ! the src offset was stored in %o2
2272 2249
2273 2250 .cpy_upper_double:
2274 2251 sub %i1, %o2, %i1 ! align the src at 16 bytes.
2275 2252 mov 0x8, %o0
2276 2253 sub %o2, %o0, %o0
2277 2254 sll %o0, 3, %o0 ! %o0 left shift
2278 2255 mov 0x40, %o1
2279 2256 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
2280 2257 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
2281 2258 prefetch [%l0+0x0], #one_read
2282 2259 ldda [%i1+0x0]%asi, %l2 ! partial data in %l3 for this read and
2283 2260 ! no data in %l2
2284 2261 loop2:
2285 2262 ldda [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has
2286 2263 ! partial
2287 2264 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
2288 2265 ! into %l3 and %l4
2289 2266 prefetch [%l0+0x40], #one_read
2290 2267 stxa %l3, [%i0+0x0]%asi
2291 2268 stxa %l4, [%i0+0x8]%asi
2292 2269
2293 2270 ldda [%i1+0x20]%asi, %l2
2294 2271 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
2295 2272 stxa %l5, [%i0+0x10]%asi ! %l5 from previous read
2296 2273 stxa %l2, [%i0+0x18]%asi ! into %l5 and %l2
2297 2274
2298 2275 ! Repeat the same for next 32 bytes.
2299 2276
2300 2277 ldda [%i1+0x30]%asi, %l4
2301 2278 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2302 2279 stxa %l3, [%i0+0x20]%asi
2303 2280 stxa %l4, [%i0+0x28]%asi
2304 2281
2305 2282 ldda [%i1+0x40]%asi, %l2
2306 2283 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2307 2284 stxa %l5, [%i0+0x30]%asi
2308 2285 stxa %l2, [%i0+0x38]%asi
2309 2286
2310 2287 add %l0, 0x40, %l0
2311 2288 add %i1, 0x40, %i1
2312 2289 subcc %i3, 0x40, %i3
2313 2290 bgu,pt %xcc, loop2
2314 2291 add %i0, 0x40, %i0
2315 2292 ba .blkdone
2316 2293 add %i1, %o2, %i1 ! increment the source by src offset
2317 2294 ! the src offset was stored in %o2
2318 2295
2319 2296
2320 2297 ! Both Source and Destination are block aligned.
2321 2298 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2322 2299 .blkcpy:
2323 2300 prefetch [%i1+0x0], #one_read
2324 2301 1:
2325 2302 ldda [%i1+0x0]%asi, %l0
2326 2303 ldda [%i1+0x10]%asi, %l2
2327 2304 prefetch [%i1+0x40], #one_read
2328 2305
2329 2306 stxa %l0, [%i0+0x0]%asi
2330 2307 ldda [%i1+0x20]%asi, %l4
2331 2308 ldda [%i1+0x30]%asi, %l6
2332 2309
2333 2310 stxa %l1, [%i0+0x8]%asi
2334 2311 stxa %l2, [%i0+0x10]%asi
2335 2312 stxa %l3, [%i0+0x18]%asi
2336 2313 stxa %l4, [%i0+0x20]%asi
2337 2314 stxa %l5, [%i0+0x28]%asi
2338 2315 stxa %l6, [%i0+0x30]%asi
2339 2316 stxa %l7, [%i0+0x38]%asi
2340 2317
2341 2318 add %i1, 0x40, %i1
2342 2319 subcc %i3, 0x40, %i3
2343 2320 bgu,pt %xcc, 1b
2344 2321 add %i0, 0x40, %i0
2345 2322
2346 2323 .blkdone:
2347 2324 membar #Sync
2348 2325
2349 2326 brz,pt %i2, .blkexit
2350 2327 nop
2351 2328
2352 2329 ! Handle trailing bytes
2353 2330 cmp %i2, 0x8
2354 2331 blu,pt %ncc, .residue
2355 2332 nop
2356 2333
2357 2334 ! Can we do some 8B ops
2358 2335 or %i1, %i0, %o2
2359 2336 andcc %o2, 0x7, %g0
2360 2337 bnz %ncc, .last4
2361 2338 nop
2362 2339
2363 2340 ! Do 8byte ops as long as possible
2364 2341 .last8:
2365 2342 ldx [%i1], %o2
2366 2343 stx %o2, [%i0]
2367 2344 add %i1, 0x8, %i1
2368 2345 sub %i2, 0x8, %i2
2369 2346 cmp %i2, 0x8
2370 2347 bgu,pt %ncc, .last8
2371 2348 add %i0, 0x8, %i0
2372 2349
2373 2350 brz,pt %i2, .blkexit
2374 2351 nop
2375 2352
2376 2353 ba .residue
2377 2354 nop
2378 2355
2379 2356 .last4:
2380 2357 ! Can we do 4B ops
2381 2358 andcc %o2, 0x3, %g0
2382 2359 bnz %ncc, .last2
2383 2360 nop
2384 2361 1:
2385 2362 ld [%i1], %o2
2386 2363 st %o2, [%i0]
2387 2364 add %i1, 0x4, %i1
2388 2365 sub %i2, 0x4, %i2
2389 2366 cmp %i2, 0x4
2390 2367 bgu,pt %ncc, 1b
2391 2368 add %i0, 0x4, %i0
2392 2369
2393 2370 brz,pt %i2, .blkexit
2394 2371 nop
2395 2372
2396 2373 ba .residue
2397 2374 nop
2398 2375
2399 2376 .last2:
2400 2377 ! Can we do 2B ops
2401 2378 andcc %o2, 0x1, %g0
2402 2379 bnz %ncc, .residue
2403 2380 nop
2404 2381
2405 2382 1:
2406 2383 lduh [%i1], %o2
2407 2384 stuh %o2, [%i0]
2408 2385 add %i1, 0x2, %i1
2409 2386 sub %i2, 0x2, %i2
2410 2387 cmp %i2, 0x2
2411 2388 bgu,pt %ncc, 1b
2412 2389 add %i0, 0x2, %i0
2413 2390
2414 2391 brz,pt %i2, .blkexit
2415 2392 nop
2416 2393
2417 2394 .residue:
2418 2395 ldub [%i1], %o2
2419 2396 stb %o2, [%i0]
2420 2397 inc %i1
2421 2398 deccc %i2
2422 2399 bgu,pt %ncc, .residue
2423 2400 inc %i0
2424 2401
2425 2402 .blkexit:
2426 2403
2427 2404 membar #Sync ! sync error barrier
2428 2405 ! Restore t_lofault handler, if came here from kcopy().
2429 2406 tst %o5
2430 2407 bz %ncc, 1f
2431 2408 andn %o5, LOFAULT_SET, %o5
2432 2409 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2433 2410 1:
2434 2411 ret
2435 2412 restore %g0, 0, %o0
2436 2413
2437 2414
2438 2415 .bcb_punt:
2439 2416 !
2440 2417 ! use aligned transfers where possible
2441 2418 !
2442 2419 xor %i0, %i1, %o4 ! xor from and to address
2443 2420 btst 7, %o4 ! if lower three bits zero
2444 2421 bz .aldoubcp ! can align on double boundary
2445 2422 .empty ! assembler complaints about label
2446 2423
2447 2424 xor %i0, %i1, %o4 ! xor from and to address
2448 2425 btst 3, %o4 ! if lower two bits zero
2449 2426 bz .alwordcp ! can align on word boundary
2450 2427 btst 3, %i0 ! delay slot, from address unaligned?
2451 2428 !
2452 2429 ! use aligned reads and writes where possible
2453 2430 ! this differs from wordcp in that it copes
2454 2431 ! with odd alignment between source and destnation
2455 2432 ! using word reads and writes with the proper shifts
2456 2433 ! in between to align transfers to and from memory
2457 2434 ! i0 - src address, i1 - dest address, i2 - count
2458 2435 ! i3, i4 - tmps for used generating complete word
2459 2436 ! i5 (word to write)
2460 2437 ! l0 size in bits of upper part of source word (US)
2461 2438 ! l1 size in bits of lower part of source word (LS = 32 - US)
2462 2439 ! l2 size in bits of upper part of destination word (UD)
2463 2440 ! l3 size in bits of lower part of destination word (LD = 32 - UD)
2464 2441 ! l4 number of bytes leftover after aligned transfers complete
2465 2442 ! l5 the number 32
2466 2443 !
2467 2444 mov 32, %l5 ! load an oft-needed constant
2468 2445 bz .align_dst_only
2469 2446 btst 3, %i1 ! is destnation address aligned?
2470 2447 clr %i4 ! clear registers used in either case
2471 2448 bz .align_src_only
2472 2449 clr %l0
2473 2450 !
2474 2451 ! both source and destination addresses are unaligned
2475 2452 !
2476 2453 1: ! align source
2477 2454 ldub [%i0], %i3 ! read a byte from source address
2478 2455 add %i0, 1, %i0 ! increment source address
2479 2456 or %i4, %i3, %i4 ! or in with previous bytes (if any)
2480 2457 btst 3, %i0 ! is source aligned?
2481 2458 add %l0, 8, %l0 ! increment size of upper source (US)
2482 2459 bnz,a 1b
2483 2460 sll %i4, 8, %i4 ! make room for next byte
2484 2461
2485 2462 sub %l5, %l0, %l1 ! generate shift left count (LS)
2486 2463 sll %i4, %l1, %i4 ! prepare to get rest
2487 2464 ld [%i0], %i3 ! read a word
2488 2465 add %i0, 4, %i0 ! increment source address
2489 2466 srl %i3, %l0, %i5 ! upper src bits into lower dst bits
2490 2467 or %i4, %i5, %i5 ! merge
2491 2468 mov 24, %l3 ! align destination
2492 2469 1:
2493 2470 srl %i5, %l3, %i4 ! prepare to write a single byte
2494 2471 stb %i4, [%i1] ! write a byte
2495 2472 add %i1, 1, %i1 ! increment destination address
2496 2473 sub %i2, 1, %i2 ! decrement count
2497 2474 btst 3, %i1 ! is destination aligned?
2498 2475 bnz,a 1b
2499 2476 sub %l3, 8, %l3 ! delay slot, decrement shift count (LD)
2500 2477 sub %l5, %l3, %l2 ! generate shift left count (UD)
2501 2478 sll %i5, %l2, %i5 ! move leftover into upper bytes
2502 2479 cmp %l2, %l0 ! cmp # reqd to fill dst w old src left
2503 2480 bgu %ncc, .more_needed ! need more to fill than we have
2504 2481 nop
2505 2482
2506 2483 sll %i3, %l1, %i3 ! clear upper used byte(s)
2507 2484 srl %i3, %l1, %i3
2508 2485 ! get the odd bytes between alignments
2509 2486 sub %l0, %l2, %l0 ! regenerate shift count
2510 2487 sub %l5, %l0, %l1 ! generate new shift left count (LS)
2511 2488 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0
2512 2489 andn %i2, 3, %i2 ! # of aligned bytes that can be moved
2513 2490 srl %i3, %l0, %i4
2514 2491 or %i5, %i4, %i5
2515 2492 st %i5, [%i1] ! write a word
2516 2493 subcc %i2, 4, %i2 ! decrement count
2517 2494 bz %ncc, .unalign_out
2518 2495 add %i1, 4, %i1 ! increment destination address
2519 2496
2520 2497 b 2f
2521 2498 sll %i3, %l1, %i5 ! get leftover into upper bits
2522 2499 .more_needed:
2523 2500 sll %i3, %l0, %i3 ! save remaining byte(s)
2524 2501 srl %i3, %l0, %i3
2525 2502 sub %l2, %l0, %l1 ! regenerate shift count
2526 2503 sub %l5, %l1, %l0 ! generate new shift left count
2527 2504 sll %i3, %l1, %i4 ! move to fill empty space
2528 2505 b 3f
2529 2506 or %i5, %i4, %i5 ! merge to complete word
2530 2507 !
2531 2508 ! the source address is aligned and destination is not
2532 2509 !
2533 2510 .align_dst_only:
2534 2511 ld [%i0], %i4 ! read a word
2535 2512 add %i0, 4, %i0 ! increment source address
2536 2513 mov 24, %l0 ! initial shift alignment count
2537 2514 1:
2538 2515 srl %i4, %l0, %i3 ! prepare to write a single byte
2539 2516 stb %i3, [%i1] ! write a byte
2540 2517 add %i1, 1, %i1 ! increment destination address
2541 2518 sub %i2, 1, %i2 ! decrement count
2542 2519 btst 3, %i1 ! is destination aligned?
2543 2520 bnz,a 1b
2544 2521 sub %l0, 8, %l0 ! delay slot, decrement shift count
2545 2522 .xfer:
2546 2523 sub %l5, %l0, %l1 ! generate shift left count
2547 2524 sll %i4, %l1, %i5 ! get leftover
2548 2525 3:
2549 2526 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0
2550 2527 andn %i2, 3, %i2 ! # of aligned bytes that can be moved
2551 2528 2:
2552 2529 ld [%i0], %i3 ! read a source word
2553 2530 add %i0, 4, %i0 ! increment source address
2554 2531 srl %i3, %l0, %i4 ! upper src bits into lower dst bits
2555 2532 or %i5, %i4, %i5 ! merge with upper dest bits (leftover)
2556 2533 st %i5, [%i1] ! write a destination word
2557 2534 subcc %i2, 4, %i2 ! decrement count
2558 2535 bz %ncc, .unalign_out ! check if done
2559 2536 add %i1, 4, %i1 ! increment destination address
2560 2537 b 2b ! loop
2561 2538 sll %i3, %l1, %i5 ! get leftover
2562 2539 .unalign_out:
2563 2540 tst %l4 ! any bytes leftover?
2564 2541 bz %ncc, .cpdone
2565 2542 .empty ! allow next instruction in delay slot
2566 2543 1:
2567 2544 sub %l0, 8, %l0 ! decrement shift
2568 2545 srl %i3, %l0, %i4 ! upper src byte into lower dst byte
2569 2546 stb %i4, [%i1] ! write a byte
2570 2547 subcc %l4, 1, %l4 ! decrement count
2571 2548 bz %ncc, .cpdone ! done?
2572 2549 add %i1, 1, %i1 ! increment destination
2573 2550 tst %l0 ! any more previously read bytes
2574 2551 bnz %ncc, 1b ! we have leftover bytes
2575 2552 mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants
2576 2553 b .dbytecp ! let dbytecp do the rest
2577 2554 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
2578 2555 !
2579 2556 ! the destination address is aligned and the source is not
2580 2557 !
2581 2558 .align_src_only:
2582 2559 ldub [%i0], %i3 ! read a byte from source address
2583 2560 add %i0, 1, %i0 ! increment source address
2584 2561 or %i4, %i3, %i4 ! or in with previous bytes (if any)
2585 2562 btst 3, %i0 ! is source aligned?
2586 2563 add %l0, 8, %l0 ! increment shift count (US)
2587 2564 bnz,a .align_src_only
2588 2565 sll %i4, 8, %i4 ! make room for next byte
2589 2566 b,a .xfer
2590 2567 !
2591 2568 ! if from address unaligned for double-word moves,
2592 2569 ! move bytes till it is, if count is < 56 it could take
2593 2570 ! longer to align the thing than to do the transfer
2594 2571 ! in word size chunks right away
2595 2572 !
2596 2573 .aldoubcp:
2597 2574 cmp %i2, 56 ! if count < 56, use wordcp, it takes
2598 2575 blu,a %ncc, .alwordcp ! longer to align doubles than words
2599 2576 mov 3, %o0 ! mask for word alignment
2600 2577 call .alignit ! copy bytes until aligned
2601 2578 mov 7, %o0 ! mask for double alignment
2602 2579 !
2603 2580 ! source and destination are now double-word aligned
2604 2581 ! i3 has aligned count returned by alignit
2605 2582 !
2606 2583 and %i2, 7, %i2 ! unaligned leftover count
2607 2584 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
2608 2585 5:
2609 2586 ldx [%i0+%i1], %o4 ! read from address
2610 2587 stx %o4, [%i1] ! write at destination address
2611 2588 subcc %i3, 8, %i3 ! dec count
2612 2589 bgu %ncc, 5b
2613 2590 add %i1, 8, %i1 ! delay slot, inc to address
2614 2591 cmp %i2, 4 ! see if we can copy a word
2615 2592 blu %ncc, .dbytecp ! if 3 or less bytes use bytecp
2616 2593 .empty
2617 2594 !
2618 2595 ! for leftover bytes we fall into wordcp, if needed
2619 2596 !
2620 2597 .wordcp:
2621 2598 and %i2, 3, %i2 ! unaligned leftover count
2622 2599 5:
2623 2600 ld [%i0+%i1], %o4 ! read from address
2624 2601 st %o4, [%i1] ! write at destination address
2625 2602 subcc %i3, 4, %i3 ! dec count
2626 2603 bgu %ncc, 5b
2627 2604 add %i1, 4, %i1 ! delay slot, inc to address
2628 2605 b,a .dbytecp
2629 2606
2630 2607 ! we come here to align copies on word boundaries
2631 2608 .alwordcp:
2632 2609 call .alignit ! go word-align it
2633 2610 mov 3, %o0 ! bits that must be zero to be aligned
2634 2611 b .wordcp
2635 2612 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst
2636 2613
2637 2614 !
2638 2615 ! byte copy, works with any alignment
2639 2616 !
2640 2617 .bytecp:
2641 2618 b .dbytecp
2642 2619 sub %i0, %i1, %i0 ! i0 gets difference of src and dst
2643 2620
2644 2621 !
2645 2622 ! differenced byte copy, works with any alignment
2646 2623 ! assumes dest in %i1 and (source - dest) in %i0
2647 2624 !
2648 2625 1:
2649 2626 stb %o4, [%i1] ! write to address
2650 2627 inc %i1 ! inc to address
2651 2628 .dbytecp:
2652 2629 deccc %i2 ! dec count
2653 2630 bgeu,a %ncc, 1b ! loop till done
2654 2631 ldub [%i0+%i1], %o4 ! read from address
2655 2632 .cpdone:
2656 2633
2657 2634 membar #Sync ! sync error barrier
2658 2635 ! Restore t_lofault handler, if came here from kcopy().
2659 2636 tst %o5
2660 2637 bz %ncc, 1f
2661 2638 andn %o5, LOFAULT_SET, %o5
2662 2639 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2663 2640 1:
2664 2641 ret
2665 2642 restore %g0, 0, %o0 ! return (0)
2666 2643
2667 2644 /*
2668 2645 * Common code used to align transfers on word and doubleword
2669 2646 * boundaries. Aligns source and destination and returns a count
2670 2647 * of aligned bytes to transfer in %i3
2671 2648 */
2672 2649 1:
2673 2650 inc %i0 ! inc from
2674 2651 stb %o4, [%i1] ! write a byte
2675 2652 inc %i1 ! inc to
2676 2653 dec %i2 ! dec count
2677 2654 .alignit:
2678 2655 btst %o0, %i0 ! %o0 is bit mask to check for alignment
↓ open down ↓ |
2059 lines elided |
↑ open up ↑ |
2679 2656 bnz,a 1b
2680 2657 ldub [%i0], %o4 ! read next byte
2681 2658
2682 2659 retl
2683 2660 andn %i2, %o0, %i3 ! return size of aligned bytes
2684 2661
2685 2662 SET_SIZE(bcopy)
2686 2663
2687 2664 #endif /* NIAGARA_IMPL */
2688 2665
2689 -#endif /* lint */
2690 -
2691 2666 /*
2692 2667 * Block copy with possibly overlapped operands.
2693 2668 */
2694 2669
2695 -#if defined(lint)
2696 -
2697 -/*ARGSUSED*/
2698 -void
2699 -ovbcopy(const void *from, void *to, size_t count)
2700 -{}
2701 -
2702 -#else /* lint */
2703 -
2704 2670 ENTRY(ovbcopy)
2705 2671 tst %o2 ! check count
2706 2672 bgu,a %ncc, 1f ! nothing to do or bad arguments
2707 2673 subcc %o0, %o1, %o3 ! difference of from and to address
2708 2674
2709 2675 retl ! return
2710 2676 nop
2711 2677 1:
2712 2678 bneg,a %ncc, 2f
2713 2679 neg %o3 ! if < 0, make it positive
2714 2680 2: cmp %o2, %o3 ! cmp size and abs(from - to)
2715 2681 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy,
2716 2682 .empty ! no overlap
2717 2683 cmp %o0, %o1 ! compare from and to addresses
2718 2684 blu %ncc, .ov_bkwd ! if from < to, copy backwards
2719 2685 nop
2720 2686 !
2721 2687 ! Copy forwards.
2722 2688 !
2723 2689 .ov_fwd:
2724 2690 ldub [%o0], %o3 ! read from address
2725 2691 inc %o0 ! inc from address
2726 2692 stb %o3, [%o1] ! write to address
2727 2693 deccc %o2 ! dec count
2728 2694 bgu %ncc, .ov_fwd ! loop till done
2729 2695 inc %o1 ! inc to address
2730 2696
2731 2697 retl ! return
2732 2698 nop
2733 2699 !
2734 2700 ! Copy backwards.
2735 2701 !
↓ open down ↓ |
22 lines elided |
↑ open up ↑ |
2736 2702 .ov_bkwd:
2737 2703 deccc %o2 ! dec count
2738 2704 ldub [%o0 + %o2], %o3 ! get byte at end of src
2739 2705 bgu %ncc, .ov_bkwd ! loop till done
2740 2706 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst
2741 2707
2742 2708 retl ! return
2743 2709 nop
2744 2710 SET_SIZE(ovbcopy)
2745 2711
2746 -#endif /* lint */
2747 -
2748 2712 /*
2749 2713 * hwblkpagecopy()
2750 2714 *
2751 2715 * Copies exactly one page. This routine assumes the caller (ppcopy)
2752 2716 * has already disabled kernel preemption and has checked
2753 2717 * use_hw_bcopy.
2754 2718 */
2755 -#ifdef lint
2756 -/*ARGSUSED*/
2757 -void
2758 -hwblkpagecopy(const void *src, void *dst)
2759 -{ }
2760 -#else /* lint */
2761 2719 ENTRY(hwblkpagecopy)
2762 2720 save %sp, -SA(MINFRAME), %sp
2763 2721
2764 2722 ! %i0 - source address (arg)
2765 2723 ! %i1 - destination address (arg)
2766 2724 ! %i2 - length of region (not arg)
2767 2725
2768 2726 set PAGESIZE, %i2
2769 2727
2770 2728 /*
2771 2729 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
2772 2730 */
2773 2731 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2774 2732 prefetch [%i0+0x0], #one_read
2775 2733 prefetch [%i0+0x40], #one_read
2776 2734 1:
2777 2735 prefetch [%i0+0x80], #one_read
2778 2736 prefetch [%i0+0xc0], #one_read
2779 2737 ldda [%i0+0x0]%asi, %l0
2780 2738 ldda [%i0+0x10]%asi, %l2
2781 2739 ldda [%i0+0x20]%asi, %l4
2782 2740 ldda [%i0+0x30]%asi, %l6
2783 2741 stxa %l0, [%i1+0x0]%asi
2784 2742 stxa %l1, [%i1+0x8]%asi
2785 2743 stxa %l2, [%i1+0x10]%asi
2786 2744 stxa %l3, [%i1+0x18]%asi
2787 2745 stxa %l4, [%i1+0x20]%asi
2788 2746 stxa %l5, [%i1+0x28]%asi
2789 2747 stxa %l6, [%i1+0x30]%asi
2790 2748 stxa %l7, [%i1+0x38]%asi
2791 2749 ldda [%i0+0x40]%asi, %l0
2792 2750 ldda [%i0+0x50]%asi, %l2
2793 2751 ldda [%i0+0x60]%asi, %l4
2794 2752 ldda [%i0+0x70]%asi, %l6
2795 2753 stxa %l0, [%i1+0x40]%asi
2796 2754 stxa %l1, [%i1+0x48]%asi
2797 2755 stxa %l2, [%i1+0x50]%asi
2798 2756 stxa %l3, [%i1+0x58]%asi
2799 2757 stxa %l4, [%i1+0x60]%asi
2800 2758 stxa %l5, [%i1+0x68]%asi
2801 2759 stxa %l6, [%i1+0x70]%asi
2802 2760 stxa %l7, [%i1+0x78]%asi
↓ open down ↓ |
32 lines elided |
↑ open up ↑ |
2803 2761
2804 2762 add %i0, 0x80, %i0
2805 2763 subcc %i2, 0x80, %i2
2806 2764 bgu,pt %xcc, 1b
2807 2765 add %i1, 0x80, %i1
2808 2766
2809 2767 membar #Sync
2810 2768 ret
2811 2769 restore %g0, 0, %o0
2812 2770 SET_SIZE(hwblkpagecopy)
2813 -#endif /* lint */
2814 2771
2815 2772
2816 2773 /*
2817 2774 * Transfer data to and from user space -
2818 2775 * Note that these routines can cause faults
2819 2776 * It is assumed that the kernel has nothing at
2820 2777 * less than KERNELBASE in the virtual address space.
2821 2778 *
2822 2779 * Note that copyin(9F) and copyout(9F) are part of the
2823 2780 * DDI/DKI which specifies that they return '-1' on "errors."
2824 2781 *
2825 2782 * Sigh.
2826 2783 *
2827 2784 * So there's two extremely similar routines - xcopyin() and xcopyout()
2828 2785 * which return the errno that we've faithfully computed. This
2829 2786 * allows other callers (e.g. uiomove(9F)) to work correctly.
2830 2787 * Given that these are used pretty heavily, we expand the calling
2831 2788 * sequences inline for all flavours (rather than making wrappers).
2832 2789 *
2833 2790 * There are also stub routines for xcopyout_little and xcopyin_little,
2834 2791 * which currently are intended to handle requests of <= 16 bytes from
2835 2792 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2836 2793 * is left as an exercise...
2837 2794 */
2838 2795
2839 2796 /*
2840 2797 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2841 2798 *
2842 2799 * General theory of operation:
2843 2800 *
2844 2801 * None of the copyops routines grab a window until it's decided that
2845 2802 * we need to do a HW block copy operation. This saves a window
2846 2803 * spill/fill when we're called during socket ops. The typical IO
2847 2804 * path won't cause spill/fill traps.
2848 2805 *
2849 2806 * This code uses a set of 4 limits for the maximum size that will
2850 2807 * be copied given a particular input/output address alignment.
2851 2808 * the default limits are:
2852 2809 *
2853 2810 * single byte aligned - 256 (hw_copy_limit_1)
2854 2811 * two byte aligned - 512 (hw_copy_limit_2)
2855 2812 * four byte aligned - 1024 (hw_copy_limit_4)
2856 2813 * eight byte aligned - 1024 (hw_copy_limit_8)
2857 2814 *
2858 2815 * If the value for a particular limit is zero, the copy will be done
2859 2816 * via the copy loops rather than block store/quad load instructions.
2860 2817 *
2861 2818 * Flow:
2862 2819 *
2863 2820 * If count == zero return zero.
2864 2821 *
2865 2822 * Store the previous lo_fault handler into %g6.
2866 2823 * Place our secondary lofault handler into %g5.
2867 2824 * Place the address of our nowindow fault handler into %o3.
2868 2825 * Place the address of the windowed fault handler into %o4.
2869 2826 * --> We'll use this handler if we end up grabbing a window
2870 2827 * --> before we use block initializing store and quad load ASIs
2871 2828 *
2872 2829 * If count is less than or equal to SMALL_LIMIT (7) we
2873 2830 * always do a byte for byte copy.
2874 2831 *
2875 2832 * If count is > SMALL_LIMIT, we check the alignment of the input
2876 2833 * and output pointers. Based on the alignment we check count
2877 2834 * against a limit based on detected alignment. If we exceed the
2878 2835 * alignment value we copy via block initializing store and quad
2879 2836 * load instructions.
2880 2837 *
2881 2838 * If we don't exceed one of the limits, we store -count in %o3,
2882 2839 * we store the number of chunks (8, 4, 2 or 1 byte) operated
2883 2840 * on in our basic copy loop in %o2. Following this we branch
2884 2841 * to the appropriate copy loop and copy that many chunks.
2885 2842 * Since we've been adding the chunk size to %o3 each time through
2886 2843 * as well as decrementing %o2, we can tell if any data is
2887 2844 * is left to be copied by examining %o3. If that is zero, we're
2888 2845 * done and can go home. If not, we figure out what the largest
2889 2846 * chunk size left to be copied is and branch to that copy loop
2890 2847 * unless there's only one byte left. We load that as we're
2891 2848 * branching to code that stores it just before we return.
2892 2849 *
2893 2850 * Fault handlers are invoked if we reference memory that has no
2894 2851 * current mapping. All forms share the same copyio_fault handler.
2895 2852 * This routine handles fixing up the stack and general housecleaning.
2896 2853 * Each copy operation has a simple fault handler that is then called
↓ open down ↓ |
73 lines elided |
↑ open up ↑ |
2897 2854 * to do the work specific to the invidual operation. The handler
2898 2855 * for copyOP and xcopyOP are found at the end of individual function.
2899 2856 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
2900 2857 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
2901 2858 */
2902 2859
2903 2860 /*
2904 2861 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2905 2862 */
2906 2863
2907 -#if defined(lint)
2908 -
2909 -/*ARGSUSED*/
2910 -int
2911 -copyout(const void *kaddr, void *uaddr, size_t count)
2912 -{ return (0); }
2913 -
2914 -#else /* lint */
2915 -
2916 2864 /*
2917 2865 * We save the arguments in the following registers in case of a fault:
2918 2866 * kaddr - %g2
2919 2867 * uaddr - %g3
2920 2868 * count - %g4
2921 2869 */
2922 2870 #define SAVE_SRC %g2
2923 2871 #define SAVE_DST %g3
2924 2872 #define SAVE_COUNT %g4
2925 2873
2926 2874 #define REAL_LOFAULT %g5
2927 2875 #define SAVED_LOFAULT %g6
2928 2876
2929 2877 /*
2930 2878 * Generic copyio fault handler. This is the first line of defense when a
2931 2879 * fault occurs in (x)copyin/(x)copyout. In order for this to function
2932 2880 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2933 2881 * This allows us to share common code for all the flavors of the copy
2934 2882 * operations, including the _noerr versions.
2935 2883 *
2936 2884 * Note that this function will restore the original input parameters before
2937 2885 * calling REAL_LOFAULT. So the real handler can vector to the appropriate
2938 2886 * member of the t_copyop structure, if needed.
2939 2887 */
2940 2888 ENTRY(copyio_fault)
2941 2889 #if !defined(NIAGARA_IMPL)
2942 2890 btst FPUSED_FLAG, SAVED_LOFAULT
2943 2891 bz 1f
2944 2892 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2945 2893
2946 2894 wr %l5, 0, %gsr ! restore gsr
2947 2895
2948 2896 btst FPRS_FEF, %g1
2949 2897 bz %icc, 4f
2950 2898 nop
2951 2899
2952 2900 ! restore fpregs from stack
2953 2901 BLD_FP_FROMSTACK(%o2)
2954 2902
2955 2903 ba,pt %ncc, 1f
2956 2904 nop
2957 2905 4:
2958 2906 FZERO ! zero all of the fpregs
2959 2907 wr %g1, %g0, %fprs ! restore fprs
2960 2908 1:
2961 2909 restore
2962 2910 mov SAVE_SRC, %o0
2963 2911 mov SAVE_DST, %o1
2964 2912 jmp REAL_LOFAULT
2965 2913 mov SAVE_COUNT, %o2
2966 2914
2967 2915 #else /* NIAGARA_IMPL */
2968 2916 membar #Sync
2969 2917 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2970 2918 restore
2971 2919 mov SAVE_SRC, %o0
2972 2920 mov SAVE_DST, %o1
2973 2921 jmp REAL_LOFAULT
2974 2922 mov SAVE_COUNT, %o2
2975 2923
2976 2924 #endif /* NIAGARA_IMPL */
2977 2925
2978 2926 SET_SIZE(copyio_fault)
2979 2927
2980 2928 ENTRY(copyio_fault_nowindow)
2981 2929 membar #Sync
2982 2930 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2983 2931
2984 2932 mov SAVE_SRC, %o0
2985 2933 mov SAVE_DST, %o1
2986 2934 jmp REAL_LOFAULT
2987 2935 mov SAVE_COUNT, %o2
2988 2936 SET_SIZE(copyio_fault_nowindow)
2989 2937
2990 2938 ENTRY(copyout)
2991 2939 sethi %hi(.copyout_err), REAL_LOFAULT
2992 2940 or REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2993 2941
2994 2942 #if !defined(NIAGARA_IMPL)
2995 2943 .do_copyout:
2996 2944 tst %o2 ! check for zero count; quick exit
2997 2945 bz,pt %ncc, .co_smallqx
2998 2946 mov %o0, SAVE_SRC
2999 2947 mov %o1, SAVE_DST
3000 2948 mov %o2, SAVE_COUNT
3001 2949 cmp %o2, FP_COPY ! check for small copy/leaf case
3002 2950 bgt,pt %ncc, .co_copy_more
3003 2951 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
3004 2952 /*
3005 2953 * Small copy out code
3006 2954 *
3007 2955 */
3008 2956 sethi %hi(copyio_fault_nowindow), %o3
3009 2957 or %o3, %lo(copyio_fault_nowindow), %o3
3010 2958 membar #Sync
3011 2959 stn %o3, [THREAD_REG + T_LOFAULT]
3012 2960
3013 2961 mov ASI_USER, %asi
3014 2962 cmp %o2, SHORTCOPY ! make sure there is enough to align
3015 2963 ble,pt %ncc, .co_smallest
3016 2964 andcc %o1, 0x7, %o3 ! is dest long word aligned
3017 2965 bnz,pn %ncc, .co_align
3018 2966 andcc %o1, 1, %o3 ! is dest byte aligned
3019 2967
3020 2968 ! Destination is long word aligned
3021 2969 ! 8 cases for src alignment; load parts, store long words
3022 2970 .co_al_src:
3023 2971 andcc %o0, 7, %o3
3024 2972 brnz,pt %o3, .co_src_dst_unal8
3025 2973 nop
3026 2974 /*
3027 2975 * Special case for handling when src and dest are both long word aligned
3028 2976 * and total data to move is less than FP_COPY bytes
3029 2977 * Also handles finish up for large block moves, so may be less than 32 bytes
3030 2978 */
3031 2979 .co_medlong:
3032 2980 subcc %o2, 31, %o2 ! adjust length to allow cc test
3033 2981 ble,pt %ncc, .co_medl31
3034 2982 nop
3035 2983 .co_medl32:
3036 2984 ldx [%o0], %o4 ! move 32 bytes
3037 2985 subcc %o2, 32, %o2 ! decrement length count by 32
3038 2986 stxa %o4, [%o1]%asi
3039 2987 ldx [%o0+8], %o4
3040 2988 stxa %o4, [%o1+8]%asi
3041 2989 ldx [%o0+16], %o4
3042 2990 add %o0, 32, %o0 ! increase src ptr by 32
3043 2991 stxa %o4, [%o1+16]%asi
3044 2992 ldx [%o0-8], %o4
3045 2993 add %o1, 32, %o1 ! increase dst ptr by 32
3046 2994 bgu,pt %ncc, .co_medl32 ! repeat if at least 32 bytes left
3047 2995 stxa %o4, [%o1-8]%asi
3048 2996 .co_medl31:
3049 2997 addcc %o2, 24, %o2 ! adjust count to be off by 7
3050 2998 ble,pt %ncc, .co_medl7 ! skip if 7 or fewer bytes left
3051 2999 nop
3052 3000 .co_medl8:
3053 3001 ldx [%o0], %o4 ! move 8 bytes
3054 3002 add %o0, 8, %o0 ! increase src ptr by 8
3055 3003 subcc %o2, 8, %o2 ! decrease count by 8
3056 3004 add %o1, 8, %o1 ! increase dst ptr by 8
3057 3005 bgu,pt %ncc, .co_medl8
3058 3006 stxa %o4, [%o1-8]%asi
3059 3007 .co_medl7:
3060 3008 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3061 3009 bnz,pt %ncc, .co_small4 ! do final bytes if not finished
3062 3010
3063 3011 .co_smallx: ! finish up and exit
3064 3012 membar #Sync
3065 3013 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3066 3014 .co_smallqx:
3067 3015 retl
3068 3016 mov %g0, %o0
3069 3017
3070 3018 .co_small4:
3071 3019 cmp %o2, 4
3072 3020 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3073 3021 nop !
3074 3022 ld [%o0], %o4 ! move 4 bytes
3075 3023 add %o0, 4, %o0 ! increase src ptr by 4
3076 3024 add %o1, 4, %o1 ! increase dst ptr by 4
3077 3025 subcc %o2, 4, %o2 ! decrease count by 4
3078 3026 bz,pt %ncc, .co_smallx
3079 3027 stwa %o4, [%o1-4]%asi
3080 3028
3081 3029 .co_small3x: ! Exactly 1, 2, or 3 bytes remain
3082 3030 subcc %o2, 1, %o2 ! reduce count for cc test
3083 3031 ldub [%o0], %o4 ! load one byte
3084 3032 bz,pt %ncc, .co_smallx
3085 3033 stba %o4, [%o1]%asi ! store one byte
3086 3034 ldub [%o0+1], %o4 ! load second byte
3087 3035 subcc %o2, 1, %o2
3088 3036 bz,pt %ncc, .co_smallx
3089 3037 stba %o4, [%o1+1]%asi ! store second byte
3090 3038 ldub [%o0+2], %o4 ! load third byte
3091 3039 ba .co_smallx
3092 3040 stba %o4, [%o1+2]%asi ! store third byte
3093 3041
3094 3042 .co_smallest: ! 7 or fewer bytes remain
3095 3043 cmp %o2, 4
3096 3044 blt,pt %ncc, .co_small3x
3097 3045 nop
3098 3046 ldub [%o0], %o4 ! read byte
3099 3047 subcc %o2, 4, %o2 ! reduce count by 4
3100 3048 stba %o4, [%o1]%asi ! write byte
3101 3049 ldub [%o0+1], %o4 ! repeat for total of 4 bytes
3102 3050 add %o0, 4, %o0 ! advance src by 4
3103 3051 stba %o4, [%o1+1]%asi
3104 3052 ldub [%o0-2], %o4
3105 3053 add %o1, 4, %o1 ! advance dst by 4
3106 3054 stba %o4, [%o1-2]%asi
3107 3055 ldub [%o0-1], %o4
3108 3056 bnz,pt %ncc, .co_small3x
3109 3057 stba %o4, [%o1-1]%asi
3110 3058 membar #Sync
3111 3059 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3112 3060 retl
3113 3061 mov %g0, %o0
3114 3062
3115 3063 .co_align: ! byte align test in prior branch delay
3116 3064 bnz,pt %ncc, .co_al_d1
3117 3065 .co_al_d1f: ! dest is now half word aligned
3118 3066 andcc %o1, 2, %o3
3119 3067 bnz,pt %ncc, .co_al_d2
3120 3068 .co_al_d2f: ! dest is now word aligned
3121 3069 andcc %o1, 4, %o3 ! is dest longword aligned?
3122 3070 bz,pt %ncc, .co_al_src
3123 3071 nop
3124 3072 .co_al_d4: ! dest is word aligned; src is unknown
3125 3073 ldub [%o0], %o4 ! move a word (src align unknown)
3126 3074 ldub [%o0+1], %o3
3127 3075 sll %o4, 24, %o4 ! position
3128 3076 sll %o3, 16, %o3 ! position
3129 3077 or %o4, %o3, %o3 ! merge
3130 3078 ldub [%o0+2], %o4
3131 3079 sll %o4, 8, %o4 ! position
3132 3080 or %o4, %o3, %o3 ! merge
3133 3081 ldub [%o0+3], %o4
3134 3082 or %o4, %o3, %o4 ! merge
3135 3083 stwa %o4,[%o1]%asi ! store four bytes
3136 3084 add %o0, 4, %o0 ! adjust src by 4
3137 3085 add %o1, 4, %o1 ! adjust dest by 4
3138 3086 sub %o2, 4, %o2 ! adjust count by 4
3139 3087 andcc %o0, 7, %o3 ! check for src long word alignment
3140 3088 brz,pt %o3, .co_medlong
3141 3089 .co_src_dst_unal8:
3142 3090 ! dst is 8-byte aligned, src is not
3143 3091 ! Size is less than FP_COPY
3144 3092 ! Following code is to select for alignment
3145 3093 andcc %o0, 0x3, %o3 ! test word alignment
3146 3094 bz,pt %ncc, .co_medword
3147 3095 nop
3148 3096 andcc %o0, 0x1, %o3 ! test halfword alignment
3149 3097 bnz,pt %ncc, .co_med_byte ! go to byte move if not halfword
3150 3098 andcc %o0, 0x2, %o3 ! test which byte alignment
3151 3099 ba .co_medhalf
3152 3100 nop
3153 3101 .co_al_d1: ! align dest to half word
3154 3102 ldub [%o0], %o4 ! move a byte
3155 3103 add %o0, 1, %o0
3156 3104 stba %o4, [%o1]%asi
3157 3105 add %o1, 1, %o1
3158 3106 andcc %o1, 2, %o3
3159 3107 bz,pt %ncc, .co_al_d2f
3160 3108 sub %o2, 1, %o2
3161 3109 .co_al_d2: ! align dest to word
3162 3110 ldub [%o0], %o4 ! move a half-word (src align unknown)
3163 3111 ldub [%o0+1], %o3
3164 3112 sll %o4, 8, %o4 ! position
3165 3113 or %o4, %o3, %o4 ! merge
3166 3114 stha %o4, [%o1]%asi
3167 3115 add %o0, 2, %o0
3168 3116 add %o1, 2, %o1
3169 3117 andcc %o1, 4, %o3 ! is dest longword aligned?
3170 3118 bz,pt %ncc, .co_al_src
3171 3119 sub %o2, 2, %o2
3172 3120 ba .co_al_d4
3173 3121 nop
3174 3122 /*
3175 3123 * Handle all cases where src and dest are aligned on word
3176 3124 * boundaries. Use unrolled loops for better performance.
3177 3125 * This option wins over standard large data move when
3178 3126 * source and destination is in cache for medium
3179 3127 * to short data moves.
3180 3128 */
3181 3129 .co_medword:
3182 3130 subcc %o2, 31, %o2 ! adjust length to allow cc test
3183 3131 ble,pt %ncc, .co_medw31
3184 3132 nop
3185 3133 .co_medw32:
3186 3134 ld [%o0], %o4 ! move a block of 32 bytes
3187 3135 stwa %o4, [%o1]%asi
3188 3136 ld [%o0+4], %o4
3189 3137 stwa %o4, [%o1+4]%asi
3190 3138 ld [%o0+8], %o4
3191 3139 stwa %o4, [%o1+8]%asi
3192 3140 ld [%o0+12], %o4
3193 3141 stwa %o4, [%o1+12]%asi
3194 3142 ld [%o0+16], %o4
3195 3143 stwa %o4, [%o1+16]%asi
3196 3144 ld [%o0+20], %o4
3197 3145 subcc %o2, 32, %o2 ! decrement length count
3198 3146 stwa %o4, [%o1+20]%asi
3199 3147 ld [%o0+24], %o4
3200 3148 add %o0, 32, %o0 ! increase src ptr by 32
3201 3149 stwa %o4, [%o1+24]%asi
3202 3150 ld [%o0-4], %o4
3203 3151 add %o1, 32, %o1 ! increase dst ptr by 32
3204 3152 bgu,pt %ncc, .co_medw32 ! repeat if at least 32 bytes left
3205 3153 stwa %o4, [%o1-4]%asi
3206 3154 .co_medw31:
3207 3155 addcc %o2, 24, %o2 ! adjust count to be off by 7
3208 3156 ble,pt %ncc, .co_medw7 ! skip if 7 or fewer bytes left
3209 3157 nop !
3210 3158 .co_medw15:
3211 3159 ld [%o0], %o4 ! move a block of 8 bytes
3212 3160 subcc %o2, 8, %o2 ! decrement length count
3213 3161 stwa %o4, [%o1]%asi
3214 3162 add %o0, 8, %o0 ! increase src ptr by 8
3215 3163 ld [%o0-4], %o4
3216 3164 add %o1, 8, %o1 ! increase dst ptr by 8
3217 3165 bgu,pt %ncc, .co_medw15
3218 3166 stwa %o4, [%o1-4]%asi
3219 3167 .co_medw7:
3220 3168 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3221 3169 bz,pt %ncc, .co_smallx ! exit if finished
3222 3170 cmp %o2, 4
3223 3171 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3224 3172 nop !
3225 3173 ld [%o0], %o4 ! move 4 bytes
3226 3174 add %o0, 4, %o0 ! increase src ptr by 4
3227 3175 add %o1, 4, %o1 ! increase dst ptr by 4
3228 3176 subcc %o2, 4, %o2 ! decrease count by 4
3229 3177 bnz .co_small3x
3230 3178 stwa %o4, [%o1-4]%asi
3231 3179 membar #Sync
3232 3180 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3233 3181 retl
3234 3182 mov %g0, %o0
3235 3183
3236 3184 .co_medhalf:
3237 3185 subcc %o2, 31, %o2 ! adjust length to allow cc test
3238 3186 ble,pt %ncc, .co_medh31
3239 3187 nop
3240 3188 .co_medh32: ! load and store block of 32 bytes
3241 3189
3242 3190 lduh [%o0], %o4 ! move 32 bytes
3243 3191 subcc %o2, 32, %o2 ! decrement length count
3244 3192 lduw [%o0+2], %o3
3245 3193 sllx %o4, 48, %o4
3246 3194 sllx %o3, 16, %o3
3247 3195 or %o4, %o3, %o3
3248 3196 lduh [%o0+6], %o4
3249 3197 or %o4, %o3, %o4
3250 3198 stxa %o4, [%o1]%asi
3251 3199
3252 3200 lduh [%o0+8], %o4
3253 3201 lduw [%o0+10], %o3
3254 3202 sllx %o4, 48, %o4
3255 3203 sllx %o3, 16, %o3
3256 3204 or %o4, %o3, %o3
3257 3205 lduh [%o0+14], %o4
3258 3206 or %o4, %o3, %o4
3259 3207 stxa %o4, [%o1+8]%asi
3260 3208
3261 3209 lduh [%o0+16], %o4
3262 3210 lduw [%o0+18], %o3
3263 3211 sllx %o4, 48, %o4
3264 3212 sllx %o3, 16, %o3
3265 3213 or %o4, %o3, %o3
3266 3214 lduh [%o0+22], %o4
3267 3215 or %o4, %o3, %o4
3268 3216 stxa %o4, [%o1+16]%asi
3269 3217
3270 3218 add %o0, 32, %o0 ! increase src ptr by 32
3271 3219 add %o1, 32, %o1 ! increase dst ptr by 32
3272 3220
3273 3221 lduh [%o0-8], %o4
3274 3222 lduw [%o0-6], %o3
3275 3223 sllx %o4, 48, %o4
3276 3224 sllx %o3, 16, %o3
3277 3225 or %o4, %o3, %o3
3278 3226 lduh [%o0-2], %o4
3279 3227 or %o3, %o4, %o4
3280 3228 bgu,pt %ncc, .co_medh32 ! repeat if at least 32 bytes left
3281 3229 stxa %o4, [%o1-8]%asi
3282 3230
3283 3231 .co_medh31:
3284 3232 addcc %o2, 24, %o2 ! adjust count to be off by 7
3285 3233 ble,pt %ncc, .co_medh7 ! skip if 7 or fewer bytes left
3286 3234 nop !
3287 3235 .co_medh15:
3288 3236 lduh [%o0], %o4 ! move 16 bytes
3289 3237 subcc %o2, 8, %o2 ! decrement length count
3290 3238 lduw [%o0+2], %o3
3291 3239 sllx %o4, 48, %o4
3292 3240 sllx %o3, 16, %o3
3293 3241 or %o4, %o3, %o3
3294 3242 add %o1, 8, %o1 ! increase dst ptr by 8
3295 3243 lduh [%o0+6], %o4
3296 3244 add %o0, 8, %o0 ! increase src ptr by 8
3297 3245 or %o4, %o3, %o4
3298 3246 bgu,pt %ncc, .co_medh15
3299 3247 stxa %o4, [%o1-8]%asi
3300 3248 .co_medh7:
3301 3249 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3302 3250 bz,pt %ncc, .co_smallx ! exit if finished
3303 3251 cmp %o2, 4
3304 3252 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3305 3253 nop !
3306 3254 lduh [%o0], %o4
3307 3255 sll %o4, 16, %o4
3308 3256 lduh [%o0+2], %o3
3309 3257 or %o3, %o4, %o4
3310 3258 subcc %o2, 4, %o2
3311 3259 add %o0, 4, %o0
3312 3260 add %o1, 4, %o1
3313 3261 bnz .co_small3x
3314 3262 stwa %o4, [%o1-4]%asi
3315 3263 membar #Sync
3316 3264 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3317 3265 retl
3318 3266 mov %g0, %o0
3319 3267
3320 3268 .align 16
3321 3269 .co_med_byte:
3322 3270 bnz,pt %ncc, .co_medbh32a ! go to correct byte move
3323 3271 subcc %o2, 31, %o2 ! adjust length to allow cc test
3324 3272 ble,pt %ncc, .co_medb31
3325 3273 nop
3326 3274 .co_medb32: ! Alignment 1 or 5
3327 3275 subcc %o2, 32, %o2 ! decrement length count
3328 3276
3329 3277 ldub [%o0], %o4 ! load and store a block of 32 bytes
3330 3278 sllx %o4, 56, %o3
3331 3279 lduh [%o0+1], %o4
3332 3280 sllx %o4, 40, %o4
3333 3281 or %o4, %o3, %o3
3334 3282 lduw [%o0+3], %o4
3335 3283 sllx %o4, 8, %o4
3336 3284 or %o4, %o3, %o3
3337 3285 ldub [%o0+7], %o4
3338 3286 or %o4, %o3, %o4
3339 3287 stxa %o4, [%o1]%asi
3340 3288
3341 3289 ldub [%o0+8], %o4
3342 3290 sllx %o4, 56, %o3
3343 3291 lduh [%o0+9], %o4
3344 3292 sllx %o4, 40, %o4
3345 3293 or %o4, %o3, %o3
3346 3294 lduw [%o0+11], %o4
3347 3295 sllx %o4, 8, %o4
3348 3296 or %o4, %o3, %o3
3349 3297 ldub [%o0+15], %o4
3350 3298 or %o4, %o3, %o4
3351 3299 stxa %o4, [%o1+8]%asi
3352 3300
3353 3301 ldub [%o0+16], %o4
3354 3302 sllx %o4, 56, %o3
3355 3303 lduh [%o0+17], %o4
3356 3304 sllx %o4, 40, %o4
3357 3305 or %o4, %o3, %o3
3358 3306 lduw [%o0+19], %o4
3359 3307 sllx %o4, 8, %o4
3360 3308 or %o4, %o3, %o3
3361 3309 ldub [%o0+23], %o4
3362 3310 or %o4, %o3, %o4
3363 3311 stxa %o4, [%o1+16]%asi
3364 3312
3365 3313 add %o0, 32, %o0 ! increase src ptr by 32
3366 3314 add %o1, 32, %o1 ! increase dst ptr by 32
3367 3315
3368 3316 ldub [%o0-8], %o4
3369 3317 sllx %o4, 56, %o3
3370 3318 lduh [%o0-7], %o4
3371 3319 sllx %o4, 40, %o4
3372 3320 or %o4, %o3, %o3
3373 3321 lduw [%o0-5], %o4
3374 3322 sllx %o4, 8, %o4
3375 3323 or %o4, %o3, %o3
3376 3324 ldub [%o0-1], %o4
3377 3325 or %o4, %o3, %o4
3378 3326 bgu,pt %ncc, .co_medb32 ! repeat if at least 32 bytes left
3379 3327 stxa %o4, [%o1-8]%asi
3380 3328
3381 3329 .co_medb31: ! 31 or fewer bytes remaining
3382 3330 addcc %o2, 24, %o2 ! adjust count to be off by 7
3383 3331 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left
3384 3332 nop !
3385 3333 .co_medb15:
3386 3334
3387 3335 ldub [%o0], %o4 ! load and store a block of 8 bytes
3388 3336 subcc %o2, 8, %o2 ! decrement length count
3389 3337 sllx %o4, 56, %o3
3390 3338 lduh [%o0+1], %o4
3391 3339 sllx %o4, 40, %o4
3392 3340 or %o4, %o3, %o3
3393 3341 lduw [%o0+3], %o4
3394 3342 add %o1, 8, %o1 ! increase dst ptr by 16
3395 3343 sllx %o4, 8, %o4
3396 3344 or %o4, %o3, %o3
3397 3345 ldub [%o0+7], %o4
3398 3346 add %o0, 8, %o0 ! increase src ptr by 16
3399 3347 or %o4, %o3, %o4
3400 3348 bgu,pt %ncc, .co_medb15
3401 3349 stxa %o4, [%o1-8]%asi
3402 3350 .co_medb7:
3403 3351 addcc %o2, 7, %o2 ! finish adjustment of remaining count
3404 3352 bz,pt %ncc, .co_smallx ! exit if finished
3405 3353 cmp %o2, 4
3406 3354 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left
3407 3355 nop !
3408 3356 ldub [%o0], %o4 ! move 4 bytes
3409 3357 sll %o4, 24, %o3
3410 3358 lduh [%o0+1], %o4
3411 3359 sll %o4, 8, %o4
3412 3360 or %o4, %o3, %o3
3413 3361 ldub [%o0+3], %o4
3414 3362 or %o4, %o3, %o4
3415 3363 subcc %o2, 4, %o2
3416 3364 add %o0, 4, %o0
3417 3365 add %o1, 4, %o1
3418 3366 bnz .co_small3x
3419 3367 stwa %o4, [%o1-4]%asi
3420 3368 membar #Sync
3421 3369 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3422 3370 retl
3423 3371 mov %g0, %o0
3424 3372
3425 3373 .align 16
3426 3374 .co_medbh32a:
3427 3375 ble,pt %ncc, .co_medbh31
3428 3376 nop
3429 3377 .co_medbh32: ! Alignment 3 or 7
3430 3378 subcc %o2, 32, %o2 ! decrement length count
3431 3379
3432 3380 ldub [%o0], %o4 ! load and store a block of 32 bytes
3433 3381 sllx %o4, 56, %o3
3434 3382 lduw [%o0+1], %o4
3435 3383 sllx %o4, 24, %o4
3436 3384 or %o4, %o3, %o3
3437 3385 lduh [%o0+5], %o4
3438 3386 sllx %o4, 8, %o4
3439 3387 or %o4, %o3, %o3
3440 3388 ldub [%o0+7], %o4
3441 3389 or %o4, %o3, %o4
3442 3390 stxa %o4, [%o1]%asi
3443 3391
3444 3392 ldub [%o0+8], %o4
3445 3393 sllx %o4, 56, %o3
3446 3394 lduw [%o0+9], %o4
3447 3395 sllx %o4, 24, %o4
3448 3396 or %o4, %o3, %o3
3449 3397 lduh [%o0+13], %o4
3450 3398 sllx %o4, 8, %o4
3451 3399 or %o4, %o3, %o3
3452 3400 ldub [%o0+15], %o4
3453 3401 or %o4, %o3, %o4
3454 3402 stxa %o4, [%o1+8]%asi
3455 3403
3456 3404 ldub [%o0+16], %o4
3457 3405 sllx %o4, 56, %o3
3458 3406 lduw [%o0+17], %o4
3459 3407 sllx %o4, 24, %o4
3460 3408 or %o4, %o3, %o3
3461 3409 lduh [%o0+21], %o4
3462 3410 sllx %o4, 8, %o4
3463 3411 or %o4, %o3, %o3
3464 3412 ldub [%o0+23], %o4
3465 3413 or %o4, %o3, %o4
3466 3414 stxa %o4, [%o1+16]%asi
3467 3415
3468 3416 add %o0, 32, %o0 ! increase src ptr by 32
3469 3417 add %o1, 32, %o1 ! increase dst ptr by 32
3470 3418
3471 3419 ldub [%o0-8], %o4
3472 3420 sllx %o4, 56, %o3
3473 3421 lduw [%o0-7], %o4
3474 3422 sllx %o4, 24, %o4
3475 3423 or %o4, %o3, %o3
3476 3424 lduh [%o0-3], %o4
3477 3425 sllx %o4, 8, %o4
3478 3426 or %o4, %o3, %o3
3479 3427 ldub [%o0-1], %o4
3480 3428 or %o4, %o3, %o4
3481 3429 bgu,pt %ncc, .co_medbh32 ! repeat if at least 32 bytes left
3482 3430 stxa %o4, [%o1-8]%asi
3483 3431
3484 3432 .co_medbh31:
3485 3433 addcc %o2, 24, %o2 ! adjust count to be off by 7
3486 3434 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left
3487 3435 nop !
3488 3436 .co_medbh15:
3489 3437 ldub [%o0], %o4 ! load and store a block of 8 bytes
3490 3438 sllx %o4, 56, %o3
3491 3439 lduw [%o0+1], %o4
3492 3440 sllx %o4, 24, %o4
3493 3441 or %o4, %o3, %o3
3494 3442 lduh [%o0+5], %o4
3495 3443 sllx %o4, 8, %o4
3496 3444 or %o4, %o3, %o3
3497 3445 ldub [%o0+7], %o4
3498 3446 or %o4, %o3, %o4
3499 3447 stxa %o4, [%o1]%asi
3500 3448 subcc %o2, 8, %o2 ! decrement length count
3501 3449 add %o1, 8, %o1 ! increase dst ptr by 8
3502 3450 add %o0, 8, %o0 ! increase src ptr by 8
3503 3451 bgu,pt %ncc, .co_medbh15
3504 3452 stxa %o4, [%o1-8]%asi
3505 3453 ba .co_medb7
3506 3454 nop
3507 3455 /*
3508 3456 * End of small copy (no window) code
3509 3457 */
3510 3458
3511 3459 /*
3512 3460 * Long copy code
3513 3461 */
3514 3462 .co_copy_more:
3515 3463 sethi %hi(copyio_fault), %o3
3516 3464 or %o3, %lo(copyio_fault), %o3
3517 3465 membar #Sync
3518 3466 stn %o3, [THREAD_REG + T_LOFAULT]
3519 3467
3520 3468 /*
3521 3469 * Following code is for large copies. We know there is at
3522 3470 * least FP_COPY bytes available. FP regs are used, so
3523 3471 * we save registers and fp regs before starting
3524 3472 */
3525 3473 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3526 3474 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3527 3475 rd %fprs, %g1 ! check for unused fp
3528 3476 ! if fprs.fef == 0, set it.
3529 3477 ! Setting it when already set costs more than checking
3530 3478 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0
3531 3479 bz,pt %ncc, .co_fp_unused
3532 3480 mov ASI_USER, %asi
3533 3481 BST_FP_TOSTACK(%o3)
3534 3482 ba .co_fp_ready
3535 3483 .co_fp_unused:
3536 3484 prefetch [%i0 + (1 * CACHE_LINE)], #one_read
3537 3485 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
3538 3486 .co_fp_ready:
3539 3487 rd %gsr, %l5 ! save %gsr value
3540 3488 andcc %i1, 1, %o3 ! is dest byte aligned
3541 3489 bnz,pt %ncc, .co_big_d1
3542 3490 .co_big_d1f: ! dest is now half word aligned
3543 3491 andcc %i1, 2, %o3
3544 3492 bnz,pt %ncc, .co_big_d2
3545 3493 .co_big_d2f: ! dest is now word aligned
3546 3494 andcc %i1, 4, %o3 ! is dest longword aligned
3547 3495 bnz,pt %ncc, .co_big_d4
3548 3496 .co_big_d4f: ! dest is now long word aligned
3549 3497 andcc %i0, 7, %o3 ! is src long word aligned
3550 3498 brnz,pt %o3, .co_big_unal8
3551 3499 prefetch [%i0 + (2 * CACHE_LINE)], #one_read
3552 3500 ! Src and dst are long word aligned
3553 3501 ! align dst to 64 byte boundary
3554 3502 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
3555 3503 brz,pn %o3, .co_al_to_64
3556 3504 nop
3557 3505 sub %o3, 64, %o3 ! %o3 has negative bytes to move
3558 3506 add %i2, %o3, %i2 ! adjust remaining count
3559 3507 andcc %o3, 8, %o4 ! odd long words to move?
3560 3508 brz,pt %o4, .co_al_to_16
3561 3509 nop
3562 3510 add %o3, 8, %o3
3563 3511 ldx [%i0], %o4
3564 3512 add %i0, 8, %i0 ! increment src ptr
3565 3513 stxa %o4, [%i1]ASI_USER
3566 3514 add %i1, 8, %i1 ! increment dst ptr
3567 3515 ! Dest is aligned on 16 bytes, src 8 byte aligned
3568 3516 .co_al_to_16:
3569 3517 andcc %o3, 0x30, %o4 ! move to move?
3570 3518 brz,pt %o4, .co_al_to_64
3571 3519 nop
3572 3520 .co_al_mv_16:
3573 3521 add %o3, 16, %o3
3574 3522 ldx [%i0], %o4
3575 3523 stxa %o4, [%i1]ASI_USER
3576 3524 add %i0, 16, %i0 ! increment src ptr
3577 3525 ldx [%i0-8], %o4
3578 3526 add %i1, 8, %i1 ! increment dst ptr
3579 3527 stxa %o4, [%i1]ASI_USER
3580 3528 andcc %o3, 0x30, %o4
3581 3529 brnz,pt %o4, .co_al_mv_16
3582 3530 add %i1, 8, %i1 ! increment dst ptr
3583 3531 ! Dest is aligned on 64 bytes, src 8 byte aligned
3584 3532 .co_al_to_64:
3585 3533 ! Determine source alignment
3586 3534 ! to correct 8 byte offset
3587 3535 andcc %i0, 32, %o3
3588 3536 brnz,pn %o3, .co_aln_1
3589 3537 andcc %i0, 16, %o3
3590 3538 brnz,pn %o3, .co_aln_01
3591 3539 andcc %i0, 8, %o3
3592 3540 brz,pn %o3, .co_aln_000
3593 3541 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3594 3542 ba .co_aln_001
3595 3543 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3596 3544 .co_aln_01:
3597 3545 brnz,pn %o3, .co_aln_011
3598 3546 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3599 3547 ba .co_aln_010
3600 3548 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3601 3549 .co_aln_1:
3602 3550 andcc %i0, 16, %o3
3603 3551 brnz,pn %o3, .co_aln_11
3604 3552 andcc %i0, 8, %o3
3605 3553 brnz,pn %o3, .co_aln_101
3606 3554 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3607 3555 ba .co_aln_100
3608 3556 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3609 3557 .co_aln_11:
3610 3558 brz,pn %o3, .co_aln_110
3611 3559 prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3612 3560
3613 3561 .co_aln_111:
3614 3562 ! Alignment off by 8 bytes
3615 3563 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3616 3564 ldd [%i0], %d0
3617 3565 add %i0, 8, %i0
3618 3566 sub %i2, 8, %i2
3619 3567 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3620 3568 and %i2, 0x7f, %i2 ! residue bytes in %i2
3621 3569 sub %i1, %i0, %i1
3622 3570 .co_aln_111_loop:
3623 3571 ldda [%i0]ASI_BLK_P,%d16 ! block load
3624 3572 subcc %o3, 64, %o3
3625 3573 fmovd %d16, %d2
3626 3574 fmovd %d18, %d4
3627 3575 fmovd %d20, %d6
3628 3576 fmovd %d22, %d8
3629 3577 fmovd %d24, %d10
3630 3578 fmovd %d26, %d12
3631 3579 fmovd %d28, %d14
3632 3580 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3633 3581 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3634 3582 add %i0, 64, %i0
3635 3583 fmovd %d30, %d0
3636 3584 bgt,pt %ncc, .co_aln_111_loop
3637 3585 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3638 3586 add %i1, %i0, %i1
3639 3587
3640 3588 stda %d0, [%i1]ASI_USER
3641 3589 ba .co_remain_stuff
3642 3590 add %i1, 8, %i1
3643 3591 ! END OF aln_111
3644 3592
3645 3593 .co_aln_110:
3646 3594 ! Alignment off by 16 bytes
3647 3595 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3648 3596 ldd [%i0], %d0
3649 3597 ldd [%i0+8], %d2
3650 3598 add %i0, 16, %i0
3651 3599 sub %i2, 16, %i2
3652 3600 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3653 3601 and %i2, 0x7f, %i2 ! residue bytes in %i2
3654 3602 sub %i1, %i0, %i1
3655 3603 .co_aln_110_loop:
3656 3604 ldda [%i0]ASI_BLK_P,%d16 ! block load
3657 3605 subcc %o3, 64, %o3
3658 3606 fmovd %d16, %d4
3659 3607 fmovd %d18, %d6
3660 3608 fmovd %d20, %d8
3661 3609 fmovd %d22, %d10
3662 3610 fmovd %d24, %d12
3663 3611 fmovd %d26, %d14
3664 3612 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3665 3613 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3666 3614 add %i0, 64, %i0
3667 3615 fmovd %d28, %d0
3668 3616 fmovd %d30, %d2
3669 3617 bgt,pt %ncc, .co_aln_110_loop
3670 3618 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3671 3619 add %i1, %i0, %i1
3672 3620
3673 3621 stda %d0, [%i1]%asi
3674 3622 stda %d2, [%i1+8]%asi
3675 3623 ba .co_remain_stuff
3676 3624 add %i1, 16, %i1
3677 3625 ! END OF aln_110
3678 3626
3679 3627 .co_aln_101:
3680 3628 ! Alignment off by 24 bytes
3681 3629 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3682 3630 ldd [%i0], %d0
3683 3631 ldd [%i0+8], %d2
3684 3632 ldd [%i0+16], %d4
3685 3633 add %i0, 24, %i0
3686 3634 sub %i2, 24, %i2
3687 3635 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3688 3636 and %i2, 0x7f, %i2 ! residue bytes in %i2
3689 3637 sub %i1, %i0, %i1
3690 3638 .co_aln_101_loop:
3691 3639 ldda [%i0]ASI_BLK_P,%d16 ! block load
3692 3640 subcc %o3, 64, %o3
3693 3641 fmovd %d16, %d6
3694 3642 fmovd %d18, %d8
3695 3643 fmovd %d20, %d10
3696 3644 fmovd %d22, %d12
3697 3645 fmovd %d24, %d14
3698 3646 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3699 3647 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3700 3648 add %i0, 64, %i0
3701 3649 fmovd %d26, %d0
3702 3650 fmovd %d28, %d2
3703 3651 fmovd %d30, %d4
3704 3652 bgt,pt %ncc, .co_aln_101_loop
3705 3653 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3706 3654 add %i1, %i0, %i1
3707 3655
3708 3656 stda %d0, [%i1]%asi
3709 3657 stda %d2, [%i1+8]%asi
3710 3658 stda %d4, [%i1+16]%asi
3711 3659 ba .co_remain_stuff
3712 3660 add %i1, 24, %i1
3713 3661 ! END OF aln_101
3714 3662
3715 3663 .co_aln_100:
3716 3664 ! Alignment off by 32 bytes
3717 3665 ldd [%i0], %d0
3718 3666 ldd [%i0+8], %d2
3719 3667 ldd [%i0+16],%d4
3720 3668 ldd [%i0+24],%d6
3721 3669 add %i0, 32, %i0
3722 3670 sub %i2, 32, %i2
3723 3671 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3724 3672 and %i2, 0x7f, %i2 ! residue bytes in %i2
3725 3673 sub %i1, %i0, %i1
3726 3674 .co_aln_100_loop:
3727 3675 ldda [%i0]ASI_BLK_P,%d16 ! block load
3728 3676 subcc %o3, 64, %o3
3729 3677 fmovd %d16, %d8
3730 3678 fmovd %d18, %d10
3731 3679 fmovd %d20, %d12
3732 3680 fmovd %d22, %d14
3733 3681 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3734 3682 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3735 3683 add %i0, 64, %i0
3736 3684 fmovd %d24, %d0
3737 3685 fmovd %d26, %d2
3738 3686 fmovd %d28, %d4
3739 3687 fmovd %d30, %d6
3740 3688 bgt,pt %ncc, .co_aln_100_loop
3741 3689 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3742 3690 add %i1, %i0, %i1
3743 3691
3744 3692 stda %d0, [%i1]%asi
3745 3693 stda %d2, [%i1+8]%asi
3746 3694 stda %d4, [%i1+16]%asi
3747 3695 stda %d6, [%i1+24]%asi
3748 3696 ba .co_remain_stuff
3749 3697 add %i1, 32, %i1
3750 3698 ! END OF aln_100
3751 3699
3752 3700 .co_aln_011:
3753 3701 ! Alignment off by 40 bytes
3754 3702 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3755 3703 ldd [%i0], %d0
3756 3704 ldd [%i0+8], %d2
3757 3705 ldd [%i0+16], %d4
3758 3706 ldd [%i0+24], %d6
3759 3707 ldd [%i0+32], %d8
3760 3708 add %i0, 40, %i0
3761 3709 sub %i2, 40, %i2
3762 3710 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3763 3711 and %i2, 0x7f, %i2 ! residue bytes in %i2
3764 3712 sub %i1, %i0, %i1
3765 3713 .co_aln_011_loop:
3766 3714 ldda [%i0]ASI_BLK_P,%d16 ! block load
3767 3715 subcc %o3, 64, %o3
3768 3716 fmovd %d16, %d10
3769 3717 fmovd %d18, %d12
3770 3718 fmovd %d20, %d14
3771 3719 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3772 3720 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3773 3721 add %i0, 64, %i0
3774 3722 fmovd %d22, %d0
3775 3723 fmovd %d24, %d2
3776 3724 fmovd %d26, %d4
3777 3725 fmovd %d28, %d6
3778 3726 fmovd %d30, %d8
3779 3727 bgt,pt %ncc, .co_aln_011_loop
3780 3728 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3781 3729 add %i1, %i0, %i1
3782 3730
3783 3731 stda %d0, [%i1]%asi
3784 3732 stda %d2, [%i1+8]%asi
3785 3733 stda %d4, [%i1+16]%asi
3786 3734 stda %d6, [%i1+24]%asi
3787 3735 stda %d8, [%i1+32]%asi
3788 3736 ba .co_remain_stuff
3789 3737 add %i1, 40, %i1
3790 3738 ! END OF aln_011
3791 3739
3792 3740 .co_aln_010:
3793 3741 ! Alignment off by 48 bytes
3794 3742 ldd [%i0], %d0
3795 3743 ldd [%i0+8], %d2
3796 3744 ldd [%i0+16], %d4
3797 3745 ldd [%i0+24], %d6
3798 3746 ldd [%i0+32], %d8
3799 3747 ldd [%i0+40], %d10
3800 3748 add %i0, 48, %i0
3801 3749 sub %i2, 48, %i2
3802 3750 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3803 3751 and %i2, 0x7f, %i2 ! residue bytes in %i2
3804 3752 sub %i1, %i0, %i1
3805 3753 .co_aln_010_loop:
3806 3754 ldda [%i0]ASI_BLK_P,%d16 ! block load
3807 3755 subcc %o3, 64, %o3
3808 3756 fmovd %d16, %d12
3809 3757 fmovd %d18, %d14
3810 3758 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3811 3759 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3812 3760 add %i0, 64, %i0
3813 3761 fmovd %d20, %d0
3814 3762 fmovd %d22, %d2
3815 3763 fmovd %d24, %d4
3816 3764 fmovd %d26, %d6
3817 3765 fmovd %d28, %d8
3818 3766 fmovd %d30, %d10
3819 3767 bgt,pt %ncc, .co_aln_010_loop
3820 3768 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3821 3769 add %i1, %i0, %i1
3822 3770
3823 3771 stda %d0, [%i1]%asi
3824 3772 stda %d2, [%i1+8]%asi
3825 3773 stda %d4, [%i1+16]%asi
3826 3774 stda %d6, [%i1+24]%asi
3827 3775 stda %d8, [%i1+32]%asi
3828 3776 stda %d10, [%i1+40]%asi
3829 3777 ba .co_remain_stuff
3830 3778 add %i1, 48, %i1
3831 3779 ! END OF aln_010
3832 3780
3833 3781 .co_aln_001:
3834 3782 ! Alignment off by 56 bytes
3835 3783 ldd [%i0], %d0
3836 3784 ldd [%i0+8], %d2
3837 3785 ldd [%i0+16], %d4
3838 3786 ldd [%i0+24], %d6
3839 3787 ldd [%i0+32], %d8
3840 3788 ldd [%i0+40], %d10
3841 3789 ldd [%i0+48], %d12
3842 3790 add %i0, 56, %i0
3843 3791 sub %i2, 56, %i2
3844 3792 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3845 3793 and %i2, 0x7f, %i2 ! residue bytes in %i2
3846 3794 sub %i1, %i0, %i1
3847 3795 .co_aln_001_loop:
3848 3796 ldda [%i0]ASI_BLK_P,%d16 ! block load
3849 3797 subcc %o3, 64, %o3
3850 3798 fmovd %d16, %d14
3851 3799 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3852 3800 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3853 3801 add %i0, 64, %i0
3854 3802 fmovd %d18, %d0
3855 3803 fmovd %d20, %d2
3856 3804 fmovd %d22, %d4
3857 3805 fmovd %d24, %d6
3858 3806 fmovd %d26, %d8
3859 3807 fmovd %d28, %d10
3860 3808 fmovd %d30, %d12
3861 3809 bgt,pt %ncc, .co_aln_001_loop
3862 3810 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3863 3811 add %i1, %i0, %i1
3864 3812
3865 3813 stda %d0, [%i1]%asi
3866 3814 stda %d2, [%i1+8]%asi
3867 3815 stda %d4, [%i1+16]%asi
3868 3816 stda %d6, [%i1+24]%asi
3869 3817 stda %d8, [%i1+32]%asi
3870 3818 stda %d10, [%i1+40]%asi
3871 3819 stda %d12, [%i1+48]%asi
3872 3820 ba .co_remain_stuff
3873 3821 add %i1, 56, %i1
3874 3822 ! END OF aln_001
3875 3823
3876 3824 .co_aln_000:
3877 3825 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3878 3826 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
3879 3827 and %i2, 0x7f, %i2 ! residue bytes in %i2
3880 3828 sub %i1, %i0, %i1
3881 3829 .co_aln_000_loop:
3882 3830 ldda [%i0]ASI_BLK_P,%d0
3883 3831 subcc %o3, 64, %o3
3884 3832 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store
3885 3833 stda %d0,[%i0+%i1]ASI_BLK_AIUS
3886 3834 add %i0, 64, %i0
3887 3835 bgt,pt %ncc, .co_aln_000_loop
3888 3836 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3889 3837 add %i1, %i0, %i1
3890 3838
3891 3839 ! END OF aln_000
3892 3840
3893 3841 .co_remain_stuff:
3894 3842 subcc %i2, 31, %i2 ! adjust length to allow cc test
3895 3843 ble,pt %ncc, .co_aln_31
3896 3844 nop
3897 3845 .co_aln_32:
3898 3846 ldx [%i0], %o4 ! move 32 bytes
3899 3847 subcc %i2, 32, %i2 ! decrement length count by 32
3900 3848 stxa %o4, [%i1]%asi
3901 3849 ldx [%i0+8], %o4
3902 3850 stxa %o4, [%i1+8]%asi
3903 3851 ldx [%i0+16], %o4
3904 3852 add %i0, 32, %i0 ! increase src ptr by 32
3905 3853 stxa %o4, [%i1+16]%asi
3906 3854 ldx [%i0-8], %o4
3907 3855 add %i1, 32, %i1 ! increase dst ptr by 32
3908 3856 bgu,pt %ncc, .co_aln_32 ! repeat if at least 32 bytes left
3909 3857 stxa %o4, [%i1-8]%asi
3910 3858 .co_aln_31:
3911 3859 addcc %i2, 24, %i2 ! adjust count to be off by 7
3912 3860 ble,pt %ncc, .co_aln_7 ! skip if 7 or fewer bytes left
3913 3861 nop !
3914 3862 .co_aln_15:
3915 3863 ldx [%i0], %o4 ! move 8 bytes
3916 3864 add %i0, 8, %i0 ! increase src ptr by 8
3917 3865 subcc %i2, 8, %i2 ! decrease count by 8
3918 3866 add %i1, 8, %i1 ! increase dst ptr by 8
3919 3867 bgu,pt %ncc, .co_aln_15
3920 3868 stxa %o4, [%i1-8]%asi
3921 3869 .co_aln_7:
3922 3870 addcc %i2, 7, %i2 ! finish adjustment of remaining count
3923 3871 bz,pt %ncc, .co_exit ! exit if finished
3924 3872 cmp %i2, 4
3925 3873 blt,pt %ncc, .co_unaln3x ! skip if less than 4 bytes left
3926 3874 nop !
3927 3875 ld [%i0], %o4 ! move 4 bytes
3928 3876 add %i0, 4, %i0 ! increase src ptr by 4
3929 3877 add %i1, 4, %i1 ! increase dst ptr by 4
3930 3878 subcc %i2, 4, %i2 ! decrease count by 4
3931 3879 bnz .co_unaln3x
3932 3880 stwa %o4, [%i1-4]%asi
3933 3881 ba .co_exit
3934 3882 nop
3935 3883
3936 3884 ! destination alignment code
3937 3885 .co_big_d1:
3938 3886 ldub [%i0], %o4 ! move a byte
3939 3887 add %i0, 1, %i0
3940 3888 stba %o4, [%i1]ASI_USER
3941 3889 add %i1, 1, %i1
3942 3890 andcc %i1, 2, %o3
3943 3891 bz,pt %ncc, .co_big_d2f
3944 3892 sub %i2, 1, %i2
3945 3893 .co_big_d2:
3946 3894 ldub [%i0], %o4 ! move a half-word (src align unknown)
3947 3895 ldub [%i0+1], %o3
3948 3896 add %i0, 2, %i0
3949 3897 sll %o4, 8, %o4 ! position
3950 3898 or %o4, %o3, %o4 ! merge
3951 3899 stha %o4, [%i1]ASI_USER
3952 3900 add %i1, 2, %i1
3953 3901 andcc %i1, 4, %o3 ! is dest longword aligned
3954 3902 bz,pt %ncc, .co_big_d4f
3955 3903 sub %i2, 2, %i2
3956 3904 .co_big_d4: ! dest is at least word aligned
3957 3905 nop
3958 3906 ldub [%i0], %o4 ! move a word (src align unknown)
3959 3907 ldub [%i0+1], %o3
3960 3908 sll %o4, 24, %o4 ! position
3961 3909 sll %o3, 16, %o3 ! position
3962 3910 or %o4, %o3, %o3 ! merge
3963 3911 ldub [%i0+2], %o4
3964 3912 sll %o4, 8, %o4 ! position
3965 3913 or %o4, %o3, %o3 ! merge
3966 3914 ldub [%i0+3], %o4
3967 3915 or %o4, %o3, %o4 ! merge
3968 3916 stwa %o4,[%i1]ASI_USER ! store four bytes
3969 3917 add %i0, 4, %i0 ! adjust src by 4
3970 3918 add %i1, 4, %i1 ! adjust dest by 4
3971 3919 ba .co_big_d4f
3972 3920 sub %i2, 4, %i2 ! adjust count by 4
3973 3921
3974 3922
3975 3923 ! Dst is on 8 byte boundary; src is not;
3976 3924 .co_big_unal8:
3977 3925 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
3978 3926 bz %ncc, .co_unalnsrc
3979 3927 sub %o3, 64, %o3 ! %o3 will be multiple of 8
3980 3928 neg %o3 ! bytes until dest is 64 byte aligned
3981 3929 sub %i2, %o3, %i2 ! update cnt with bytes to be moved
3982 3930 ! Move bytes according to source alignment
3983 3931 andcc %i0, 0x1, %o4
3984 3932 bnz %ncc, .co_unalnbyte ! check for byte alignment
3985 3933 nop
3986 3934 andcc %i0, 2, %o4 ! check for half word alignment
3987 3935 bnz %ncc, .co_unalnhalf
3988 3936 nop
3989 3937 ! Src is word aligned, move bytes until dest 64 byte aligned
3990 3938 .co_unalnword:
3991 3939 ld [%i0], %o4 ! load 4 bytes
3992 3940 stwa %o4, [%i1]%asi ! and store 4 bytes
3993 3941 ld [%i0+4], %o4 ! load 4 bytes
3994 3942 add %i0, 8, %i0 ! increase src ptr by 8
3995 3943 stwa %o4, [%i1+4]%asi ! and store 4 bytes
3996 3944 subcc %o3, 8, %o3 ! decrease count by 8
3997 3945 bnz %ncc, .co_unalnword
3998 3946 add %i1, 8, %i1 ! increase dst ptr by 8
3999 3947 ba .co_unalnsrc
4000 3948 nop
4001 3949
4002 3950 ! Src is half-word aligned, move bytes until dest 64 byte aligned
4003 3951 .co_unalnhalf:
4004 3952 lduh [%i0], %o4 ! load 2 bytes
4005 3953 sllx %o4, 32, %i3 ! shift left
4006 3954 lduw [%i0+2], %o4
4007 3955 or %o4, %i3, %i3
4008 3956 sllx %i3, 16, %i3
4009 3957 lduh [%i0+6], %o4
4010 3958 or %o4, %i3, %i3
4011 3959 stxa %i3, [%i1]ASI_USER
4012 3960 add %i0, 8, %i0
4013 3961 subcc %o3, 8, %o3
4014 3962 bnz %ncc, .co_unalnhalf
4015 3963 add %i1, 8, %i1
4016 3964 ba .co_unalnsrc
4017 3965 nop
4018 3966
4019 3967 ! Src is Byte aligned, move bytes until dest 64 byte aligned
4020 3968 .co_unalnbyte:
4021 3969 sub %i1, %i0, %i1 ! share pointer advance
4022 3970 .co_unalnbyte_loop:
4023 3971 ldub [%i0], %o4
4024 3972 sllx %o4, 56, %i3
4025 3973 lduh [%i0+1], %o4
4026 3974 sllx %o4, 40, %o4
4027 3975 or %o4, %i3, %i3
4028 3976 lduh [%i0+3], %o4
4029 3977 sllx %o4, 24, %o4
4030 3978 or %o4, %i3, %i3
4031 3979 lduh [%i0+5], %o4
4032 3980 sllx %o4, 8, %o4
4033 3981 or %o4, %i3, %i3
4034 3982 ldub [%i0+7], %o4
4035 3983 or %o4, %i3, %i3
4036 3984 stxa %i3, [%i1+%i0]ASI_USER
4037 3985 subcc %o3, 8, %o3
4038 3986 bnz %ncc, .co_unalnbyte_loop
4039 3987 add %i0, 8, %i0
4040 3988 add %i1,%i0, %i1 ! restore pointer
4041 3989
4042 3990 ! Destination is now block (64 byte aligned), src is not 8 byte aligned
4043 3991 .co_unalnsrc:
4044 3992 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
4045 3993 and %i2, 0x3f, %i2 ! residue bytes in %i2
4046 3994 add %i2, 64, %i2 ! Insure we don't load beyond
4047 3995 sub %i3, 64, %i3 ! end of source buffer
4048 3996
4049 3997 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
4050 3998 prefetch [%o4 + (3 * CACHE_LINE)], #one_read
4051 3999 alignaddr %i0, %g0, %g0 ! generate %gsr
4052 4000 add %i0, %i3, %i0 ! advance %i0 to after blocks
4053 4001 !
4054 4002 ! Determine source alignment to correct 8 byte offset
4055 4003 andcc %i0, 0x20, %o3
4056 4004 brnz,pn %o3, .co_unaln_1
4057 4005 andcc %i0, 0x10, %o3
4058 4006 brnz,pn %o3, .co_unaln_01
4059 4007 andcc %i0, 0x08, %o3
4060 4008 brz,a %o3, .co_unaln_000
4061 4009 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4062 4010 ba .co_unaln_001
4063 4011 nop
4064 4012 .co_unaln_01:
4065 4013 brnz,a %o3, .co_unaln_011
4066 4014 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4067 4015 ba .co_unaln_010
4068 4016 nop
4069 4017 .co_unaln_1:
4070 4018 brnz,pn %o3, .co_unaln_11
4071 4019 andcc %i0, 0x08, %o3
4072 4020 brnz,a %o3, .co_unaln_101
4073 4021 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4074 4022 ba .co_unaln_100
4075 4023 nop
4076 4024 .co_unaln_11:
4077 4025 brz,pn %o3, .co_unaln_110
4078 4026 prefetch [%i0 + (4 * CACHE_LINE)], #one_read
4079 4027
4080 4028 .co_unaln_111:
4081 4029 ldd [%o4+56], %d14
4082 4030 .co_unaln_111_loop:
4083 4031 add %o4, 64, %o4
4084 4032 ldda [%o4]ASI_BLK_P, %d16
4085 4033 faligndata %d14, %d16, %d48
4086 4034 faligndata %d16, %d18, %d50
4087 4035 faligndata %d18, %d20, %d52
4088 4036 faligndata %d20, %d22, %d54
4089 4037 faligndata %d22, %d24, %d56
4090 4038 faligndata %d24, %d26, %d58
4091 4039 faligndata %d26, %d28, %d60
4092 4040 faligndata %d28, %d30, %d62
4093 4041 fmovd %d30, %d14
4094 4042 stda %d48, [%i1]ASI_BLK_AIUS
4095 4043 subcc %i3, 64, %i3
4096 4044 add %i1, 64, %i1
4097 4045 bgu,pt %ncc, .co_unaln_111_loop
4098 4046 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4099 4047 ba .co_unaln_done
4100 4048 nop
4101 4049
4102 4050 .co_unaln_110:
4103 4051 ldd [%o4+48], %d12
4104 4052 ldd [%o4+56], %d14
4105 4053 .co_unaln_110_loop:
4106 4054 add %o4, 64, %o4
4107 4055 ldda [%o4]ASI_BLK_P, %d16
4108 4056 faligndata %d12, %d14, %d48
4109 4057 faligndata %d14, %d16, %d50
4110 4058 faligndata %d16, %d18, %d52
4111 4059 faligndata %d18, %d20, %d54
4112 4060 faligndata %d20, %d22, %d56
4113 4061 faligndata %d22, %d24, %d58
4114 4062 faligndata %d24, %d26, %d60
4115 4063 faligndata %d26, %d28, %d62
4116 4064 fmovd %d28, %d12
4117 4065 fmovd %d30, %d14
4118 4066 stda %d48, [%i1]ASI_BLK_AIUS
4119 4067 subcc %i3, 64, %i3
4120 4068 add %i1, 64, %i1
4121 4069 bgu,pt %ncc, .co_unaln_110_loop
4122 4070 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4123 4071 ba .co_unaln_done
4124 4072 nop
4125 4073
4126 4074 .co_unaln_101:
4127 4075 ldd [%o4+40], %d10
4128 4076 ldd [%o4+48], %d12
4129 4077 ldd [%o4+56], %d14
4130 4078 .co_unaln_101_loop:
4131 4079 add %o4, 64, %o4
4132 4080 ldda [%o4]ASI_BLK_P, %d16
4133 4081 faligndata %d10, %d12, %d48
4134 4082 faligndata %d12, %d14, %d50
4135 4083 faligndata %d14, %d16, %d52
4136 4084 faligndata %d16, %d18, %d54
4137 4085 faligndata %d18, %d20, %d56
4138 4086 faligndata %d20, %d22, %d58
4139 4087 faligndata %d22, %d24, %d60
4140 4088 faligndata %d24, %d26, %d62
4141 4089 fmovd %d26, %d10
4142 4090 fmovd %d28, %d12
4143 4091 fmovd %d30, %d14
4144 4092 stda %d48, [%i1]ASI_BLK_AIUS
4145 4093 subcc %i3, 64, %i3
4146 4094 add %i1, 64, %i1
4147 4095 bgu,pt %ncc, .co_unaln_101_loop
4148 4096 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4149 4097 ba .co_unaln_done
4150 4098 nop
4151 4099
4152 4100 .co_unaln_100:
4153 4101 ldd [%o4+32], %d8
4154 4102 ldd [%o4+40], %d10
4155 4103 ldd [%o4+48], %d12
4156 4104 ldd [%o4+56], %d14
4157 4105 .co_unaln_100_loop:
4158 4106 add %o4, 64, %o4
4159 4107 ldda [%o4]ASI_BLK_P, %d16
4160 4108 faligndata %d8, %d10, %d48
4161 4109 faligndata %d10, %d12, %d50
4162 4110 faligndata %d12, %d14, %d52
4163 4111 faligndata %d14, %d16, %d54
4164 4112 faligndata %d16, %d18, %d56
4165 4113 faligndata %d18, %d20, %d58
4166 4114 faligndata %d20, %d22, %d60
4167 4115 faligndata %d22, %d24, %d62
4168 4116 fmovd %d24, %d8
4169 4117 fmovd %d26, %d10
4170 4118 fmovd %d28, %d12
4171 4119 fmovd %d30, %d14
4172 4120 stda %d48, [%i1]ASI_BLK_AIUS
4173 4121 subcc %i3, 64, %i3
4174 4122 add %i1, 64, %i1
4175 4123 bgu,pt %ncc, .co_unaln_100_loop
4176 4124 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4177 4125 ba .co_unaln_done
4178 4126 nop
4179 4127
4180 4128 .co_unaln_011:
4181 4129 ldd [%o4+24], %d6
4182 4130 ldd [%o4+32], %d8
4183 4131 ldd [%o4+40], %d10
4184 4132 ldd [%o4+48], %d12
4185 4133 ldd [%o4+56], %d14
4186 4134 .co_unaln_011_loop:
4187 4135 add %o4, 64, %o4
4188 4136 ldda [%o4]ASI_BLK_P, %d16
4189 4137 faligndata %d6, %d8, %d48
4190 4138 faligndata %d8, %d10, %d50
4191 4139 faligndata %d10, %d12, %d52
4192 4140 faligndata %d12, %d14, %d54
4193 4141 faligndata %d14, %d16, %d56
4194 4142 faligndata %d16, %d18, %d58
4195 4143 faligndata %d18, %d20, %d60
4196 4144 faligndata %d20, %d22, %d62
4197 4145 fmovd %d22, %d6
4198 4146 fmovd %d24, %d8
4199 4147 fmovd %d26, %d10
4200 4148 fmovd %d28, %d12
4201 4149 fmovd %d30, %d14
4202 4150 stda %d48, [%i1]ASI_BLK_AIUS
4203 4151 subcc %i3, 64, %i3
4204 4152 add %i1, 64, %i1
4205 4153 bgu,pt %ncc, .co_unaln_011_loop
4206 4154 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4207 4155 ba .co_unaln_done
4208 4156 nop
4209 4157
4210 4158 .co_unaln_010:
4211 4159 ldd [%o4+16], %d4
4212 4160 ldd [%o4+24], %d6
4213 4161 ldd [%o4+32], %d8
4214 4162 ldd [%o4+40], %d10
4215 4163 ldd [%o4+48], %d12
4216 4164 ldd [%o4+56], %d14
4217 4165 .co_unaln_010_loop:
4218 4166 add %o4, 64, %o4
4219 4167 ldda [%o4]ASI_BLK_P, %d16
4220 4168 faligndata %d4, %d6, %d48
4221 4169 faligndata %d6, %d8, %d50
4222 4170 faligndata %d8, %d10, %d52
4223 4171 faligndata %d10, %d12, %d54
4224 4172 faligndata %d12, %d14, %d56
4225 4173 faligndata %d14, %d16, %d58
4226 4174 faligndata %d16, %d18, %d60
4227 4175 faligndata %d18, %d20, %d62
4228 4176 fmovd %d20, %d4
4229 4177 fmovd %d22, %d6
4230 4178 fmovd %d24, %d8
4231 4179 fmovd %d26, %d10
4232 4180 fmovd %d28, %d12
4233 4181 fmovd %d30, %d14
4234 4182 stda %d48, [%i1]ASI_BLK_AIUS
4235 4183 subcc %i3, 64, %i3
4236 4184 add %i1, 64, %i1
4237 4185 bgu,pt %ncc, .co_unaln_010_loop
4238 4186 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4239 4187 ba .co_unaln_done
4240 4188 nop
4241 4189
4242 4190 .co_unaln_001:
4243 4191 ldd [%o4+8], %d2
4244 4192 ldd [%o4+16], %d4
4245 4193 ldd [%o4+24], %d6
4246 4194 ldd [%o4+32], %d8
4247 4195 ldd [%o4+40], %d10
4248 4196 ldd [%o4+48], %d12
4249 4197 ldd [%o4+56], %d14
4250 4198 .co_unaln_001_loop:
4251 4199 add %o4, 64, %o4
4252 4200 ldda [%o4]ASI_BLK_P, %d16
4253 4201 faligndata %d2, %d4, %d48
4254 4202 faligndata %d4, %d6, %d50
4255 4203 faligndata %d6, %d8, %d52
4256 4204 faligndata %d8, %d10, %d54
4257 4205 faligndata %d10, %d12, %d56
4258 4206 faligndata %d12, %d14, %d58
4259 4207 faligndata %d14, %d16, %d60
4260 4208 faligndata %d16, %d18, %d62
4261 4209 fmovd %d18, %d2
4262 4210 fmovd %d20, %d4
4263 4211 fmovd %d22, %d6
4264 4212 fmovd %d24, %d8
4265 4213 fmovd %d26, %d10
4266 4214 fmovd %d28, %d12
4267 4215 fmovd %d30, %d14
4268 4216 stda %d48, [%i1]ASI_BLK_AIUS
4269 4217 subcc %i3, 64, %i3
4270 4218 add %i1, 64, %i1
4271 4219 bgu,pt %ncc, .co_unaln_001_loop
4272 4220 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4273 4221 ba .co_unaln_done
4274 4222 nop
4275 4223
4276 4224 .co_unaln_000:
4277 4225 ldda [%o4]ASI_BLK_P, %d0
4278 4226 .co_unaln_000_loop:
4279 4227 add %o4, 64, %o4
4280 4228 ldda [%o4]ASI_BLK_P, %d16
4281 4229 faligndata %d0, %d2, %d48
4282 4230 faligndata %d2, %d4, %d50
4283 4231 faligndata %d4, %d6, %d52
4284 4232 faligndata %d6, %d8, %d54
4285 4233 faligndata %d8, %d10, %d56
4286 4234 faligndata %d10, %d12, %d58
4287 4235 faligndata %d12, %d14, %d60
4288 4236 faligndata %d14, %d16, %d62
4289 4237 fmovd %d16, %d0
4290 4238 fmovd %d18, %d2
4291 4239 fmovd %d20, %d4
4292 4240 fmovd %d22, %d6
4293 4241 fmovd %d24, %d8
4294 4242 fmovd %d26, %d10
4295 4243 fmovd %d28, %d12
4296 4244 fmovd %d30, %d14
4297 4245 stda %d48, [%i1]ASI_BLK_AIUS
4298 4246 subcc %i3, 64, %i3
4299 4247 add %i1, 64, %i1
4300 4248 bgu,pt %ncc, .co_unaln_000_loop
4301 4249 prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4302 4250
4303 4251 .co_unaln_done:
4304 4252 ! Handle trailing bytes, 64 to 127
4305 4253 ! Dest long word aligned, Src not long word aligned
4306 4254 cmp %i2, 15
4307 4255 bleu %ncc, .co_unaln_short
4308 4256
4309 4257 andn %i2, 0x7, %i3 ! %i3 is multiple of 8
4310 4258 and %i2, 0x7, %i2 ! residue bytes in %i2
4311 4259 add %i2, 8, %i2
4312 4260 sub %i3, 8, %i3 ! insure we don't load past end of src
4313 4261 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
4314 4262 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
4315 4263 ldd [%o4], %d0 ! fetch partial word
4316 4264 .co_unaln_by8:
4317 4265 ldd [%o4+8], %d2
4318 4266 add %o4, 8, %o4
4319 4267 faligndata %d0, %d2, %d16
4320 4268 subcc %i3, 8, %i3
4321 4269 stda %d16, [%i1]%asi
4322 4270 fmovd %d2, %d0
4323 4271 bgu,pt %ncc, .co_unaln_by8
4324 4272 add %i1, 8, %i1
4325 4273
4326 4274 .co_unaln_short:
4327 4275 cmp %i2, 8
4328 4276 blt,pt %ncc, .co_unalnfin
4329 4277 nop
4330 4278 ldub [%i0], %o4
4331 4279 sll %o4, 24, %o3
4332 4280 ldub [%i0+1], %o4
4333 4281 sll %o4, 16, %o4
4334 4282 or %o4, %o3, %o3
4335 4283 ldub [%i0+2], %o4
4336 4284 sll %o4, 8, %o4
4337 4285 or %o4, %o3, %o3
4338 4286 ldub [%i0+3], %o4
4339 4287 or %o4, %o3, %o3
4340 4288 stwa %o3, [%i1]%asi
4341 4289 ldub [%i0+4], %o4
4342 4290 sll %o4, 24, %o3
4343 4291 ldub [%i0+5], %o4
4344 4292 sll %o4, 16, %o4
4345 4293 or %o4, %o3, %o3
4346 4294 ldub [%i0+6], %o4
4347 4295 sll %o4, 8, %o4
4348 4296 or %o4, %o3, %o3
4349 4297 ldub [%i0+7], %o4
4350 4298 or %o4, %o3, %o3
4351 4299 stwa %o3, [%i1+4]%asi
4352 4300 add %i0, 8, %i0
4353 4301 add %i1, 8, %i1
4354 4302 sub %i2, 8, %i2
4355 4303 .co_unalnfin:
4356 4304 cmp %i2, 4
4357 4305 blt,pt %ncc, .co_unalnz
4358 4306 tst %i2
4359 4307 ldub [%i0], %o3 ! read byte
4360 4308 subcc %i2, 4, %i2 ! reduce count by 4
4361 4309 sll %o3, 24, %o3 ! position
4362 4310 ldub [%i0+1], %o4
4363 4311 sll %o4, 16, %o4 ! position
4364 4312 or %o4, %o3, %o3 ! merge
4365 4313 ldub [%i0+2], %o4
4366 4314 sll %o4, 8, %o4 ! position
4367 4315 or %o4, %o3, %o3 ! merge
4368 4316 add %i1, 4, %i1 ! advance dst by 4
4369 4317 ldub [%i0+3], %o4
4370 4318 add %i0, 4, %i0 ! advance src by 4
4371 4319 or %o4, %o3, %o4 ! merge
4372 4320 bnz,pt %ncc, .co_unaln3x
4373 4321 stwa %o4, [%i1-4]%asi
4374 4322 ba .co_exit
4375 4323 nop
4376 4324 .co_unalnz:
4377 4325 bz,pt %ncc, .co_exit
4378 4326 wr %l5, %g0, %gsr ! restore %gsr
4379 4327 .co_unaln3x: ! Exactly 1, 2, or 3 bytes remain
4380 4328 subcc %i2, 1, %i2 ! reduce count for cc test
4381 4329 ldub [%i0], %o4 ! load one byte
4382 4330 bz,pt %ncc, .co_exit
4383 4331 stba %o4, [%i1]%asi ! store one byte
4384 4332 ldub [%i0+1], %o4 ! load second byte
4385 4333 subcc %i2, 1, %i2
4386 4334 bz,pt %ncc, .co_exit
4387 4335 stba %o4, [%i1+1]%asi ! store second byte
4388 4336 ldub [%i0+2], %o4 ! load third byte
4389 4337 stba %o4, [%i1+2]%asi ! store third byte
4390 4338 .co_exit:
4391 4339 brnz %g1, .co_fp_restore
4392 4340 nop
4393 4341 FZERO
4394 4342 wr %g1, %g0, %fprs
4395 4343 ba,pt %ncc, .co_ex2
4396 4344 membar #Sync
4397 4345 .co_fp_restore:
4398 4346 BLD_FP_FROMSTACK(%o4)
4399 4347 .co_ex2:
4400 4348 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
4401 4349 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4402 4350 ret
4403 4351 restore %g0, 0, %o0
4404 4352
4405 4353 .copyout_err:
4406 4354 ldn [THREAD_REG + T_COPYOPS], %o4
4407 4355 brz %o4, 2f
4408 4356 nop
4409 4357 ldn [%o4 + CP_COPYOUT], %g2
4410 4358 jmp %g2
4411 4359 nop
4412 4360 2:
4413 4361 retl
4414 4362 mov -1, %o0
4415 4363
4416 4364 #else /* NIAGARA_IMPL */
4417 4365 .do_copyout:
4418 4366 !
4419 4367 ! Check the length and bail if zero.
4420 4368 !
4421 4369 tst %o2
4422 4370 bnz,pt %ncc, 1f
4423 4371 nop
4424 4372 retl
4425 4373 clr %o0
4426 4374 1:
4427 4375 sethi %hi(copyio_fault), %o4
4428 4376 or %o4, %lo(copyio_fault), %o4
4429 4377 sethi %hi(copyio_fault_nowindow), %o3
4430 4378 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
4431 4379 or %o3, %lo(copyio_fault_nowindow), %o3
4432 4380 membar #Sync
4433 4381 stn %o3, [THREAD_REG + T_LOFAULT]
4434 4382
4435 4383 mov %o0, SAVE_SRC
4436 4384 mov %o1, SAVE_DST
4437 4385 mov %o2, SAVE_COUNT
4438 4386
4439 4387 !
4440 4388 ! Check to see if we're more than SMALL_LIMIT (7 bytes).
4441 4389 ! Run in leaf mode, using the %o regs as our input regs.
4442 4390 !
4443 4391 subcc %o2, SMALL_LIMIT, %o3
4444 4392 bgu,a,pt %ncc, .dco_ns
4445 4393 or %o0, %o1, %o3
4446 4394 !
4447 4395 ! What was previously ".small_copyout"
4448 4396 ! Do full differenced copy.
4449 4397 !
4450 4398 .dcobcp:
4451 4399 sub %g0, %o2, %o3 ! negate count
4452 4400 add %o0, %o2, %o0 ! make %o0 point at the end
4453 4401 add %o1, %o2, %o1 ! make %o1 point at the end
4454 4402 ba,pt %ncc, .dcocl
4455 4403 ldub [%o0 + %o3], %o4 ! load first byte
4456 4404 !
4457 4405 ! %o0 and %o2 point at the end and remain pointing at the end
4458 4406 ! of their buffers. We pull things out by adding %o3 (which is
4459 4407 ! the negation of the length) to the buffer end which gives us
4460 4408 ! the curent location in the buffers. By incrementing %o3 we walk
4461 4409 ! through both buffers without having to bump each buffer's
4462 4410 ! pointer. A very fast 4 instruction loop.
4463 4411 !
4464 4412 .align 16
4465 4413 .dcocl:
4466 4414 stba %o4, [%o1 + %o3]ASI_USER
4467 4415 inccc %o3
4468 4416 bl,a,pt %ncc, .dcocl
4469 4417 ldub [%o0 + %o3], %o4
4470 4418 !
4471 4419 ! We're done. Go home.
4472 4420 !
4473 4421 membar #Sync
4474 4422 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
4475 4423 retl
4476 4424 clr %o0
4477 4425 !
4478 4426 ! Try aligned copies from here.
4479 4427 !
4480 4428 .dco_ns:
4481 4429 ! %o0 = kernel addr (to be copied from)
4482 4430 ! %o1 = user addr (to be copied to)
4483 4431 ! %o2 = length
4484 4432 ! %o3 = %o1 | %o2 (used for alignment checking)
4485 4433 ! %o4 is alternate lo_fault
4486 4434 ! %o5 is original lo_fault
4487 4435 !
4488 4436 ! See if we're single byte aligned. If we are, check the
4489 4437 ! limit for single byte copies. If we're smaller or equal,
4490 4438 ! bounce to the byte for byte copy loop. Otherwise do it in
4491 4439 ! HW (if enabled).
4492 4440 !
4493 4441 btst 1, %o3
4494 4442 bz,pt %icc, .dcoh8
4495 4443 btst 7, %o3
4496 4444 !
4497 4445 ! Single byte aligned. Do we do it via HW or via
4498 4446 ! byte for byte? Do a quick no memory reference
4499 4447 ! check to pick up small copies.
4500 4448 !
4501 4449 sethi %hi(hw_copy_limit_1), %o3
4502 4450 !
4503 4451 ! Big enough that we need to check the HW limit for
4504 4452 ! this size copy.
4505 4453 !
4506 4454 ld [%o3 + %lo(hw_copy_limit_1)], %o3
4507 4455 !
4508 4456 ! Is HW copy on? If not, do everything byte for byte.
4509 4457 !
4510 4458 tst %o3
4511 4459 bz,pn %icc, .dcobcp
4512 4460 subcc %o3, %o2, %o3
4513 4461 !
4514 4462 ! If we're less than or equal to the single byte copy limit,
4515 4463 ! bop to the copy loop.
4516 4464 !
4517 4465 bge,pt %ncc, .dcobcp
4518 4466 nop
4519 4467 !
4520 4468 ! We're big enough and copy is on. Do it with HW.
4521 4469 !
4522 4470 ba,pt %ncc, .big_copyout
4523 4471 nop
4524 4472 .dcoh8:
4525 4473 !
4526 4474 ! 8 byte aligned?
4527 4475 !
4528 4476 bnz,a %ncc, .dcoh4
4529 4477 btst 3, %o3
4530 4478 !
4531 4479 ! See if we're in the "small range".
4532 4480 ! If so, go off and do the copy.
4533 4481 ! If not, load the hard limit. %o3 is
4534 4482 ! available for reuse.
4535 4483 !
4536 4484 sethi %hi(hw_copy_limit_8), %o3
4537 4485 ld [%o3 + %lo(hw_copy_limit_8)], %o3
4538 4486 !
4539 4487 ! If it's zero, there's no HW bcopy.
4540 4488 ! Bop off to the aligned copy.
4541 4489 !
4542 4490 tst %o3
4543 4491 bz,pn %icc, .dcos8
4544 4492 subcc %o3, %o2, %o3
4545 4493 !
4546 4494 ! We're negative if our size is larger than hw_copy_limit_8.
4547 4495 !
4548 4496 bge,pt %ncc, .dcos8
4549 4497 nop
4550 4498 !
4551 4499 ! HW assist is on and we're large enough. Do it.
4552 4500 !
4553 4501 ba,pt %ncc, .big_copyout
4554 4502 nop
4555 4503 .dcos8:
4556 4504 !
4557 4505 ! Housekeeping for copy loops. Uses same idea as in the byte for
4558 4506 ! byte copy loop above.
4559 4507 !
4560 4508 add %o0, %o2, %o0
4561 4509 add %o1, %o2, %o1
4562 4510 sub %g0, %o2, %o3
4563 4511 ba,pt %ncc, .dodebc
4564 4512 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy
4565 4513 !
4566 4514 ! 4 byte aligned?
4567 4515 !
4568 4516 .dcoh4:
4569 4517 bnz,pn %ncc, .dcoh2
4570 4518 !
4571 4519 ! See if we're in the "small range".
4572 4520 ! If so, go off an do the copy.
4573 4521 ! If not, load the hard limit. %o3 is
4574 4522 ! available for reuse.
4575 4523 !
4576 4524 sethi %hi(hw_copy_limit_4), %o3
4577 4525 ld [%o3 + %lo(hw_copy_limit_4)], %o3
4578 4526 !
4579 4527 ! If it's zero, there's no HW bcopy.
4580 4528 ! Bop off to the aligned copy.
4581 4529 !
4582 4530 tst %o3
4583 4531 bz,pn %icc, .dcos4
4584 4532 subcc %o3, %o2, %o3
4585 4533 !
4586 4534 ! We're negative if our size is larger than hw_copy_limit_4.
4587 4535 !
4588 4536 bge,pt %ncc, .dcos4
4589 4537 nop
4590 4538 !
4591 4539 ! HW assist is on and we're large enough. Do it.
4592 4540 !
4593 4541 ba,pt %ncc, .big_copyout
4594 4542 nop
4595 4543 .dcos4:
4596 4544 add %o0, %o2, %o0
4597 4545 add %o1, %o2, %o1
4598 4546 sub %g0, %o2, %o3
4599 4547 ba,pt %ncc, .dodfbc
4600 4548 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy
4601 4549 !
4602 4550 ! We must be 2 byte aligned. Off we go.
4603 4551 ! The check for small copies was done in the
4604 4552 ! delay at .dcoh4
4605 4553 !
4606 4554 .dcoh2:
4607 4555 ble %ncc, .dcos2
4608 4556 sethi %hi(hw_copy_limit_2), %o3
4609 4557 ld [%o3 + %lo(hw_copy_limit_2)], %o3
4610 4558 tst %o3
4611 4559 bz,pn %icc, .dcos2
4612 4560 subcc %o3, %o2, %o3
4613 4561 bge,pt %ncc, .dcos2
4614 4562 nop
4615 4563 !
4616 4564 ! HW is on and we're big enough. Do it.
4617 4565 !
4618 4566 ba,pt %ncc, .big_copyout
4619 4567 nop
4620 4568 .dcos2:
4621 4569 add %o0, %o2, %o0
4622 4570 add %o1, %o2, %o1
4623 4571 sub %g0, %o2, %o3
4624 4572 ba,pt %ncc, .dodtbc
4625 4573 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy
4626 4574 .small_copyout:
4627 4575 !
4628 4576 ! Why are we doing this AGAIN? There are certain conditions in
4629 4577 ! big_copyout that will cause us to forego the HW assisted copies
4630 4578 ! and bounce back to a non-HW assisted copy. This dispatches those
4631 4579 ! copies. Note that we branch around this in the main line code.
4632 4580 !
4633 4581 ! We make no check for limits or HW enablement here. We've
4634 4582 ! already been told that we're a poster child so just go off
4635 4583 ! and do it.
4636 4584 !
4637 4585 or %o0, %o1, %o3
4638 4586 btst 1, %o3
4639 4587 bnz %icc, .dcobcp ! Most likely
4640 4588 btst 7, %o3
4641 4589 bz %icc, .dcos8
4642 4590 btst 3, %o3
4643 4591 bz %icc, .dcos4
4644 4592 nop
4645 4593 ba,pt %ncc, .dcos2
4646 4594 nop
4647 4595 .align 32
4648 4596 .dodebc:
4649 4597 ldx [%o0 + %o3], %o4
4650 4598 deccc %o2
4651 4599 stxa %o4, [%o1 + %o3]ASI_USER
4652 4600 bg,pt %ncc, .dodebc
4653 4601 addcc %o3, 8, %o3
4654 4602 !
4655 4603 ! End of copy loop. Check to see if we're done. Most
4656 4604 ! eight byte aligned copies end here.
4657 4605 !
4658 4606 bz,pt %ncc, .dcofh
4659 4607 nop
4660 4608 !
4661 4609 ! Something is left - do it byte for byte.
4662 4610 !
4663 4611 ba,pt %ncc, .dcocl
4664 4612 ldub [%o0 + %o3], %o4 ! load next byte
4665 4613 !
4666 4614 ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
4667 4615 !
4668 4616 .align 32
4669 4617 .dodfbc:
4670 4618 lduw [%o0 + %o3], %o4
4671 4619 deccc %o2
4672 4620 sta %o4, [%o1 + %o3]ASI_USER
4673 4621 bg,pt %ncc, .dodfbc
4674 4622 addcc %o3, 4, %o3
4675 4623 !
4676 4624 ! End of copy loop. Check to see if we're done. Most
4677 4625 ! four byte aligned copies end here.
4678 4626 !
4679 4627 bz,pt %ncc, .dcofh
4680 4628 nop
4681 4629 !
4682 4630 ! Something is left. Do it byte for byte.
4683 4631 !
4684 4632 ba,pt %ncc, .dcocl
4685 4633 ldub [%o0 + %o3], %o4 ! load next byte
4686 4634 !
4687 4635 ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
4688 4636 ! copy.
4689 4637 !
4690 4638 .align 32
4691 4639 .dodtbc:
4692 4640 lduh [%o0 + %o3], %o4
4693 4641 deccc %o2
4694 4642 stha %o4, [%o1 + %o3]ASI_USER
4695 4643 bg,pt %ncc, .dodtbc
4696 4644 addcc %o3, 2, %o3
4697 4645 !
4698 4646 ! End of copy loop. Anything left?
4699 4647 !
4700 4648 bz,pt %ncc, .dcofh
4701 4649 nop
4702 4650 !
4703 4651 ! Deal with the last byte
4704 4652 !
4705 4653 ldub [%o0 + %o3], %o4
4706 4654 stba %o4, [%o1 + %o3]ASI_USER
4707 4655 .dcofh:
4708 4656 membar #Sync
4709 4657 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4710 4658 retl
4711 4659 clr %o0
4712 4660
4713 4661 .big_copyout:
4714 4662 ! We're going to go off and do a block copy.
4715 4663 ! Switch fault handlers and grab a window. We
4716 4664 ! don't do a membar #Sync since we've done only
4717 4665 ! kernel data to this point.
4718 4666 stn %o4, [THREAD_REG + T_LOFAULT]
4719 4667
4720 4668 ! Copy out that reach here are larger than 256 bytes. The
4721 4669 ! hw_copy_limit_1 is set to 256. Never set this limit less
4722 4670 ! 128 bytes.
4723 4671 save %sp, -SA(MINFRAME), %sp
4724 4672 .do_block_copyout:
4725 4673
4726 4674 ! Swap src/dst since the code below is memcpy code
4727 4675 ! and memcpy/bcopy have different calling sequences
4728 4676 mov %i1, %i5
4729 4677 mov %i0, %i1
4730 4678 mov %i5, %i0
4731 4679
4732 4680 ! Block (64 bytes) align the destination.
4733 4681 andcc %i0, 0x3f, %i3 ! is dst block aligned
4734 4682 bz %ncc, copyout_blalign ! dst already block aligned
4735 4683 sub %i3, 0x40, %i3
4736 4684 neg %i3 ! bytes till dst 64 bytes aligned
4737 4685 sub %i2, %i3, %i2 ! update i2 with new count
4738 4686
4739 4687 ! Based on source and destination alignment do
4740 4688 ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
4741 4689
4742 4690 ! Is dst & src 8B aligned
4743 4691 or %i0, %i1, %o2
4744 4692 andcc %o2, 0x7, %g0
4745 4693 bz %ncc, .co_alewdcp
4746 4694 nop
4747 4695
4748 4696 ! Is dst & src 4B aligned
4749 4697 andcc %o2, 0x3, %g0
4750 4698 bz %ncc, .co_alwdcp
4751 4699 nop
4752 4700
4753 4701 ! Is dst & src 2B aligned
4754 4702 andcc %o2, 0x1, %g0
4755 4703 bz %ncc, .co_alhlfwdcp
4756 4704 nop
4757 4705
4758 4706 ! 1B aligned
4759 4707 1: ldub [%i1], %o2
4760 4708 stba %o2, [%i0]ASI_USER
4761 4709 inc %i1
4762 4710 deccc %i3
4763 4711 bgu,pt %ncc, 1b
4764 4712 inc %i0
4765 4713
4766 4714 ba copyout_blalign
4767 4715 nop
4768 4716
4769 4717 ! dst & src 4B aligned
4770 4718 .co_alwdcp:
4771 4719 ld [%i1], %o2
4772 4720 sta %o2, [%i0]ASI_USER
4773 4721 add %i1, 0x4, %i1
4774 4722 subcc %i3, 0x4, %i3
4775 4723 bgu,pt %ncc, .co_alwdcp
4776 4724 add %i0, 0x4, %i0
4777 4725
4778 4726 ba copyout_blalign
4779 4727 nop
4780 4728
4781 4729 ! dst & src 2B aligned
4782 4730 .co_alhlfwdcp:
4783 4731 lduh [%i1], %o2
4784 4732 stuha %o2, [%i0]ASI_USER
4785 4733 add %i1, 0x2, %i1
4786 4734 subcc %i3, 0x2, %i3
4787 4735 bgu,pt %ncc, .co_alhlfwdcp
4788 4736 add %i0, 0x2, %i0
4789 4737
4790 4738 ba copyout_blalign
4791 4739 nop
4792 4740
4793 4741 ! dst & src 8B aligned
4794 4742 .co_alewdcp:
4795 4743 ldx [%i1], %o2
4796 4744 stxa %o2, [%i0]ASI_USER
4797 4745 add %i1, 0x8, %i1
4798 4746 subcc %i3, 0x8, %i3
4799 4747 bgu,pt %ncc, .co_alewdcp
4800 4748 add %i0, 0x8, %i0
4801 4749
4802 4750 ! Now Destination is block (64 bytes) aligned
4803 4751 copyout_blalign:
4804 4752 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
4805 4753 sub %i2, %i3, %i2 ! Residue bytes in %i2
4806 4754
4807 4755 mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
4808 4756
4809 4757 andcc %i1, 0xf, %o2 ! is src quadword aligned
4810 4758 bz,pn %xcc, .co_blkcpy ! src offset in %o2 (last 4-bits)
4811 4759 nop
4812 4760 cmp %o2, 0x8
4813 4761 bg .co_upper_double
4814 4762 nop
4815 4763 bl .co_lower_double
4816 4764 nop
4817 4765
4818 4766 ! Falls through when source offset is equal to 8 i.e.
4819 4767 ! source is double word aligned.
4820 4768 ! In this case no shift/merge of data is required
4821 4769
4822 4770 sub %i1, %o2, %i1 ! align the src at 16 bytes.
4823 4771 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
4824 4772 prefetch [%l0+0x0], #one_read
4825 4773 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4826 4774 .co_loop0:
4827 4775 add %i1, 0x10, %i1
4828 4776 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4829 4777 prefetch [%l0+0x40], #one_read
4830 4778
4831 4779 stxa %l3, [%i0+0x0]%asi
4832 4780 stxa %l4, [%i0+0x8]%asi
4833 4781
4834 4782 add %i1, 0x10, %i1
4835 4783 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4836 4784
4837 4785 stxa %l5, [%i0+0x10]%asi
4838 4786 stxa %l2, [%i0+0x18]%asi
4839 4787
4840 4788 add %i1, 0x10, %i1
4841 4789 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4842 4790
4843 4791 stxa %l3, [%i0+0x20]%asi
4844 4792 stxa %l4, [%i0+0x28]%asi
4845 4793
4846 4794 add %i1, 0x10, %i1
4847 4795 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4848 4796
4849 4797 stxa %l5, [%i0+0x30]%asi
4850 4798 stxa %l2, [%i0+0x38]%asi
4851 4799
4852 4800 add %l0, 0x40, %l0
4853 4801 subcc %i3, 0x40, %i3
4854 4802 bgu,pt %xcc, .co_loop0
4855 4803 add %i0, 0x40, %i0
4856 4804 ba .co_blkdone
4857 4805 add %i1, %o2, %i1 ! increment the source by src offset
4858 4806 ! the src offset was stored in %o2
4859 4807
4860 4808 .co_lower_double:
4861 4809
4862 4810 sub %i1, %o2, %i1 ! align the src at 16 bytes.
4863 4811 sll %o2, 3, %o0 ! %o0 left shift
4864 4812 mov 0x40, %o1
4865 4813 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
4866 4814 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
4867 4815 prefetch [%l0+0x0], #one_read
4868 4816 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l2 and %l3 has
4869 4817 ! complete data
4870 4818 .co_loop1:
4871 4819 add %i1, 0x10, %i1
4872 4820 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has partial data
4873 4821 ! for this read.
4874 4822 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
4875 4823 ! into %l2 and %l3
4876 4824 prefetch [%l0+0x40], #one_read
4877 4825
4878 4826 stxa %l2, [%i0+0x0]%asi
4879 4827 stxa %l3, [%i0+0x8]%asi
4880 4828
4881 4829 add %i1, 0x10, %i1
4882 4830 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4883 4831 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
4884 4832 ! %l4 from previous read
4885 4833 ! into %l4 and %l5
4886 4834 stxa %l4, [%i0+0x10]%asi
4887 4835 stxa %l5, [%i0+0x18]%asi
4888 4836
4889 4837 ! Repeat the same for next 32 bytes.
4890 4838
4891 4839 add %i1, 0x10, %i1
4892 4840 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4893 4841 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
4894 4842
4895 4843 stxa %l2, [%i0+0x20]%asi
4896 4844 stxa %l3, [%i0+0x28]%asi
4897 4845
4898 4846 add %i1, 0x10, %i1
4899 4847 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4900 4848 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
4901 4849
4902 4850 stxa %l4, [%i0+0x30]%asi
4903 4851 stxa %l5, [%i0+0x38]%asi
4904 4852
4905 4853 add %l0, 0x40, %l0
4906 4854 subcc %i3, 0x40, %i3
4907 4855 bgu,pt %xcc, .co_loop1
4908 4856 add %i0, 0x40, %i0
4909 4857 ba .co_blkdone
4910 4858 add %i1, %o2, %i1 ! increment the source by src offset
4911 4859 ! the src offset was stored in %o2
4912 4860
4913 4861 .co_upper_double:
4914 4862
4915 4863 sub %i1, %o2, %i1 ! align the src at 16 bytes.
4916 4864 sub %o2, 0x8, %o0
4917 4865 sll %o0, 3, %o0 ! %o0 left shift
4918 4866 mov 0x40, %o1
4919 4867 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
4920 4868 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
4921 4869 prefetch [%l0+0x0], #one_read
4922 4870 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l3
4923 4871 ! for this read and
4924 4872 ! no data in %l2
4925 4873 .co_loop2:
4926 4874 add %i1, 0x10, %i1
4927 4875 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has complete data
4928 4876 ! and %l5 has partial
4929 4877 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
4930 4878 ! into %l3 and %l4
4931 4879 prefetch [%l0+0x40], #one_read
4932 4880
4933 4881 stxa %l3, [%i0+0x0]%asi
4934 4882 stxa %l4, [%i0+0x8]%asi
4935 4883
4936 4884 add %i1, 0x10, %i1
4937 4885 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4938 4886 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
4939 4887 ! %l5 from previous read
4940 4888 ! into %l5 and %l2
4941 4889
4942 4890 stxa %l5, [%i0+0x10]%asi
4943 4891 stxa %l2, [%i0+0x18]%asi
4944 4892
4945 4893 ! Repeat the same for next 32 bytes.
4946 4894
4947 4895 add %i1, 0x10, %i1
4948 4896 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4949 4897 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
4950 4898
4951 4899 stxa %l3, [%i0+0x20]%asi
4952 4900 stxa %l4, [%i0+0x28]%asi
4953 4901
4954 4902 add %i1, 0x10, %i1
4955 4903 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4956 4904 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
4957 4905
4958 4906 stxa %l5, [%i0+0x30]%asi
4959 4907 stxa %l2, [%i0+0x38]%asi
4960 4908
4961 4909 add %l0, 0x40, %l0
4962 4910 subcc %i3, 0x40, %i3
4963 4911 bgu,pt %xcc, .co_loop2
4964 4912 add %i0, 0x40, %i0
4965 4913 ba .co_blkdone
4966 4914 add %i1, %o2, %i1 ! increment the source by src offset
4967 4915 ! the src offset was stored in %o2
4968 4916
4969 4917
4970 4918 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
4971 4919 .co_blkcpy:
4972 4920
4973 4921 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
4974 4922 prefetch [%o0+0x0], #one_read
4975 4923 1:
4976 4924 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
4977 4925 add %i1, 0x10, %i1
4978 4926 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4979 4927 add %i1, 0x10, %i1
4980 4928
4981 4929 prefetch [%o0+0x40], #one_read
4982 4930
4983 4931 stxa %l0, [%i0+0x0]%asi
4984 4932
4985 4933 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4986 4934 add %i1, 0x10, %i1
4987 4935 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
4988 4936 add %i1, 0x10, %i1
4989 4937
4990 4938 stxa %l1, [%i0+0x8]%asi
4991 4939 stxa %l2, [%i0+0x10]%asi
4992 4940 stxa %l3, [%i0+0x18]%asi
4993 4941 stxa %l4, [%i0+0x20]%asi
4994 4942 stxa %l5, [%i0+0x28]%asi
4995 4943 stxa %l6, [%i0+0x30]%asi
4996 4944 stxa %l7, [%i0+0x38]%asi
4997 4945
4998 4946 add %o0, 0x40, %o0
4999 4947 subcc %i3, 0x40, %i3
5000 4948 bgu,pt %xcc, 1b
5001 4949 add %i0, 0x40, %i0
5002 4950
5003 4951 .co_blkdone:
5004 4952 membar #Sync
5005 4953
5006 4954 brz,pt %i2, .copyout_exit
5007 4955 nop
5008 4956
5009 4957 ! Handle trailing bytes
5010 4958 cmp %i2, 0x8
5011 4959 blu,pt %ncc, .co_residue
5012 4960 nop
5013 4961
5014 4962 ! Can we do some 8B ops
5015 4963 or %i1, %i0, %o2
5016 4964 andcc %o2, 0x7, %g0
5017 4965 bnz %ncc, .co_last4
5018 4966 nop
5019 4967
5020 4968 ! Do 8byte ops as long as possible
5021 4969 .co_last8:
5022 4970 ldx [%i1], %o2
5023 4971 stxa %o2, [%i0]ASI_USER
5024 4972 add %i1, 0x8, %i1
5025 4973 sub %i2, 0x8, %i2
5026 4974 cmp %i2, 0x8
5027 4975 bgu,pt %ncc, .co_last8
5028 4976 add %i0, 0x8, %i0
5029 4977
5030 4978 brz,pt %i2, .copyout_exit
5031 4979 nop
5032 4980
5033 4981 ba .co_residue
5034 4982 nop
5035 4983
5036 4984 .co_last4:
5037 4985 ! Can we do 4B ops
5038 4986 andcc %o2, 0x3, %g0
5039 4987 bnz %ncc, .co_last2
5040 4988 nop
5041 4989 1:
5042 4990 ld [%i1], %o2
5043 4991 sta %o2, [%i0]ASI_USER
5044 4992 add %i1, 0x4, %i1
5045 4993 sub %i2, 0x4, %i2
5046 4994 cmp %i2, 0x4
5047 4995 bgu,pt %ncc, 1b
5048 4996 add %i0, 0x4, %i0
5049 4997
5050 4998 brz,pt %i2, .copyout_exit
5051 4999 nop
5052 5000
5053 5001 ba .co_residue
5054 5002 nop
5055 5003
5056 5004 .co_last2:
5057 5005 ! Can we do 2B ops
5058 5006 andcc %o2, 0x1, %g0
5059 5007 bnz %ncc, .co_residue
5060 5008 nop
5061 5009
5062 5010 1:
5063 5011 lduh [%i1], %o2
5064 5012 stuha %o2, [%i0]ASI_USER
5065 5013 add %i1, 0x2, %i1
5066 5014 sub %i2, 0x2, %i2
5067 5015 cmp %i2, 0x2
5068 5016 bgu,pt %ncc, 1b
5069 5017 add %i0, 0x2, %i0
5070 5018
5071 5019 brz,pt %i2, .copyout_exit
5072 5020 nop
5073 5021
5074 5022 ! Copy the residue as byte copy
5075 5023 .co_residue:
5076 5024 ldub [%i1], %i4
5077 5025 stba %i4, [%i0]ASI_USER
5078 5026 inc %i1
5079 5027 deccc %i2
5080 5028 bgu,pt %xcc, .co_residue
5081 5029 inc %i0
5082 5030
5083 5031 .copyout_exit:
5084 5032 membar #Sync
5085 5033 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
5086 5034 ret
5087 5035 restore %g0, 0, %o0
5088 5036
5089 5037 .copyout_err:
5090 5038 ldn [THREAD_REG + T_COPYOPS], %o4
5091 5039 brz %o4, 2f
↓ open down ↓ |
2166 lines elided |
↑ open up ↑ |
5092 5040 nop
5093 5041 ldn [%o4 + CP_COPYOUT], %g2
5094 5042 jmp %g2
5095 5043 nop
5096 5044 2:
5097 5045 retl
5098 5046 mov -1, %o0
5099 5047 #endif /* NIAGARA_IMPL */
5100 5048 SET_SIZE(copyout)
5101 5049
5102 -#endif /* lint */
5103 5050
5104 -
5105 -#ifdef lint
5106 -
5107 -/*ARGSUSED*/
5108 -int
5109 -xcopyout(const void *kaddr, void *uaddr, size_t count)
5110 -{ return (0); }
5111 -
5112 -#else /* lint */
5113 -
5114 5051 ENTRY(xcopyout)
5115 5052 sethi %hi(.xcopyout_err), REAL_LOFAULT
5116 5053 b .do_copyout
5117 5054 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
5118 5055 .xcopyout_err:
5119 5056 ldn [THREAD_REG + T_COPYOPS], %o4
5120 5057 brz %o4, 2f
5121 5058 nop
5122 5059 ldn [%o4 + CP_XCOPYOUT], %g2
5123 5060 jmp %g2
5124 5061 nop
5125 5062 2:
5126 5063 retl
5127 5064 mov %g1, %o0
5128 5065 SET_SIZE(xcopyout)
5129 5066
5130 -#endif /* lint */
5131 -
5132 -#ifdef lint
5133 -
5134 -/*ARGSUSED*/
5135 -int
5136 -xcopyout_little(const void *kaddr, void *uaddr, size_t count)
5137 -{ return (0); }
5138 -
5139 -#else /* lint */
5140 -
5141 5067 ENTRY(xcopyout_little)
5142 5068 sethi %hi(.little_err), %o4
5143 5069 ldn [THREAD_REG + T_LOFAULT], %o5
5144 5070 or %o4, %lo(.little_err), %o4
5145 5071 membar #Sync ! sync error barrier
5146 5072 stn %o4, [THREAD_REG + T_LOFAULT]
5147 5073
5148 5074 subcc %g0, %o2, %o3
5149 5075 add %o0, %o2, %o0
5150 5076 bz,pn %ncc, 2f ! check for zero bytes
5151 5077 sub %o2, 1, %o4
5152 5078 add %o0, %o4, %o0 ! start w/last byte
5153 5079 add %o1, %o2, %o1
5154 5080 ldub [%o0+%o3], %o4
5155 5081
5156 5082 1: stba %o4, [%o1+%o3]ASI_AIUSL
5157 5083 inccc %o3
↓ open down ↓ |
7 lines elided |
↑ open up ↑ |
5158 5084 sub %o0, 2, %o0 ! get next byte
5159 5085 bcc,a,pt %ncc, 1b
5160 5086 ldub [%o0+%o3], %o4
5161 5087
5162 5088 2: membar #Sync ! sync error barrier
5163 5089 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
5164 5090 retl
5165 5091 mov %g0, %o0 ! return (0)
5166 5092 SET_SIZE(xcopyout_little)
5167 5093
5168 -#endif /* lint */
5169 -
5170 5094 /*
5171 5095 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
5172 5096 */
5173 5097
5174 -#if defined(lint)
5175 -
5176 -/*ARGSUSED*/
5177 -int
5178 -copyin(const void *uaddr, void *kaddr, size_t count)
5179 -{ return (0); }
5180 -
5181 -#else /* lint */
5182 -
5183 5098 ENTRY(copyin)
5184 5099 sethi %hi(.copyin_err), REAL_LOFAULT
5185 5100 or REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
5186 5101
5187 5102 #if !defined(NIAGARA_IMPL)
5188 5103 .do_copyin:
5189 5104 tst %o2 ! check for zero count; quick exit
5190 5105 bz,pt %ncc, .ci_smallqx
5191 5106 mov %o0, SAVE_SRC
5192 5107 mov %o1, SAVE_DST
5193 5108 mov %o2, SAVE_COUNT
5194 5109 cmp %o2, FP_COPY ! check for small copy/leaf case
5195 5110 bgt,pt %ncc, .ci_copy_more
5196 5111 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
5197 5112 /*
5198 5113 * Small copy in code
5199 5114 *
5200 5115 */
5201 5116 sethi %hi(copyio_fault_nowindow), %o3
5202 5117 or %o3, %lo(copyio_fault_nowindow), %o3
5203 5118 membar #Sync
5204 5119 stn %o3, [THREAD_REG + T_LOFAULT]
5205 5120
5206 5121 mov ASI_USER, %asi
5207 5122 cmp %o2, SHORTCOPY ! make sure there is enough to align
5208 5123 ble,pt %ncc, .ci_smallest
5209 5124 andcc %o1, 0x7, %o3 ! is dest long word aligned
5210 5125 bnz,pn %ncc, .ci_align
5211 5126 andcc %o1, 1, %o3 ! is dest byte aligned
5212 5127
5213 5128 ! Destination is long word aligned
5214 5129 .ci_al_src:
5215 5130 andcc %o0, 7, %o3
5216 5131 brnz,pt %o3, .ci_src_dst_unal8
5217 5132 nop
5218 5133 /*
5219 5134 * Special case for handling when src and dest are both long word aligned
5220 5135 * and total data to move is less than FP_COPY bytes
5221 5136 * Also handles finish up for large block moves, so may be less than 32 bytes
5222 5137 */
5223 5138 .ci_medlong:
5224 5139 subcc %o2, 31, %o2 ! adjust length to allow cc test
5225 5140 ble,pt %ncc, .ci_medl31
5226 5141 nop
5227 5142 .ci_medl32:
5228 5143 ldxa [%o0]%asi, %o4 ! move 32 bytes
5229 5144 subcc %o2, 32, %o2 ! decrement length count by 32
5230 5145 stx %o4, [%o1]
5231 5146 ldxa [%o0+8]%asi, %o4
5232 5147 stx %o4, [%o1+8]
5233 5148 ldxa [%o0+16]%asi, %o4
5234 5149 add %o0, 32, %o0 ! increase src ptr by 32
5235 5150 stx %o4, [%o1+16]
5236 5151 ldxa [%o0-8]%asi, %o4
5237 5152 add %o1, 32, %o1 ! increase dst ptr by 32
5238 5153 bgu,pt %ncc, .ci_medl32 ! repeat if at least 32 bytes left
5239 5154 stx %o4, [%o1-8]
5240 5155 .ci_medl31:
5241 5156 addcc %o2, 24, %o2 ! adjust count to be off by 7
5242 5157 ble,pt %ncc, .ci_medl7 ! skip if 7 or fewer bytes left
5243 5158 nop
5244 5159 .ci_medl8:
5245 5160 ldxa [%o0]%asi, %o4 ! move 8 bytes
5246 5161 add %o0, 8, %o0 ! increase src ptr by 8
5247 5162 subcc %o2, 8, %o2 ! decrease count by 8
5248 5163 add %o1, 8, %o1 ! increase dst ptr by 8
5249 5164 bgu,pt %ncc, .ci_medl8
5250 5165 stx %o4, [%o1-8]
5251 5166 .ci_medl7:
5252 5167 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5253 5168 bnz,pt %ncc, .ci_small4 ! do final bytes if not finished
5254 5169 nop
5255 5170 .ci_smallx: ! finish up and exit
5256 5171 membar #Sync
5257 5172 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5258 5173 .ci_smallqx:
5259 5174 retl
5260 5175 mov %g0, %o0
5261 5176
5262 5177 .ci_small4:
5263 5178 cmp %o2, 4
5264 5179 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5265 5180 nop !
5266 5181 lda [%o0]%asi, %o4 ! move 4 bytes
5267 5182 add %o0, 4, %o0 ! increase src ptr by 4
5268 5183 add %o1, 4, %o1 ! increase dst ptr by 4
5269 5184 subcc %o2, 4, %o2 ! decrease count by 4
5270 5185 bz %ncc, .ci_smallx
5271 5186 stw %o4, [%o1-4]
5272 5187
5273 5188 .ci_small3x: ! Exactly 1, 2, or 3 bytes remain
5274 5189 subcc %o2, 1, %o2 ! reduce count for cc test
5275 5190 lduba [%o0]%asi, %o4 ! load one byte
5276 5191 bz,pt %ncc, .ci_smallx
5277 5192 stb %o4, [%o1] ! store one byte
5278 5193 lduba [%o0+1]%asi, %o4 ! load second byte
5279 5194 subcc %o2, 1, %o2
5280 5195 bz,pt %ncc, .ci_smallx
5281 5196 stb %o4, [%o1+1] ! store second byte
5282 5197 lduba [%o0+2]%asi, %o4 ! load third byte
5283 5198 ba .ci_smallx
5284 5199 stb %o4, [%o1+2] ! store third byte
5285 5200
5286 5201 .ci_smallest: ! 7 or fewer bytes remain
5287 5202 cmp %o2, 4
5288 5203 blt,pt %ncc, .ci_small3x
5289 5204 nop
5290 5205 lduba [%o0]%asi, %o4 ! read byte
5291 5206 subcc %o2, 4, %o2 ! reduce count by 4
5292 5207 stb %o4, [%o1] ! write byte
5293 5208 lduba [%o0+1]%asi, %o4 ! repeat for total of 4 bytes
5294 5209 add %o0, 4, %o0 ! advance src by 4
5295 5210 stb %o4, [%o1+1]
5296 5211 lduba [%o0-2]%asi, %o4
5297 5212 add %o1, 4, %o1 ! advance dst by 4
5298 5213 stb %o4, [%o1-2]
5299 5214 lduba [%o0-1]%asi, %o4
5300 5215 bnz,pt %ncc, .ci_small3x
5301 5216 stb %o4, [%o1-1]
5302 5217 membar #Sync
5303 5218 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5304 5219 retl
5305 5220 mov %g0, %o0
5306 5221
5307 5222 .ci_align:
5308 5223 bnz,pt %ncc, .ci_al_d1
5309 5224 .ci_al_d1f: ! dest is now half word aligned
5310 5225 andcc %o1, 2, %o3 ! is dest word aligned
5311 5226 bnz,pt %ncc, .ci_al_d2
5312 5227 .ci_al_d2f: ! dest is now word aligned
5313 5228 andcc %o1, 4, %o3 ! is dest longword aligned?
5314 5229 bz,pt %ncc, .ci_al_src
5315 5230 nop
5316 5231 .ci_al_d4: ! dest is word aligned; src is unknown
5317 5232 lduba [%o0]%asi, %o4 ! move a word (src align unknown)
5318 5233 lduba [%o0+1]%asi, %o3
5319 5234 sll %o4, 24, %o4 ! position
5320 5235 sll %o3, 16, %o3 ! position
5321 5236 or %o4, %o3, %o3 ! merge
5322 5237 lduba [%o0+2]%asi, %o4
5323 5238 sll %o4, 8, %o4 ! position
5324 5239 or %o4, %o3, %o3 ! merge
5325 5240 lduba [%o0+3]%asi, %o4
5326 5241 or %o4, %o3, %o4 ! merge
5327 5242 stw %o4,[%o1] ! store four bytes
5328 5243 add %o0, 4, %o0 ! adjust src by 4
5329 5244 add %o1, 4, %o1 ! adjust dest by 4
5330 5245 sub %o2, 4, %o2 ! adjust count by 4
5331 5246 andcc %o0, 7, %o3 ! check for src long word alignment
5332 5247 brz,pt %o3, .ci_medlong
5333 5248 .ci_src_dst_unal8:
5334 5249 ! dst is 8-byte aligned, src is not
5335 5250 ! Size is less than FP_COPY
5336 5251 ! Following code is to select for alignment
5337 5252 andcc %o0, 0x3, %o3 ! test word alignment
5338 5253 bz,pt %ncc, .ci_medword
5339 5254 nop
5340 5255 andcc %o0, 0x1, %o3 ! test halfword alignment
5341 5256 bnz,pt %ncc, .ci_med_byte ! go to byte move if not halfword
5342 5257 andcc %o0, 0x2, %o3 ! test which byte alignment
5343 5258 ba .ci_medhalf
5344 5259 nop
5345 5260 .ci_al_d1: ! align dest to half word
5346 5261 lduba [%o0]%asi, %o4 ! move a byte
5347 5262 add %o0, 1, %o0
5348 5263 stb %o4, [%o1]
5349 5264 add %o1, 1, %o1
5350 5265 andcc %o1, 2, %o3 ! is dest word aligned
5351 5266 bz,pt %ncc, .ci_al_d2f
5352 5267 sub %o2, 1, %o2
5353 5268 .ci_al_d2: ! align dest to word
5354 5269 lduba [%o0]%asi, %o4 ! move a half-word (src align unknown)
5355 5270 lduba [%o0+1]%asi, %o3
5356 5271 sll %o4, 8, %o4 ! position
5357 5272 or %o4, %o3, %o4 ! merge
5358 5273 sth %o4, [%o1]
5359 5274 add %o0, 2, %o0
5360 5275 add %o1, 2, %o1
5361 5276 andcc %o1, 4, %o3 ! is dest longword aligned?
5362 5277 bz,pt %ncc, .ci_al_src
5363 5278 sub %o2, 2, %o2
5364 5279 ba .ci_al_d4
5365 5280 nop
5366 5281 /*
5367 5282 * Handle all cases where src and dest are aligned on word
5368 5283 * boundaries. Use unrolled loops for better performance.
5369 5284 * This option wins over standard large data move when
5370 5285 * source and destination is in cache for medium
5371 5286 * to short data moves.
5372 5287 */
5373 5288 .ci_medword:
5374 5289 subcc %o2, 31, %o2 ! adjust length to allow cc test
5375 5290 ble,pt %ncc, .ci_medw31
5376 5291 nop
5377 5292 .ci_medw32:
5378 5293 lda [%o0]%asi, %o4 ! move a block of 32 bytes
5379 5294 stw %o4, [%o1]
5380 5295 lda [%o0+4]%asi, %o4
5381 5296 stw %o4, [%o1+4]
5382 5297 lda [%o0+8]%asi, %o4
5383 5298 stw %o4, [%o1+8]
5384 5299 lda [%o0+12]%asi, %o4
5385 5300 stw %o4, [%o1+12]
5386 5301 lda [%o0+16]%asi, %o4
5387 5302 stw %o4, [%o1+16]
5388 5303 lda [%o0+20]%asi, %o4
5389 5304 subcc %o2, 32, %o2 ! decrement length count
5390 5305 stw %o4, [%o1+20]
5391 5306 lda [%o0+24]%asi, %o4
5392 5307 add %o0, 32, %o0 ! increase src ptr by 32
5393 5308 stw %o4, [%o1+24]
5394 5309 lda [%o0-4]%asi, %o4
5395 5310 add %o1, 32, %o1 ! increase dst ptr by 32
5396 5311 bgu,pt %ncc, .ci_medw32 ! repeat if at least 32 bytes left
5397 5312 stw %o4, [%o1-4]
5398 5313 .ci_medw31:
5399 5314 addcc %o2, 24, %o2 ! adjust count to be off by 7
5400 5315 ble,pt %ncc, .ci_medw7 ! skip if 7 or fewer bytes left
5401 5316 nop !
5402 5317 .ci_medw15:
5403 5318 lda [%o0]%asi, %o4 ! move a block of 8 bytes
5404 5319 subcc %o2, 8, %o2 ! decrement length count
5405 5320 stw %o4, [%o1]
5406 5321 add %o0, 8, %o0 ! increase src ptr by 8
5407 5322 lda [%o0-4]%asi, %o4
5408 5323 add %o1, 8, %o1 ! increase dst ptr by 8
5409 5324 bgu,pt %ncc, .ci_medw15
5410 5325 stw %o4, [%o1-4]
5411 5326 .ci_medw7:
5412 5327 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5413 5328 bz,pt %ncc, .ci_smallx ! exit if finished
5414 5329 cmp %o2, 4
5415 5330 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5416 5331 nop !
5417 5332 lda [%o0]%asi, %o4 ! move 4 bytes
5418 5333 add %o0, 4, %o0 ! increase src ptr by 4
5419 5334 add %o1, 4, %o1 ! increase dst ptr by 4
5420 5335 subcc %o2, 4, %o2 ! decrease count by 4
5421 5336 bnz .ci_small3x
5422 5337 stw %o4, [%o1-4]
5423 5338 membar #Sync
5424 5339 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5425 5340 retl
5426 5341 mov %g0, %o0
5427 5342
5428 5343 .ci_medhalf:
5429 5344 subcc %o2, 31, %o2 ! adjust length to allow cc test
5430 5345 ble,pt %ncc, .ci_medh31
5431 5346 nop
5432 5347 .ci_medh32: ! load and store block of 32 bytes
5433 5348 subcc %o2, 32, %o2 ! decrement length count
5434 5349
5435 5350 lduha [%o0]%asi, %o4 ! move 32 bytes
5436 5351 lduwa [%o0+2]%asi, %o3
5437 5352 sllx %o4, 48, %o4
5438 5353 sllx %o3, 16, %o3
5439 5354 or %o4, %o3, %o3
5440 5355 lduha [%o0+6]%asi, %o4
5441 5356 or %o4, %o3, %o4
5442 5357 stx %o4, [%o1]
5443 5358
5444 5359 lduha [%o0+8]%asi, %o4
5445 5360 lduwa [%o0+10]%asi, %o3
5446 5361 sllx %o4, 48, %o4
5447 5362 sllx %o3, 16, %o3
5448 5363 or %o4, %o3, %o3
5449 5364 lduha [%o0+14]%asi, %o4
5450 5365 or %o4, %o3, %o4
5451 5366 stx %o4, [%o1+8]
5452 5367
5453 5368 lduha [%o0+16]%asi, %o4
5454 5369 lduwa [%o0+18]%asi, %o3
5455 5370 sllx %o4, 48, %o4
5456 5371 sllx %o3, 16, %o3
5457 5372 or %o4, %o3, %o3
5458 5373 lduha [%o0+22]%asi, %o4
5459 5374 or %o4, %o3, %o4
5460 5375 stx %o4, [%o1+16]
5461 5376
5462 5377 add %o0, 32, %o0 ! increase src ptr by 32
5463 5378 add %o1, 32, %o1 ! increase dst ptr by 32
5464 5379
5465 5380 lduha [%o0-8]%asi, %o4
5466 5381 lduwa [%o0-6]%asi, %o3
5467 5382 sllx %o4, 48, %o4
5468 5383 sllx %o3, 16, %o3
5469 5384 or %o4, %o3, %o3
5470 5385 lduha [%o0-2]%asi, %o4
5471 5386 or %o3, %o4, %o4
5472 5387 bgu,pt %ncc, .ci_medh32 ! repeat if at least 32 bytes left
5473 5388 stx %o4, [%o1-8]
5474 5389
5475 5390 .ci_medh31:
5476 5391 addcc %o2, 24, %o2 ! adjust count to be off by 7
5477 5392 ble,pt %ncc, .ci_medh7 ! skip if 7 or fewer bytes left
5478 5393 nop !
5479 5394 .ci_medh15:
5480 5395 lduha [%o0]%asi, %o4 ! move 16 bytes
5481 5396 subcc %o2, 8, %o2 ! decrement length count
5482 5397 lduwa [%o0+2]%asi, %o3
5483 5398 sllx %o4, 48, %o4
5484 5399 sllx %o3, 16, %o3
5485 5400 or %o4, %o3, %o3
5486 5401 add %o1, 8, %o1 ! increase dst ptr by 8
5487 5402 lduha [%o0+6]%asi, %o4
5488 5403 add %o0, 8, %o0 ! increase src ptr by 8
5489 5404 or %o4, %o3, %o4
5490 5405 bgu,pt %ncc, .ci_medh15
5491 5406 stx %o4, [%o1-8]
5492 5407 .ci_medh7:
5493 5408 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5494 5409 bz,pt %ncc, .ci_smallx ! exit if finished
5495 5410 cmp %o2, 4
5496 5411 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5497 5412 nop !
5498 5413 lduha [%o0]%asi, %o4
5499 5414 sll %o4, 16, %o4
5500 5415 lduha [%o0+2]%asi, %o3
5501 5416 or %o3, %o4, %o4
5502 5417 subcc %o2, 4, %o2
5503 5418 add %o0, 4, %o0
5504 5419 add %o1, 4, %o1
5505 5420 bnz .ci_small3x
5506 5421 stw %o4, [%o1-4]
5507 5422 membar #Sync
5508 5423 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5509 5424 retl
5510 5425 mov %g0, %o0
5511 5426
5512 5427 .align 16
5513 5428 .ci_med_byte:
5514 5429 bnz,pt %ncc, .ci_medbh32a ! go to correct byte move
5515 5430 subcc %o2, 31, %o2 ! adjust length to allow cc test
5516 5431 ble,pt %ncc, .ci_medb31
5517 5432 nop
5518 5433 .ci_medb32: ! Alignment 1 or 5
5519 5434 subcc %o2, 32, %o2 ! decrement length count
5520 5435
5521 5436 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes
5522 5437 sllx %o4, 56, %o3
5523 5438 lduha [%o0+1]%asi, %o4
5524 5439 sllx %o4, 40, %o4
5525 5440 or %o4, %o3, %o3
5526 5441 lduwa [%o0+3]%asi, %o4
5527 5442 sllx %o4, 8, %o4
5528 5443 or %o4, %o3, %o3
5529 5444 lduba [%o0+7]%asi, %o4
5530 5445 or %o4, %o3, %o4
5531 5446 stx %o4, [%o1]
5532 5447
5533 5448 lduba [%o0+8]%asi, %o4
5534 5449 sllx %o4, 56, %o3
5535 5450 lduha [%o0+9]%asi, %o4
5536 5451 sllx %o4, 40, %o4
5537 5452 or %o4, %o3, %o3
5538 5453 lduwa [%o0+11]%asi, %o4
5539 5454 sllx %o4, 8, %o4
5540 5455 or %o4, %o3, %o3
5541 5456 lduba [%o0+15]%asi, %o4
5542 5457 or %o4, %o3, %o4
5543 5458 stx %o4, [%o1+8]
5544 5459
5545 5460 lduba [%o0+16]%asi, %o4
5546 5461 sllx %o4, 56, %o3
5547 5462 lduha [%o0+17]%asi, %o4
5548 5463 sllx %o4, 40, %o4
5549 5464 or %o4, %o3, %o3
5550 5465 lduwa [%o0+19]%asi, %o4
5551 5466 sllx %o4, 8, %o4
5552 5467 or %o4, %o3, %o3
5553 5468 lduba [%o0+23]%asi, %o4
5554 5469 or %o4, %o3, %o4
5555 5470 stx %o4, [%o1+16]
5556 5471
5557 5472 add %o0, 32, %o0 ! increase src ptr by 32
5558 5473 add %o1, 32, %o1 ! increase dst ptr by 32
5559 5474
5560 5475 lduba [%o0-8]%asi, %o4
5561 5476 sllx %o4, 56, %o3
5562 5477 lduha [%o0-7]%asi, %o4
5563 5478 sllx %o4, 40, %o4
5564 5479 or %o4, %o3, %o3
5565 5480 lduwa [%o0-5]%asi, %o4
5566 5481 sllx %o4, 8, %o4
5567 5482 or %o4, %o3, %o3
5568 5483 lduba [%o0-1]%asi, %o4
5569 5484 or %o4, %o3, %o4
5570 5485 bgu,pt %ncc, .ci_medb32 ! repeat if at least 32 bytes left
5571 5486 stx %o4, [%o1-8]
5572 5487
5573 5488 .ci_medb31: ! 31 or fewer bytes remaining
5574 5489 addcc %o2, 24, %o2 ! adjust count to be off by 7
5575 5490 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left
5576 5491 nop !
5577 5492 .ci_medb15:
5578 5493
5579 5494 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes
5580 5495 subcc %o2, 8, %o2 ! decrement length count
5581 5496 sllx %o4, 56, %o3
5582 5497 lduha [%o0+1]%asi, %o4
5583 5498 sllx %o4, 40, %o4
5584 5499 or %o4, %o3, %o3
5585 5500 lduwa [%o0+3]%asi, %o4
5586 5501 add %o1, 8, %o1 ! increase dst ptr by 16
5587 5502 sllx %o4, 8, %o4
5588 5503 or %o4, %o3, %o3
5589 5504 lduba [%o0+7]%asi, %o4
5590 5505 add %o0, 8, %o0 ! increase src ptr by 16
5591 5506 or %o4, %o3, %o4
5592 5507 bgu,pt %ncc, .ci_medb15
5593 5508 stx %o4, [%o1-8]
5594 5509 .ci_medb7:
5595 5510 addcc %o2, 7, %o2 ! finish adjustment of remaining count
5596 5511 bz,pt %ncc, .ci_smallx ! exit if finished
5597 5512 cmp %o2, 4
5598 5513 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left
5599 5514 nop !
5600 5515 lduba [%o0]%asi, %o4 ! move 4 bytes
5601 5516 sll %o4, 24, %o3
5602 5517 lduha [%o0+1]%asi, %o4
5603 5518 sll %o4, 8, %o4
5604 5519 or %o4, %o3, %o3
5605 5520 lduba [%o0+3]%asi, %o4
5606 5521 or %o4, %o3, %o4
5607 5522 subcc %o2, 4, %o2
5608 5523 add %o0, 4, %o0
5609 5524 add %o1, 4, %o1
5610 5525 bnz .ci_small3x
5611 5526 stw %o4, [%o1-4]
5612 5527 membar #Sync
5613 5528 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5614 5529 retl
5615 5530 mov %g0, %o0
5616 5531
5617 5532 .align 16
5618 5533 .ci_medbh32a: ! Alignment 3 or 7
5619 5534 ble,pt %ncc, .ci_medbh31
5620 5535 nop
5621 5536 .ci_medbh32: ! Alignment 3 or 7
5622 5537 subcc %o2, 32, %o2 ! decrement length count
5623 5538
5624 5539 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes
5625 5540 sllx %o4, 56, %o3
5626 5541 lduwa [%o0+1]%asi, %o4
5627 5542 sllx %o4, 24, %o4
5628 5543 or %o4, %o3, %o3
5629 5544 lduha [%o0+5]%asi, %o4
5630 5545 sllx %o4, 8, %o4
5631 5546 or %o4, %o3, %o3
5632 5547 lduba [%o0+7]%asi, %o4
5633 5548 or %o4, %o3, %o4
5634 5549 stx %o4, [%o1]
5635 5550
5636 5551 lduba [%o0+8]%asi, %o4
5637 5552 sllx %o4, 56, %o3
5638 5553 lduwa [%o0+9]%asi, %o4
5639 5554 sllx %o4, 24, %o4
5640 5555 or %o4, %o3, %o3
5641 5556 lduha [%o0+13]%asi, %o4
5642 5557 sllx %o4, 8, %o4
5643 5558 or %o4, %o3, %o3
5644 5559 lduba [%o0+15]%asi, %o4
5645 5560 or %o4, %o3, %o4
5646 5561 stx %o4, [%o1+8]
5647 5562
5648 5563 lduba [%o0+16]%asi, %o4
5649 5564 sllx %o4, 56, %o3
5650 5565 lduwa [%o0+17]%asi, %o4
5651 5566 sllx %o4, 24, %o4
5652 5567 or %o4, %o3, %o3
5653 5568 lduha [%o0+21]%asi, %o4
5654 5569 sllx %o4, 8, %o4
5655 5570 or %o4, %o3, %o3
5656 5571 lduba [%o0+23]%asi, %o4
5657 5572 or %o4, %o3, %o4
5658 5573 stx %o4, [%o1+16]
5659 5574
5660 5575 add %o0, 32, %o0 ! increase src ptr by 32
5661 5576 add %o1, 32, %o1 ! increase dst ptr by 32
5662 5577
5663 5578 lduba [%o0-8]%asi, %o4
5664 5579 sllx %o4, 56, %o3
5665 5580 lduwa [%o0-7]%asi, %o4
5666 5581 sllx %o4, 24, %o4
5667 5582 or %o4, %o3, %o3
5668 5583 lduha [%o0-3]%asi, %o4
5669 5584 sllx %o4, 8, %o4
5670 5585 or %o4, %o3, %o3
5671 5586 lduba [%o0-1]%asi, %o4
5672 5587 or %o4, %o3, %o4
5673 5588 bgu,pt %ncc, .ci_medbh32 ! repeat if at least 32 bytes left
5674 5589 stx %o4, [%o1-8]
5675 5590
5676 5591 .ci_medbh31:
5677 5592 addcc %o2, 24, %o2 ! adjust count to be off by 7
5678 5593 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left
5679 5594 nop !
5680 5595 .ci_medbh15:
5681 5596 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes
5682 5597 sllx %o4, 56, %o3
5683 5598 lduwa [%o0+1]%asi, %o4
5684 5599 sllx %o4, 24, %o4
5685 5600 or %o4, %o3, %o3
5686 5601 lduha [%o0+5]%asi, %o4
5687 5602 sllx %o4, 8, %o4
5688 5603 or %o4, %o3, %o3
5689 5604 lduba [%o0+7]%asi, %o4
5690 5605 or %o4, %o3, %o4
5691 5606 stx %o4, [%o1]
5692 5607 subcc %o2, 8, %o2 ! decrement length count
5693 5608 add %o1, 8, %o1 ! increase dst ptr by 8
5694 5609 add %o0, 8, %o0 ! increase src ptr by 8
5695 5610 bgu,pt %ncc, .ci_medbh15
5696 5611 stx %o4, [%o1-8]
5697 5612 ba .ci_medb7
5698 5613 nop
5699 5614
5700 5615 /*
5701 5616 * End of small copy in code (no window)
5702 5617 *
5703 5618 */
5704 5619
5705 5620 /*
5706 5621 * Long copy in code (using register window and fp regs)
5707 5622 *
5708 5623 */
5709 5624
5710 5625 .ci_copy_more:
5711 5626 sethi %hi(copyio_fault), %o3
5712 5627 or %o3, %lo(copyio_fault), %o3
5713 5628 membar #Sync
5714 5629 stn %o3, [THREAD_REG + T_LOFAULT]
5715 5630 /*
5716 5631 * Following code is for large copies. We know there is at
5717 5632 * least FP_COPY bytes available. FP regs are used, so
5718 5633 * we save registers and fp regs before starting
5719 5634 */
5720 5635 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
5721 5636 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
5722 5637 rd %fprs, %g1 ! check for unused fp
5723 5638 ! if fprs.fef == 0, set it.
5724 5639 ! Setting it when already set costs more than checking
5725 5640 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0
5726 5641 bz,pt %ncc, .ci_fp_unused
5727 5642 mov ASI_USER, %asi
5728 5643 BST_FP_TOSTACK(%o3)
5729 5644 ba .ci_fp_ready
5730 5645 .ci_fp_unused:
5731 5646 prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read
5732 5647 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1
5733 5648 .ci_fp_ready:
5734 5649 rd %gsr, %l5 ! save %gsr value
5735 5650 andcc %i1, 1, %o3 ! is dest byte aligned
5736 5651 bnz,pt %ncc, .ci_big_d1
5737 5652 .ci_big_d1f: ! dest is now half word aligned
5738 5653 andcc %i1, 2, %o3
5739 5654 bnz,pt %ncc, .ci_big_d2
5740 5655 .ci_big_d2f: ! dest is now word aligned
5741 5656 andcc %i1, 4, %o3
5742 5657 bnz,pt %ncc, .ci_big_d4
5743 5658 .ci_big_d4f: ! dest is long word aligned
5744 5659 andcc %i0, 7, %o3 ! is src long word aligned
5745 5660 brnz,pt %o3, .ci_big_unal8
5746 5661 prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read
5747 5662 ! Src and dst are long word aligned
5748 5663 ! align dst to 64 byte boundary
5749 5664 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
5750 5665 brz,pn %o3, .ci_al_to_64
5751 5666 nop
5752 5667 sub %o3, 64, %o3 ! %o3 has negative bytes to move
5753 5668 add %i2, %o3, %i2 ! adjust remaining count
5754 5669 andcc %o3, 8, %o4 ! odd long words to move?
5755 5670 brz,pt %o4, .ci_al_to_16
5756 5671 nop
5757 5672 add %o3, 8, %o3
5758 5673 ldxa [%i0]%asi, %o4
5759 5674 add %i0, 8, %i0 ! increment src ptr
5760 5675 add %i1, 8, %i1 ! increment dst ptr
5761 5676 stx %o4, [%i1-8]
5762 5677 ! Dest is aligned on 16 bytes, src 8 byte aligned
5763 5678 .ci_al_to_16:
5764 5679 andcc %o3, 0x30, %o4 ! pair of long words to move?
5765 5680 brz,pt %o4, .ci_al_to_64
5766 5681 nop
5767 5682 .ci_al_mv_16:
5768 5683 add %o3, 16, %o3
5769 5684 ldxa [%i0]%asi, %o4
5770 5685 stx %o4, [%i1]
5771 5686 add %i0, 16, %i0 ! increment src ptr
5772 5687 ldxa [%i0-8]%asi, %o4
5773 5688 stx %o4, [%i1+8]
5774 5689 andcc %o3, 0x30, %o4
5775 5690 brnz,pt %o4, .ci_al_mv_16
5776 5691 add %i1, 16, %i1 ! increment dst ptr
5777 5692 ! Dest is aligned on 64 bytes, src 8 byte aligned
5778 5693 .ci_al_to_64:
5779 5694 ! Determine source alignment
5780 5695 ! to correct 8 byte offset
5781 5696 andcc %i0, 32, %o3
5782 5697 brnz,pn %o3, .ci_aln_1
5783 5698 andcc %i0, 16, %o3
5784 5699 brnz,pn %o3, .ci_aln_01
5785 5700 andcc %i0, 8, %o3
5786 5701 brz,pn %o3, .ci_aln_000
5787 5702 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5788 5703 ba .ci_aln_001
5789 5704 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5790 5705 .ci_aln_01:
5791 5706 brnz,pn %o3, .ci_aln_011
5792 5707 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5793 5708 ba .ci_aln_010
5794 5709 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5795 5710 .ci_aln_1:
5796 5711 andcc %i0, 16, %o3
5797 5712 brnz,pn %o3, .ci_aln_11
5798 5713 andcc %i0, 8, %o3
5799 5714 brnz,pn %o3, .ci_aln_101
5800 5715 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5801 5716 ba .ci_aln_100
5802 5717 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5803 5718 .ci_aln_11:
5804 5719 brz,pn %o3, .ci_aln_110
5805 5720 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5806 5721
5807 5722 .ci_aln_111:
5808 5723 ! Alignment off by 8 bytes
5809 5724 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5810 5725 ldda [%i0]%asi, %d0
5811 5726 add %i0, 8, %i0
5812 5727 sub %i2, 8, %i2
5813 5728 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5814 5729 and %i2, 0x7f, %i2 ! residue bytes in %i2
5815 5730 sub %i1, %i0, %i1
5816 5731 .ci_aln_111_loop:
5817 5732 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5818 5733 subcc %o3, 64, %o3
5819 5734 fmovd %d16, %d2
5820 5735 fmovd %d18, %d4
5821 5736 fmovd %d20, %d6
5822 5737 fmovd %d22, %d8
5823 5738 fmovd %d24, %d10
5824 5739 fmovd %d26, %d12
5825 5740 fmovd %d28, %d14
5826 5741 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5827 5742 stda %d0,[%i0+%i1]ASI_BLK_P
5828 5743 add %i0, 64, %i0
5829 5744 fmovd %d30, %d0
5830 5745 bgt,pt %ncc, .ci_aln_111_loop
5831 5746 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5832 5747 add %i1, %i0, %i1
5833 5748
5834 5749 std %d0, [%i1]
5835 5750 ba .ci_remain_stuff
5836 5751 add %i1, 8, %i1
5837 5752 ! END OF aln_111
5838 5753
5839 5754 .ci_aln_110:
5840 5755 ! Alignment off by 16 bytes
5841 5756 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5842 5757 ldda [%i0]%asi, %d0
5843 5758 ldda [%i0+8]%asi, %d2
5844 5759 add %i0, 16, %i0
5845 5760 sub %i2, 16, %i2
5846 5761 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5847 5762 and %i2, 0x7f, %i2 ! residue bytes in %i2
5848 5763 sub %i1, %i0, %i1
5849 5764 .ci_aln_110_loop:
5850 5765 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5851 5766 subcc %o3, 64, %o3
5852 5767 fmovd %d16, %d4
5853 5768 fmovd %d18, %d6
5854 5769 fmovd %d20, %d8
5855 5770 fmovd %d22, %d10
5856 5771 fmovd %d24, %d12
5857 5772 fmovd %d26, %d14
5858 5773 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5859 5774 stda %d0,[%i0+%i1]ASI_BLK_P
5860 5775 add %i0, 64, %i0
5861 5776 fmovd %d28, %d0
5862 5777 fmovd %d30, %d2
5863 5778 bgt,pt %ncc, .ci_aln_110_loop
5864 5779 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5865 5780 add %i1, %i0, %i1
5866 5781
5867 5782 std %d0, [%i1]
5868 5783 std %d2, [%i1+8]
5869 5784 ba .ci_remain_stuff
5870 5785 add %i1, 16, %i1
5871 5786 ! END OF aln_110
5872 5787
5873 5788 .ci_aln_101:
5874 5789 ! Alignment off by 24 bytes
5875 5790 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5876 5791 ldda [%i0]%asi, %d0
5877 5792 ldda [%i0+8]%asi, %d2
5878 5793 ldda [%i0+16]%asi, %d4
5879 5794 add %i0, 24, %i0
5880 5795 sub %i2, 24, %i2
5881 5796 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5882 5797 and %i2, 0x7f, %i2 ! residue bytes in %i2
5883 5798 sub %i1, %i0, %i1
5884 5799 .ci_aln_101_loop:
5885 5800 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5886 5801 subcc %o3, 64, %o3
5887 5802 fmovd %d16, %d6
5888 5803 fmovd %d18, %d8
5889 5804 fmovd %d20, %d10
5890 5805 fmovd %d22, %d12
5891 5806 fmovd %d24, %d14
5892 5807 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5893 5808 stda %d0,[%i0+%i1]ASI_BLK_P
5894 5809 add %i0, 64, %i0
5895 5810 fmovd %d26, %d0
5896 5811 fmovd %d28, %d2
5897 5812 fmovd %d30, %d4
5898 5813 bgt,pt %ncc, .ci_aln_101_loop
5899 5814 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5900 5815 add %i1, %i0, %i1
5901 5816
5902 5817 std %d0, [%i1]
5903 5818 std %d2, [%i1+8]
5904 5819 std %d4, [%i1+16]
5905 5820 ba .ci_remain_stuff
5906 5821 add %i1, 24, %i1
5907 5822 ! END OF aln_101
5908 5823
5909 5824 .ci_aln_100:
5910 5825 ! Alignment off by 32 bytes
5911 5826 ldda [%i0]%asi, %d0
5912 5827 ldda [%i0+8]%asi, %d2
5913 5828 ldda [%i0+16]%asi,%d4
5914 5829 ldda [%i0+24]%asi,%d6
5915 5830 add %i0, 32, %i0
5916 5831 sub %i2, 32, %i2
5917 5832 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5918 5833 and %i2, 0x7f, %i2 ! residue bytes in %i2
5919 5834 sub %i1, %i0, %i1
5920 5835 .ci_aln_100_loop:
5921 5836 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5922 5837 subcc %o3, 64, %o3
5923 5838 fmovd %d16, %d8
5924 5839 fmovd %d18, %d10
5925 5840 fmovd %d20, %d12
5926 5841 fmovd %d22, %d14
5927 5842 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5928 5843 stda %d0,[%i0+%i1]ASI_BLK_P
5929 5844 add %i0, 64, %i0
5930 5845 fmovd %d24, %d0
5931 5846 fmovd %d26, %d2
5932 5847 fmovd %d28, %d4
5933 5848 fmovd %d30, %d6
5934 5849 bgt,pt %ncc, .ci_aln_100_loop
5935 5850 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5936 5851 add %i1, %i0, %i1
5937 5852
5938 5853 std %d0, [%i1]
5939 5854 std %d2, [%i1+8]
5940 5855 std %d4, [%i1+16]
5941 5856 std %d6, [%i1+24]
5942 5857 ba .ci_remain_stuff
5943 5858 add %i1, 32, %i1
5944 5859 ! END OF aln_100
5945 5860
5946 5861 .ci_aln_011:
5947 5862 ! Alignment off by 40 bytes
5948 5863 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5949 5864 ldda [%i0]%asi, %d0
5950 5865 ldda [%i0+8]%asi, %d2
5951 5866 ldda [%i0+16]%asi, %d4
5952 5867 ldda [%i0+24]%asi, %d6
5953 5868 ldda [%i0+32]%asi, %d8
5954 5869 add %i0, 40, %i0
5955 5870 sub %i2, 40, %i2
5956 5871 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5957 5872 and %i2, 0x7f, %i2 ! residue bytes in %i2
5958 5873 sub %i1, %i0, %i1
5959 5874 .ci_aln_011_loop:
5960 5875 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
5961 5876 subcc %o3, 64, %o3
5962 5877 fmovd %d16, %d10
5963 5878 fmovd %d18, %d12
5964 5879 fmovd %d20, %d14
5965 5880 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5966 5881 stda %d0,[%i0+%i1]ASI_BLK_P
5967 5882 add %i0, 64, %i0
5968 5883 fmovd %d22, %d0
5969 5884 fmovd %d24, %d2
5970 5885 fmovd %d26, %d4
5971 5886 fmovd %d28, %d6
5972 5887 fmovd %d30, %d8
5973 5888 bgt,pt %ncc, .ci_aln_011_loop
5974 5889 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5975 5890 add %i1, %i0, %i1
5976 5891
5977 5892 std %d0, [%i1]
5978 5893 std %d2, [%i1+8]
5979 5894 std %d4, [%i1+16]
5980 5895 std %d6, [%i1+24]
5981 5896 std %d8, [%i1+32]
5982 5897 ba .ci_remain_stuff
5983 5898 add %i1, 40, %i1
5984 5899 ! END OF aln_011
5985 5900
5986 5901 .ci_aln_010:
5987 5902 ! Alignment off by 48 bytes
5988 5903 ldda [%i0]%asi, %d0
5989 5904 ldda [%i0+8]%asi, %d2
5990 5905 ldda [%i0+16]%asi, %d4
5991 5906 ldda [%i0+24]%asi, %d6
5992 5907 ldda [%i0+32]%asi, %d8
5993 5908 ldda [%i0+40]%asi, %d10
5994 5909 add %i0, 48, %i0
5995 5910 sub %i2, 48, %i2
5996 5911 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
5997 5912 and %i2, 0x7f, %i2 ! residue bytes in %i2
5998 5913 sub %i1, %i0, %i1
5999 5914 .ci_aln_010_loop:
6000 5915 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
6001 5916 subcc %o3, 64, %o3
6002 5917 fmovd %d16, %d12
6003 5918 fmovd %d18, %d14
6004 5919 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
6005 5920 stda %d0,[%i0+%i1]ASI_BLK_P
6006 5921 add %i0, 64, %i0
6007 5922 fmovd %d20, %d0
6008 5923 fmovd %d22, %d2
6009 5924 fmovd %d24, %d4
6010 5925 fmovd %d26, %d6
6011 5926 fmovd %d28, %d8
6012 5927 fmovd %d30, %d10
6013 5928 bgt,pt %ncc, .ci_aln_010_loop
6014 5929 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6015 5930 add %i1, %i0, %i1
6016 5931
6017 5932 std %d0, [%i1]
6018 5933 std %d2, [%i1+8]
6019 5934 std %d4, [%i1+16]
6020 5935 std %d6, [%i1+24]
6021 5936 std %d8, [%i1+32]
6022 5937 std %d10, [%i1+40]
6023 5938 ba .ci_remain_stuff
6024 5939 add %i1, 48, %i1
6025 5940 ! END OF aln_010
6026 5941
6027 5942 .ci_aln_001:
6028 5943 ! Alignment off by 56 bytes
6029 5944 ldda [%i0]%asi, %d0
6030 5945 ldda [%i0+8]%asi, %d2
6031 5946 ldda [%i0+16]%asi, %d4
6032 5947 ldda [%i0+24]%asi, %d6
6033 5948 ldda [%i0+32]%asi, %d8
6034 5949 ldda [%i0+40]%asi, %d10
6035 5950 ldda [%i0+48]%asi, %d12
6036 5951 add %i0, 56, %i0
6037 5952 sub %i2, 56, %i2
6038 5953 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
6039 5954 and %i2, 0x7f, %i2 ! residue bytes in %i2
6040 5955 sub %i1, %i0, %i1
6041 5956 .ci_aln_001_loop:
6042 5957 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load
6043 5958 subcc %o3, 64, %o3
6044 5959 fmovd %d16, %d14
6045 5960 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
6046 5961 stda %d0,[%i0+%i1]ASI_BLK_P
6047 5962 add %i0, 64, %i0
6048 5963 fmovd %d18, %d0
6049 5964 fmovd %d20, %d2
6050 5965 fmovd %d22, %d4
6051 5966 fmovd %d24, %d6
6052 5967 fmovd %d26, %d8
6053 5968 fmovd %d28, %d10
6054 5969 fmovd %d30, %d12
6055 5970 bgt,pt %ncc, .ci_aln_001_loop
6056 5971 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6057 5972 add %i1, %i0, %i1
6058 5973
6059 5974 std %d0, [%i1]
6060 5975 std %d2, [%i1+8]
6061 5976 std %d4, [%i1+16]
6062 5977 std %d6, [%i1+24]
6063 5978 std %d8, [%i1+32]
6064 5979 std %d10, [%i1+40]
6065 5980 std %d12, [%i1+48]
6066 5981 ba .ci_remain_stuff
6067 5982 add %i1, 56, %i1
6068 5983 ! END OF aln_001
6069 5984
6070 5985 .ci_aln_000:
6071 5986 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6072 5987 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size
6073 5988 and %i2, 0x7f, %i2 ! residue bytes in %i2
6074 5989 sub %i1, %i0, %i1
6075 5990 .ci_aln_000_loop:
6076 5991 ldda [%i0]ASI_BLK_AIUS,%d0
6077 5992 subcc %o3, 64, %o3
6078 5993 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
6079 5994 stda %d0,[%i0+%i1]ASI_BLK_P
6080 5995 add %i0, 64, %i0
6081 5996 bgt,pt %ncc, .ci_aln_000_loop
6082 5997 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6083 5998 add %i1, %i0, %i1
6084 5999
6085 6000 ! END OF aln_000
6086 6001
6087 6002 .ci_remain_stuff:
6088 6003 subcc %i2, 31, %i2 ! adjust length to allow cc test
6089 6004 ble,pt %ncc, .ci_aln_31
6090 6005 nop
6091 6006 .ci_aln_32:
6092 6007 ldxa [%i0]%asi, %o4 ! move 32 bytes
6093 6008 subcc %i2, 32, %i2 ! decrement length count by 32
6094 6009 stx %o4, [%i1]
6095 6010 ldxa [%i0+8]%asi, %o4
6096 6011 stx %o4, [%i1+8]
6097 6012 ldxa [%i0+16]%asi, %o4
6098 6013 add %i0, 32, %i0 ! increase src ptr by 32
6099 6014 stx %o4, [%i1+16]
6100 6015 ldxa [%i0-8]%asi, %o4
6101 6016 add %i1, 32, %i1 ! increase dst ptr by 32
6102 6017 bgu,pt %ncc, .ci_aln_32 ! repeat if at least 32 bytes left
6103 6018 stx %o4, [%i1-8]
6104 6019 .ci_aln_31:
6105 6020 addcc %i2, 24, %i2 ! adjust count to be off by 7
6106 6021 ble,pt %ncc, .ci_aln_7 ! skip if 7 or fewer bytes left
6107 6022 nop !
6108 6023 .ci_aln_15:
6109 6024 ldxa [%i0]%asi, %o4 ! move 8 bytes
6110 6025 add %i0, 8, %i0 ! increase src ptr by 8
6111 6026 subcc %i2, 8, %i2 ! decrease count by 8
6112 6027 add %i1, 8, %i1 ! increase dst ptr by 8
6113 6028 bgu,pt %ncc, .ci_aln_15
6114 6029 stx %o4, [%i1-8] !
6115 6030 .ci_aln_7:
6116 6031 addcc %i2, 7, %i2 ! finish adjustment of remaining count
6117 6032 bz,pt %ncc, .ci_exit ! exit if finished
6118 6033 cmp %i2, 4
6119 6034 blt,pt %ncc, .ci_unaln3x ! skip if less than 4 bytes left
6120 6035 nop !
6121 6036 lda [%i0]%asi, %o4 ! move 4 bytes
6122 6037 add %i0, 4, %i0 ! increase src ptr by 4
6123 6038 add %i1, 4, %i1 ! increase dst ptr by 4
6124 6039 subcc %i2, 4, %i2 ! decrease count by 4
6125 6040 bnz .ci_unaln3x
6126 6041 stw %o4, [%i1-4]
6127 6042 ba .ci_exit
6128 6043 nop
6129 6044
6130 6045 ! destination alignment code
6131 6046 .ci_big_d1:
6132 6047 lduba [%i0]%asi, %o4 ! move a byte
6133 6048 add %i0, 1, %i0
6134 6049 stb %o4, [%i1]
6135 6050 add %i1, 1, %i1
6136 6051 andcc %i1, 2, %o3
6137 6052 bz,pt %ncc, .ci_big_d2f
6138 6053 sub %i2, 1, %i2
6139 6054 .ci_big_d2: ! dest is now at least half word aligned
6140 6055 lduba [%i0]%asi, %o4 ! move a half-word (src align unknown)
6141 6056 lduba [%i0+1]%asi, %o3
6142 6057 add %i0, 2, %i0
6143 6058 sll %o4, 8, %o4 ! position
6144 6059 or %o4, %o3, %o4 ! merge
6145 6060 sth %o4, [%i1]
6146 6061 add %i1, 2, %i1
6147 6062 andcc %i1, 4, %o3
6148 6063 bz,pt %ncc, .ci_big_d4f
6149 6064 sub %i2, 2, %i2
6150 6065 .ci_big_d4: ! dest is at least word aligned
6151 6066 nop
6152 6067 lduba [%i0]%asi, %o4 ! move a word (src align unknown)
6153 6068 lduba [%i0+1]%asi, %o3
6154 6069 sll %o4, 24, %o4 ! position
6155 6070 sll %o3, 16, %o3 ! position
6156 6071 or %o4, %o3, %o3 ! merge
6157 6072 lduba [%i0+2]%asi, %o4
6158 6073 sll %o4, 8, %o4 ! position
6159 6074 or %o4, %o3, %o3 ! merge
6160 6075 lduba [%i0+3]%asi, %o4
6161 6076 or %o4, %o3, %o4 ! merge
6162 6077 stw %o4,[%i1] ! store four bytes
6163 6078 add %i0, 4, %i0 ! adjust src by 4
6164 6079 add %i1, 4, %i1 ! adjust dest by 4
6165 6080 ba .ci_big_d4f
6166 6081 sub %i2, 4, %i2 ! adjust count by 4
6167 6082
6168 6083
6169 6084 ! Dst is on 8 byte boundary; src is not;
6170 6085 .ci_big_unal8:
6171 6086 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned?
6172 6087 bz %ncc, .ci_unalnsrc
6173 6088 sub %o3, 64, %o3 ! %o3 will be multiple of 8
6174 6089 neg %o3 ! bytes until dest is 64 byte aligned
6175 6090 sub %i2, %o3, %i2 ! update cnt with bytes to be moved
6176 6091 ! Move bytes according to source alignment
6177 6092 andcc %i0, 0x1, %o4
6178 6093 bnz %ncc, .ci_unalnbyte ! check for byte alignment
6179 6094 nop
6180 6095 andcc %i0, 2, %o4 ! check for half word alignment
6181 6096 bnz %ncc, .ci_unalnhalf
6182 6097 nop
6183 6098 ! Src is word aligned, move bytes until dest 64 byte aligned
6184 6099 .ci_unalnword:
6185 6100 lda [%i0]%asi, %o4 ! load 4 bytes
6186 6101 stw %o4, [%i1] ! and store 4 bytes
6187 6102 lda [%i0+4]%asi, %o4 ! load 4 bytes
6188 6103 add %i0, 8, %i0 ! increase src ptr by 8
6189 6104 stw %o4, [%i1+4] ! and store 4 bytes
6190 6105 subcc %o3, 8, %o3 ! decrease count by 8
6191 6106 bnz %ncc, .ci_unalnword
6192 6107 add %i1, 8, %i1 ! increase dst ptr by 8
6193 6108 ba .ci_unalnsrc
6194 6109 nop
6195 6110
6196 6111 ! Src is half-word aligned, move bytes until dest 64 byte aligned
6197 6112 .ci_unalnhalf:
6198 6113 lduha [%i0]%asi, %o4 ! load 2 bytes
6199 6114 sllx %o4, 32, %i3 ! shift left
6200 6115 lduwa [%i0+2]%asi, %o4
6201 6116 or %o4, %i3, %i3
6202 6117 sllx %i3, 16, %i3
6203 6118 lduha [%i0+6]%asi, %o4
6204 6119 or %o4, %i3, %i3
6205 6120 stx %i3, [%i1]
6206 6121 add %i0, 8, %i0
6207 6122 subcc %o3, 8, %o3
6208 6123 bnz %ncc, .ci_unalnhalf
6209 6124 add %i1, 8, %i1
6210 6125 ba .ci_unalnsrc
6211 6126 nop
6212 6127
6213 6128 ! Src is Byte aligned, move bytes until dest 64 byte aligned
6214 6129 .ci_unalnbyte:
6215 6130 sub %i1, %i0, %i1 ! share pointer advance
6216 6131 .ci_unalnbyte_loop:
6217 6132 lduba [%i0]%asi, %o4
6218 6133 sllx %o4, 56, %i3
6219 6134 lduha [%i0+1]%asi, %o4
6220 6135 sllx %o4, 40, %o4
6221 6136 or %o4, %i3, %i3
6222 6137 lduha [%i0+3]%asi, %o4
6223 6138 sllx %o4, 24, %o4
6224 6139 or %o4, %i3, %i3
6225 6140 lduha [%i0+5]%asi, %o4
6226 6141 sllx %o4, 8, %o4
6227 6142 or %o4, %i3, %i3
6228 6143 lduba [%i0+7]%asi, %o4
6229 6144 or %o4, %i3, %i3
6230 6145 stx %i3, [%i1+%i0]
6231 6146 subcc %o3, 8, %o3
6232 6147 bnz %ncc, .ci_unalnbyte_loop
6233 6148 add %i0, 8, %i0
6234 6149 add %i1,%i0, %i1 ! restore pointer
6235 6150
6236 6151 ! Destination is now block (64 byte aligned), src is not 8 byte aligned
6237 6152 .ci_unalnsrc:
6238 6153 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size
6239 6154 and %i2, 0x3f, %i2 ! residue bytes in %i2
6240 6155 add %i2, 64, %i2 ! Insure we don't load beyond
6241 6156 sub %i3, 64, %i3 ! end of source buffer
6242 6157
6243 6158 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address
6244 6159 prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read
6245 6160 alignaddr %i0, %g0, %g0 ! generate %gsr
6246 6161 add %i0, %i3, %i0 ! advance %i0 to after blocks
6247 6162 !
6248 6163 ! Determine source alignment to correct 8 byte offset
6249 6164 andcc %i0, 0x20, %o3
6250 6165 brnz,pn %o3, .ci_unaln_1
6251 6166 andcc %i0, 0x10, %o3
6252 6167 brnz,pn %o3, .ci_unaln_01
6253 6168 andcc %i0, 0x08, %o3
6254 6169 brz,a %o3, .ci_unaln_000
6255 6170 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6256 6171 ba .ci_unaln_001
6257 6172 nop
6258 6173 .ci_unaln_01:
6259 6174 brnz,a %o3, .ci_unaln_011
6260 6175 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6261 6176 ba .ci_unaln_010
6262 6177 nop
6263 6178 .ci_unaln_1:
6264 6179 brnz,pn %o3, .ci_unaln_11
6265 6180 andcc %i0, 0x08, %o3
6266 6181 brnz,a %o3, .ci_unaln_101
6267 6182 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6268 6183 ba .ci_unaln_100
6269 6184 nop
6270 6185 .ci_unaln_11:
6271 6186 brz,pn %o3, .ci_unaln_110
6272 6187 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6273 6188
6274 6189 .ci_unaln_111:
6275 6190 ldda [%o4+56]%asi, %d14
6276 6191 .ci_unaln_111_loop:
6277 6192 add %o4, 64, %o4
6278 6193 ldda [%o4]ASI_BLK_AIUS, %d16
6279 6194 faligndata %d14, %d16, %d48
6280 6195 faligndata %d16, %d18, %d50
6281 6196 faligndata %d18, %d20, %d52
6282 6197 faligndata %d20, %d22, %d54
6283 6198 faligndata %d22, %d24, %d56
6284 6199 faligndata %d24, %d26, %d58
6285 6200 faligndata %d26, %d28, %d60
6286 6201 faligndata %d28, %d30, %d62
6287 6202 fmovd %d30, %d14
6288 6203 stda %d48, [%i1]ASI_BLK_P
6289 6204 subcc %i3, 64, %i3
6290 6205 add %i1, 64, %i1
6291 6206 bgu,pt %ncc, .ci_unaln_111_loop
6292 6207 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6293 6208 ba .ci_unaln_done
6294 6209 nop
6295 6210
6296 6211 .ci_unaln_110:
6297 6212 ldda [%o4+48]%asi, %d12
6298 6213 ldda [%o4+56]%asi, %d14
6299 6214 .ci_unaln_110_loop:
6300 6215 add %o4, 64, %o4
6301 6216 ldda [%o4]ASI_BLK_AIUS, %d16
6302 6217 faligndata %d12, %d14, %d48
6303 6218 faligndata %d14, %d16, %d50
6304 6219 faligndata %d16, %d18, %d52
6305 6220 faligndata %d18, %d20, %d54
6306 6221 faligndata %d20, %d22, %d56
6307 6222 faligndata %d22, %d24, %d58
6308 6223 faligndata %d24, %d26, %d60
6309 6224 faligndata %d26, %d28, %d62
6310 6225 fmovd %d28, %d12
6311 6226 fmovd %d30, %d14
6312 6227 stda %d48, [%i1]ASI_BLK_P
6313 6228 subcc %i3, 64, %i3
6314 6229 add %i1, 64, %i1
6315 6230 bgu,pt %ncc, .ci_unaln_110_loop
6316 6231 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6317 6232 ba .ci_unaln_done
6318 6233 nop
6319 6234
6320 6235 .ci_unaln_101:
6321 6236 ldda [%o4+40]%asi, %d10
6322 6237 ldda [%o4+48]%asi, %d12
6323 6238 ldda [%o4+56]%asi, %d14
6324 6239 .ci_unaln_101_loop:
6325 6240 add %o4, 64, %o4
6326 6241 ldda [%o4]ASI_BLK_AIUS, %d16
6327 6242 faligndata %d10, %d12, %d48
6328 6243 faligndata %d12, %d14, %d50
6329 6244 faligndata %d14, %d16, %d52
6330 6245 faligndata %d16, %d18, %d54
6331 6246 faligndata %d18, %d20, %d56
6332 6247 faligndata %d20, %d22, %d58
6333 6248 faligndata %d22, %d24, %d60
6334 6249 faligndata %d24, %d26, %d62
6335 6250 fmovd %d26, %d10
6336 6251 fmovd %d28, %d12
6337 6252 fmovd %d30, %d14
6338 6253 stda %d48, [%i1]ASI_BLK_P
6339 6254 subcc %i3, 64, %i3
6340 6255 add %i1, 64, %i1
6341 6256 bgu,pt %ncc, .ci_unaln_101_loop
6342 6257 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6343 6258 ba .ci_unaln_done
6344 6259 nop
6345 6260
6346 6261 .ci_unaln_100:
6347 6262 ldda [%o4+32]%asi, %d8
6348 6263 ldda [%o4+40]%asi, %d10
6349 6264 ldda [%o4+48]%asi, %d12
6350 6265 ldda [%o4+56]%asi, %d14
6351 6266 .ci_unaln_100_loop:
6352 6267 add %o4, 64, %o4
6353 6268 ldda [%o4]ASI_BLK_AIUS, %d16
6354 6269 faligndata %d8, %d10, %d48
6355 6270 faligndata %d10, %d12, %d50
6356 6271 faligndata %d12, %d14, %d52
6357 6272 faligndata %d14, %d16, %d54
6358 6273 faligndata %d16, %d18, %d56
6359 6274 faligndata %d18, %d20, %d58
6360 6275 faligndata %d20, %d22, %d60
6361 6276 faligndata %d22, %d24, %d62
6362 6277 fmovd %d24, %d8
6363 6278 fmovd %d26, %d10
6364 6279 fmovd %d28, %d12
6365 6280 fmovd %d30, %d14
6366 6281 stda %d48, [%i1]ASI_BLK_P
6367 6282 subcc %i3, 64, %i3
6368 6283 add %i1, 64, %i1
6369 6284 bgu,pt %ncc, .ci_unaln_100_loop
6370 6285 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6371 6286 ba .ci_unaln_done
6372 6287 nop
6373 6288
6374 6289 .ci_unaln_011:
6375 6290 ldda [%o4+24]%asi, %d6
6376 6291 ldda [%o4+32]%asi, %d8
6377 6292 ldda [%o4+40]%asi, %d10
6378 6293 ldda [%o4+48]%asi, %d12
6379 6294 ldda [%o4+56]%asi, %d14
6380 6295 .ci_unaln_011_loop:
6381 6296 add %o4, 64, %o4
6382 6297 ldda [%o4]ASI_BLK_AIUS, %d16
6383 6298 faligndata %d6, %d8, %d48
6384 6299 faligndata %d8, %d10, %d50
6385 6300 faligndata %d10, %d12, %d52
6386 6301 faligndata %d12, %d14, %d54
6387 6302 faligndata %d14, %d16, %d56
6388 6303 faligndata %d16, %d18, %d58
6389 6304 faligndata %d18, %d20, %d60
6390 6305 faligndata %d20, %d22, %d62
6391 6306 fmovd %d22, %d6
6392 6307 fmovd %d24, %d8
6393 6308 fmovd %d26, %d10
6394 6309 fmovd %d28, %d12
6395 6310 fmovd %d30, %d14
6396 6311 stda %d48, [%i1]ASI_BLK_P
6397 6312 subcc %i3, 64, %i3
6398 6313 add %i1, 64, %i1
6399 6314 bgu,pt %ncc, .ci_unaln_011_loop
6400 6315 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6401 6316 ba .ci_unaln_done
6402 6317 nop
6403 6318
6404 6319 .ci_unaln_010:
6405 6320 ldda [%o4+16]%asi, %d4
6406 6321 ldda [%o4+24]%asi, %d6
6407 6322 ldda [%o4+32]%asi, %d8
6408 6323 ldda [%o4+40]%asi, %d10
6409 6324 ldda [%o4+48]%asi, %d12
6410 6325 ldda [%o4+56]%asi, %d14
6411 6326 .ci_unaln_010_loop:
6412 6327 add %o4, 64, %o4
6413 6328 ldda [%o4]ASI_BLK_AIUS, %d16
6414 6329 faligndata %d4, %d6, %d48
6415 6330 faligndata %d6, %d8, %d50
6416 6331 faligndata %d8, %d10, %d52
6417 6332 faligndata %d10, %d12, %d54
6418 6333 faligndata %d12, %d14, %d56
6419 6334 faligndata %d14, %d16, %d58
6420 6335 faligndata %d16, %d18, %d60
6421 6336 faligndata %d18, %d20, %d62
6422 6337 fmovd %d20, %d4
6423 6338 fmovd %d22, %d6
6424 6339 fmovd %d24, %d8
6425 6340 fmovd %d26, %d10
6426 6341 fmovd %d28, %d12
6427 6342 fmovd %d30, %d14
6428 6343 stda %d48, [%i1]ASI_BLK_P
6429 6344 subcc %i3, 64, %i3
6430 6345 add %i1, 64, %i1
6431 6346 bgu,pt %ncc, .ci_unaln_010_loop
6432 6347 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6433 6348 ba .ci_unaln_done
6434 6349 nop
6435 6350
6436 6351 .ci_unaln_001:
6437 6352 ldda [%o4+8]%asi, %d2
6438 6353 ldda [%o4+16]%asi, %d4
6439 6354 ldda [%o4+24]%asi, %d6
6440 6355 ldda [%o4+32]%asi, %d8
6441 6356 ldda [%o4+40]%asi, %d10
6442 6357 ldda [%o4+48]%asi, %d12
6443 6358 ldda [%o4+56]%asi, %d14
6444 6359 .ci_unaln_001_loop:
6445 6360 add %o4, 64, %o4
6446 6361 ldda [%o4]ASI_BLK_AIUS, %d16
6447 6362 faligndata %d2, %d4, %d48
6448 6363 faligndata %d4, %d6, %d50
6449 6364 faligndata %d6, %d8, %d52
6450 6365 faligndata %d8, %d10, %d54
6451 6366 faligndata %d10, %d12, %d56
6452 6367 faligndata %d12, %d14, %d58
6453 6368 faligndata %d14, %d16, %d60
6454 6369 faligndata %d16, %d18, %d62
6455 6370 fmovd %d18, %d2
6456 6371 fmovd %d20, %d4
6457 6372 fmovd %d22, %d6
6458 6373 fmovd %d24, %d8
6459 6374 fmovd %d26, %d10
6460 6375 fmovd %d28, %d12
6461 6376 fmovd %d30, %d14
6462 6377 stda %d48, [%i1]ASI_BLK_P
6463 6378 subcc %i3, 64, %i3
6464 6379 add %i1, 64, %i1
6465 6380 bgu,pt %ncc, .ci_unaln_001_loop
6466 6381 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6467 6382 ba .ci_unaln_done
6468 6383 nop
6469 6384
6470 6385 .ci_unaln_000:
6471 6386 ldda [%o4]ASI_BLK_AIUS, %d0
6472 6387 .ci_unaln_000_loop:
6473 6388 add %o4, 64, %o4
6474 6389 ldda [%o4]ASI_BLK_AIUS, %d16
6475 6390 faligndata %d0, %d2, %d48
6476 6391 faligndata %d2, %d4, %d50
6477 6392 faligndata %d4, %d6, %d52
6478 6393 faligndata %d6, %d8, %d54
6479 6394 faligndata %d8, %d10, %d56
6480 6395 faligndata %d10, %d12, %d58
6481 6396 faligndata %d12, %d14, %d60
6482 6397 faligndata %d14, %d16, %d62
6483 6398 fmovd %d16, %d0
6484 6399 fmovd %d18, %d2
6485 6400 fmovd %d20, %d4
6486 6401 fmovd %d22, %d6
6487 6402 fmovd %d24, %d8
6488 6403 fmovd %d26, %d10
6489 6404 fmovd %d28, %d12
6490 6405 fmovd %d30, %d14
6491 6406 stda %d48, [%i1]ASI_BLK_P
6492 6407 subcc %i3, 64, %i3
6493 6408 add %i1, 64, %i1
6494 6409 bgu,pt %ncc, .ci_unaln_000_loop
6495 6410 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6496 6411
6497 6412 .ci_unaln_done:
6498 6413 ! Handle trailing bytes, 64 to 127
6499 6414 ! Dest long word aligned, Src not long word aligned
6500 6415 cmp %i2, 15
6501 6416 bleu %ncc, .ci_unaln_short
6502 6417
6503 6418 andn %i2, 0x7, %i3 ! %i3 is multiple of 8
6504 6419 and %i2, 0x7, %i2 ! residue bytes in %i2
6505 6420 add %i2, 8, %i2
6506 6421 sub %i3, 8, %i3 ! insure we don't load past end of src
6507 6422 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address
6508 6423 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8
6509 6424 ldda [%o4]%asi, %d0 ! fetch partial word
6510 6425 .ci_unaln_by8:
6511 6426 ldda [%o4+8]%asi, %d2
6512 6427 add %o4, 8, %o4
6513 6428 faligndata %d0, %d2, %d16
6514 6429 subcc %i3, 8, %i3
6515 6430 std %d16, [%i1]
6516 6431 fmovd %d2, %d0
6517 6432 bgu,pt %ncc, .ci_unaln_by8
6518 6433 add %i1, 8, %i1
6519 6434
6520 6435 .ci_unaln_short:
6521 6436 cmp %i2, 8
6522 6437 blt,pt %ncc, .ci_unalnfin
6523 6438 nop
6524 6439 lduba [%i0]%asi, %o4
6525 6440 sll %o4, 24, %o3
6526 6441 lduba [%i0+1]%asi, %o4
6527 6442 sll %o4, 16, %o4
6528 6443 or %o4, %o3, %o3
6529 6444 lduba [%i0+2]%asi, %o4
6530 6445 sll %o4, 8, %o4
6531 6446 or %o4, %o3, %o3
6532 6447 lduba [%i0+3]%asi, %o4
6533 6448 or %o4, %o3, %o3
6534 6449 stw %o3, [%i1]
6535 6450 lduba [%i0+4]%asi, %o4
6536 6451 sll %o4, 24, %o3
6537 6452 lduba [%i0+5]%asi, %o4
6538 6453 sll %o4, 16, %o4
6539 6454 or %o4, %o3, %o3
6540 6455 lduba [%i0+6]%asi, %o4
6541 6456 sll %o4, 8, %o4
6542 6457 or %o4, %o3, %o3
6543 6458 lduba [%i0+7]%asi, %o4
6544 6459 or %o4, %o3, %o3
6545 6460 stw %o3, [%i1+4]
6546 6461 add %i0, 8, %i0
6547 6462 add %i1, 8, %i1
6548 6463 sub %i2, 8, %i2
6549 6464 .ci_unalnfin:
6550 6465 cmp %i2, 4
6551 6466 blt,pt %ncc, .ci_unalnz
6552 6467 tst %i2
6553 6468 lduba [%i0]%asi, %o3 ! read byte
6554 6469 subcc %i2, 4, %i2 ! reduce count by 4
6555 6470 sll %o3, 24, %o3 ! position
6556 6471 lduba [%i0+1]%asi, %o4
6557 6472 sll %o4, 16, %o4 ! position
6558 6473 or %o4, %o3, %o3 ! merge
6559 6474 lduba [%i0+2]%asi, %o4
6560 6475 sll %o4, 8, %o4 ! position
6561 6476 or %o4, %o3, %o3 ! merge
6562 6477 add %i1, 4, %i1 ! advance dst by 4
6563 6478 lduba [%i0+3]%asi, %o4
6564 6479 add %i0, 4, %i0 ! advance src by 4
6565 6480 or %o4, %o3, %o4 ! merge
6566 6481 bnz,pt %ncc, .ci_unaln3x
6567 6482 stw %o4, [%i1-4]
6568 6483 ba .ci_exit
6569 6484 nop
6570 6485 .ci_unalnz:
6571 6486 bz,pt %ncc, .ci_exit
6572 6487 wr %l5, %g0, %gsr ! restore %gsr
6573 6488 .ci_unaln3x: ! Exactly 1, 2, or 3 bytes remain
6574 6489 subcc %i2, 1, %i2 ! reduce count for cc test
6575 6490 lduba [%i0]%asi, %o4 ! load one byte
6576 6491 bz,pt %ncc, .ci_exit
6577 6492 stb %o4, [%i1] ! store one byte
6578 6493 lduba [%i0+1]%asi, %o4 ! load second byte
6579 6494 subcc %i2, 1, %i2
6580 6495 bz,pt %ncc, .ci_exit
6581 6496 stb %o4, [%i1+1] ! store second byte
6582 6497 lduba [%i0+2]%asi, %o4 ! load third byte
6583 6498 stb %o4, [%i1+2] ! store third byte
6584 6499 .ci_exit:
6585 6500 brnz %g1, .ci_fp_restore
6586 6501 nop
6587 6502 FZERO
6588 6503 wr %g1, %g0, %fprs
6589 6504 ba,pt %ncc, .ci_ex2
6590 6505 membar #Sync
6591 6506 .ci_fp_restore:
6592 6507 BLD_FP_FROMSTACK(%o4)
6593 6508 .ci_ex2:
6594 6509 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
6595 6510 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6596 6511 ret
6597 6512 restore %g0, 0, %o0
6598 6513
6599 6514 .copyin_err:
6600 6515 ldn [THREAD_REG + T_COPYOPS], %o4
6601 6516 brz %o4, 2f
6602 6517 nop
6603 6518 ldn [%o4 + CP_COPYIN], %g2
6604 6519 jmp %g2
6605 6520 nop
6606 6521 2:
6607 6522 retl
6608 6523 mov -1, %o0
6609 6524
6610 6525 #else /* NIAGARA_IMPL */
6611 6526 .do_copyin:
6612 6527 !
6613 6528 ! Check the length and bail if zero.
6614 6529 !
6615 6530 tst %o2
6616 6531 bnz,pt %ncc, 1f
6617 6532 nop
6618 6533 retl
6619 6534 clr %o0
6620 6535 1:
6621 6536 sethi %hi(copyio_fault), %o4
6622 6537 or %o4, %lo(copyio_fault), %o4
6623 6538 sethi %hi(copyio_fault_nowindow), %o3
6624 6539 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
6625 6540 or %o3, %lo(copyio_fault_nowindow), %o3
6626 6541 membar #Sync
6627 6542 stn %o3, [THREAD_REG + T_LOFAULT]
6628 6543
6629 6544 mov %o0, SAVE_SRC
6630 6545 mov %o1, SAVE_DST
6631 6546 mov %o2, SAVE_COUNT
6632 6547
6633 6548 !
6634 6549 ! Check to see if we're more than SMALL_LIMIT.
6635 6550 !
6636 6551 subcc %o2, SMALL_LIMIT, %o3
6637 6552 bgu,a,pt %ncc, .dci_ns
6638 6553 or %o0, %o1, %o3
6639 6554 !
6640 6555 ! What was previously ".small_copyin"
6641 6556 !
6642 6557 .dcibcp:
6643 6558 sub %g0, %o2, %o3 ! setup for copy loop
6644 6559 add %o0, %o2, %o0
6645 6560 add %o1, %o2, %o1
6646 6561 ba,pt %ncc, .dcicl
6647 6562 lduba [%o0 + %o3]ASI_USER, %o4
6648 6563 !
6649 6564 ! %o0 and %o1 point at the end and remain pointing at the end
6650 6565 ! of their buffers. We pull things out by adding %o3 (which is
6651 6566 ! the negation of the length) to the buffer end which gives us
6652 6567 ! the curent location in the buffers. By incrementing %o3 we walk
6653 6568 ! through both buffers without having to bump each buffer's
6654 6569 ! pointer. A very fast 4 instruction loop.
6655 6570 !
6656 6571 .align 16
6657 6572 .dcicl:
6658 6573 stb %o4, [%o1 + %o3]
6659 6574 inccc %o3
6660 6575 bl,a,pt %ncc, .dcicl
6661 6576 lduba [%o0 + %o3]ASI_USER, %o4
6662 6577 !
6663 6578 ! We're done. Go home.
6664 6579 !
6665 6580 membar #Sync
6666 6581 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
6667 6582 retl
6668 6583 clr %o0
6669 6584 !
6670 6585 ! Try aligned copies from here.
6671 6586 !
6672 6587 .dci_ns:
6673 6588 !
6674 6589 ! See if we're single byte aligned. If we are, check the
6675 6590 ! limit for single byte copies. If we're smaller, or equal,
6676 6591 ! bounce to the byte for byte copy loop. Otherwise do it in
6677 6592 ! HW (if enabled).
6678 6593 !
6679 6594 btst 1, %o3
6680 6595 bz,a,pt %icc, .dcih8
6681 6596 btst 7, %o3
6682 6597 !
6683 6598 ! We're single byte aligned.
6684 6599 !
6685 6600 sethi %hi(hw_copy_limit_1), %o3
6686 6601 ld [%o3 + %lo(hw_copy_limit_1)], %o3
6687 6602 !
6688 6603 ! Is HW copy on? If not do everything byte for byte.
6689 6604 !
6690 6605 tst %o3
6691 6606 bz,pn %icc, .dcibcp
6692 6607 subcc %o3, %o2, %o3
6693 6608 !
6694 6609 ! Are we bigger than the HW limit? If not
6695 6610 ! go to byte for byte.
6696 6611 !
6697 6612 bge,pt %ncc, .dcibcp
6698 6613 nop
6699 6614 !
6700 6615 ! We're big enough and copy is on. Do it with HW.
6701 6616 !
6702 6617 ba,pt %ncc, .big_copyin
6703 6618 nop
6704 6619 .dcih8:
6705 6620 !
6706 6621 ! 8 byte aligned?
6707 6622 !
6708 6623 bnz,a %ncc, .dcih4
6709 6624 btst 3, %o3
6710 6625 !
6711 6626 ! We're eight byte aligned.
6712 6627 !
6713 6628 sethi %hi(hw_copy_limit_8), %o3
6714 6629 ld [%o3 + %lo(hw_copy_limit_8)], %o3
6715 6630 !
6716 6631 ! Is HW assist on? If not, do it with the aligned copy.
6717 6632 !
6718 6633 tst %o3
6719 6634 bz,pn %icc, .dcis8
6720 6635 subcc %o3, %o2, %o3
6721 6636 bge %ncc, .dcis8
6722 6637 nop
6723 6638 ba,pt %ncc, .big_copyin
6724 6639 nop
6725 6640 .dcis8:
6726 6641 !
6727 6642 ! Housekeeping for copy loops. Uses same idea as in the byte for
6728 6643 ! byte copy loop above.
6729 6644 !
6730 6645 add %o0, %o2, %o0
6731 6646 add %o1, %o2, %o1
6732 6647 sub %g0, %o2, %o3
6733 6648 ba,pt %ncc, .didebc
6734 6649 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy
6735 6650 !
6736 6651 ! 4 byte aligned?
6737 6652 !
6738 6653 .dcih4:
6739 6654 bnz %ncc, .dcih2
6740 6655 sethi %hi(hw_copy_limit_4), %o3
6741 6656 ld [%o3 + %lo(hw_copy_limit_4)], %o3
6742 6657 !
6743 6658 ! Is HW assist on? If not, do it with the aligned copy.
6744 6659 !
6745 6660 tst %o3
6746 6661 bz,pn %icc, .dcis4
6747 6662 subcc %o3, %o2, %o3
6748 6663 !
6749 6664 ! We're negative if our size is less than or equal to hw_copy_limit_4.
6750 6665 !
6751 6666 bge %ncc, .dcis4
6752 6667 nop
6753 6668 ba,pt %ncc, .big_copyin
6754 6669 nop
6755 6670 .dcis4:
6756 6671 !
6757 6672 ! Housekeeping for copy loops. Uses same idea as in the byte
6758 6673 ! for byte copy loop above.
6759 6674 !
6760 6675 add %o0, %o2, %o0
6761 6676 add %o1, %o2, %o1
6762 6677 sub %g0, %o2, %o3
6763 6678 ba,pt %ncc, .didfbc
6764 6679 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy
6765 6680 .dcih2:
6766 6681 !
6767 6682 ! We're two byte aligned. Check for "smallness"
6768 6683 ! done in delay at .dcih4
6769 6684 !
6770 6685 bleu,pt %ncc, .dcis2
6771 6686 sethi %hi(hw_copy_limit_2), %o3
6772 6687 ld [%o3 + %lo(hw_copy_limit_2)], %o3
6773 6688 !
6774 6689 ! Is HW assist on? If not, do it with the aligned copy.
6775 6690 !
6776 6691 tst %o3
6777 6692 bz,pn %icc, .dcis2
6778 6693 subcc %o3, %o2, %o3
6779 6694 !
6780 6695 ! Are we larger than the HW limit?
6781 6696 !
6782 6697 bge %ncc, .dcis2
6783 6698 nop
6784 6699 !
6785 6700 ! HW assist is on and we're large enough to use it.
6786 6701 !
6787 6702 ba,pt %ncc, .big_copyin
6788 6703 nop
6789 6704 !
6790 6705 ! Housekeeping for copy loops. Uses same idea as in the byte
6791 6706 ! for byte copy loop above.
6792 6707 !
6793 6708 .dcis2:
6794 6709 add %o0, %o2, %o0
6795 6710 add %o1, %o2, %o1
6796 6711 sub %g0, %o2, %o3
6797 6712 ba,pt %ncc, .didtbc
6798 6713 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy
6799 6714 !
6800 6715 .small_copyin:
6801 6716 !
6802 6717 ! Why are we doing this AGAIN? There are certain conditions in
6803 6718 ! big copyin that will cause us to forgo the HW assisted copys
6804 6719 ! and bounce back to a non-hw assisted copy. This dispatches
6805 6720 ! those copies. Note that we branch around this in the main line
6806 6721 ! code.
6807 6722 !
6808 6723 ! We make no check for limits or HW enablement here. We've
6809 6724 ! already been told that we're a poster child so just go off
6810 6725 ! and do it.
6811 6726 !
6812 6727 or %o0, %o1, %o3
6813 6728 btst 1, %o3
6814 6729 bnz %icc, .dcibcp ! Most likely
6815 6730 btst 7, %o3
6816 6731 bz %icc, .dcis8
6817 6732 btst 3, %o3
6818 6733 bz %icc, .dcis4
6819 6734 nop
6820 6735 ba,pt %ncc, .dcis2
6821 6736 nop
6822 6737 !
6823 6738 ! Eight byte aligned copies. A steal from the original .small_copyin
6824 6739 ! with modifications. %o2 is number of 8 byte chunks to copy. When
6825 6740 ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
6826 6741 ! to copy.
6827 6742 !
6828 6743 .align 32
6829 6744 .didebc:
6830 6745 ldxa [%o0 + %o3]ASI_USER, %o4
6831 6746 deccc %o2
6832 6747 stx %o4, [%o1 + %o3]
6833 6748 bg,pt %ncc, .didebc
6834 6749 addcc %o3, 8, %o3
6835 6750 !
6836 6751 ! End of copy loop. Most 8 byte aligned copies end here.
6837 6752 !
6838 6753 bz,pt %ncc, .dcifh
6839 6754 nop
6840 6755 !
6841 6756 ! Something is left. Do it byte for byte.
6842 6757 !
6843 6758 ba,pt %ncc, .dcicl
6844 6759 lduba [%o0 + %o3]ASI_USER, %o4
6845 6760 !
6846 6761 ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
6847 6762 !
6848 6763 .align 32
6849 6764 .didfbc:
6850 6765 lduwa [%o0 + %o3]ASI_USER, %o4
6851 6766 deccc %o2
6852 6767 st %o4, [%o1 + %o3]
6853 6768 bg,pt %ncc, .didfbc
6854 6769 addcc %o3, 4, %o3
6855 6770 !
6856 6771 ! End of copy loop. Most 4 byte aligned copies end here.
6857 6772 !
6858 6773 bz,pt %ncc, .dcifh
6859 6774 nop
6860 6775 !
6861 6776 ! Something is left. Do it byte for byte.
6862 6777 !
6863 6778 ba,pt %ncc, .dcicl
6864 6779 lduba [%o0 + %o3]ASI_USER, %o4
6865 6780 !
6866 6781 ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
6867 6782 ! copy.
6868 6783 !
6869 6784 .align 32
6870 6785 .didtbc:
6871 6786 lduha [%o0 + %o3]ASI_USER, %o4
6872 6787 deccc %o2
6873 6788 sth %o4, [%o1 + %o3]
6874 6789 bg,pt %ncc, .didtbc
6875 6790 addcc %o3, 2, %o3
6876 6791 !
6877 6792 ! End of copy loop. Most 2 byte aligned copies end here.
6878 6793 !
6879 6794 bz,pt %ncc, .dcifh
6880 6795 nop
6881 6796 !
6882 6797 ! Deal with the last byte
6883 6798 !
6884 6799 lduba [%o0 + %o3]ASI_USER, %o4
6885 6800 stb %o4, [%o1 + %o3]
6886 6801 .dcifh:
6887 6802 membar #Sync
6888 6803 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6889 6804 retl
6890 6805 clr %o0
6891 6806
6892 6807 .big_copyin:
6893 6808 ! We're going off to do a block copy.
6894 6809 ! Switch fault hendlers and grab a window. We
6895 6810 ! don't do a membar #Sync since we've done only
6896 6811 ! kernel data to this point.
6897 6812 stn %o4, [THREAD_REG + T_LOFAULT]
6898 6813
6899 6814 ! Copy in that reach here are larger than 256 bytes. The
6900 6815 ! hw_copy_limit_1 is set to 256. Never set this limit less
6901 6816 ! 128 bytes.
6902 6817 save %sp, -SA(MINFRAME), %sp
6903 6818 .do_blockcopyin:
6904 6819
6905 6820 ! Swap src/dst since the code below is memcpy code
6906 6821 ! and memcpy/bcopy have different calling sequences
6907 6822 mov %i1, %i5
6908 6823 mov %i0, %i1
6909 6824 mov %i5, %i0
6910 6825
6911 6826 ! Block (64 bytes) align the destination.
6912 6827 andcc %i0, 0x3f, %i3 ! is dst block aligned
6913 6828 bz %ncc, copyin_blalign ! dst already block aligned
6914 6829 sub %i3, 0x40, %i3
6915 6830 neg %i3 ! bytes till dst 64 bytes aligned
6916 6831 sub %i2, %i3, %i2 ! update i2 with new count
6917 6832
6918 6833 ! Based on source and destination alignment do
6919 6834 ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
6920 6835
6921 6836 ! Is dst & src 8B aligned
6922 6837 or %i0, %i1, %o2
6923 6838 andcc %o2, 0x7, %g0
6924 6839 bz %ncc, .ci_alewdcp
6925 6840 nop
6926 6841
6927 6842 ! Is dst & src 4B aligned
6928 6843 andcc %o2, 0x3, %g0
6929 6844 bz %ncc, .ci_alwdcp
6930 6845 nop
6931 6846
6932 6847 ! Is dst & src 2B aligned
6933 6848 andcc %o2, 0x1, %g0
6934 6849 bz %ncc, .ci_alhlfwdcp
6935 6850 nop
6936 6851
6937 6852 ! 1B aligned
6938 6853 1: lduba [%i1]ASI_USER, %o2
6939 6854 stb %o2, [%i0]
6940 6855 inc %i1
6941 6856 deccc %i3
6942 6857 bgu,pt %ncc, 1b
6943 6858 inc %i0
6944 6859
6945 6860 ba copyin_blalign
6946 6861 nop
6947 6862
6948 6863 ! dst & src 4B aligned
6949 6864 .ci_alwdcp:
6950 6865 lda [%i1]ASI_USER, %o2
6951 6866 st %o2, [%i0]
6952 6867 add %i1, 0x4, %i1
6953 6868 subcc %i3, 0x4, %i3
6954 6869 bgu,pt %ncc, .ci_alwdcp
6955 6870 add %i0, 0x4, %i0
6956 6871
6957 6872 ba copyin_blalign
6958 6873 nop
6959 6874
6960 6875 ! dst & src 2B aligned
6961 6876 .ci_alhlfwdcp:
6962 6877 lduha [%i1]ASI_USER, %o2
6963 6878 stuh %o2, [%i0]
6964 6879 add %i1, 0x2, %i1
6965 6880 subcc %i3, 0x2, %i3
6966 6881 bgu,pt %ncc, .ci_alhlfwdcp
6967 6882 add %i0, 0x2, %i0
6968 6883
6969 6884 ba copyin_blalign
6970 6885 nop
6971 6886
6972 6887 ! dst & src 8B aligned
6973 6888 .ci_alewdcp:
6974 6889 ldxa [%i1]ASI_USER, %o2
6975 6890 stx %o2, [%i0]
6976 6891 add %i1, 0x8, %i1
6977 6892 subcc %i3, 0x8, %i3
6978 6893 bgu,pt %ncc, .ci_alewdcp
6979 6894 add %i0, 0x8, %i0
6980 6895
6981 6896 copyin_blalign:
6982 6897 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size
6983 6898 sub %i2, %i3, %i2 ! Residue bytes in %i2
6984 6899
6985 6900 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
6986 6901
6987 6902 andcc %i1, 0xf, %o2 ! is src quadword aligned
6988 6903 bz,pn %xcc, .ci_blkcpy ! src offset in %o2 (last 4-bits)
6989 6904 nop
6990 6905 cmp %o2, 0x8
6991 6906 bg .ci_upper_double
6992 6907 nop
6993 6908 bl .ci_lower_double
6994 6909 nop
6995 6910
6996 6911 ! Falls through when source offset is equal to 8 i.e.
6997 6912 ! source is double word aligned.
6998 6913 ! In this case no shift/merge of data is required
6999 6914
7000 6915 sub %i1, %o2, %i1 ! align the src at 16 bytes.
7001 6916 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
7002 6917 prefetcha [%l0]ASI_USER, #one_read
7003 6918 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7004 6919 add %l0, 0x40, %l0
7005 6920 .ci_loop0:
7006 6921 add %i1, 0x10, %i1
7007 6922 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7008 6923
7009 6924 prefetcha [%l0]ASI_USER, #one_read
7010 6925
7011 6926 stxa %l3, [%i0+0x0]%asi
7012 6927 stxa %l4, [%i0+0x8]%asi
7013 6928
7014 6929 add %i1, 0x10, %i1
7015 6930 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7016 6931
7017 6932 stxa %l5, [%i0+0x10]%asi
7018 6933 stxa %l2, [%i0+0x18]%asi
7019 6934
7020 6935 add %i1, 0x10, %i1
7021 6936 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7022 6937
7023 6938 stxa %l3, [%i0+0x20]%asi
7024 6939 stxa %l4, [%i0+0x28]%asi
7025 6940
7026 6941 add %i1, 0x10, %i1
7027 6942 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7028 6943
7029 6944 stxa %l5, [%i0+0x30]%asi
7030 6945 stxa %l2, [%i0+0x38]%asi
7031 6946
7032 6947 add %l0, 0x40, %l0
7033 6948 subcc %i3, 0x40, %i3
7034 6949 bgu,pt %xcc, .ci_loop0
7035 6950 add %i0, 0x40, %i0
7036 6951 ba .ci_blkdone
7037 6952 add %i1, %o2, %i1 ! increment the source by src offset
7038 6953 ! the src offset was stored in %o2
7039 6954
7040 6955 .ci_lower_double:
7041 6956
7042 6957 sub %i1, %o2, %i1 ! align the src at 16 bytes.
7043 6958 sll %o2, 3, %o0 ! %o0 left shift
7044 6959 mov 0x40, %o1
7045 6960 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
7046 6961 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
7047 6962 prefetcha [%l0]ASI_USER, #one_read
7048 6963 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l2
7049 6964 ! and %l3 has complete
7050 6965 ! data
7051 6966 add %l0, 0x40, %l0
7052 6967 .ci_loop1:
7053 6968 add %i1, 0x10, %i1
7054 6969 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has partial data
7055 6970 ! for this read.
7056 6971 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4
7057 6972 ! into %l2 and %l3
7058 6973
7059 6974 prefetcha [%l0]ASI_USER, #one_read
7060 6975
7061 6976 stxa %l2, [%i0+0x0]%asi
7062 6977 stxa %l3, [%i0+0x8]%asi
7063 6978
7064 6979 add %i1, 0x10, %i1
7065 6980 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7066 6981 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and
7067 6982 ! %l4 from previous read
7068 6983 ! into %l4 and %l5
7069 6984 stxa %l4, [%i0+0x10]%asi
7070 6985 stxa %l5, [%i0+0x18]%asi
7071 6986
7072 6987 ! Repeat the same for next 32 bytes.
7073 6988
7074 6989 add %i1, 0x10, %i1
7075 6990 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7076 6991 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
7077 6992
7078 6993 stxa %l2, [%i0+0x20]%asi
7079 6994 stxa %l3, [%i0+0x28]%asi
7080 6995
7081 6996 add %i1, 0x10, %i1
7082 6997 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7083 6998 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
7084 6999
7085 7000 stxa %l4, [%i0+0x30]%asi
7086 7001 stxa %l5, [%i0+0x38]%asi
7087 7002
7088 7003 add %l0, 0x40, %l0
7089 7004 subcc %i3, 0x40, %i3
7090 7005 bgu,pt %xcc, .ci_loop1
7091 7006 add %i0, 0x40, %i0
7092 7007 ba .ci_blkdone
7093 7008 add %i1, %o2, %i1 ! increment the source by src offset
7094 7009 ! the src offset was stored in %o2
7095 7010
7096 7011 .ci_upper_double:
7097 7012
7098 7013 sub %i1, %o2, %i1 ! align the src at 16 bytes.
7099 7014 sub %o2, 0x8, %o0
7100 7015 sll %o0, 3, %o0 ! %o0 left shift
7101 7016 mov 0x40, %o1
7102 7017 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift)
7103 7018 andn %i1, 0x3f, %l0 ! %l0 has block aligned source
7104 7019 prefetcha [%l0]ASI_USER, #one_read
7105 7020 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l3
7106 7021 ! for this read and
7107 7022 ! no data in %l2
7108 7023 add %l0, 0x40, %l0
7109 7024 .ci_loop2:
7110 7025 add %i1, 0x10, %i1
7111 7026 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has complete data
7112 7027 ! and %l5 has partial
7113 7028 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5
7114 7029 ! into %l3 and %l4
7115 7030 prefetcha [%l0]ASI_USER, #one_read
7116 7031
7117 7032 stxa %l3, [%i0+0x0]%asi
7118 7033 stxa %l4, [%i0+0x8]%asi
7119 7034
7120 7035 add %i1, 0x10, %i1
7121 7036 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7122 7037 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with
7123 7038 ! %l5 from previous read
7124 7039 ! into %l5 and %l2
7125 7040
7126 7041 stxa %l5, [%i0+0x10]%asi
7127 7042 stxa %l2, [%i0+0x18]%asi
7128 7043
7129 7044 ! Repeat the same for next 32 bytes.
7130 7045
7131 7046 add %i1, 0x10, %i1
7132 7047 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7133 7048 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
7134 7049
7135 7050 stxa %l3, [%i0+0x20]%asi
7136 7051 stxa %l4, [%i0+0x28]%asi
7137 7052
7138 7053 add %i1, 0x10, %i1
7139 7054 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7140 7055 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
7141 7056
7142 7057 stxa %l5, [%i0+0x30]%asi
7143 7058 stxa %l2, [%i0+0x38]%asi
7144 7059
7145 7060 add %l0, 0x40, %l0
7146 7061 subcc %i3, 0x40, %i3
7147 7062 bgu,pt %xcc, .ci_loop2
7148 7063 add %i0, 0x40, %i0
7149 7064 ba .ci_blkdone
7150 7065 add %i1, %o2, %i1 ! increment the source by src offset
7151 7066 ! the src offset was stored in %o2
7152 7067
7153 7068
7154 7069 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
7155 7070 .ci_blkcpy:
7156 7071
7157 7072 andn %i1, 0x3f, %o0 ! %o0 has block aligned source
7158 7073 prefetcha [%o0]ASI_USER, #one_read
7159 7074 add %o0, 0x40, %o0
7160 7075 1:
7161 7076 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
7162 7077 add %i1, 0x10, %i1
7163 7078 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7164 7079 add %i1, 0x10, %i1
7165 7080
7166 7081 prefetcha [%o0]ASI_USER, #one_read
7167 7082
7168 7083 stxa %l0, [%i0+0x0]%asi
7169 7084
7170 7085 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7171 7086 add %i1, 0x10, %i1
7172 7087 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
7173 7088 add %i1, 0x10, %i1
7174 7089
7175 7090 stxa %l1, [%i0+0x8]%asi
7176 7091 stxa %l2, [%i0+0x10]%asi
7177 7092 stxa %l3, [%i0+0x18]%asi
7178 7093 stxa %l4, [%i0+0x20]%asi
7179 7094 stxa %l5, [%i0+0x28]%asi
7180 7095 stxa %l6, [%i0+0x30]%asi
7181 7096 stxa %l7, [%i0+0x38]%asi
7182 7097
7183 7098 add %o0, 0x40, %o0
7184 7099 subcc %i3, 0x40, %i3
7185 7100 bgu,pt %xcc, 1b
7186 7101 add %i0, 0x40, %i0
7187 7102
7188 7103 .ci_blkdone:
7189 7104 membar #Sync
7190 7105
7191 7106 brz,pt %i2, .copyin_exit
7192 7107 nop
7193 7108
7194 7109 ! Handle trailing bytes
7195 7110 cmp %i2, 0x8
7196 7111 blu,pt %ncc, .ci_residue
7197 7112 nop
7198 7113
7199 7114 ! Can we do some 8B ops
7200 7115 or %i1, %i0, %o2
7201 7116 andcc %o2, 0x7, %g0
7202 7117 bnz %ncc, .ci_last4
7203 7118 nop
7204 7119
7205 7120 ! Do 8byte ops as long as possible
7206 7121 .ci_last8:
7207 7122 ldxa [%i1]ASI_USER, %o2
7208 7123 stx %o2, [%i0]
7209 7124 add %i1, 0x8, %i1
7210 7125 sub %i2, 0x8, %i2
7211 7126 cmp %i2, 0x8
7212 7127 bgu,pt %ncc, .ci_last8
7213 7128 add %i0, 0x8, %i0
7214 7129
7215 7130 brz,pt %i2, .copyin_exit
7216 7131 nop
7217 7132
7218 7133 ba .ci_residue
7219 7134 nop
7220 7135
7221 7136 .ci_last4:
7222 7137 ! Can we do 4B ops
7223 7138 andcc %o2, 0x3, %g0
7224 7139 bnz %ncc, .ci_last2
7225 7140 nop
7226 7141 1:
7227 7142 lda [%i1]ASI_USER, %o2
7228 7143 st %o2, [%i0]
7229 7144 add %i1, 0x4, %i1
7230 7145 sub %i2, 0x4, %i2
7231 7146 cmp %i2, 0x4
7232 7147 bgu,pt %ncc, 1b
7233 7148 add %i0, 0x4, %i0
7234 7149
7235 7150 brz,pt %i2, .copyin_exit
7236 7151 nop
7237 7152
7238 7153 ba .ci_residue
7239 7154 nop
7240 7155
7241 7156 .ci_last2:
7242 7157 ! Can we do 2B ops
7243 7158 andcc %o2, 0x1, %g0
7244 7159 bnz %ncc, .ci_residue
7245 7160 nop
7246 7161
7247 7162 1:
7248 7163 lduha [%i1]ASI_USER, %o2
7249 7164 stuh %o2, [%i0]
7250 7165 add %i1, 0x2, %i1
7251 7166 sub %i2, 0x2, %i2
7252 7167 cmp %i2, 0x2
7253 7168 bgu,pt %ncc, 1b
7254 7169 add %i0, 0x2, %i0
7255 7170
7256 7171 brz,pt %i2, .copyin_exit
7257 7172 nop
7258 7173
7259 7174 ! Copy the residue as byte copy
7260 7175 .ci_residue:
7261 7176 lduba [%i1]ASI_USER, %i4
7262 7177 stb %i4, [%i0]
7263 7178 inc %i1
7264 7179 deccc %i2
7265 7180 bgu,pt %xcc, .ci_residue
7266 7181 inc %i0
7267 7182
7268 7183 .copyin_exit:
7269 7184 membar #Sync
7270 7185 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7271 7186 ret
7272 7187 restore %g0, 0, %o0
7273 7188 .copyin_err:
7274 7189 ldn [THREAD_REG + T_COPYOPS], %o4
7275 7190 brz %o4, 2f
↓ open down ↓ |
2083 lines elided |
↑ open up ↑ |
7276 7191 nop
7277 7192 ldn [%o4 + CP_COPYIN], %g2
7278 7193 jmp %g2
7279 7194 nop
7280 7195 2:
7281 7196 retl
7282 7197 mov -1, %o0
7283 7198 #endif /* NIAGARA_IMPL */
7284 7199 SET_SIZE(copyin)
7285 7200
7286 -#endif /* lint */
7287 -
7288 -#ifdef lint
7289 -
7290 -/*ARGSUSED*/
7291 -int
7292 -xcopyin(const void *uaddr, void *kaddr, size_t count)
7293 -{ return (0); }
7294 -
7295 -#else /* lint */
7296 -
7297 7201 ENTRY(xcopyin)
7298 7202 sethi %hi(.xcopyin_err), REAL_LOFAULT
7299 7203 b .do_copyin
7300 7204 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
7301 7205 .xcopyin_err:
7302 7206 ldn [THREAD_REG + T_COPYOPS], %o4
7303 7207 brz %o4, 2f
7304 7208 nop
7305 7209 ldn [%o4 + CP_XCOPYIN], %g2
7306 7210 jmp %g2
7307 7211 nop
7308 7212 2:
7309 7213 retl
7310 7214 mov %g1, %o0
7311 7215 SET_SIZE(xcopyin)
7312 7216
7313 -#endif /* lint */
7314 -
7315 -#ifdef lint
7316 -
7317 -/*ARGSUSED*/
7318 -int
7319 -xcopyin_little(const void *uaddr, void *kaddr, size_t count)
7320 -{ return (0); }
7321 -
7322 -#else /* lint */
7323 -
7324 7217 ENTRY(xcopyin_little)
7325 7218 sethi %hi(.little_err), %o4
7326 7219 ldn [THREAD_REG + T_LOFAULT], %o5
7327 7220 or %o4, %lo(.little_err), %o4
7328 7221 membar #Sync ! sync error barrier
7329 7222 stn %o4, [THREAD_REG + T_LOFAULT]
7330 7223
7331 7224 subcc %g0, %o2, %o3
7332 7225 add %o0, %o2, %o0
7333 7226 bz,pn %ncc, 2f ! check for zero bytes
7334 7227 sub %o2, 1, %o4
7335 7228 add %o0, %o4, %o0 ! start w/last byte
7336 7229 add %o1, %o2, %o1
7337 7230 lduba [%o0+%o3]ASI_AIUSL, %o4
7338 7231
7339 7232 1: stb %o4, [%o1+%o3]
7340 7233 inccc %o3
7341 7234 sub %o0, 2, %o0 ! get next byte
7342 7235 bcc,a,pt %ncc, 1b
7343 7236 lduba [%o0+%o3]ASI_AIUSL, %o4
7344 7237
7345 7238 2: membar #Sync ! sync error barrier
7346 7239 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
7347 7240 retl
7348 7241 mov %g0, %o0 ! return (0)
7349 7242
7350 7243 .little_err:
7351 7244 membar #Sync ! sync error barrier
7352 7245 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7353 7246 retl
7354 7247 mov %g1, %o0
7355 7248 SET_SIZE(xcopyin_little)
7356 7249
7357 -#endif /* lint */
7358 7250
7359 -
7360 7251 /*
7361 7252 * Copy a block of storage - must not overlap (from + len <= to).
7362 7253 * No fault handler installed (to be called under on_fault())
7363 7254 */
7364 -#if defined(lint)
7365 7255
7366 -/* ARGSUSED */
7367 -void
7368 -copyin_noerr(const void *ufrom, void *kto, size_t count)
7369 -{}
7370 -
7371 -#else /* lint */
7372 -
7373 7256 ENTRY(copyin_noerr)
7374 7257 sethi %hi(.copyio_noerr), REAL_LOFAULT
7375 7258 b .do_copyin
7376 7259 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7377 7260 .copyio_noerr:
7378 7261 jmp SAVED_LOFAULT
7379 7262 nop
7380 7263 SET_SIZE(copyin_noerr)
7381 7264
7382 -#endif /* lint */
7383 -
7384 7265 /*
7385 7266 * Copy a block of storage - must not overlap (from + len <= to).
7386 7267 * No fault handler installed (to be called under on_fault())
7387 7268 */
7388 7269
7389 -#if defined(lint)
7390 -
7391 -/* ARGSUSED */
7392 -void
7393 -copyout_noerr(const void *kfrom, void *uto, size_t count)
7394 -{}
7395 -
7396 -#else /* lint */
7397 -
7398 7270 ENTRY(copyout_noerr)
7399 7271 sethi %hi(.copyio_noerr), REAL_LOFAULT
7400 7272 b .do_copyout
7401 7273 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7402 7274 SET_SIZE(copyout_noerr)
7403 7275
7404 -#endif /* lint */
7405 -
7406 -#if defined(lint)
7407 -
7408 -int use_hw_bcopy = 1;
7409 -int use_hw_bzero = 1;
7410 -uint_t hw_copy_limit_1 = 0x100;
7411 -uint_t hw_copy_limit_2 = 0x200;
7412 -uint_t hw_copy_limit_4 = 0x400;
7413 -uint_t hw_copy_limit_8 = 0x400;
7414 -
7415 -#else /* !lint */
7416 -
7417 7276 .align 4
7418 7277 DGDEF(use_hw_bcopy)
7419 7278 .word 1
7420 7279 DGDEF(use_hw_bzero)
7421 7280 .word 1
7422 7281 DGDEF(hw_copy_limit_1)
7423 7282 .word 0x100
7424 7283 DGDEF(hw_copy_limit_2)
7425 7284 .word 0x200
7426 7285 DGDEF(hw_copy_limit_4)
7427 7286 .word 0x400
7428 7287 DGDEF(hw_copy_limit_8)
7429 7288 .word 0x400
7430 7289
7431 7290 .align 64
7432 7291 .section ".text"
7433 -#endif /* !lint */
7434 7292
7435 7293 /*
7436 7294 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
7437 7295 * longer than 256 bytes in length using Niagara's block stores/quad store.
7438 7296 * If the criteria for using this routine are not met then it calls bzero
7439 7297 * and returns 1. Otherwise 0 is returned indicating success.
7440 7298 * Caller is responsible for ensuring use_hw_bzero is true and that
7441 7299 * kpreempt_disable() has been called.
7442 7300 */
7443 -#ifdef lint
7444 -/*ARGSUSED*/
7445 -int
7446 -hwblkclr(void *addr, size_t len)
7447 -{
7448 - return(0);
7449 -}
7450 -#else /* lint */
7451 7301 ! %i0 - start address
7452 7302 ! %i1 - length of region (multiple of 64)
7453 7303
7454 7304 ENTRY(hwblkclr)
7455 7305 save %sp, -SA(MINFRAME), %sp
7456 7306
7457 7307 ! Must be block-aligned
7458 7308 andcc %i0, 0x3f, %g0
7459 7309 bnz,pn %ncc, 1f
7460 7310 nop
7461 7311
7462 7312 ! ... and must be 256 bytes or more
7463 7313 cmp %i1, 0x100
7464 7314 blu,pn %ncc, 1f
7465 7315 nop
7466 7316
7467 7317 ! ... and length must be a multiple of 64
7468 7318 andcc %i1, 0x3f, %g0
7469 7319 bz,pn %ncc, .pz_doblock
7470 7320 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7471 7321
7472 7322 1: ! punt, call bzero but notify the caller that bzero was used
7473 7323 mov %i0, %o0
7474 7324 call bzero
7475 7325 mov %i1, %o1
7476 7326 ret
7477 7327 restore %g0, 1, %o0 ! return (1) - did not use block operations
7478 7328
7479 7329 ! Already verified that there are at least 256 bytes to set
7480 7330 .pz_doblock:
7481 7331 stxa %g0, [%i0+0x0]%asi
7482 7332 stxa %g0, [%i0+0x40]%asi
7483 7333 stxa %g0, [%i0+0x80]%asi
7484 7334 stxa %g0, [%i0+0xc0]%asi
7485 7335
7486 7336 stxa %g0, [%i0+0x8]%asi
7487 7337 stxa %g0, [%i0+0x10]%asi
7488 7338 stxa %g0, [%i0+0x18]%asi
7489 7339 stxa %g0, [%i0+0x20]%asi
7490 7340 stxa %g0, [%i0+0x28]%asi
7491 7341 stxa %g0, [%i0+0x30]%asi
7492 7342 stxa %g0, [%i0+0x38]%asi
7493 7343
7494 7344 stxa %g0, [%i0+0x48]%asi
7495 7345 stxa %g0, [%i0+0x50]%asi
7496 7346 stxa %g0, [%i0+0x58]%asi
7497 7347 stxa %g0, [%i0+0x60]%asi
7498 7348 stxa %g0, [%i0+0x68]%asi
7499 7349 stxa %g0, [%i0+0x70]%asi
7500 7350 stxa %g0, [%i0+0x78]%asi
7501 7351
7502 7352 stxa %g0, [%i0+0x88]%asi
7503 7353 stxa %g0, [%i0+0x90]%asi
7504 7354 stxa %g0, [%i0+0x98]%asi
7505 7355 stxa %g0, [%i0+0xa0]%asi
7506 7356 stxa %g0, [%i0+0xa8]%asi
7507 7357 stxa %g0, [%i0+0xb0]%asi
7508 7358 stxa %g0, [%i0+0xb8]%asi
7509 7359
7510 7360 stxa %g0, [%i0+0xc8]%asi
7511 7361 stxa %g0, [%i0+0xd0]%asi
7512 7362 stxa %g0, [%i0+0xd8]%asi
7513 7363 stxa %g0, [%i0+0xe0]%asi
7514 7364 stxa %g0, [%i0+0xe8]%asi
7515 7365 stxa %g0, [%i0+0xf0]%asi
7516 7366 stxa %g0, [%i0+0xf8]%asi
7517 7367
7518 7368 sub %i1, 0x100, %i1
7519 7369 cmp %i1, 0x100
7520 7370 bgu,pt %ncc, .pz_doblock
7521 7371 add %i0, 0x100, %i0
7522 7372
7523 7373 2:
7524 7374 ! Check if more than 64 bytes to set
7525 7375 cmp %i1,0x40
7526 7376 blu %ncc, .pz_finish
7527 7377 nop
7528 7378
7529 7379 3:
7530 7380 stxa %g0, [%i0+0x0]%asi
7531 7381 stxa %g0, [%i0+0x8]%asi
7532 7382 stxa %g0, [%i0+0x10]%asi
7533 7383 stxa %g0, [%i0+0x18]%asi
7534 7384 stxa %g0, [%i0+0x20]%asi
7535 7385 stxa %g0, [%i0+0x28]%asi
7536 7386 stxa %g0, [%i0+0x30]%asi
7537 7387 stxa %g0, [%i0+0x38]%asi
↓ open down ↓ |
77 lines elided |
↑ open up ↑ |
7538 7388
7539 7389 subcc %i1, 0x40, %i1
7540 7390 bgu,pt %ncc, 3b
7541 7391 add %i0, 0x40, %i0
7542 7392
7543 7393 .pz_finish:
7544 7394 membar #Sync
7545 7395 ret
7546 7396 restore %g0, 0, %o0 ! return (bzero or not)
7547 7397 SET_SIZE(hwblkclr)
7548 -#endif /* lint */
7549 7398
7550 -#ifdef lint
7551 -/* Copy 32 bytes of data from src to dst using physical addresses */
7552 -/*ARGSUSED*/
7553 -void
7554 -hw_pa_bcopy32(uint64_t src, uint64_t dst)
7555 -{}
7556 -#else /*!lint */
7557 -
7558 7399 /*
7559 7400 * Copy 32 bytes of data from src (%o0) to dst (%o1)
7560 7401 * using physical addresses.
7561 7402 */
7562 7403 ENTRY_NP(hw_pa_bcopy32)
7563 7404 rdpr %pstate, %g1
7564 7405 andn %g1, PSTATE_IE, %g2
7565 7406 wrpr %g0, %g2, %pstate
7566 7407
7567 7408 ldxa [%o0]ASI_MEM, %o2
7568 7409 add %o0, 8, %o0
7569 7410 ldxa [%o0]ASI_MEM, %o3
7570 7411 add %o0, 8, %o0
7571 7412 ldxa [%o0]ASI_MEM, %o4
7572 7413 add %o0, 8, %o0
7573 7414 ldxa [%o0]ASI_MEM, %o5
7574 7415 stxa %o2, [%o1]ASI_MEM
7575 7416 add %o1, 8, %o1
↓ open down ↓ |
8 lines elided |
↑ open up ↑ |
7576 7417 stxa %o3, [%o1]ASI_MEM
7577 7418 add %o1, 8, %o1
7578 7419 stxa %o4, [%o1]ASI_MEM
7579 7420 add %o1, 8, %o1
7580 7421 stxa %o5, [%o1]ASI_MEM
7581 7422
7582 7423 membar #Sync
7583 7424 retl
7584 7425 wrpr %g0, %g1, %pstate
7585 7426 SET_SIZE(hw_pa_bcopy32)
7586 -#endif /* lint */
7587 7427
7588 7428 /*
7589 7429 * Zero a block of storage.
7590 7430 *
7591 7431 * uzero is used by the kernel to zero a block in user address space.
7592 7432 */
7593 7433
7594 7434 /*
7595 7435 * Control flow of the bzero/kzero/uzero routine.
7596 7436 *
7597 7437 * For fewer than 7 bytes stores, bytes will be zeroed.
7598 7438 *
7599 7439 * For less than 15 bytes stores, align the address on 4 byte boundary.
7600 7440 * Then store as many 4-byte chunks, followed by trailing bytes.
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
7601 7441 *
7602 7442 * For sizes greater than 15 bytes, align the address on 8 byte boundary.
7603 7443 * if (count > 128) {
7604 7444 * store as many 8-bytes chunks to block align the address
7605 7445 * store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
7606 7446 * store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
7607 7447 * }
7608 7448 * Store as many 8-byte chunks, followed by trailing bytes.
7609 7449 */
7610 7450
7611 -#if defined(lint)
7612 -
7613 -/* ARGSUSED */
7614 -int
7615 -kzero(void *addr, size_t count)
7616 -{ return(0); }
7617 -
7618 -/* ARGSUSED */
7619 -void
7620 -uzero(void *addr, size_t count)
7621 -{}
7622 -
7623 -#else /* lint */
7624 -
7625 7451 ENTRY(uzero)
7626 7452 !
7627 7453 ! Set a new lo_fault handler only if we came in with one
7628 7454 ! already specified.
7629 7455 !
7630 7456 wr %g0, ASI_USER, %asi
7631 7457 ldn [THREAD_REG + T_LOFAULT], %o5
7632 7458 tst %o5
7633 7459 bz,pt %ncc, .do_zero
7634 7460 sethi %hi(.zeroerr), %o2
7635 7461 or %o2, %lo(.zeroerr), %o2
7636 7462 membar #Sync
7637 7463 ba,pt %ncc, .do_zero
7638 7464 stn %o2, [THREAD_REG + T_LOFAULT]
7639 7465
7640 7466 ENTRY(kzero)
7641 7467 !
7642 7468 ! Always set a lo_fault handler
7643 7469 !
7644 7470 wr %g0, ASI_P, %asi
7645 7471 ldn [THREAD_REG + T_LOFAULT], %o5
7646 7472 sethi %hi(.zeroerr), %o2
7647 7473 or %o5, LOFAULT_SET, %o5
7648 7474 or %o2, %lo(.zeroerr), %o2
7649 7475 membar #Sync
7650 7476 ba,pt %ncc, .do_zero
7651 7477 stn %o2, [THREAD_REG + T_LOFAULT]
7652 7478
7653 7479 /*
7654 7480 * We got here because of a fault during kzero or if
7655 7481 * uzero or bzero was called with t_lofault non-zero.
7656 7482 * Otherwise we've already run screaming from the room.
7657 7483 * Errno value is in %g1. Note that we're here iff
7658 7484 * we did set t_lofault.
7659 7485 */
7660 7486 .zeroerr:
7661 7487 !
7662 7488 ! Undo asi register setting. Just set it to be the
7663 7489 ! kernel default without checking.
7664 7490 !
7665 7491 wr %g0, ASI_P, %asi
7666 7492
7667 7493 !
7668 7494 ! We did set t_lofault. It may well have been zero coming in.
7669 7495 !
7670 7496 1:
7671 7497 tst %o5
7672 7498 membar #Sync
7673 7499 bne,pn %ncc, 3f
7674 7500 andncc %o5, LOFAULT_SET, %o5
7675 7501 2:
7676 7502 !
7677 7503 ! Old handler was zero. Just return the error.
7678 7504 !
7679 7505 retl ! return
7680 7506 mov %g1, %o0 ! error code from %g1
7681 7507 3:
7682 7508 !
7683 7509 ! We're here because %o5 was non-zero. It was non-zero
7684 7510 ! because either LOFAULT_SET was present, a previous fault
7685 7511 ! handler was present or both. In all cases we need to reset
7686 7512 ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
↓ open down ↓ |
52 lines elided |
↑ open up ↑ |
7687 7513 ! before we either simply return the error or we invoke the
7688 7514 ! previously specified handler.
7689 7515 !
7690 7516 be %ncc, 2b
7691 7517 stn %o5, [THREAD_REG + T_LOFAULT]
7692 7518 jmp %o5 ! goto real handler
7693 7519 nop
7694 7520 SET_SIZE(kzero)
7695 7521 SET_SIZE(uzero)
7696 7522
7697 -#endif /* lint */
7698 -
7699 7523 /*
7700 7524 * Zero a block of storage.
7701 7525 */
7702 7526
7703 -#if defined(lint)
7704 -
7705 -/* ARGSUSED */
7706 -void
7707 -bzero(void *addr, size_t count)
7708 -{}
7709 -
7710 -#else /* lint */
7711 -
7712 7527 ENTRY(bzero)
7713 7528 wr %g0, ASI_P, %asi
7714 7529
7715 7530 ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector
7716 7531 tst %o5
7717 7532 bz,pt %ncc, .do_zero
7718 7533 sethi %hi(.zeroerr), %o2
7719 7534 or %o2, %lo(.zeroerr), %o2
7720 7535 membar #Sync ! sync error barrier
7721 7536 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector
7722 7537
7723 7538 .do_zero:
7724 7539 cmp %o1, 7
7725 7540 blu,pn %ncc, .byteclr
7726 7541 nop
7727 7542
7728 7543 cmp %o1, 15
7729 7544 blu,pn %ncc, .wdalign
7730 7545 nop
7731 7546
7732 7547 andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound
7733 7548 bz,pt %ncc, .blkalign ! already double aligned
7734 7549 sub %o3, 8, %o3 ! -(bytes till double aligned)
7735 7550 add %o1, %o3, %o1 ! update o1 with new count
7736 7551
7737 7552 1:
7738 7553 stba %g0, [%o0]%asi
7739 7554 inccc %o3
7740 7555 bl,pt %ncc, 1b
7741 7556 inc %o0
7742 7557
7743 7558 ! Now address is double aligned
7744 7559 .blkalign:
7745 7560 cmp %o1, 0x80 ! check if there are 128 bytes to set
7746 7561 blu,pn %ncc, .bzero_small
7747 7562 mov %o1, %o3
7748 7563
7749 7564 sethi %hi(use_hw_bzero), %o2
7750 7565 ld [%o2 + %lo(use_hw_bzero)], %o2
7751 7566 tst %o2
7752 7567 bz %ncc, .bzero_small
7753 7568 mov %o1, %o3
7754 7569
7755 7570 rd %asi, %o3
7756 7571 wr %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7757 7572 cmp %o3, ASI_P
7758 7573 bne,a %ncc, .algnblk
7759 7574 wr %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
7760 7575
7761 7576 .algnblk:
7762 7577 andcc %o0, 0x3f, %o3 ! is block aligned?
7763 7578 bz,pt %ncc, .bzero_blk
7764 7579 sub %o3, 0x40, %o3 ! -(bytes till block aligned)
7765 7580 add %o1, %o3, %o1 ! o1 is the remainder
7766 7581
7767 7582 ! Clear -(%o3) bytes till block aligned
7768 7583 1:
7769 7584 stxa %g0, [%o0]%asi
7770 7585 addcc %o3, 8, %o3
7771 7586 bl,pt %ncc, 1b
7772 7587 add %o0, 8, %o0
7773 7588
7774 7589 .bzero_blk:
7775 7590 and %o1, 0x3f, %o3 ! calc bytes left after blk clear
7776 7591 andn %o1, 0x3f, %o4 ! calc size of blocks in bytes
7777 7592
7778 7593 cmp %o4, 0x100 ! 256 bytes or more
7779 7594 blu,pn %ncc, 3f
7780 7595 nop
7781 7596
7782 7597 2:
7783 7598 stxa %g0, [%o0+0x0]%asi
7784 7599 stxa %g0, [%o0+0x40]%asi
7785 7600 stxa %g0, [%o0+0x80]%asi
7786 7601 stxa %g0, [%o0+0xc0]%asi
7787 7602
7788 7603 stxa %g0, [%o0+0x8]%asi
7789 7604 stxa %g0, [%o0+0x10]%asi
7790 7605 stxa %g0, [%o0+0x18]%asi
7791 7606 stxa %g0, [%o0+0x20]%asi
7792 7607 stxa %g0, [%o0+0x28]%asi
7793 7608 stxa %g0, [%o0+0x30]%asi
7794 7609 stxa %g0, [%o0+0x38]%asi
7795 7610
7796 7611 stxa %g0, [%o0+0x48]%asi
7797 7612 stxa %g0, [%o0+0x50]%asi
7798 7613 stxa %g0, [%o0+0x58]%asi
7799 7614 stxa %g0, [%o0+0x60]%asi
7800 7615 stxa %g0, [%o0+0x68]%asi
7801 7616 stxa %g0, [%o0+0x70]%asi
7802 7617 stxa %g0, [%o0+0x78]%asi
7803 7618
7804 7619 stxa %g0, [%o0+0x88]%asi
7805 7620 stxa %g0, [%o0+0x90]%asi
7806 7621 stxa %g0, [%o0+0x98]%asi
7807 7622 stxa %g0, [%o0+0xa0]%asi
7808 7623 stxa %g0, [%o0+0xa8]%asi
7809 7624 stxa %g0, [%o0+0xb0]%asi
7810 7625 stxa %g0, [%o0+0xb8]%asi
7811 7626
7812 7627 stxa %g0, [%o0+0xc8]%asi
7813 7628 stxa %g0, [%o0+0xd0]%asi
7814 7629 stxa %g0, [%o0+0xd8]%asi
7815 7630 stxa %g0, [%o0+0xe0]%asi
7816 7631 stxa %g0, [%o0+0xe8]%asi
7817 7632 stxa %g0, [%o0+0xf0]%asi
7818 7633 stxa %g0, [%o0+0xf8]%asi
7819 7634
7820 7635 sub %o4, 0x100, %o4
7821 7636 cmp %o4, 0x100
7822 7637 bgu,pt %ncc, 2b
7823 7638 add %o0, 0x100, %o0
7824 7639
7825 7640 3:
7826 7641 ! ... check if 64 bytes to set
7827 7642 cmp %o4, 0x40
7828 7643 blu %ncc, .bzero_blk_done
7829 7644 nop
7830 7645
7831 7646 4:
7832 7647 stxa %g0, [%o0+0x0]%asi
7833 7648 stxa %g0, [%o0+0x8]%asi
7834 7649 stxa %g0, [%o0+0x10]%asi
7835 7650 stxa %g0, [%o0+0x18]%asi
7836 7651 stxa %g0, [%o0+0x20]%asi
7837 7652 stxa %g0, [%o0+0x28]%asi
7838 7653 stxa %g0, [%o0+0x30]%asi
7839 7654 stxa %g0, [%o0+0x38]%asi
7840 7655
7841 7656 subcc %o4, 0x40, %o4
7842 7657 bgu,pt %ncc, 3b
7843 7658 add %o0, 0x40, %o0
7844 7659
7845 7660 .bzero_blk_done:
7846 7661 membar #Sync
7847 7662 !
7848 7663 ! Undo asi register setting.
7849 7664 !
7850 7665 rd %asi, %o4
7851 7666 wr %g0, ASI_P, %asi
7852 7667 cmp %o4, ASI_BLK_INIT_ST_QUAD_LDD_P
7853 7668 bne,a %ncc, .bzero_small
7854 7669 wr %g0, ASI_USER, %asi
7855 7670
7856 7671 .bzero_small:
7857 7672 ! Set the remaining doubles
7858 7673 subcc %o3, 8, %o3 ! Can we store any doubles?
7859 7674 blu,pn %ncc, .byteclr
7860 7675 and %o1, 7, %o1 ! calc bytes left after doubles
7861 7676
7862 7677 .dbclr:
7863 7678 stxa %g0, [%o0]%asi ! Clear the doubles
7864 7679 subcc %o3, 8, %o3
7865 7680 bgeu,pt %ncc, .dbclr
7866 7681 add %o0, 8, %o0
7867 7682
7868 7683 ba .byteclr
7869 7684 nop
7870 7685
7871 7686 .wdalign:
7872 7687 andcc %o0, 3, %o3 ! is add aligned on a word boundary
7873 7688 bz,pn %ncc, .wdclr
7874 7689 andn %o1, 3, %o3 ! create word sized count in %o3
7875 7690
7876 7691 dec %o1 ! decrement count
7877 7692 stba %g0, [%o0]%asi ! clear a byte
7878 7693 ba .wdalign
7879 7694 inc %o0 ! next byte
7880 7695
7881 7696 .wdclr:
7882 7697 sta %g0, [%o0]%asi ! 4-byte clearing loop
7883 7698 subcc %o3, 4, %o3
7884 7699 bnz,pt %ncc, .wdclr
7885 7700 inc 4, %o0
7886 7701
7887 7702 and %o1, 3, %o1 ! leftover count, if any
7888 7703
7889 7704 .byteclr:
7890 7705 ! Set the leftover bytes
7891 7706 brz %o1, .bzero_exit
7892 7707 nop
7893 7708
7894 7709 7:
7895 7710 deccc %o1 ! byte clearing loop
7896 7711 stba %g0, [%o0]%asi
7897 7712 bgu,pt %ncc, 7b
7898 7713 inc %o0
7899 7714
7900 7715 .bzero_exit:
7901 7716 !
7902 7717 ! We're just concerned with whether t_lofault was set
7903 7718 ! when we came in. We end up here from either kzero()
7904 7719 ! or bzero(). kzero() *always* sets a lofault handler.
7905 7720 ! It ors LOFAULT_SET into %o5 to indicate it has done
7906 7721 ! this even if the value of %o5 is otherwise zero.
7907 7722 ! bzero() sets a lofault handler *only* if one was
7908 7723 ! previously set. Accordingly we need to examine
7909 7724 ! %o5 and if it is non-zero be sure to clear LOFAULT_SET
7910 7725 ! before resetting the error handler.
7911 7726 !
↓ open down ↓ |
190 lines elided |
↑ open up ↑ |
7912 7727 tst %o5
7913 7728 bz %ncc, 1f
7914 7729 andn %o5, LOFAULT_SET, %o5
7915 7730 membar #Sync ! sync error barrier
7916 7731 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7917 7732 1:
7918 7733 retl
7919 7734 clr %o0 ! return (0)
7920 7735
7921 7736 SET_SIZE(bzero)
7922 -#endif /* lint */
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX