1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 26 #include <sys/param.h> 27 #include <sys/errno.h> 28 #include <sys/asm_linkage.h> 29 #include <sys/vtrace.h> 30 #include <sys/machthread.h> 31 #include <sys/clock.h> 32 #include <sys/asi.h> 33 #include <sys/fsr.h> 34 #include <sys/privregs.h> 35 #include <sys/machasi.h> 36 #include <sys/niagaraasi.h> 37 38 #include "assym.h" 39 40 41 /* 42 * Pseudo-code to aid in understanding the control flow of the 43 * bcopy/kcopy routine. 44 * 45 * ! WARNING : <Register usage convention> 46 * ! In kcopy() the %o5, holds previous error handler and a flag 47 * ! LOFAULT_SET (low bits). The %o5 is null in bcopy(). 48 * ! The %o5 is not available for any other use. 49 * 50 * On entry: 51 * ! Determine whether to use the FP register version or the 52 * ! the leaf routine version depending on the size of the copy. 53 * ! Set up error handling accordingly. 54 * ! The transition point depends on FP_COPY 55 * ! For both versions %o5 is reserved 56 * 57 * kcopy(): 58 * if(length > FP_COPY) 59 * go to regular_kcopy 60 * 61 * ! Setup_leaf_rtn_error_handler 62 * %o5 = curthread->t_lofault; ! save existing handler in %o5 63 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag 64 * curthread->t_lofault = .sm_copyerr; 65 * goto small_bcopy(); 66 * 67 * regular_kcopy: 68 * save_registers() 69 * %o5 = curthread->t_lofault; ! save existing handler in %o5 70 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag 71 * curthread->t_lofault = .copyerr; 72 * goto do_copy(); 73 * 74 * bcopy(): 75 * if(length > FP_COPY) 76 * go to regular_bcopy 77 * 78 * ! Setup_leaf_rtn_error_handler 79 * %o5 = curthread->t_lofault; ! save existing handler in %o5 80 * curthread->t_lofault = .sm_copyerr; 81 * goto small_bcopy(); 82 * 83 * regular_bcopy: 84 * %o5 = curthread->t_lofault; ! save existing handler in %o5 85 * curthread->t_lofault = .copyerr; 86 * goto do_copy(); 87 * 88 * small_bcopy: 89 * ! handle copies smaller than FP_COPY 90 * restore t_lofault handler 91 * exit 92 * 93 * do_copy: 94 * ! handle copies larger than FP_COPY 95 * save fp_regs 96 * blockcopy; 97 * restore fp_regs 98 * restore t_lofault handler if came from kcopy(); 99 * 100 * 101 * In leaf lofault handler: 102 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault 103 * return (errno) 104 * 105 * In lofault handler: 106 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault 107 * restore fp_regs 108 * return (errno) 109 * 110 * 111 * 112 * For all of bcopy/copyin/copyout the copy logic is specialized according 113 * to how the src and dst is aligned and how much data needs to be moved. 114 * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL)) 115 * 116 * N2/RF Flow : 117 * 118 * if (count < FP_COPY) { (584 bytes) 119 * set small fault handler (no register window save/restore) 120 * if count < SHORTCOPY (7 bytes) 121 * copy bytes; go to short_exit 122 * else 123 * determine dst alignment, move minimum bytes/halfwords to 124 * get dst aligned on long word boundary 125 * if( src is on long word boundary ) { 126 * medlong: src/dst aligned on 8 bytes 127 * copy with ldx/stx in 4-way unrolled loop; 128 * copy final 0-31 bytes; go to short_exit 129 * } else { src/dst not aligned on 8 bytes 130 * if src is word aligned, ld/st words in 32-byte chunks 131 * if src is half word aligned, ld half, ld word, ld half; pack 132 * into long word, store long words in 32-byte chunks 133 * if src is byte aligned, ld byte,half,word parts; pack into long 134 * word, store long words in 32-byte chunks 135 * move final 0-31 bytes according to src alignment; go to short_exit 136 * short_exit: 137 * restore trap handler if needed, retl 138 * else { More than FP_COPY bytes 139 * set fault handler 140 * disable kernel preemption 141 * save registers, save FP registers if in use 142 * move bytes to align destination register on long word boundary 143 * if(src is on long word boundary) { src/dst aligned on 8 bytes 144 * align dst on 64 byte boundary; use 8-way test for each of 8 possible 145 * src alignments relative to a 64 byte boundary to select the 146 * 16-way unrolled loop (128 bytes) to use for 147 * block load, fmovd, block-init-store, block-store, fmovd operations 148 * then go to remain_stuff. 149 * remain_stuff: move remaining bytes. go to long_exit 150 * } else { 151 * setup alignaddr for faligndata instructions 152 * align dst on 64 byte boundary; use 8-way test for each of 8 possible 153 * src alignments to nearest long word relative to 64 byte boundary to 154 * select the 8-way unrolled loop (64 bytes) to use for 155 * block load, falign, fmovd, block-store loop 156 * (only use block-init-store when src/dst on 8 byte boundaries.) 157 * goto unalign_done. 158 * unalign_done: 159 * move remaining bytes for unaligned cases. go to long_exit 160 * long_exit: 161 * restore %gsr, FP regs (either from stack or set to zero), 162 * restore trap handler, check for kernel preemption request, 163 * handle if needed, ret. 164 * } 165 * 166 * Other platforms include hw_bcopy_limit_[1248] to control the exact 167 * point where the FP register code is used. On those platforms, the 168 * FP register code did not leave data in L2 cache, potentially affecting 169 * performance more than the gain/loss from the algorithm difference. 170 * For N2/RF, block store places data in the L2 cache, so use or non-use 171 * of the FP registers has no effect on L2 cache behavior. 172 * The cost for testing hw_bcopy_limit_* according to different 173 * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits 174 * were not used. That cost was judged too high relative to the benefits, 175 * so the hw_bcopy_limit option is omitted from this code. 176 */ 177 178 /* 179 * Less then or equal this number of bytes we will always copy byte-for-byte 180 */ 181 #define SMALL_LIMIT 7 182 183 /* 184 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault 185 * handler was set 186 */ 187 #define LOFAULT_SET 2 188 189 /* 190 * This define is to align data for the unaligned source cases. 191 * The data1, data2 and data3 is merged into data1 and data2. 192 * The data3 is preserved for next merge. 193 */ 194 #define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ 195 sllx data1, lshift, data1 ;\ 196 srlx data2, rshift, tmp ;\ 197 or data1, tmp, data1 ;\ 198 sllx data2, lshift, data2 ;\ 199 srlx data3, rshift, tmp ;\ 200 or data2, tmp, data2 201 /* 202 * This macro is to align the data. Basically it merges 203 * data1 and data2 to form double word. 204 */ 205 #define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ 206 sllx data1, lshift, data1 ;\ 207 srlx data2, rshift, tmp ;\ 208 or data1, tmp, data1 209 210 #if !defined(NIAGARA_IMPL) 211 /* 212 * Flags set in the lower bits of the t_lofault address: 213 * FPUSED_FLAG: The FP registers were in use and must be restored 214 * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls 215 * COPY_FLAGS: Both of the above 216 * 217 * Other flags: 218 * KPREEMPT_FLAG: kpreempt needs to be called 219 */ 220 #define FPUSED_FLAG 1 221 #define LOFAULT_SET 2 222 #define COPY_FLAGS (FPUSED_FLAG | LOFAULT_SET) 223 #define KPREEMPT_FLAG 4 224 225 #define ALIGN_OFF_1_7 \ 226 faligndata %d0, %d2, %d48 ;\ 227 faligndata %d2, %d4, %d50 ;\ 228 faligndata %d4, %d6, %d52 ;\ 229 faligndata %d6, %d8, %d54 ;\ 230 faligndata %d8, %d10, %d56 ;\ 231 faligndata %d10, %d12, %d58 ;\ 232 faligndata %d12, %d14, %d60 ;\ 233 faligndata %d14, %d16, %d62 234 235 #define ALIGN_OFF_8_15 \ 236 faligndata %d2, %d4, %d48 ;\ 237 faligndata %d4, %d6, %d50 ;\ 238 faligndata %d6, %d8, %d52 ;\ 239 faligndata %d8, %d10, %d54 ;\ 240 faligndata %d10, %d12, %d56 ;\ 241 faligndata %d12, %d14, %d58 ;\ 242 faligndata %d14, %d16, %d60 ;\ 243 faligndata %d16, %d18, %d62 244 245 #define ALIGN_OFF_16_23 \ 246 faligndata %d4, %d6, %d48 ;\ 247 faligndata %d6, %d8, %d50 ;\ 248 faligndata %d8, %d10, %d52 ;\ 249 faligndata %d10, %d12, %d54 ;\ 250 faligndata %d12, %d14, %d56 ;\ 251 faligndata %d14, %d16, %d58 ;\ 252 faligndata %d16, %d18, %d60 ;\ 253 faligndata %d18, %d20, %d62 254 255 #define ALIGN_OFF_24_31 \ 256 faligndata %d6, %d8, %d48 ;\ 257 faligndata %d8, %d10, %d50 ;\ 258 faligndata %d10, %d12, %d52 ;\ 259 faligndata %d12, %d14, %d54 ;\ 260 faligndata %d14, %d16, %d56 ;\ 261 faligndata %d16, %d18, %d58 ;\ 262 faligndata %d18, %d20, %d60 ;\ 263 faligndata %d20, %d22, %d62 264 265 #define ALIGN_OFF_32_39 \ 266 faligndata %d8, %d10, %d48 ;\ 267 faligndata %d10, %d12, %d50 ;\ 268 faligndata %d12, %d14, %d52 ;\ 269 faligndata %d14, %d16, %d54 ;\ 270 faligndata %d16, %d18, %d56 ;\ 271 faligndata %d18, %d20, %d58 ;\ 272 faligndata %d20, %d22, %d60 ;\ 273 faligndata %d22, %d24, %d62 274 275 #define ALIGN_OFF_40_47 \ 276 faligndata %d10, %d12, %d48 ;\ 277 faligndata %d12, %d14, %d50 ;\ 278 faligndata %d14, %d16, %d52 ;\ 279 faligndata %d16, %d18, %d54 ;\ 280 faligndata %d18, %d20, %d56 ;\ 281 faligndata %d20, %d22, %d58 ;\ 282 faligndata %d22, %d24, %d60 ;\ 283 faligndata %d24, %d26, %d62 284 285 #define ALIGN_OFF_48_55 \ 286 faligndata %d12, %d14, %d48 ;\ 287 faligndata %d14, %d16, %d50 ;\ 288 faligndata %d16, %d18, %d52 ;\ 289 faligndata %d18, %d20, %d54 ;\ 290 faligndata %d20, %d22, %d56 ;\ 291 faligndata %d22, %d24, %d58 ;\ 292 faligndata %d24, %d26, %d60 ;\ 293 faligndata %d26, %d28, %d62 294 295 #define ALIGN_OFF_56_63 \ 296 faligndata %d14, %d16, %d48 ;\ 297 faligndata %d16, %d18, %d50 ;\ 298 faligndata %d18, %d20, %d52 ;\ 299 faligndata %d20, %d22, %d54 ;\ 300 faligndata %d22, %d24, %d56 ;\ 301 faligndata %d24, %d26, %d58 ;\ 302 faligndata %d26, %d28, %d60 ;\ 303 faligndata %d28, %d30, %d62 304 305 /* 306 * FP_COPY indicates the minimum number of bytes needed 307 * to justify using FP/VIS-accelerated memory operations. 308 * The FPBLK code assumes a minimum number of bytes are available 309 * to be moved on entry. Check that code carefully before 310 * reducing FP_COPY below 256. 311 */ 312 #define FP_COPY 584 313 #define SHORTCOPY 7 314 #define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P 315 #define ASI_STBI_AIUS ASI_BLK_INIT_QUAD_LDD_AIUS 316 #define CACHE_LINE 64 317 #define VIS_BLOCKSIZE 64 318 319 /* 320 * Size of stack frame in order to accomodate a 64-byte aligned 321 * floating-point register save area and 2 64-bit temp locations. 322 * All copy functions use three quadrants of fp registers; to assure a 323 * block-aligned three block buffer in which to save we must reserve 324 * four blocks on stack. 325 * 326 * _______________________________________ <-- %fp + STACK_BIAS 327 * | We may need to preserve 3 quadrants | 328 * | of fp regs, but since we do so with | 329 * | BST/BLD we need room in which to | 330 * | align to VIS_BLOCKSIZE bytes. So | 331 * | this area is 4 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 332 * |-------------------------------------| 333 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 334 * |-------------------------------------| 335 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 336 * --------------------------------------- 337 */ 338 #define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8)) 339 #define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 4) 340 #define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 3) + 1) 341 #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 342 #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 343 344 /* 345 * In FP copies if we do not have preserved data to restore over 346 * the fp regs we used then we must zero those regs to avoid 347 * exposing portions of the data to later threads (data security). 348 */ 349 #define FZERO \ 350 fzero %f0 ;\ 351 fzero %f2 ;\ 352 faddd %f0, %f2, %f4 ;\ 353 fmuld %f0, %f2, %f6 ;\ 354 faddd %f0, %f2, %f8 ;\ 355 fmuld %f0, %f2, %f10 ;\ 356 faddd %f0, %f2, %f12 ;\ 357 fmuld %f0, %f2, %f14 ;\ 358 faddd %f0, %f2, %f16 ;\ 359 fmuld %f0, %f2, %f18 ;\ 360 faddd %f0, %f2, %f20 ;\ 361 fmuld %f0, %f2, %f22 ;\ 362 faddd %f0, %f2, %f24 ;\ 363 fmuld %f0, %f2, %f26 ;\ 364 faddd %f0, %f2, %f28 ;\ 365 fmuld %f0, %f2, %f30 ;\ 366 faddd %f0, %f2, %f48 ;\ 367 fmuld %f0, %f2, %f50 ;\ 368 faddd %f0, %f2, %f52 ;\ 369 fmuld %f0, %f2, %f54 ;\ 370 faddd %f0, %f2, %f56 ;\ 371 fmuld %f0, %f2, %f58 ;\ 372 faddd %f0, %f2, %f60 ;\ 373 fmuld %f0, %f2, %f62 374 375 /* 376 * Macros to save and restore fp registers to/from the stack. 377 * Used to save and restore in-use fp registers when we want to use FP. 378 */ 379 #define BST_FP_TOSTACK(tmp1) \ 380 /* membar #Sync */ ;\ 381 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 382 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 383 stda %f0, [tmp1]ASI_BLK_P ;\ 384 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 385 stda %f16, [tmp1]ASI_BLK_P ;\ 386 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 387 stda %f48, [tmp1]ASI_BLK_P ;\ 388 membar #Sync 389 390 #define BLD_FP_FROMSTACK(tmp1) \ 391 /* membar #Sync - provided at copy completion */ ;\ 392 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 393 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 394 ldda [tmp1]ASI_BLK_P, %f0 ;\ 395 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 396 ldda [tmp1]ASI_BLK_P, %f16 ;\ 397 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 398 ldda [tmp1]ASI_BLK_P, %f48 ;\ 399 membar #Sync 400 401 #endif 402 /* 403 * Copy a block of storage, returning an error code if `from' or 404 * `to' takes a kernel pagefault which cannot be resolved. 405 * Returns errno value on pagefault error, 0 if all ok 406 */ 407 408 .seg ".text" 409 .align 4 410 411 ENTRY(kcopy) 412 #if !defined(NIAGARA_IMPL) 413 cmp %o2, FP_COPY ! check for small copy/leaf case 414 bgt,pt %ncc, .kcopy_more ! 415 nop 416 .kcopy_small: ! setup error handler 417 sethi %hi(.sm_copyerr), %o4 418 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value 419 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 420 ! Note that we carefully do *not* flag the setting of 421 ! t_lofault. 422 membar #Sync ! sync error barrier 423 b .sm_do_copy ! common code 424 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault 425 426 427 .kcopy_more: 428 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 429 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 430 or %l7, %lo(.copyerr), %l7 431 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 432 ! Note that we carefully do *not* flag the setting of 433 ! t_lofault. 434 membar #Sync ! sync error barrier 435 b .do_copy ! common code 436 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 437 438 /* 439 * We got here because of a fault during a small kcopy or bcopy. 440 * if a fault handler existed when bcopy was called. 441 * No floating point registers are used by the small copies. 442 * Small copies are from a leaf routine 443 * Errno value is in %g1. 444 */ 445 .sm_copyerr: 446 ! The kcopy will always set a t_lofault handler. If it fires, 447 ! we're expected to just return the error code and not to 448 ! invoke any existing error handler. As far as bcopy is concerned, 449 ! we only set t_lofault if there was an existing lofault handler. 450 ! In that case we're expected to invoke the previously existing 451 ! handler after resetting the t_lofault value. 452 btst LOFAULT_SET, %o5 453 membar #Sync ! sync error barrier 454 andn %o5, LOFAULT_SET, %o5 ! clear fault flag 455 bnz,pn %ncc, 3f 456 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 457 retl 458 mov %g1, %o0 459 3: 460 ! We're here via bcopy. There must have been an error handler 461 ! in place otherwise we would have died a nasty death already. 462 jmp %o5 ! goto real handler 463 mov %g0, %o0 464 /* 465 * end of .sm_copyerr 466 */ 467 468 /* 469 * We got here because of a fault during kcopy or bcopy if a fault 470 * handler existed when bcopy was called. 471 * stack and fp registers need to be restored 472 * Errno value is in %g1. 473 */ 474 .copyerr: 475 sethi %hi(.copyerr2), %l1 476 or %l1, %lo(.copyerr2), %l1 477 membar #Sync ! sync error barrier 478 stn %l1, [THREAD_REG + T_LOFAULT] ! set t_lofault 479 btst FPUSED_FLAG, %o5 480 bz,pt %xcc, 1f 481 and %o5, LOFAULT_SET, %l1 ! copy flag to %l1 482 483 membar #Sync ! sync error barrier 484 wr %l5, 0, %gsr 485 btst FPRS_FEF, %g5 486 bz,pt %icc, 4f 487 nop 488 ! restore fpregs from stack 489 BLD_FP_FROMSTACK(%o2) 490 ba,pt %ncc, 2f 491 wr %g5, 0, %fprs ! restore fprs 492 4: 493 FZERO 494 wr %g5, 0, %fprs ! restore fprs 495 2: 496 ldn [THREAD_REG + T_LWP], %o2 497 brnz,pt %o2, 1f 498 nop 499 500 ldsb [THREAD_REG + T_PREEMPT], %l0 501 deccc %l0 502 bnz,pn %ncc, 1f 503 stb %l0, [THREAD_REG + T_PREEMPT] 504 505 ! Check for a kernel preemption request 506 ldn [THREAD_REG + T_CPU], %l0 507 ldub [%l0 + CPU_KPRUNRUN], %l0 508 brnz,a,pt %l0, 1f ! Need to call kpreempt? 509 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag 510 511 ! The kcopy will always set a t_lofault handler. If it fires, 512 ! we're expected to just return the error code and not to 513 ! invoke any existing error handler. As far as bcopy is concerned, 514 ! we only set t_lofault if there was an existing lofault handler. 515 ! In that case we're expected to invoke the previously existing 516 ! handler after resetting the t_lofault value. 517 1: 518 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address 519 membar #Sync ! sync error barrier 520 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 521 522 ! call kpreempt if necessary 523 btst KPREEMPT_FLAG, %l1 524 bz,pt %icc, 2f 525 nop 526 call kpreempt 527 rdpr %pil, %o0 ! pass %pil 528 2: 529 btst LOFAULT_SET, %l1 530 bnz,pn %ncc, 3f 531 nop 532 ret 533 restore %g1, 0, %o0 534 3: 535 ! We're here via bcopy. There must have been an error handler 536 ! in place otherwise we would have died a nasty death already. 537 jmp %o5 ! goto real handler 538 restore %g0, 0, %o0 ! dispose of copy window 539 540 /* 541 * We got here because of a fault in .copyerr. We can't safely restore fp 542 * state, so we panic. 543 */ 544 fp_panic_msg: 545 .asciz "Unable to restore fp state after copy operation" 546 547 .align 4 548 .copyerr2: 549 set fp_panic_msg, %o0 550 call panic 551 nop 552 /* 553 * end of .copyerr 554 */ 555 556 #else /* NIAGARA_IMPL */ 557 save %sp, -SA(MINFRAME), %sp 558 set .copyerr, %l7 ! copyerr is lofault value 559 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 560 or %o5, LOFAULT_SET, %o5 561 membar #Sync ! sync error barrier 562 b .do_copy ! common code 563 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 564 565 /* 566 * We got here because of a fault during kcopy. 567 * Errno value is in %g1. 568 */ 569 .copyerr: 570 ! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET 571 ! into %o5 to indicate it has set t_lofault handler. Need to clear 572 ! LOFAULT_SET flag before restoring the error handler. 573 andn %o5, LOFAULT_SET, %o5 574 membar #Sync ! sync error barrier 575 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 576 ret 577 restore %g1, 0, %o0 578 #endif /* NIAGARA_IMPL */ 579 580 SET_SIZE(kcopy) 581 582 583 /* 584 * Copy a block of storage - must not overlap (from + len <= to). 585 */ 586 587 ENTRY(bcopy) 588 #if !defined(NIAGARA_IMPL) 589 cmp %o2, FP_COPY ! check for small copy/leaf case 590 bgt,pt %ncc, .bcopy_more ! 591 nop 592 .bcopy_small: ! setup error handler 593 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 594 tst %o5 595 bz,pt %icc, .sm_do_copy 596 sethi %hi(.sm_copyerr), %o4 597 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value 598 membar #Sync ! sync error barrier 599 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault 600 or %o5, LOFAULT_SET, %o5 ! Error should trampoline 601 .sm_do_copy: 602 mov %o0, %g1 ! save %o0 603 cmp %o2, SHORTCOPY ! make sure there is enough to align 604 ble,pt %ncc, .bc_smallest 605 andcc %o1, 0x7, %o3 ! is dest long aligned 606 bnz,pn %ncc, .bc_align 607 andcc %o1, 1, %o3 ! is dest byte aligned 608 609 ! Destination is long word aligned 610 .bc_al_src: 611 andcc %o0, 7, %o3 612 brnz,pt %o3, .bc_src_dst_unal8 613 nop 614 /* 615 * Special case for handling when src and dest are both long word aligned 616 * and total data to move is less than FP_COPY bytes 617 * Also handles finish up for large block moves, so may be less than 32 bytes 618 */ 619 .bc_medlong: 620 subcc %o2, 31, %o2 ! adjust length to allow cc test 621 ble,pt %ncc, .bc_medl31 622 nop 623 .bc_medl32: 624 ldx [%o0], %o4 ! move 32 bytes 625 subcc %o2, 32, %o2 ! decrement length count by 32 626 stx %o4, [%o1] 627 ldx [%o0+8], %o4 628 stx %o4, [%o1+8] 629 ldx [%o0+16], %o4 630 add %o0, 32, %o0 ! increase src ptr by 32 631 stx %o4, [%o1+16] 632 ldx [%o0-8], %o4 633 add %o1, 32, %o1 ! increase dst ptr by 32 634 bgu,pt %ncc, .bc_medl32 ! repeat if at least 32 bytes left 635 stx %o4, [%o1-8] 636 .bc_medl31: 637 addcc %o2, 24, %o2 ! adjust count to be off by 7 638 ble,pt %ncc, .bc_medl7 ! skip if 7 or fewer bytes left 639 nop 640 .bc_medl8: 641 ldx [%o0], %o4 ! move 8 bytes 642 add %o0, 8, %o0 ! increase src ptr by 8 643 subcc %o2, 8, %o2 ! decrease count by 8 644 add %o1, 8, %o1 ! increase dst ptr by 8 645 bgu,pt %ncc, .bc_medl8 646 stx %o4, [%o1-8] 647 .bc_medl7: 648 addcc %o2, 7, %o2 ! finish adjustment of remaining count 649 bnz,pt %ncc, .bc_small4 ! do final bytes if not finished 650 651 .bc_smallx: ! finish up and exit 652 tst %o5 653 bz,pt %ncc, .bc_sm_done 654 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address 655 membar #Sync ! sync error barrier 656 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 657 .bc_sm_done: 658 retl 659 mov %g0, %o0 660 661 .bc_small4: 662 cmp %o2, 4 663 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 664 nop ! 665 ld [%o0], %o4 ! move 4 bytes 666 add %o0, 4, %o0 ! increase src ptr by 4 667 add %o1, 4, %o1 ! increase dst ptr by 4 668 subcc %o2, 4, %o2 ! decrease count by 4 669 bz,pt %ncc, .bc_smallx 670 stw %o4, [%o1-4] 671 672 .bc_small3x: ! Exactly 1, 2, or 3 bytes remain 673 subcc %o2, 1, %o2 ! reduce count for cc test 674 ldub [%o0], %o4 ! load one byte 675 bz,pt %ncc, .bc_smallx 676 stb %o4, [%o1] ! store one byte 677 ldub [%o0+1], %o4 ! load second byte 678 subcc %o2, 1, %o2 679 bz,pt %ncc, .bc_smallx 680 stb %o4, [%o1+1] ! store second byte 681 ldub [%o0+2], %o4 ! load third byte 682 ba .bc_smallx 683 stb %o4, [%o1+2] ! store third byte 684 685 .bc_smallest: ! 7 or fewer bytes remain 686 tst %o2 687 bz,pt %ncc, .bc_smallx 688 cmp %o2, 4 689 blt,pt %ncc, .bc_small3x 690 nop 691 ldub [%o0], %o4 ! read byte 692 subcc %o2, 4, %o2 ! reduce count by 4 693 stb %o4, [%o1] ! write byte 694 ldub [%o0+1], %o4 ! repeat for total of 4 bytes 695 add %o0, 4, %o0 ! advance src by 4 696 stb %o4, [%o1+1] 697 ldub [%o0-2], %o4 698 add %o1, 4, %o1 ! advance dst by 4 699 stb %o4, [%o1-2] 700 ldub [%o0-1], %o4 701 bnz,pt %ncc, .bc_small3x 702 stb %o4, [%o1-1] 703 ba .bc_smallx 704 nop 705 706 /* 707 * Align destination to long word boundary 708 */ 709 .bc_align: ! byte align test in prior branch delay 710 bnz,pt %ncc, .bc_al_d1 711 .bc_al_d1f: ! dest is now half word aligned 712 andcc %o1, 2, %o3 713 bnz,pt %ncc, .bc_al_d2 714 .bc_al_d2f: ! dest is now word aligned 715 andcc %o1, 4, %o3 ! is dest longword aligned? 716 bz,pt %ncc, .bc_al_src 717 nop 718 .bc_al_d4: ! dest is word aligned; src is unknown 719 ldub [%o0], %o4 ! move a word (src align unknown) 720 ldub [%o0+1], %o3 721 sll %o4, 24, %o4 ! position 722 sll %o3, 16, %o3 ! position 723 or %o4, %o3, %o3 ! merge 724 ldub [%o0+2], %o4 725 sll %o4, 8, %o4 ! position 726 or %o4, %o3, %o3 ! merge 727 ldub [%o0+3], %o4 728 or %o4, %o3, %o4 ! merge 729 stw %o4,[%o1] ! store four bytes 730 add %o0, 4, %o0 ! adjust src by 4 731 add %o1, 4, %o1 ! adjust dest by 4 732 sub %o2, 4, %o2 ! adjust count by 4 733 andcc %o0, 7, %o3 ! check for src long word alignment 734 brz,pt %o3, .bc_medlong 735 .bc_src_dst_unal8: 736 ! dst is 8-byte aligned, src is not 737 ! Size is less than FP_COPY 738 ! Following code is to select for alignment 739 andcc %o0, 0x3, %o3 ! test word alignment 740 bz,pt %ncc, .bc_medword 741 nop 742 andcc %o0, 0x1, %o3 ! test halfword alignment 743 bnz,pt %ncc, .bc_med_byte ! go to byte move if not halfword 744 andcc %o0, 0x2, %o3 ! test which byte alignment 745 ba .bc_medhalf 746 nop 747 .bc_al_d1: ! align dest to half word 748 ldub [%o0], %o4 ! move a byte 749 add %o0, 1, %o0 750 stb %o4, [%o1] 751 add %o1, 1, %o1 752 andcc %o1, 2, %o3 753 bz,pt %ncc, .bc_al_d2f 754 sub %o2, 1, %o2 755 .bc_al_d2: ! align dest to word 756 ldub [%o0], %o4 ! move a half-word (src align unknown) 757 ldub [%o0+1], %o3 758 sll %o4, 8, %o4 ! position 759 or %o4, %o3, %o4 ! merge 760 sth %o4, [%o1] 761 add %o0, 2, %o0 762 add %o1, 2, %o1 763 andcc %o1, 4, %o3 ! is dest longword aligned? 764 bz,pt %ncc, .bc_al_src 765 sub %o2, 2, %o2 766 ba .bc_al_d4 767 nop 768 /* 769 * Handle all cases where src and dest are aligned on word 770 * boundaries. Use unrolled loops for better performance. 771 * This option wins over standard large data move when 772 * source and destination is in cache for medium 773 * to short data moves. 774 */ 775 .bc_medword: 776 subcc %o2, 31, %o2 ! adjust length to allow cc test 777 ble,pt %ncc, .bc_medw31 778 nop 779 .bc_medw32: 780 ld [%o0], %o4 ! move a block of 32 bytes 781 stw %o4, [%o1] 782 ld [%o0+4], %o4 783 stw %o4, [%o1+4] 784 ld [%o0+8], %o4 785 stw %o4, [%o1+8] 786 ld [%o0+12], %o4 787 stw %o4, [%o1+12] 788 ld [%o0+16], %o4 789 stw %o4, [%o1+16] 790 ld [%o0+20], %o4 791 subcc %o2, 32, %o2 ! decrement length count 792 stw %o4, [%o1+20] 793 ld [%o0+24], %o4 794 add %o0, 32, %o0 ! increase src ptr by 32 795 stw %o4, [%o1+24] 796 ld [%o0-4], %o4 797 add %o1, 32, %o1 ! increase dst ptr by 32 798 bgu,pt %ncc, .bc_medw32 ! repeat if at least 32 bytes left 799 stw %o4, [%o1-4] 800 .bc_medw31: 801 addcc %o2, 24, %o2 ! adjust count to be off by 7 802 ble,pt %ncc, .bc_medw7 ! skip if 7 or fewer bytes left 803 nop ! 804 .bc_medw15: 805 ld [%o0], %o4 ! move a block of 8 bytes 806 subcc %o2, 8, %o2 ! decrement length count 807 stw %o4, [%o1] 808 add %o0, 8, %o0 ! increase src ptr by 8 809 ld [%o0-4], %o4 810 add %o1, 8, %o1 ! increase dst ptr by 8 811 bgu,pt %ncc, .bc_medw15 812 stw %o4, [%o1-4] 813 .bc_medw7: 814 addcc %o2, 7, %o2 ! finish adjustment of remaining count 815 bz,pt %ncc, .bc_smallx ! exit if finished 816 cmp %o2, 4 817 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 818 nop ! 819 ld [%o0], %o4 ! move 4 bytes 820 add %o0, 4, %o0 ! increase src ptr by 4 821 add %o1, 4, %o1 ! increase dst ptr by 4 822 subcc %o2, 4, %o2 ! decrease count by 4 823 bnz .bc_small3x 824 stw %o4, [%o1-4] 825 ba .bc_smallx 826 nop 827 828 .bc_medhalf: 829 subcc %o2, 31, %o2 ! adjust length to allow cc test 830 ble,pt %ncc, .bc_medh31 831 nop 832 .bc_medh32: ! load and store block of 32 bytes 833 subcc %o2, 32, %o2 ! decrement length count 834 835 lduh [%o0], %o4 ! move 32 bytes 836 lduw [%o0+2], %o3 837 sllx %o4, 48, %o4 838 sllx %o3, 16, %o3 839 or %o4, %o3, %o3 840 lduh [%o0+6], %o4 841 or %o4, %o3, %o4 842 stx %o4, [%o1] 843 844 lduh [%o0+8], %o4 845 lduw [%o0+10], %o3 846 sllx %o4, 48, %o4 847 sllx %o3, 16, %o3 848 or %o4, %o3, %o3 849 lduh [%o0+14], %o4 850 or %o4, %o3, %o4 851 stx %o4, [%o1+8] 852 853 lduh [%o0+16], %o4 854 lduw [%o0+18], %o3 855 sllx %o4, 48, %o4 856 sllx %o3, 16, %o3 857 or %o4, %o3, %o3 858 lduh [%o0+22], %o4 859 or %o4, %o3, %o4 860 stx %o4, [%o1+16] 861 862 add %o0, 32, %o0 ! increase src ptr by 32 863 add %o1, 32, %o1 ! increase dst ptr by 32 864 865 lduh [%o0-8], %o4 866 lduw [%o0-6], %o3 867 sllx %o4, 48, %o4 868 sllx %o3, 16, %o3 869 or %o4, %o3, %o3 870 lduh [%o0-2], %o4 871 or %o3, %o4, %o4 872 bgu,pt %ncc, .bc_medh32 ! repeat if at least 32 bytes left 873 stx %o4, [%o1-8] 874 875 .bc_medh31: 876 addcc %o2, 24, %o2 ! adjust count to be off by 7 877 ble,pt %ncc, .bc_medh7 ! skip if 7 or fewer bytes left 878 nop ! 879 .bc_medh15: 880 lduh [%o0], %o4 ! move 16 bytes 881 subcc %o2, 8, %o2 ! decrement length count 882 lduw [%o0+2], %o3 883 sllx %o4, 48, %o4 884 sllx %o3, 16, %o3 885 or %o4, %o3, %o3 886 add %o1, 8, %o1 ! increase dst ptr by 8 887 lduh [%o0+6], %o4 888 add %o0, 8, %o0 ! increase src ptr by 8 889 or %o4, %o3, %o4 890 bgu,pt %ncc, .bc_medh15 891 stx %o4, [%o1-8] 892 .bc_medh7: 893 addcc %o2, 7, %o2 ! finish adjustment of remaining count 894 bz,pt %ncc, .bc_smallx ! exit if finished 895 cmp %o2, 4 896 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 897 nop ! 898 lduh [%o0], %o4 899 sll %o4, 16, %o4 900 lduh [%o0+2], %o3 901 or %o3, %o4, %o4 902 subcc %o2, 4, %o2 903 add %o0, 4, %o0 904 add %o1, 4, %o1 905 bnz .bc_small3x 906 stw %o4, [%o1-4] 907 ba .bc_smallx 908 nop 909 910 .align 16 911 .bc_med_byte: 912 bnz,pt %ncc, .bc_medbh32a ! go to correct byte move 913 subcc %o2, 31, %o2 ! adjust length to allow cc test 914 ble,pt %ncc, .bc_medb31 915 nop 916 .bc_medb32: ! Alignment 1 or 5 917 subcc %o2, 32, %o2 ! decrement length count 918 919 ldub [%o0], %o4 ! load and store a block of 32 bytes 920 sllx %o4, 56, %o3 921 lduh [%o0+1], %o4 922 sllx %o4, 40, %o4 923 or %o4, %o3, %o3 924 lduw [%o0+3], %o4 925 sllx %o4, 8, %o4 926 or %o4, %o3, %o3 927 ldub [%o0+7], %o4 928 or %o4, %o3, %o4 929 stx %o4, [%o1] 930 931 ldub [%o0+8], %o4 932 sllx %o4, 56, %o3 933 lduh [%o0+9], %o4 934 sllx %o4, 40, %o4 935 or %o4, %o3, %o3 936 lduw [%o0+11], %o4 937 sllx %o4, 8, %o4 938 or %o4, %o3, %o3 939 ldub [%o0+15], %o4 940 or %o4, %o3, %o4 941 stx %o4, [%o1+8] 942 943 ldub [%o0+16], %o4 944 sllx %o4, 56, %o3 945 lduh [%o0+17], %o4 946 sllx %o4, 40, %o4 947 or %o4, %o3, %o3 948 lduw [%o0+19], %o4 949 sllx %o4, 8, %o4 950 or %o4, %o3, %o3 951 ldub [%o0+23], %o4 952 or %o4, %o3, %o4 953 stx %o4, [%o1+16] 954 955 add %o0, 32, %o0 ! increase src ptr by 32 956 add %o1, 32, %o1 ! increase dst ptr by 32 957 958 ldub [%o0-8], %o4 959 sllx %o4, 56, %o3 960 lduh [%o0-7], %o4 961 sllx %o4, 40, %o4 962 or %o4, %o3, %o3 963 lduw [%o0-5], %o4 964 sllx %o4, 8, %o4 965 or %o4, %o3, %o3 966 ldub [%o0-1], %o4 967 or %o4, %o3, %o4 968 bgu,pt %ncc, .bc_medb32 ! repeat if at least 32 bytes left 969 stx %o4, [%o1-8] 970 971 .bc_medb31: ! 31 or fewer bytes remaining 972 addcc %o2, 24, %o2 ! adjust count to be off by 7 973 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left 974 nop ! 975 .bc_medb15: 976 977 ldub [%o0], %o4 ! load and store a block of 8 bytes 978 subcc %o2, 8, %o2 ! decrement length count 979 sllx %o4, 56, %o3 980 lduh [%o0+1], %o4 981 sllx %o4, 40, %o4 982 or %o4, %o3, %o3 983 lduw [%o0+3], %o4 984 add %o1, 8, %o1 ! increase dst ptr by 16 985 sllx %o4, 8, %o4 986 or %o4, %o3, %o3 987 ldub [%o0+7], %o4 988 add %o0, 8, %o0 ! increase src ptr by 16 989 or %o4, %o3, %o4 990 bgu,pt %ncc, .bc_medb15 991 stx %o4, [%o1-8] 992 .bc_medb7: 993 addcc %o2, 7, %o2 ! finish adjustment of remaining count 994 bz,pt %ncc, .bc_smallx ! exit if finished 995 cmp %o2, 4 996 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 997 nop ! 998 ldub [%o0], %o4 ! move 4 bytes 999 sll %o4, 24, %o3 1000 lduh [%o0+1], %o4 1001 sll %o4, 8, %o4 1002 or %o4, %o3, %o3 1003 ldub [%o0+3], %o4 1004 or %o4, %o3, %o4 1005 subcc %o2, 4, %o2 1006 add %o0, 4, %o0 1007 add %o1, 4, %o1 1008 bnz .bc_small3x 1009 stw %o4, [%o1-4] 1010 ba .bc_smallx 1011 nop 1012 1013 .align 16 1014 .bc_medbh32a: ! Alignment 3 or 7 1015 ble,pt %ncc, .bc_medbh31 1016 nop 1017 .bc_medbh32: ! Alignment 3 or 7 1018 subcc %o2, 32, %o2 ! decrement length count 1019 1020 ldub [%o0], %o4 ! load and store a block of 32 bytes 1021 sllx %o4, 56, %o3 1022 lduw [%o0+1], %o4 1023 sllx %o4, 24, %o4 1024 or %o4, %o3, %o3 1025 lduh [%o0+5], %o4 1026 sllx %o4, 8, %o4 1027 or %o4, %o3, %o3 1028 ldub [%o0+7], %o4 1029 or %o4, %o3, %o4 1030 stx %o4, [%o1] 1031 1032 ldub [%o0+8], %o4 1033 sllx %o4, 56, %o3 1034 lduw [%o0+9], %o4 1035 sllx %o4, 24, %o4 1036 or %o4, %o3, %o3 1037 lduh [%o0+13], %o4 1038 sllx %o4, 8, %o4 1039 or %o4, %o3, %o3 1040 ldub [%o0+15], %o4 1041 or %o4, %o3, %o4 1042 stx %o4, [%o1+8] 1043 1044 ldub [%o0+16], %o4 1045 sllx %o4, 56, %o3 1046 lduw [%o0+17], %o4 1047 sllx %o4, 24, %o4 1048 or %o4, %o3, %o3 1049 lduh [%o0+21], %o4 1050 sllx %o4, 8, %o4 1051 or %o4, %o3, %o3 1052 ldub [%o0+23], %o4 1053 or %o4, %o3, %o4 1054 stx %o4, [%o1+16] 1055 1056 add %o0, 32, %o0 ! increase src ptr by 32 1057 add %o1, 32, %o1 ! increase dst ptr by 32 1058 1059 ldub [%o0-8], %o4 1060 sllx %o4, 56, %o3 1061 lduw [%o0-7], %o4 1062 sllx %o4, 24, %o4 1063 or %o4, %o3, %o3 1064 lduh [%o0-3], %o4 1065 sllx %o4, 8, %o4 1066 or %o4, %o3, %o3 1067 ldub [%o0-1], %o4 1068 or %o4, %o3, %o4 1069 bgu,pt %ncc, .bc_medbh32 ! repeat if at least 32 bytes left 1070 stx %o4, [%o1-8] 1071 1072 .bc_medbh31: 1073 addcc %o2, 24, %o2 ! adjust count to be off by 7 1074 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left 1075 nop ! 1076 .bc_medbh15: 1077 ldub [%o0], %o4 ! load and store a block of 8 bytes 1078 sllx %o4, 56, %o3 1079 lduw [%o0+1], %o4 1080 sllx %o4, 24, %o4 1081 or %o4, %o3, %o3 1082 lduh [%o0+5], %o4 1083 sllx %o4, 8, %o4 1084 or %o4, %o3, %o3 1085 ldub [%o0+7], %o4 1086 or %o4, %o3, %o4 1087 stx %o4, [%o1] 1088 subcc %o2, 8, %o2 ! decrement length count 1089 add %o1, 8, %o1 ! increase dst ptr by 8 1090 add %o0, 8, %o0 ! increase src ptr by 8 1091 bgu,pt %ncc, .bc_medbh15 1092 stx %o4, [%o1-8] 1093 ba .bc_medb7 1094 nop 1095 1096 SET_SIZE(bcopy) 1097 /* 1098 * The _more entry points are not intended to be used directly by 1099 * any caller from outside this file. They are provided to allow 1100 * profiling and dtrace of the portions of the copy code that uses 1101 * the floating point registers. 1102 */ 1103 ENTRY(bcopy_more) 1104 .bcopy_more: 1105 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1106 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 1107 brz,pt %o5, .do_copy 1108 nop 1109 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 1110 or %l7, %lo(.copyerr), %l7 1111 membar #Sync ! sync error barrier 1112 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 1113 ! We've already captured whether t_lofault was zero on entry. 1114 ! We need to mark ourselves as being from bcopy since both 1115 ! kcopy and bcopy use the same code path. If LOFAULT_SET is 1116 ! set and the saved lofault was zero, we won't reset lofault on 1117 ! returning. 1118 or %o5, LOFAULT_SET, %o5 1119 .do_copy: 1120 ldn [THREAD_REG + T_LWP], %o3 1121 brnz,pt %o3, 1f 1122 nop 1123 /* 1124 * kpreempt_disable(); 1125 */ 1126 ldsb [THREAD_REG +T_PREEMPT], %o3 1127 inc %o3 1128 stb %o3, [THREAD_REG + T_PREEMPT] 1129 1: 1130 /* 1131 * Following code is for large copies. We know there is at 1132 * least FP_COPY bytes available. FP regs are used, so 1133 * we save registers and fp regs before starting 1134 */ 1135 rd %fprs, %g5 ! check for unused fp 1136 or %o5,FPUSED_FLAG,%o5 1137 ! if fprs.fef == 0, set it. 1138 ! Setting it when already set costs more than checking 1139 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 1140 bz,pt %ncc, .bc_fp_unused 1141 prefetch [%i0 + (1 * CACHE_LINE)], #one_read 1142 BST_FP_TOSTACK(%o3) 1143 ba .bc_fp_ready 1144 .bc_fp_unused: 1145 andcc %i1, 1, %o3 ! is dest byte aligned 1146 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 1147 .bc_fp_ready: 1148 rd %gsr, %l5 ! save %gsr value 1149 bnz,pt %ncc, .bc_big_d1 1150 .bc_big_d1f: ! dest is now half word aligned 1151 andcc %i1, 2, %o3 1152 bnz,pt %ncc, .bc_big_d2 1153 .bc_big_d2f: ! dest is now word aligned 1154 andcc %i1, 4, %o3 1155 bnz,pt %ncc, .bc_big_d4 1156 .bc_big_d4f: ! dest is now long word aligned 1157 andcc %i0, 7, %o3 ! is src long word aligned 1158 brnz,pt %o3, .bc_big_unal8 1159 prefetch [%i0 + (2 * CACHE_LINE)], #one_read 1160 1161 ! Src and dst are long word aligned 1162 ! align dst to 64 byte boundary 1163 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 1164 brz,pn %o3, .bc_al_to_64 1165 nop 1166 sub %o3, 64, %o3 ! %o3 has negative bytes to move 1167 add %i2, %o3, %i2 ! adjust remaining count 1168 andcc %o3, 8, %o4 ! odd long words to move? 1169 brz,pt %o4, .bc_al_to_16 1170 nop 1171 add %o3, 8, %o3 1172 ldx [%i0], %o4 1173 add %i0, 8, %i0 ! increment src ptr 1174 add %i1, 8, %i1 ! increment dst ptr 1175 stx %o4, [%i1-8] 1176 ! Dest is aligned on 16 bytes, src 8 byte aligned 1177 .bc_al_to_16: 1178 andcc %o3, 0x30, %o4 ! pair of long words to move? 1179 brz,pt %o4, .bc_al_to_64 1180 nop 1181 .bc_al_mv_16: 1182 add %o3, 16, %o3 1183 ldx [%i0], %o4 1184 stx %o4, [%i1] 1185 ldx [%i0+8], %o4 1186 add %i0, 16, %i0 ! increment src ptr 1187 stx %o4, [%i1+8] 1188 andcc %o3, 48, %o4 1189 brnz,pt %o4, .bc_al_mv_16 1190 add %i1, 16, %i1 ! increment dst ptr 1191 ! Dest is aligned on 64 bytes, src 8 byte aligned 1192 .bc_al_to_64: 1193 ! Determine source alignment 1194 ! to correct 8 byte offset 1195 andcc %i0, 32, %o3 1196 brnz,pn %o3, .bc_aln_1 1197 andcc %i0, 16, %o3 1198 brnz,pn %o3, .bc_aln_01 1199 andcc %i0, 8, %o3 1200 brz,pn %o3, .bc_aln_000 1201 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1202 ba .bc_aln_001 1203 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1204 1205 .bc_aln_01: 1206 brnz,pn %o3, .bc_aln_011 1207 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1208 ba .bc_aln_010 1209 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1210 .bc_aln_1: 1211 andcc %i0, 16, %o3 1212 brnz,pn %o3, .bc_aln_11 1213 andcc %i0, 8, %o3 1214 brnz,pn %o3, .bc_aln_101 1215 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1216 ba .bc_aln_100 1217 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1218 .bc_aln_11: 1219 brz,pn %o3, .bc_aln_110 1220 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1221 1222 .bc_aln_111: 1223 ! Alignment off by 8 bytes 1224 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1225 ldd [%i0], %d0 1226 add %i0, 8, %i0 1227 sub %i2, 8, %i2 1228 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1229 and %i2, 0x7f, %i2 ! residue bytes in %i2 1230 sub %i1, %i0, %i1 1231 .bc_aln_111_loop: 1232 ldda [%i0]ASI_BLK_P,%d16 ! block load 1233 subcc %o3, 64, %o3 1234 fmovd %d16, %d2 1235 fmovd %d18, %d4 1236 fmovd %d20, %d6 1237 fmovd %d22, %d8 1238 fmovd %d24, %d10 1239 fmovd %d26, %d12 1240 fmovd %d28, %d14 1241 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1242 stda %d0,[%i0+%i1]ASI_BLK_P 1243 add %i0, 64, %i0 1244 fmovd %d30, %d0 1245 bgt,pt %ncc, .bc_aln_111_loop 1246 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1247 add %i1, %i0, %i1 1248 1249 std %d0, [%i1] 1250 ba .bc_remain_stuff 1251 add %i1, 8, %i1 1252 ! END OF aln_111 1253 1254 .bc_aln_110: 1255 ! Alignment off by 16 bytes 1256 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1257 ldd [%i0], %d0 1258 ldd [%i0+8], %d2 1259 add %i0, 16, %i0 1260 sub %i2, 16, %i2 1261 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1262 and %i2, 0x7f, %i2 ! residue bytes in %i2 1263 sub %i1, %i0, %i1 1264 .bc_aln_110_loop: 1265 ldda [%i0]ASI_BLK_P,%d16 ! block load 1266 subcc %o3, 64, %o3 1267 fmovd %d16, %d4 1268 fmovd %d18, %d6 1269 fmovd %d20, %d8 1270 fmovd %d22, %d10 1271 fmovd %d24, %d12 1272 fmovd %d26, %d14 1273 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1274 stda %d0,[%i0+%i1]ASI_BLK_P 1275 add %i0, 64, %i0 1276 fmovd %d28, %d0 1277 fmovd %d30, %d2 1278 bgt,pt %ncc, .bc_aln_110_loop 1279 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1280 add %i1, %i0, %i1 1281 1282 std %d0, [%i1] 1283 std %d2, [%i1+8] 1284 ba .bc_remain_stuff 1285 add %i1, 16, %i1 1286 ! END OF aln_110 1287 1288 .bc_aln_101: 1289 ! Alignment off by 24 bytes 1290 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1291 ldd [%i0], %d0 1292 ldd [%i0+8], %d2 1293 ldd [%i0+16], %d4 1294 add %i0, 24, %i0 1295 sub %i2, 24, %i2 1296 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1297 and %i2, 0x7f, %i2 ! residue bytes in %i2 1298 sub %i1, %i0, %i1 1299 .bc_aln_101_loop: 1300 ldda [%i0]ASI_BLK_P,%d16 ! block load 1301 subcc %o3, 64, %o3 1302 fmovd %d16, %d6 1303 fmovd %d18, %d8 1304 fmovd %d20, %d10 1305 fmovd %d22, %d12 1306 fmovd %d24, %d14 1307 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1308 stda %d0,[%i0+%i1]ASI_BLK_P 1309 add %i0, 64, %i0 1310 fmovd %d26, %d0 1311 fmovd %d28, %d2 1312 fmovd %d30, %d4 1313 bgt,pt %ncc, .bc_aln_101_loop 1314 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1315 add %i1, %i0, %i1 1316 1317 std %d0, [%i1] 1318 std %d2, [%i1+8] 1319 std %d4, [%i1+16] 1320 ba .bc_remain_stuff 1321 add %i1, 24, %i1 1322 ! END OF aln_101 1323 1324 .bc_aln_100: 1325 ! Alignment off by 32 bytes 1326 ldd [%i0], %d0 1327 ldd [%i0+8], %d2 1328 ldd [%i0+16],%d4 1329 ldd [%i0+24],%d6 1330 add %i0, 32, %i0 1331 sub %i2, 32, %i2 1332 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1333 and %i2, 0x7f, %i2 ! residue bytes in %i2 1334 sub %i1, %i0, %i1 1335 .bc_aln_100_loop: 1336 ldda [%i0]ASI_BLK_P,%d16 ! block load 1337 subcc %o3, 64, %o3 1338 fmovd %d16, %d8 1339 fmovd %d18, %d10 1340 fmovd %d20, %d12 1341 fmovd %d22, %d14 1342 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1343 stda %d0,[%i0+%i1]ASI_BLK_P 1344 add %i0, 64, %i0 1345 fmovd %d24, %d0 1346 fmovd %d26, %d2 1347 fmovd %d28, %d4 1348 fmovd %d30, %d6 1349 bgt,pt %ncc, .bc_aln_100_loop 1350 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1351 add %i1, %i0, %i1 1352 1353 std %d0, [%i1] 1354 std %d2, [%i1+8] 1355 std %d4, [%i1+16] 1356 std %d6, [%i1+24] 1357 ba .bc_remain_stuff 1358 add %i1, 32, %i1 1359 ! END OF aln_100 1360 1361 .bc_aln_011: 1362 ! Alignment off by 40 bytes 1363 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1364 ldd [%i0], %d0 1365 ldd [%i0+8], %d2 1366 ldd [%i0+16], %d4 1367 ldd [%i0+24], %d6 1368 ldd [%i0+32], %d8 1369 add %i0, 40, %i0 1370 sub %i2, 40, %i2 1371 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1372 and %i2, 0x7f, %i2 ! residue bytes in %i2 1373 sub %i1, %i0, %i1 1374 .bc_aln_011_loop: 1375 ldda [%i0]ASI_BLK_P,%d16 ! block load 1376 subcc %o3, 64, %o3 1377 fmovd %d16, %d10 1378 fmovd %d18, %d12 1379 fmovd %d20, %d14 1380 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1381 stda %d0,[%i0+%i1]ASI_BLK_P 1382 add %i0, 64, %i0 1383 fmovd %d22, %d0 1384 fmovd %d24, %d2 1385 fmovd %d26, %d4 1386 fmovd %d28, %d6 1387 fmovd %d30, %d8 1388 bgt,pt %ncc, .bc_aln_011_loop 1389 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1390 add %i1, %i0, %i1 1391 1392 std %d0, [%i1] 1393 std %d2, [%i1+8] 1394 std %d4, [%i1+16] 1395 std %d6, [%i1+24] 1396 std %d8, [%i1+32] 1397 ba .bc_remain_stuff 1398 add %i1, 40, %i1 1399 ! END OF aln_011 1400 1401 .bc_aln_010: 1402 ! Alignment off by 48 bytes 1403 ldd [%i0], %d0 1404 ldd [%i0+8], %d2 1405 ldd [%i0+16], %d4 1406 ldd [%i0+24], %d6 1407 ldd [%i0+32], %d8 1408 ldd [%i0+40], %d10 1409 add %i0, 48, %i0 1410 sub %i2, 48, %i2 1411 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1412 and %i2, 0x7f, %i2 ! residue bytes in %i2 1413 sub %i1, %i0, %i1 1414 .bc_aln_010_loop: 1415 ldda [%i0]ASI_BLK_P,%d16 ! block load 1416 subcc %o3, 64, %o3 1417 fmovd %d16, %d12 1418 fmovd %d18, %d14 1419 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1420 stda %d0,[%i0+%i1]ASI_BLK_P 1421 add %i0, 64, %i0 1422 fmovd %d20, %d0 1423 fmovd %d22, %d2 1424 fmovd %d24, %d4 1425 fmovd %d26, %d6 1426 fmovd %d28, %d8 1427 fmovd %d30, %d10 1428 bgt,pt %ncc, .bc_aln_010_loop 1429 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1430 add %i1, %i0, %i1 1431 1432 std %d0, [%i1] 1433 std %d2, [%i1+8] 1434 std %d4, [%i1+16] 1435 std %d6, [%i1+24] 1436 std %d8, [%i1+32] 1437 std %d10, [%i1+40] 1438 ba .bc_remain_stuff 1439 add %i1, 48, %i1 1440 ! END OF aln_010 1441 1442 .bc_aln_001: 1443 ! Alignment off by 56 bytes 1444 ldd [%i0], %d0 1445 ldd [%i0+8], %d2 1446 ldd [%i0+16], %d4 1447 ldd [%i0+24], %d6 1448 ldd [%i0+32], %d8 1449 ldd [%i0+40], %d10 1450 ldd [%i0+48], %d12 1451 add %i0, 56, %i0 1452 sub %i2, 56, %i2 1453 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1454 and %i2, 0x7f, %i2 ! residue bytes in %i2 1455 sub %i1, %i0, %i1 1456 .bc_aln_001_loop: 1457 ldda [%i0]ASI_BLK_P,%d16 ! block load 1458 subcc %o3, 64, %o3 1459 fmovd %d16, %d14 1460 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1461 stda %d0,[%i0+%i1]ASI_BLK_P 1462 add %i0, 64, %i0 1463 fmovd %d18, %d0 1464 fmovd %d20, %d2 1465 fmovd %d22, %d4 1466 fmovd %d24, %d6 1467 fmovd %d26, %d8 1468 fmovd %d28, %d10 1469 fmovd %d30, %d12 1470 bgt,pt %ncc, .bc_aln_001_loop 1471 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1472 add %i1, %i0, %i1 1473 1474 std %d0, [%i1] 1475 std %d2, [%i1+8] 1476 std %d4, [%i1+16] 1477 std %d6, [%i1+24] 1478 std %d8, [%i1+32] 1479 std %d10, [%i1+40] 1480 std %d12, [%i1+48] 1481 ba .bc_remain_stuff 1482 add %i1, 56, %i1 1483 ! END OF aln_001 1484 1485 .bc_aln_000: 1486 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1487 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1488 and %i2, 0x7f, %i2 ! residue bytes in %i2 1489 sub %i1, %i0, %i1 1490 .bc_aln_000_loop: 1491 ldda [%i0]ASI_BLK_P,%d0 1492 subcc %o3, 64, %o3 1493 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1494 stda %d0,[%i0+%i1]ASI_BLK_P 1495 add %i0, 64, %i0 1496 bgt,pt %ncc, .bc_aln_000_loop 1497 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1498 add %i1, %i0, %i1 1499 1500 ! END OF aln_000 1501 1502 .bc_remain_stuff: 1503 subcc %i2, 31, %i2 ! adjust length to allow cc test 1504 ble,pt %ncc, .bc_aln_31 1505 nop 1506 .bc_aln_32: 1507 ldx [%i0], %o4 ! move 32 bytes 1508 subcc %i2, 32, %i2 ! decrement length count by 32 1509 stx %o4, [%i1] 1510 ldx [%i0+8], %o4 1511 stx %o4, [%i1+8] 1512 ldx [%i0+16], %o4 1513 add %i0, 32, %i0 ! increase src ptr by 32 1514 stx %o4, [%i1+16] 1515 ldx [%i0-8], %o4 1516 add %i1, 32, %i1 ! increase dst ptr by 32 1517 bgu,pt %ncc, .bc_aln_32 ! repeat if at least 32 bytes left 1518 stx %o4, [%i1-8] 1519 .bc_aln_31: 1520 addcc %i2, 24, %i2 ! adjust count to be off by 7 1521 ble,pt %ncc, .bc_aln_7 ! skip if 7 or fewer bytes left 1522 nop ! 1523 .bc_aln_15: 1524 ldx [%i0], %o4 ! move 8 bytes 1525 add %i0, 8, %i0 ! increase src ptr by 8 1526 subcc %i2, 8, %i2 ! decrease count by 8 1527 add %i1, 8, %i1 ! increase dst ptr by 8 1528 bgu,pt %ncc, .bc_aln_15 1529 stx %o4, [%i1-8] ! 1530 .bc_aln_7: 1531 addcc %i2, 7, %i2 ! finish adjustment of remaining count 1532 bz,pt %ncc, .bc_exit ! exit if finished 1533 cmp %i2, 4 1534 blt,pt %ncc, .bc_unaln3x ! skip if less than 4 bytes left 1535 nop ! 1536 ld [%i0], %o4 ! move 4 bytes 1537 add %i0, 4, %i0 ! increase src ptr by 4 1538 add %i1, 4, %i1 ! increase dst ptr by 4 1539 subcc %i2, 4, %i2 ! decrease count by 4 1540 bnz .bc_unaln3x 1541 stw %o4, [%i1-4] 1542 ba .bc_exit 1543 nop 1544 1545 ! destination alignment code 1546 .bc_big_d1: 1547 ldub [%i0], %o4 ! move a byte 1548 add %i0, 1, %i0 1549 stb %o4, [%i1] 1550 add %i1, 1, %i1 1551 andcc %i1, 2, %o3 1552 bz,pt %ncc, .bc_big_d2f 1553 sub %i2, 1, %i2 1554 .bc_big_d2: 1555 ldub [%i0], %o4 ! move a half-word (src align unknown) 1556 ldub [%i0+1], %o3 1557 add %i0, 2, %i0 1558 sll %o4, 8, %o4 ! position 1559 or %o4, %o3, %o4 ! merge 1560 sth %o4, [%i1] 1561 add %i1, 2, %i1 1562 andcc %i1, 4, %o3 1563 bz,pt %ncc, .bc_big_d4f 1564 sub %i2, 2, %i2 1565 .bc_big_d4: 1566 ldub [%i0], %o4 ! move a word (src align unknown) 1567 ldub [%i0+1], %o3 1568 sll %o4, 24, %o4 ! position 1569 sll %o3, 16, %o3 ! position 1570 or %o4, %o3, %o3 ! merge 1571 ldub [%i0+2], %o4 1572 sll %o4, 8, %o4 ! position 1573 or %o4, %o3, %o3 ! merge 1574 ldub [%i0+3], %o4 1575 or %o4, %o3, %o4 ! merge 1576 stw %o4,[%i1] ! store four bytes 1577 add %i0, 4, %i0 ! adjust src by 4 1578 add %i1, 4, %i1 ! adjust dest by 4 1579 ba .bc_big_d4f 1580 sub %i2, 4, %i2 ! adjust count by 4 1581 1582 1583 ! Dst is on 8 byte boundary; src is not; 1584 .bc_big_unal8: 1585 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned? 1586 bz %ncc, .bc_unalnsrc 1587 sub %o3, 64, %o3 ! %o3 will be multiple of 8 1588 neg %o3 ! bytes until dest is 64 byte aligned 1589 sub %i2, %o3, %i2 ! update cnt with bytes to be moved 1590 ! Move bytes according to source alignment 1591 andcc %i0, 0x1, %o4 1592 bnz %ncc, .bc_unalnbyte ! check for byte alignment 1593 nop 1594 andcc %i0, 2, %o4 ! check for half word alignment 1595 bnz %ncc, .bc_unalnhalf 1596 nop 1597 ! Src is word aligned, move bytes until dest 64 byte aligned 1598 .bc_unalnword: 1599 ld [%i0], %o4 ! load 4 bytes 1600 stw %o4, [%i1] ! and store 4 bytes 1601 ld [%i0+4], %o4 ! load 4 bytes 1602 add %i0, 8, %i0 ! increase src ptr by 8 1603 stw %o4, [%i1+4] ! and store 4 bytes 1604 subcc %o3, 8, %o3 ! decrease count by 8 1605 bnz %ncc, .bc_unalnword 1606 add %i1, 8, %i1 ! increase dst ptr by 8 1607 ba .bc_unalnsrc 1608 nop 1609 1610 ! Src is half-word aligned, move bytes until dest 64 byte aligned 1611 .bc_unalnhalf: 1612 lduh [%i0], %o4 ! load 2 bytes 1613 sllx %o4, 32, %i3 ! shift left 1614 lduw [%i0+2], %o4 1615 or %o4, %i3, %i3 1616 sllx %i3, 16, %i3 1617 lduh [%i0+6], %o4 1618 or %o4, %i3, %i3 1619 stx %i3, [%i1] 1620 add %i0, 8, %i0 1621 subcc %o3, 8, %o3 1622 bnz %ncc, .bc_unalnhalf 1623 add %i1, 8, %i1 1624 ba .bc_unalnsrc 1625 nop 1626 1627 ! Src is Byte aligned, move bytes until dest 64 byte aligned 1628 .bc_unalnbyte: 1629 sub %i1, %i0, %i1 ! share pointer advance 1630 .bc_unalnbyte_loop: 1631 ldub [%i0], %o4 1632 sllx %o4, 56, %i3 1633 lduh [%i0+1], %o4 1634 sllx %o4, 40, %o4 1635 or %o4, %i3, %i3 1636 lduh [%i0+3], %o4 1637 sllx %o4, 24, %o4 1638 or %o4, %i3, %i3 1639 lduh [%i0+5], %o4 1640 sllx %o4, 8, %o4 1641 or %o4, %i3, %i3 1642 ldub [%i0+7], %o4 1643 or %o4, %i3, %i3 1644 stx %i3, [%i1+%i0] 1645 subcc %o3, 8, %o3 1646 bnz %ncc, .bc_unalnbyte_loop 1647 add %i0, 8, %i0 1648 add %i1,%i0, %i1 ! restore pointer 1649 1650 ! Destination is now block (64 byte aligned), src is not 8 byte aligned 1651 .bc_unalnsrc: 1652 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size 1653 and %i2, 0x3f, %i2 ! residue bytes in %i2 1654 add %i2, 64, %i2 ! Insure we don't load beyond 1655 sub %i3, 64, %i3 ! end of source buffer 1656 1657 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address 1658 prefetch [%o4 + (3 * CACHE_LINE)], #one_read 1659 alignaddr %i0, %g0, %g0 ! generate %gsr 1660 add %i0, %i3, %i0 ! advance %i0 to after blocks 1661 ! 1662 ! Determine source alignment to correct 8 byte offset 1663 andcc %i0, 0x20, %o3 1664 brnz,pn %o3, .bc_unaln_1 1665 andcc %i0, 0x10, %o3 1666 brnz,pn %o3, .bc_unaln_01 1667 andcc %i0, 0x08, %o3 1668 brz,a %o3, .bc_unaln_000 1669 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1670 ba .bc_unaln_001 1671 nop 1672 .bc_unaln_01: 1673 brnz,a %o3, .bc_unaln_011 1674 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1675 ba .bc_unaln_010 1676 nop 1677 .bc_unaln_1: 1678 brnz,pn %o3, .bc_unaln_11 1679 andcc %i0, 0x08, %o3 1680 brnz,a %o3, .bc_unaln_101 1681 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1682 ba .bc_unaln_100 1683 nop 1684 .bc_unaln_11: 1685 brz,pn %o3, .bc_unaln_110 1686 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1687 1688 .bc_unaln_111: 1689 ldd [%o4+56], %d14 1690 .bc_unaln_111_loop: 1691 add %o4, 64, %o4 1692 ldda [%o4]ASI_BLK_P, %d16 1693 faligndata %d14, %d16, %d48 1694 faligndata %d16, %d18, %d50 1695 faligndata %d18, %d20, %d52 1696 faligndata %d20, %d22, %d54 1697 faligndata %d22, %d24, %d56 1698 faligndata %d24, %d26, %d58 1699 faligndata %d26, %d28, %d60 1700 faligndata %d28, %d30, %d62 1701 fmovd %d30, %d14 1702 stda %d48, [%i1]ASI_BLK_P 1703 subcc %i3, 64, %i3 1704 add %i1, 64, %i1 1705 bgu,pt %ncc, .bc_unaln_111_loop 1706 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1707 ba .bc_unaln_done 1708 nop 1709 1710 .bc_unaln_110: 1711 ldd [%o4+48], %d12 1712 ldd [%o4+56], %d14 1713 .bc_unaln_110_loop: 1714 add %o4, 64, %o4 1715 ldda [%o4]ASI_BLK_P, %d16 1716 faligndata %d12, %d14, %d48 1717 faligndata %d14, %d16, %d50 1718 faligndata %d16, %d18, %d52 1719 faligndata %d18, %d20, %d54 1720 faligndata %d20, %d22, %d56 1721 faligndata %d22, %d24, %d58 1722 faligndata %d24, %d26, %d60 1723 faligndata %d26, %d28, %d62 1724 fmovd %d28, %d12 1725 fmovd %d30, %d14 1726 stda %d48, [%i1]ASI_BLK_P 1727 subcc %i3, 64, %i3 1728 add %i1, 64, %i1 1729 bgu,pt %ncc, .bc_unaln_110_loop 1730 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1731 ba .bc_unaln_done 1732 nop 1733 1734 .bc_unaln_101: 1735 ldd [%o4+40], %d10 1736 ldd [%o4+48], %d12 1737 ldd [%o4+56], %d14 1738 .bc_unaln_101_loop: 1739 add %o4, 64, %o4 1740 ldda [%o4]ASI_BLK_P, %d16 1741 faligndata %d10, %d12, %d48 1742 faligndata %d12, %d14, %d50 1743 faligndata %d14, %d16, %d52 1744 faligndata %d16, %d18, %d54 1745 faligndata %d18, %d20, %d56 1746 faligndata %d20, %d22, %d58 1747 faligndata %d22, %d24, %d60 1748 faligndata %d24, %d26, %d62 1749 fmovd %d26, %d10 1750 fmovd %d28, %d12 1751 fmovd %d30, %d14 1752 stda %d48, [%i1]ASI_BLK_P 1753 subcc %i3, 64, %i3 1754 add %i1, 64, %i1 1755 bgu,pt %ncc, .bc_unaln_101_loop 1756 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1757 ba .bc_unaln_done 1758 nop 1759 1760 .bc_unaln_100: 1761 ldd [%o4+32], %d8 1762 ldd [%o4+40], %d10 1763 ldd [%o4+48], %d12 1764 ldd [%o4+56], %d14 1765 .bc_unaln_100_loop: 1766 add %o4, 64, %o4 1767 ldda [%o4]ASI_BLK_P, %d16 1768 faligndata %d8, %d10, %d48 1769 faligndata %d10, %d12, %d50 1770 faligndata %d12, %d14, %d52 1771 faligndata %d14, %d16, %d54 1772 faligndata %d16, %d18, %d56 1773 faligndata %d18, %d20, %d58 1774 faligndata %d20, %d22, %d60 1775 faligndata %d22, %d24, %d62 1776 fmovd %d24, %d8 1777 fmovd %d26, %d10 1778 fmovd %d28, %d12 1779 fmovd %d30, %d14 1780 stda %d48, [%i1]ASI_BLK_P 1781 subcc %i3, 64, %i3 1782 add %i1, 64, %i1 1783 bgu,pt %ncc, .bc_unaln_100_loop 1784 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1785 ba .bc_unaln_done 1786 nop 1787 1788 .bc_unaln_011: 1789 ldd [%o4+24], %d6 1790 ldd [%o4+32], %d8 1791 ldd [%o4+40], %d10 1792 ldd [%o4+48], %d12 1793 ldd [%o4+56], %d14 1794 .bc_unaln_011_loop: 1795 add %o4, 64, %o4 1796 ldda [%o4]ASI_BLK_P, %d16 1797 faligndata %d6, %d8, %d48 1798 faligndata %d8, %d10, %d50 1799 faligndata %d10, %d12, %d52 1800 faligndata %d12, %d14, %d54 1801 faligndata %d14, %d16, %d56 1802 faligndata %d16, %d18, %d58 1803 faligndata %d18, %d20, %d60 1804 faligndata %d20, %d22, %d62 1805 fmovd %d22, %d6 1806 fmovd %d24, %d8 1807 fmovd %d26, %d10 1808 fmovd %d28, %d12 1809 fmovd %d30, %d14 1810 stda %d48, [%i1]ASI_BLK_P 1811 subcc %i3, 64, %i3 1812 add %i1, 64, %i1 1813 bgu,pt %ncc, .bc_unaln_011_loop 1814 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1815 ba .bc_unaln_done 1816 nop 1817 1818 .bc_unaln_010: 1819 ldd [%o4+16], %d4 1820 ldd [%o4+24], %d6 1821 ldd [%o4+32], %d8 1822 ldd [%o4+40], %d10 1823 ldd [%o4+48], %d12 1824 ldd [%o4+56], %d14 1825 .bc_unaln_010_loop: 1826 add %o4, 64, %o4 1827 ldda [%o4]ASI_BLK_P, %d16 1828 faligndata %d4, %d6, %d48 1829 faligndata %d6, %d8, %d50 1830 faligndata %d8, %d10, %d52 1831 faligndata %d10, %d12, %d54 1832 faligndata %d12, %d14, %d56 1833 faligndata %d14, %d16, %d58 1834 faligndata %d16, %d18, %d60 1835 faligndata %d18, %d20, %d62 1836 fmovd %d20, %d4 1837 fmovd %d22, %d6 1838 fmovd %d24, %d8 1839 fmovd %d26, %d10 1840 fmovd %d28, %d12 1841 fmovd %d30, %d14 1842 stda %d48, [%i1]ASI_BLK_P 1843 subcc %i3, 64, %i3 1844 add %i1, 64, %i1 1845 bgu,pt %ncc, .bc_unaln_010_loop 1846 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1847 ba .bc_unaln_done 1848 nop 1849 1850 .bc_unaln_001: 1851 ldd [%o4+8], %d2 1852 ldd [%o4+16], %d4 1853 ldd [%o4+24], %d6 1854 ldd [%o4+32], %d8 1855 ldd [%o4+40], %d10 1856 ldd [%o4+48], %d12 1857 ldd [%o4+56], %d14 1858 .bc_unaln_001_loop: 1859 add %o4, 64, %o4 1860 ldda [%o4]ASI_BLK_P, %d16 1861 faligndata %d2, %d4, %d48 1862 faligndata %d4, %d6, %d50 1863 faligndata %d6, %d8, %d52 1864 faligndata %d8, %d10, %d54 1865 faligndata %d10, %d12, %d56 1866 faligndata %d12, %d14, %d58 1867 faligndata %d14, %d16, %d60 1868 faligndata %d16, %d18, %d62 1869 fmovd %d18, %d2 1870 fmovd %d20, %d4 1871 fmovd %d22, %d6 1872 fmovd %d24, %d8 1873 fmovd %d26, %d10 1874 fmovd %d28, %d12 1875 fmovd %d30, %d14 1876 stda %d48, [%i1]ASI_BLK_P 1877 subcc %i3, 64, %i3 1878 add %i1, 64, %i1 1879 bgu,pt %ncc, .bc_unaln_001_loop 1880 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1881 ba .bc_unaln_done 1882 nop 1883 1884 .bc_unaln_000: 1885 ldda [%o4]ASI_BLK_P, %d0 1886 .bc_unaln_000_loop: 1887 add %o4, 64, %o4 1888 ldda [%o4]ASI_BLK_P, %d16 1889 faligndata %d0, %d2, %d48 1890 faligndata %d2, %d4, %d50 1891 faligndata %d4, %d6, %d52 1892 faligndata %d6, %d8, %d54 1893 faligndata %d8, %d10, %d56 1894 faligndata %d10, %d12, %d58 1895 faligndata %d12, %d14, %d60 1896 faligndata %d14, %d16, %d62 1897 fmovd %d16, %d0 1898 fmovd %d18, %d2 1899 fmovd %d20, %d4 1900 fmovd %d22, %d6 1901 fmovd %d24, %d8 1902 fmovd %d26, %d10 1903 fmovd %d28, %d12 1904 fmovd %d30, %d14 1905 stda %d48, [%i1]ASI_BLK_P 1906 subcc %i3, 64, %i3 1907 add %i1, 64, %i1 1908 bgu,pt %ncc, .bc_unaln_000_loop 1909 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1910 1911 .bc_unaln_done: 1912 ! Handle trailing bytes, 64 to 127 1913 ! Dest long word aligned, Src not long word aligned 1914 cmp %i2, 15 1915 bleu %ncc, .bc_unaln_short 1916 1917 andn %i2, 0x7, %i3 ! %i3 is multiple of 8 1918 and %i2, 0x7, %i2 ! residue bytes in %i2 1919 add %i2, 8, %i2 1920 sub %i3, 8, %i3 ! insure we don't load past end of src 1921 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address 1922 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8 1923 ldd [%o4], %d0 ! fetch partial word 1924 .bc_unaln_by8: 1925 ldd [%o4+8], %d2 1926 add %o4, 8, %o4 1927 faligndata %d0, %d2, %d16 1928 subcc %i3, 8, %i3 1929 std %d16, [%i1] 1930 fmovd %d2, %d0 1931 bgu,pt %ncc, .bc_unaln_by8 1932 add %i1, 8, %i1 1933 1934 .bc_unaln_short: 1935 cmp %i2, 8 1936 blt,pt %ncc, .bc_unalnfin 1937 nop 1938 ldub [%i0], %o4 1939 sll %o4, 24, %o3 1940 ldub [%i0+1], %o4 1941 sll %o4, 16, %o4 1942 or %o4, %o3, %o3 1943 ldub [%i0+2], %o4 1944 sll %o4, 8, %o4 1945 or %o4, %o3, %o3 1946 ldub [%i0+3], %o4 1947 or %o4, %o3, %o3 1948 stw %o3, [%i1] 1949 ldub [%i0+4], %o4 1950 sll %o4, 24, %o3 1951 ldub [%i0+5], %o4 1952 sll %o4, 16, %o4 1953 or %o4, %o3, %o3 1954 ldub [%i0+6], %o4 1955 sll %o4, 8, %o4 1956 or %o4, %o3, %o3 1957 ldub [%i0+7], %o4 1958 or %o4, %o3, %o3 1959 stw %o3, [%i1+4] 1960 add %i0, 8, %i0 1961 add %i1, 8, %i1 1962 sub %i2, 8, %i2 1963 .bc_unalnfin: 1964 cmp %i2, 4 1965 blt,pt %ncc, .bc_unalnz 1966 tst %i2 1967 ldub [%i0], %o3 ! read byte 1968 subcc %i2, 4, %i2 ! reduce count by 4 1969 sll %o3, 24, %o3 ! position 1970 ldub [%i0+1], %o4 1971 sll %o4, 16, %o4 ! position 1972 or %o4, %o3, %o3 ! merge 1973 ldub [%i0+2], %o4 1974 sll %o4, 8, %o4 ! position 1975 or %o4, %o3, %o3 ! merge 1976 add %i1, 4, %i1 ! advance dst by 4 1977 ldub [%i0+3], %o4 1978 add %i0, 4, %i0 ! advance src by 4 1979 or %o4, %o3, %o4 ! merge 1980 bnz,pt %ncc, .bc_unaln3x 1981 stw %o4, [%i1-4] 1982 ba .bc_exit 1983 nop 1984 .bc_unalnz: 1985 bz,pt %ncc, .bc_exit 1986 .bc_unaln3x: ! Exactly 1, 2, or 3 bytes remain 1987 subcc %i2, 1, %i2 ! reduce count for cc test 1988 ldub [%i0], %o4 ! load one byte 1989 bz,pt %ncc, .bc_exit 1990 stb %o4, [%i1] ! store one byte 1991 ldub [%i0+1], %o4 ! load second byte 1992 subcc %i2, 1, %i2 1993 bz,pt %ncc, .bc_exit 1994 stb %o4, [%i1+1] ! store second byte 1995 ldub [%i0+2], %o4 ! load third byte 1996 stb %o4, [%i1+2] ! store third byte 1997 .bc_exit: 1998 wr %l5, %g0, %gsr ! restore %gsr 1999 brnz %g5, .bc_fp_restore 2000 and %o5, COPY_FLAGS, %l1 ! save flags in %l1 2001 FZERO 2002 wr %g5, %g0, %fprs 2003 ba,pt %ncc, .bc_ex2 2004 nop 2005 .bc_fp_restore: 2006 BLD_FP_FROMSTACK(%o4) 2007 .bc_ex2: 2008 ldn [THREAD_REG + T_LWP], %o2 2009 brnz,pt %o2, 1f 2010 nop 2011 2012 ldsb [THREAD_REG + T_PREEMPT], %l0 2013 deccc %l0 2014 bnz,pn %ncc, 1f 2015 stb %l0, [THREAD_REG + T_PREEMPT] 2016 2017 ! Check for a kernel preemption request 2018 ldn [THREAD_REG + T_CPU], %l0 2019 ldub [%l0 + CPU_KPRUNRUN], %l0 2020 brnz,a,pt %l0, 1f ! Need to call kpreempt? 2021 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag 2022 1: 2023 btst LOFAULT_SET, %l1 2024 bz,pn %icc, 3f 2025 andncc %o5, COPY_FLAGS, %o5 2026 ! Here via bcopy. Check to see if the handler was NULL. 2027 ! If so, just return quietly. Otherwise, reset the 2028 ! handler and return. 2029 bz,pn %ncc, 2f 2030 nop 2031 membar #Sync 2032 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2033 2: 2034 btst KPREEMPT_FLAG, %l1 2035 bz,pt %icc, 3f 2036 nop 2037 call kpreempt 2038 rdpr %pil, %o0 ! pass %pil 2039 3: 2040 ret 2041 restore %g0, 0, %o0 2042 2043 SET_SIZE(bcopy_more) 2044 2045 2046 #else /* NIAGARA_IMPL */ 2047 save %sp, -SA(MINFRAME), %sp 2048 clr %o5 ! flag LOFAULT_SET is not set for bcopy 2049 .do_copy: 2050 cmp %i2, 12 ! for small counts 2051 blu %ncc, .bytecp ! just copy bytes 2052 .empty 2053 2054 cmp %i2, 128 ! for less than 128 bytes 2055 blu,pn %ncc, .bcb_punt ! no block st/quad ld 2056 nop 2057 2058 set use_hw_bcopy, %o2 2059 ld [%o2], %o2 2060 brz,pn %o2, .bcb_punt 2061 nop 2062 2063 subcc %i1, %i0, %i3 2064 bneg,a,pn %ncc, 1f 2065 neg %i3 2066 1: 2067 /* 2068 * Compare against 256 since we should be checking block addresses 2069 * and (dest & ~63) - (src & ~63) can be 3 blocks even if 2070 * src = dest + (64 * 3) + 63. 2071 */ 2072 cmp %i3, 256 2073 blu,pn %ncc, .bcb_punt 2074 nop 2075 2076 /* 2077 * Copy that reach here have at least 2 blocks of data to copy. 2078 */ 2079 .do_blockcopy: 2080 ! Swap src/dst since the code below is memcpy code 2081 ! and memcpy/bcopy have different calling sequences 2082 mov %i1, %i5 2083 mov %i0, %i1 2084 mov %i5, %i0 2085 2086 ! Block (64 bytes) align the destination. 2087 andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes 2088 bz %xcc, .chksrc ! dst is already double aligned 2089 sub %i3, 0x40, %i3 2090 neg %i3 ! bytes till dst 64 bytes aligned 2091 sub %i2, %i3, %i2 ! update i2 with new count 2092 2093 ! Based on source and destination alignment do 2094 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 2095 2096 ! Is dst & src 8B aligned 2097 or %i0, %i1, %o2 2098 andcc %o2, 0x7, %g0 2099 bz %ncc, .alewdcp 2100 nop 2101 2102 ! Is dst & src 4B aligned 2103 andcc %o2, 0x3, %g0 2104 bz %ncc, .alwdcp 2105 nop 2106 2107 ! Is dst & src 2B aligned 2108 andcc %o2, 0x1, %g0 2109 bz %ncc, .alhlfwdcp 2110 nop 2111 2112 ! 1B aligned 2113 1: ldub [%i1], %o2 2114 stb %o2, [%i0] 2115 inc %i1 2116 deccc %i3 2117 bgu,pt %ncc, 1b 2118 inc %i0 2119 2120 ba .chksrc 2121 nop 2122 2123 ! dst & src 4B aligned 2124 .alwdcp: 2125 ld [%i1], %o2 2126 st %o2, [%i0] 2127 add %i1, 0x4, %i1 2128 subcc %i3, 0x4, %i3 2129 bgu,pt %ncc, .alwdcp 2130 add %i0, 0x4, %i0 2131 2132 ba .chksrc 2133 nop 2134 2135 ! dst & src 2B aligned 2136 .alhlfwdcp: 2137 lduh [%i1], %o2 2138 stuh %o2, [%i0] 2139 add %i1, 0x2, %i1 2140 subcc %i3, 0x2, %i3 2141 bgu,pt %ncc, .alhlfwdcp 2142 add %i0, 0x2, %i0 2143 2144 ba .chksrc 2145 nop 2146 2147 ! dst & src 8B aligned 2148 .alewdcp: 2149 ldx [%i1], %o2 2150 stx %o2, [%i0] 2151 add %i1, 0x8, %i1 2152 subcc %i3, 0x8, %i3 2153 bgu,pt %ncc, .alewdcp 2154 add %i0, 0x8, %i0 2155 2156 ! Now Destination is block (64 bytes) aligned 2157 .chksrc: 2158 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 2159 sub %i2, %i3, %i2 ! Residue bytes in %i2 2160 2161 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2162 2163 andcc %i1, 0xf, %o2 ! is src quadword aligned 2164 bz,pn %xcc, .blkcpy ! src offset in %o2 2165 nop 2166 cmp %o2, 0x8 2167 bg .cpy_upper_double 2168 nop 2169 bl .cpy_lower_double 2170 nop 2171 2172 ! Falls through when source offset is equal to 8 i.e. 2173 ! source is double word aligned. 2174 ! In this case no shift/merge of data is required 2175 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2176 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2177 prefetch [%l0+0x0], #one_read 2178 ldda [%i1+0x0]%asi, %l2 2179 loop0: 2180 ldda [%i1+0x10]%asi, %l4 2181 prefetch [%l0+0x40], #one_read 2182 2183 stxa %l3, [%i0+0x0]%asi 2184 stxa %l4, [%i0+0x8]%asi 2185 2186 ldda [%i1+0x20]%asi, %l2 2187 stxa %l5, [%i0+0x10]%asi 2188 stxa %l2, [%i0+0x18]%asi 2189 2190 ldda [%i1+0x30]%asi, %l4 2191 stxa %l3, [%i0+0x20]%asi 2192 stxa %l4, [%i0+0x28]%asi 2193 2194 ldda [%i1+0x40]%asi, %l2 2195 stxa %l5, [%i0+0x30]%asi 2196 stxa %l2, [%i0+0x38]%asi 2197 2198 add %l0, 0x40, %l0 2199 add %i1, 0x40, %i1 2200 subcc %i3, 0x40, %i3 2201 bgu,pt %xcc, loop0 2202 add %i0, 0x40, %i0 2203 ba .blkdone 2204 add %i1, %o2, %i1 ! increment the source by src offset 2205 ! the src offset was stored in %o2 2206 2207 .cpy_lower_double: 2208 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2209 sll %o2, 3, %o0 ! %o0 left shift 2210 mov 0x40, %o1 2211 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 2212 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2213 prefetch [%l0+0x0], #one_read 2214 ldda [%i1+0x0]%asi, %l2 ! partial data in %l2 and %l3 has 2215 ! complete data 2216 loop1: 2217 ldda [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read. 2218 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 2219 ! into %l2 and %l3 2220 prefetch [%l0+0x40], #one_read 2221 stxa %l2, [%i0+0x0]%asi 2222 stxa %l3, [%i0+0x8]%asi 2223 2224 ldda [%i1+0x20]%asi, %l2 2225 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 2226 stxa %l4, [%i0+0x10]%asi ! %l4 from previous read 2227 stxa %l5, [%i0+0x18]%asi ! into %l4 and %l5 2228 2229 ! Repeat the same for next 32 bytes. 2230 2231 ldda [%i1+0x30]%asi, %l4 2232 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 2233 stxa %l2, [%i0+0x20]%asi 2234 stxa %l3, [%i0+0x28]%asi 2235 2236 ldda [%i1+0x40]%asi, %l2 2237 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 2238 stxa %l4, [%i0+0x30]%asi 2239 stxa %l5, [%i0+0x38]%asi 2240 2241 add %l0, 0x40, %l0 2242 add %i1, 0x40, %i1 2243 subcc %i3, 0x40, %i3 2244 bgu,pt %xcc, loop1 2245 add %i0, 0x40, %i0 2246 ba .blkdone 2247 add %i1, %o2, %i1 ! increment the source by src offset 2248 ! the src offset was stored in %o2 2249 2250 .cpy_upper_double: 2251 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2252 mov 0x8, %o0 2253 sub %o2, %o0, %o0 2254 sll %o0, 3, %o0 ! %o0 left shift 2255 mov 0x40, %o1 2256 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 2257 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2258 prefetch [%l0+0x0], #one_read 2259 ldda [%i1+0x0]%asi, %l2 ! partial data in %l3 for this read and 2260 ! no data in %l2 2261 loop2: 2262 ldda [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has 2263 ! partial 2264 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 2265 ! into %l3 and %l4 2266 prefetch [%l0+0x40], #one_read 2267 stxa %l3, [%i0+0x0]%asi 2268 stxa %l4, [%i0+0x8]%asi 2269 2270 ldda [%i1+0x20]%asi, %l2 2271 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 2272 stxa %l5, [%i0+0x10]%asi ! %l5 from previous read 2273 stxa %l2, [%i0+0x18]%asi ! into %l5 and %l2 2274 2275 ! Repeat the same for next 32 bytes. 2276 2277 ldda [%i1+0x30]%asi, %l4 2278 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 2279 stxa %l3, [%i0+0x20]%asi 2280 stxa %l4, [%i0+0x28]%asi 2281 2282 ldda [%i1+0x40]%asi, %l2 2283 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 2284 stxa %l5, [%i0+0x30]%asi 2285 stxa %l2, [%i0+0x38]%asi 2286 2287 add %l0, 0x40, %l0 2288 add %i1, 0x40, %i1 2289 subcc %i3, 0x40, %i3 2290 bgu,pt %xcc, loop2 2291 add %i0, 0x40, %i0 2292 ba .blkdone 2293 add %i1, %o2, %i1 ! increment the source by src offset 2294 ! the src offset was stored in %o2 2295 2296 2297 ! Both Source and Destination are block aligned. 2298 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 2299 .blkcpy: 2300 prefetch [%i1+0x0], #one_read 2301 1: 2302 ldda [%i1+0x0]%asi, %l0 2303 ldda [%i1+0x10]%asi, %l2 2304 prefetch [%i1+0x40], #one_read 2305 2306 stxa %l0, [%i0+0x0]%asi 2307 ldda [%i1+0x20]%asi, %l4 2308 ldda [%i1+0x30]%asi, %l6 2309 2310 stxa %l1, [%i0+0x8]%asi 2311 stxa %l2, [%i0+0x10]%asi 2312 stxa %l3, [%i0+0x18]%asi 2313 stxa %l4, [%i0+0x20]%asi 2314 stxa %l5, [%i0+0x28]%asi 2315 stxa %l6, [%i0+0x30]%asi 2316 stxa %l7, [%i0+0x38]%asi 2317 2318 add %i1, 0x40, %i1 2319 subcc %i3, 0x40, %i3 2320 bgu,pt %xcc, 1b 2321 add %i0, 0x40, %i0 2322 2323 .blkdone: 2324 membar #Sync 2325 2326 brz,pt %i2, .blkexit 2327 nop 2328 2329 ! Handle trailing bytes 2330 cmp %i2, 0x8 2331 blu,pt %ncc, .residue 2332 nop 2333 2334 ! Can we do some 8B ops 2335 or %i1, %i0, %o2 2336 andcc %o2, 0x7, %g0 2337 bnz %ncc, .last4 2338 nop 2339 2340 ! Do 8byte ops as long as possible 2341 .last8: 2342 ldx [%i1], %o2 2343 stx %o2, [%i0] 2344 add %i1, 0x8, %i1 2345 sub %i2, 0x8, %i2 2346 cmp %i2, 0x8 2347 bgu,pt %ncc, .last8 2348 add %i0, 0x8, %i0 2349 2350 brz,pt %i2, .blkexit 2351 nop 2352 2353 ba .residue 2354 nop 2355 2356 .last4: 2357 ! Can we do 4B ops 2358 andcc %o2, 0x3, %g0 2359 bnz %ncc, .last2 2360 nop 2361 1: 2362 ld [%i1], %o2 2363 st %o2, [%i0] 2364 add %i1, 0x4, %i1 2365 sub %i2, 0x4, %i2 2366 cmp %i2, 0x4 2367 bgu,pt %ncc, 1b 2368 add %i0, 0x4, %i0 2369 2370 brz,pt %i2, .blkexit 2371 nop 2372 2373 ba .residue 2374 nop 2375 2376 .last2: 2377 ! Can we do 2B ops 2378 andcc %o2, 0x1, %g0 2379 bnz %ncc, .residue 2380 nop 2381 2382 1: 2383 lduh [%i1], %o2 2384 stuh %o2, [%i0] 2385 add %i1, 0x2, %i1 2386 sub %i2, 0x2, %i2 2387 cmp %i2, 0x2 2388 bgu,pt %ncc, 1b 2389 add %i0, 0x2, %i0 2390 2391 brz,pt %i2, .blkexit 2392 nop 2393 2394 .residue: 2395 ldub [%i1], %o2 2396 stb %o2, [%i0] 2397 inc %i1 2398 deccc %i2 2399 bgu,pt %ncc, .residue 2400 inc %i0 2401 2402 .blkexit: 2403 2404 membar #Sync ! sync error barrier 2405 ! Restore t_lofault handler, if came here from kcopy(). 2406 tst %o5 2407 bz %ncc, 1f 2408 andn %o5, LOFAULT_SET, %o5 2409 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2410 1: 2411 ret 2412 restore %g0, 0, %o0 2413 2414 2415 .bcb_punt: 2416 ! 2417 ! use aligned transfers where possible 2418 ! 2419 xor %i0, %i1, %o4 ! xor from and to address 2420 btst 7, %o4 ! if lower three bits zero 2421 bz .aldoubcp ! can align on double boundary 2422 .empty ! assembler complaints about label 2423 2424 xor %i0, %i1, %o4 ! xor from and to address 2425 btst 3, %o4 ! if lower two bits zero 2426 bz .alwordcp ! can align on word boundary 2427 btst 3, %i0 ! delay slot, from address unaligned? 2428 ! 2429 ! use aligned reads and writes where possible 2430 ! this differs from wordcp in that it copes 2431 ! with odd alignment between source and destnation 2432 ! using word reads and writes with the proper shifts 2433 ! in between to align transfers to and from memory 2434 ! i0 - src address, i1 - dest address, i2 - count 2435 ! i3, i4 - tmps for used generating complete word 2436 ! i5 (word to write) 2437 ! l0 size in bits of upper part of source word (US) 2438 ! l1 size in bits of lower part of source word (LS = 32 - US) 2439 ! l2 size in bits of upper part of destination word (UD) 2440 ! l3 size in bits of lower part of destination word (LD = 32 - UD) 2441 ! l4 number of bytes leftover after aligned transfers complete 2442 ! l5 the number 32 2443 ! 2444 mov 32, %l5 ! load an oft-needed constant 2445 bz .align_dst_only 2446 btst 3, %i1 ! is destnation address aligned? 2447 clr %i4 ! clear registers used in either case 2448 bz .align_src_only 2449 clr %l0 2450 ! 2451 ! both source and destination addresses are unaligned 2452 ! 2453 1: ! align source 2454 ldub [%i0], %i3 ! read a byte from source address 2455 add %i0, 1, %i0 ! increment source address 2456 or %i4, %i3, %i4 ! or in with previous bytes (if any) 2457 btst 3, %i0 ! is source aligned? 2458 add %l0, 8, %l0 ! increment size of upper source (US) 2459 bnz,a 1b 2460 sll %i4, 8, %i4 ! make room for next byte 2461 2462 sub %l5, %l0, %l1 ! generate shift left count (LS) 2463 sll %i4, %l1, %i4 ! prepare to get rest 2464 ld [%i0], %i3 ! read a word 2465 add %i0, 4, %i0 ! increment source address 2466 srl %i3, %l0, %i5 ! upper src bits into lower dst bits 2467 or %i4, %i5, %i5 ! merge 2468 mov 24, %l3 ! align destination 2469 1: 2470 srl %i5, %l3, %i4 ! prepare to write a single byte 2471 stb %i4, [%i1] ! write a byte 2472 add %i1, 1, %i1 ! increment destination address 2473 sub %i2, 1, %i2 ! decrement count 2474 btst 3, %i1 ! is destination aligned? 2475 bnz,a 1b 2476 sub %l3, 8, %l3 ! delay slot, decrement shift count (LD) 2477 sub %l5, %l3, %l2 ! generate shift left count (UD) 2478 sll %i5, %l2, %i5 ! move leftover into upper bytes 2479 cmp %l2, %l0 ! cmp # reqd to fill dst w old src left 2480 bgu %ncc, .more_needed ! need more to fill than we have 2481 nop 2482 2483 sll %i3, %l1, %i3 ! clear upper used byte(s) 2484 srl %i3, %l1, %i3 2485 ! get the odd bytes between alignments 2486 sub %l0, %l2, %l0 ! regenerate shift count 2487 sub %l5, %l0, %l1 ! generate new shift left count (LS) 2488 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 2489 andn %i2, 3, %i2 ! # of aligned bytes that can be moved 2490 srl %i3, %l0, %i4 2491 or %i5, %i4, %i5 2492 st %i5, [%i1] ! write a word 2493 subcc %i2, 4, %i2 ! decrement count 2494 bz %ncc, .unalign_out 2495 add %i1, 4, %i1 ! increment destination address 2496 2497 b 2f 2498 sll %i3, %l1, %i5 ! get leftover into upper bits 2499 .more_needed: 2500 sll %i3, %l0, %i3 ! save remaining byte(s) 2501 srl %i3, %l0, %i3 2502 sub %l2, %l0, %l1 ! regenerate shift count 2503 sub %l5, %l1, %l0 ! generate new shift left count 2504 sll %i3, %l1, %i4 ! move to fill empty space 2505 b 3f 2506 or %i5, %i4, %i5 ! merge to complete word 2507 ! 2508 ! the source address is aligned and destination is not 2509 ! 2510 .align_dst_only: 2511 ld [%i0], %i4 ! read a word 2512 add %i0, 4, %i0 ! increment source address 2513 mov 24, %l0 ! initial shift alignment count 2514 1: 2515 srl %i4, %l0, %i3 ! prepare to write a single byte 2516 stb %i3, [%i1] ! write a byte 2517 add %i1, 1, %i1 ! increment destination address 2518 sub %i2, 1, %i2 ! decrement count 2519 btst 3, %i1 ! is destination aligned? 2520 bnz,a 1b 2521 sub %l0, 8, %l0 ! delay slot, decrement shift count 2522 .xfer: 2523 sub %l5, %l0, %l1 ! generate shift left count 2524 sll %i4, %l1, %i5 ! get leftover 2525 3: 2526 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 2527 andn %i2, 3, %i2 ! # of aligned bytes that can be moved 2528 2: 2529 ld [%i0], %i3 ! read a source word 2530 add %i0, 4, %i0 ! increment source address 2531 srl %i3, %l0, %i4 ! upper src bits into lower dst bits 2532 or %i5, %i4, %i5 ! merge with upper dest bits (leftover) 2533 st %i5, [%i1] ! write a destination word 2534 subcc %i2, 4, %i2 ! decrement count 2535 bz %ncc, .unalign_out ! check if done 2536 add %i1, 4, %i1 ! increment destination address 2537 b 2b ! loop 2538 sll %i3, %l1, %i5 ! get leftover 2539 .unalign_out: 2540 tst %l4 ! any bytes leftover? 2541 bz %ncc, .cpdone 2542 .empty ! allow next instruction in delay slot 2543 1: 2544 sub %l0, 8, %l0 ! decrement shift 2545 srl %i3, %l0, %i4 ! upper src byte into lower dst byte 2546 stb %i4, [%i1] ! write a byte 2547 subcc %l4, 1, %l4 ! decrement count 2548 bz %ncc, .cpdone ! done? 2549 add %i1, 1, %i1 ! increment destination 2550 tst %l0 ! any more previously read bytes 2551 bnz %ncc, 1b ! we have leftover bytes 2552 mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants 2553 b .dbytecp ! let dbytecp do the rest 2554 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 2555 ! 2556 ! the destination address is aligned and the source is not 2557 ! 2558 .align_src_only: 2559 ldub [%i0], %i3 ! read a byte from source address 2560 add %i0, 1, %i0 ! increment source address 2561 or %i4, %i3, %i4 ! or in with previous bytes (if any) 2562 btst 3, %i0 ! is source aligned? 2563 add %l0, 8, %l0 ! increment shift count (US) 2564 bnz,a .align_src_only 2565 sll %i4, 8, %i4 ! make room for next byte 2566 b,a .xfer 2567 ! 2568 ! if from address unaligned for double-word moves, 2569 ! move bytes till it is, if count is < 56 it could take 2570 ! longer to align the thing than to do the transfer 2571 ! in word size chunks right away 2572 ! 2573 .aldoubcp: 2574 cmp %i2, 56 ! if count < 56, use wordcp, it takes 2575 blu,a %ncc, .alwordcp ! longer to align doubles than words 2576 mov 3, %o0 ! mask for word alignment 2577 call .alignit ! copy bytes until aligned 2578 mov 7, %o0 ! mask for double alignment 2579 ! 2580 ! source and destination are now double-word aligned 2581 ! i3 has aligned count returned by alignit 2582 ! 2583 and %i2, 7, %i2 ! unaligned leftover count 2584 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 2585 5: 2586 ldx [%i0+%i1], %o4 ! read from address 2587 stx %o4, [%i1] ! write at destination address 2588 subcc %i3, 8, %i3 ! dec count 2589 bgu %ncc, 5b 2590 add %i1, 8, %i1 ! delay slot, inc to address 2591 cmp %i2, 4 ! see if we can copy a word 2592 blu %ncc, .dbytecp ! if 3 or less bytes use bytecp 2593 .empty 2594 ! 2595 ! for leftover bytes we fall into wordcp, if needed 2596 ! 2597 .wordcp: 2598 and %i2, 3, %i2 ! unaligned leftover count 2599 5: 2600 ld [%i0+%i1], %o4 ! read from address 2601 st %o4, [%i1] ! write at destination address 2602 subcc %i3, 4, %i3 ! dec count 2603 bgu %ncc, 5b 2604 add %i1, 4, %i1 ! delay slot, inc to address 2605 b,a .dbytecp 2606 2607 ! we come here to align copies on word boundaries 2608 .alwordcp: 2609 call .alignit ! go word-align it 2610 mov 3, %o0 ! bits that must be zero to be aligned 2611 b .wordcp 2612 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 2613 2614 ! 2615 ! byte copy, works with any alignment 2616 ! 2617 .bytecp: 2618 b .dbytecp 2619 sub %i0, %i1, %i0 ! i0 gets difference of src and dst 2620 2621 ! 2622 ! differenced byte copy, works with any alignment 2623 ! assumes dest in %i1 and (source - dest) in %i0 2624 ! 2625 1: 2626 stb %o4, [%i1] ! write to address 2627 inc %i1 ! inc to address 2628 .dbytecp: 2629 deccc %i2 ! dec count 2630 bgeu,a %ncc, 1b ! loop till done 2631 ldub [%i0+%i1], %o4 ! read from address 2632 .cpdone: 2633 2634 membar #Sync ! sync error barrier 2635 ! Restore t_lofault handler, if came here from kcopy(). 2636 tst %o5 2637 bz %ncc, 1f 2638 andn %o5, LOFAULT_SET, %o5 2639 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2640 1: 2641 ret 2642 restore %g0, 0, %o0 ! return (0) 2643 2644 /* 2645 * Common code used to align transfers on word and doubleword 2646 * boundaries. Aligns source and destination and returns a count 2647 * of aligned bytes to transfer in %i3 2648 */ 2649 1: 2650 inc %i0 ! inc from 2651 stb %o4, [%i1] ! write a byte 2652 inc %i1 ! inc to 2653 dec %i2 ! dec count 2654 .alignit: 2655 btst %o0, %i0 ! %o0 is bit mask to check for alignment 2656 bnz,a 1b 2657 ldub [%i0], %o4 ! read next byte 2658 2659 retl 2660 andn %i2, %o0, %i3 ! return size of aligned bytes 2661 2662 SET_SIZE(bcopy) 2663 2664 #endif /* NIAGARA_IMPL */ 2665 2666 /* 2667 * Block copy with possibly overlapped operands. 2668 */ 2669 2670 ENTRY(ovbcopy) 2671 tst %o2 ! check count 2672 bgu,a %ncc, 1f ! nothing to do or bad arguments 2673 subcc %o0, %o1, %o3 ! difference of from and to address 2674 2675 retl ! return 2676 nop 2677 1: 2678 bneg,a %ncc, 2f 2679 neg %o3 ! if < 0, make it positive 2680 2: cmp %o2, %o3 ! cmp size and abs(from - to) 2681 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 2682 .empty ! no overlap 2683 cmp %o0, %o1 ! compare from and to addresses 2684 blu %ncc, .ov_bkwd ! if from < to, copy backwards 2685 nop 2686 ! 2687 ! Copy forwards. 2688 ! 2689 .ov_fwd: 2690 ldub [%o0], %o3 ! read from address 2691 inc %o0 ! inc from address 2692 stb %o3, [%o1] ! write to address 2693 deccc %o2 ! dec count 2694 bgu %ncc, .ov_fwd ! loop till done 2695 inc %o1 ! inc to address 2696 2697 retl ! return 2698 nop 2699 ! 2700 ! Copy backwards. 2701 ! 2702 .ov_bkwd: 2703 deccc %o2 ! dec count 2704 ldub [%o0 + %o2], %o3 ! get byte at end of src 2705 bgu %ncc, .ov_bkwd ! loop till done 2706 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 2707 2708 retl ! return 2709 nop 2710 SET_SIZE(ovbcopy) 2711 2712 /* 2713 * hwblkpagecopy() 2714 * 2715 * Copies exactly one page. This routine assumes the caller (ppcopy) 2716 * has already disabled kernel preemption and has checked 2717 * use_hw_bcopy. 2718 */ 2719 ENTRY(hwblkpagecopy) 2720 save %sp, -SA(MINFRAME), %sp 2721 2722 ! %i0 - source address (arg) 2723 ! %i1 - destination address (arg) 2724 ! %i2 - length of region (not arg) 2725 2726 set PAGESIZE, %i2 2727 2728 /* 2729 * Copying exactly one page and PAGESIZE is in mutliple of 0x80. 2730 */ 2731 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2732 prefetch [%i0+0x0], #one_read 2733 prefetch [%i0+0x40], #one_read 2734 1: 2735 prefetch [%i0+0x80], #one_read 2736 prefetch [%i0+0xc0], #one_read 2737 ldda [%i0+0x0]%asi, %l0 2738 ldda [%i0+0x10]%asi, %l2 2739 ldda [%i0+0x20]%asi, %l4 2740 ldda [%i0+0x30]%asi, %l6 2741 stxa %l0, [%i1+0x0]%asi 2742 stxa %l1, [%i1+0x8]%asi 2743 stxa %l2, [%i1+0x10]%asi 2744 stxa %l3, [%i1+0x18]%asi 2745 stxa %l4, [%i1+0x20]%asi 2746 stxa %l5, [%i1+0x28]%asi 2747 stxa %l6, [%i1+0x30]%asi 2748 stxa %l7, [%i1+0x38]%asi 2749 ldda [%i0+0x40]%asi, %l0 2750 ldda [%i0+0x50]%asi, %l2 2751 ldda [%i0+0x60]%asi, %l4 2752 ldda [%i0+0x70]%asi, %l6 2753 stxa %l0, [%i1+0x40]%asi 2754 stxa %l1, [%i1+0x48]%asi 2755 stxa %l2, [%i1+0x50]%asi 2756 stxa %l3, [%i1+0x58]%asi 2757 stxa %l4, [%i1+0x60]%asi 2758 stxa %l5, [%i1+0x68]%asi 2759 stxa %l6, [%i1+0x70]%asi 2760 stxa %l7, [%i1+0x78]%asi 2761 2762 add %i0, 0x80, %i0 2763 subcc %i2, 0x80, %i2 2764 bgu,pt %xcc, 1b 2765 add %i1, 0x80, %i1 2766 2767 membar #Sync 2768 ret 2769 restore %g0, 0, %o0 2770 SET_SIZE(hwblkpagecopy) 2771 2772 2773 /* 2774 * Transfer data to and from user space - 2775 * Note that these routines can cause faults 2776 * It is assumed that the kernel has nothing at 2777 * less than KERNELBASE in the virtual address space. 2778 * 2779 * Note that copyin(9F) and copyout(9F) are part of the 2780 * DDI/DKI which specifies that they return '-1' on "errors." 2781 * 2782 * Sigh. 2783 * 2784 * So there's two extremely similar routines - xcopyin() and xcopyout() 2785 * which return the errno that we've faithfully computed. This 2786 * allows other callers (e.g. uiomove(9F)) to work correctly. 2787 * Given that these are used pretty heavily, we expand the calling 2788 * sequences inline for all flavours (rather than making wrappers). 2789 * 2790 * There are also stub routines for xcopyout_little and xcopyin_little, 2791 * which currently are intended to handle requests of <= 16 bytes from 2792 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 2793 * is left as an exercise... 2794 */ 2795 2796 /* 2797 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 2798 * 2799 * General theory of operation: 2800 * 2801 * None of the copyops routines grab a window until it's decided that 2802 * we need to do a HW block copy operation. This saves a window 2803 * spill/fill when we're called during socket ops. The typical IO 2804 * path won't cause spill/fill traps. 2805 * 2806 * This code uses a set of 4 limits for the maximum size that will 2807 * be copied given a particular input/output address alignment. 2808 * the default limits are: 2809 * 2810 * single byte aligned - 256 (hw_copy_limit_1) 2811 * two byte aligned - 512 (hw_copy_limit_2) 2812 * four byte aligned - 1024 (hw_copy_limit_4) 2813 * eight byte aligned - 1024 (hw_copy_limit_8) 2814 * 2815 * If the value for a particular limit is zero, the copy will be done 2816 * via the copy loops rather than block store/quad load instructions. 2817 * 2818 * Flow: 2819 * 2820 * If count == zero return zero. 2821 * 2822 * Store the previous lo_fault handler into %g6. 2823 * Place our secondary lofault handler into %g5. 2824 * Place the address of our nowindow fault handler into %o3. 2825 * Place the address of the windowed fault handler into %o4. 2826 * --> We'll use this handler if we end up grabbing a window 2827 * --> before we use block initializing store and quad load ASIs 2828 * 2829 * If count is less than or equal to SMALL_LIMIT (7) we 2830 * always do a byte for byte copy. 2831 * 2832 * If count is > SMALL_LIMIT, we check the alignment of the input 2833 * and output pointers. Based on the alignment we check count 2834 * against a limit based on detected alignment. If we exceed the 2835 * alignment value we copy via block initializing store and quad 2836 * load instructions. 2837 * 2838 * If we don't exceed one of the limits, we store -count in %o3, 2839 * we store the number of chunks (8, 4, 2 or 1 byte) operated 2840 * on in our basic copy loop in %o2. Following this we branch 2841 * to the appropriate copy loop and copy that many chunks. 2842 * Since we've been adding the chunk size to %o3 each time through 2843 * as well as decrementing %o2, we can tell if any data is 2844 * is left to be copied by examining %o3. If that is zero, we're 2845 * done and can go home. If not, we figure out what the largest 2846 * chunk size left to be copied is and branch to that copy loop 2847 * unless there's only one byte left. We load that as we're 2848 * branching to code that stores it just before we return. 2849 * 2850 * Fault handlers are invoked if we reference memory that has no 2851 * current mapping. All forms share the same copyio_fault handler. 2852 * This routine handles fixing up the stack and general housecleaning. 2853 * Each copy operation has a simple fault handler that is then called 2854 * to do the work specific to the invidual operation. The handler 2855 * for copyOP and xcopyOP are found at the end of individual function. 2856 * The handlers for xcopyOP_little are found at the end of xcopyin_little. 2857 * The handlers for copyOP_noerr are found at the end of copyin_noerr. 2858 */ 2859 2860 /* 2861 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 2862 */ 2863 2864 /* 2865 * We save the arguments in the following registers in case of a fault: 2866 * kaddr - %g2 2867 * uaddr - %g3 2868 * count - %g4 2869 */ 2870 #define SAVE_SRC %g2 2871 #define SAVE_DST %g3 2872 #define SAVE_COUNT %g4 2873 2874 #define REAL_LOFAULT %g5 2875 #define SAVED_LOFAULT %g6 2876 2877 /* 2878 * Generic copyio fault handler. This is the first line of defense when a 2879 * fault occurs in (x)copyin/(x)copyout. In order for this to function 2880 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 2881 * This allows us to share common code for all the flavors of the copy 2882 * operations, including the _noerr versions. 2883 * 2884 * Note that this function will restore the original input parameters before 2885 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 2886 * member of the t_copyop structure, if needed. 2887 */ 2888 ENTRY(copyio_fault) 2889 #if !defined(NIAGARA_IMPL) 2890 btst FPUSED_FLAG, SAVED_LOFAULT 2891 bz 1f 2892 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 2893 2894 wr %l5, 0, %gsr ! restore gsr 2895 2896 btst FPRS_FEF, %g1 2897 bz %icc, 4f 2898 nop 2899 2900 ! restore fpregs from stack 2901 BLD_FP_FROMSTACK(%o2) 2902 2903 ba,pt %ncc, 1f 2904 nop 2905 4: 2906 FZERO ! zero all of the fpregs 2907 wr %g1, %g0, %fprs ! restore fprs 2908 1: 2909 restore 2910 mov SAVE_SRC, %o0 2911 mov SAVE_DST, %o1 2912 jmp REAL_LOFAULT 2913 mov SAVE_COUNT, %o2 2914 2915 #else /* NIAGARA_IMPL */ 2916 membar #Sync 2917 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2918 restore 2919 mov SAVE_SRC, %o0 2920 mov SAVE_DST, %o1 2921 jmp REAL_LOFAULT 2922 mov SAVE_COUNT, %o2 2923 2924 #endif /* NIAGARA_IMPL */ 2925 2926 SET_SIZE(copyio_fault) 2927 2928 ENTRY(copyio_fault_nowindow) 2929 membar #Sync 2930 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2931 2932 mov SAVE_SRC, %o0 2933 mov SAVE_DST, %o1 2934 jmp REAL_LOFAULT 2935 mov SAVE_COUNT, %o2 2936 SET_SIZE(copyio_fault_nowindow) 2937 2938 ENTRY(copyout) 2939 sethi %hi(.copyout_err), REAL_LOFAULT 2940 or REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT 2941 2942 #if !defined(NIAGARA_IMPL) 2943 .do_copyout: 2944 tst %o2 ! check for zero count; quick exit 2945 bz,pt %ncc, .co_smallqx 2946 mov %o0, SAVE_SRC 2947 mov %o1, SAVE_DST 2948 mov %o2, SAVE_COUNT 2949 cmp %o2, FP_COPY ! check for small copy/leaf case 2950 bgt,pt %ncc, .co_copy_more 2951 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 2952 /* 2953 * Small copy out code 2954 * 2955 */ 2956 sethi %hi(copyio_fault_nowindow), %o3 2957 or %o3, %lo(copyio_fault_nowindow), %o3 2958 membar #Sync 2959 stn %o3, [THREAD_REG + T_LOFAULT] 2960 2961 mov ASI_USER, %asi 2962 cmp %o2, SHORTCOPY ! make sure there is enough to align 2963 ble,pt %ncc, .co_smallest 2964 andcc %o1, 0x7, %o3 ! is dest long word aligned 2965 bnz,pn %ncc, .co_align 2966 andcc %o1, 1, %o3 ! is dest byte aligned 2967 2968 ! Destination is long word aligned 2969 ! 8 cases for src alignment; load parts, store long words 2970 .co_al_src: 2971 andcc %o0, 7, %o3 2972 brnz,pt %o3, .co_src_dst_unal8 2973 nop 2974 /* 2975 * Special case for handling when src and dest are both long word aligned 2976 * and total data to move is less than FP_COPY bytes 2977 * Also handles finish up for large block moves, so may be less than 32 bytes 2978 */ 2979 .co_medlong: 2980 subcc %o2, 31, %o2 ! adjust length to allow cc test 2981 ble,pt %ncc, .co_medl31 2982 nop 2983 .co_medl32: 2984 ldx [%o0], %o4 ! move 32 bytes 2985 subcc %o2, 32, %o2 ! decrement length count by 32 2986 stxa %o4, [%o1]%asi 2987 ldx [%o0+8], %o4 2988 stxa %o4, [%o1+8]%asi 2989 ldx [%o0+16], %o4 2990 add %o0, 32, %o0 ! increase src ptr by 32 2991 stxa %o4, [%o1+16]%asi 2992 ldx [%o0-8], %o4 2993 add %o1, 32, %o1 ! increase dst ptr by 32 2994 bgu,pt %ncc, .co_medl32 ! repeat if at least 32 bytes left 2995 stxa %o4, [%o1-8]%asi 2996 .co_medl31: 2997 addcc %o2, 24, %o2 ! adjust count to be off by 7 2998 ble,pt %ncc, .co_medl7 ! skip if 7 or fewer bytes left 2999 nop 3000 .co_medl8: 3001 ldx [%o0], %o4 ! move 8 bytes 3002 add %o0, 8, %o0 ! increase src ptr by 8 3003 subcc %o2, 8, %o2 ! decrease count by 8 3004 add %o1, 8, %o1 ! increase dst ptr by 8 3005 bgu,pt %ncc, .co_medl8 3006 stxa %o4, [%o1-8]%asi 3007 .co_medl7: 3008 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3009 bnz,pt %ncc, .co_small4 ! do final bytes if not finished 3010 3011 .co_smallx: ! finish up and exit 3012 membar #Sync 3013 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3014 .co_smallqx: 3015 retl 3016 mov %g0, %o0 3017 3018 .co_small4: 3019 cmp %o2, 4 3020 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3021 nop ! 3022 ld [%o0], %o4 ! move 4 bytes 3023 add %o0, 4, %o0 ! increase src ptr by 4 3024 add %o1, 4, %o1 ! increase dst ptr by 4 3025 subcc %o2, 4, %o2 ! decrease count by 4 3026 bz,pt %ncc, .co_smallx 3027 stwa %o4, [%o1-4]%asi 3028 3029 .co_small3x: ! Exactly 1, 2, or 3 bytes remain 3030 subcc %o2, 1, %o2 ! reduce count for cc test 3031 ldub [%o0], %o4 ! load one byte 3032 bz,pt %ncc, .co_smallx 3033 stba %o4, [%o1]%asi ! store one byte 3034 ldub [%o0+1], %o4 ! load second byte 3035 subcc %o2, 1, %o2 3036 bz,pt %ncc, .co_smallx 3037 stba %o4, [%o1+1]%asi ! store second byte 3038 ldub [%o0+2], %o4 ! load third byte 3039 ba .co_smallx 3040 stba %o4, [%o1+2]%asi ! store third byte 3041 3042 .co_smallest: ! 7 or fewer bytes remain 3043 cmp %o2, 4 3044 blt,pt %ncc, .co_small3x 3045 nop 3046 ldub [%o0], %o4 ! read byte 3047 subcc %o2, 4, %o2 ! reduce count by 4 3048 stba %o4, [%o1]%asi ! write byte 3049 ldub [%o0+1], %o4 ! repeat for total of 4 bytes 3050 add %o0, 4, %o0 ! advance src by 4 3051 stba %o4, [%o1+1]%asi 3052 ldub [%o0-2], %o4 3053 add %o1, 4, %o1 ! advance dst by 4 3054 stba %o4, [%o1-2]%asi 3055 ldub [%o0-1], %o4 3056 bnz,pt %ncc, .co_small3x 3057 stba %o4, [%o1-1]%asi 3058 membar #Sync 3059 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3060 retl 3061 mov %g0, %o0 3062 3063 .co_align: ! byte align test in prior branch delay 3064 bnz,pt %ncc, .co_al_d1 3065 .co_al_d1f: ! dest is now half word aligned 3066 andcc %o1, 2, %o3 3067 bnz,pt %ncc, .co_al_d2 3068 .co_al_d2f: ! dest is now word aligned 3069 andcc %o1, 4, %o3 ! is dest longword aligned? 3070 bz,pt %ncc, .co_al_src 3071 nop 3072 .co_al_d4: ! dest is word aligned; src is unknown 3073 ldub [%o0], %o4 ! move a word (src align unknown) 3074 ldub [%o0+1], %o3 3075 sll %o4, 24, %o4 ! position 3076 sll %o3, 16, %o3 ! position 3077 or %o4, %o3, %o3 ! merge 3078 ldub [%o0+2], %o4 3079 sll %o4, 8, %o4 ! position 3080 or %o4, %o3, %o3 ! merge 3081 ldub [%o0+3], %o4 3082 or %o4, %o3, %o4 ! merge 3083 stwa %o4,[%o1]%asi ! store four bytes 3084 add %o0, 4, %o0 ! adjust src by 4 3085 add %o1, 4, %o1 ! adjust dest by 4 3086 sub %o2, 4, %o2 ! adjust count by 4 3087 andcc %o0, 7, %o3 ! check for src long word alignment 3088 brz,pt %o3, .co_medlong 3089 .co_src_dst_unal8: 3090 ! dst is 8-byte aligned, src is not 3091 ! Size is less than FP_COPY 3092 ! Following code is to select for alignment 3093 andcc %o0, 0x3, %o3 ! test word alignment 3094 bz,pt %ncc, .co_medword 3095 nop 3096 andcc %o0, 0x1, %o3 ! test halfword alignment 3097 bnz,pt %ncc, .co_med_byte ! go to byte move if not halfword 3098 andcc %o0, 0x2, %o3 ! test which byte alignment 3099 ba .co_medhalf 3100 nop 3101 .co_al_d1: ! align dest to half word 3102 ldub [%o0], %o4 ! move a byte 3103 add %o0, 1, %o0 3104 stba %o4, [%o1]%asi 3105 add %o1, 1, %o1 3106 andcc %o1, 2, %o3 3107 bz,pt %ncc, .co_al_d2f 3108 sub %o2, 1, %o2 3109 .co_al_d2: ! align dest to word 3110 ldub [%o0], %o4 ! move a half-word (src align unknown) 3111 ldub [%o0+1], %o3 3112 sll %o4, 8, %o4 ! position 3113 or %o4, %o3, %o4 ! merge 3114 stha %o4, [%o1]%asi 3115 add %o0, 2, %o0 3116 add %o1, 2, %o1 3117 andcc %o1, 4, %o3 ! is dest longword aligned? 3118 bz,pt %ncc, .co_al_src 3119 sub %o2, 2, %o2 3120 ba .co_al_d4 3121 nop 3122 /* 3123 * Handle all cases where src and dest are aligned on word 3124 * boundaries. Use unrolled loops for better performance. 3125 * This option wins over standard large data move when 3126 * source and destination is in cache for medium 3127 * to short data moves. 3128 */ 3129 .co_medword: 3130 subcc %o2, 31, %o2 ! adjust length to allow cc test 3131 ble,pt %ncc, .co_medw31 3132 nop 3133 .co_medw32: 3134 ld [%o0], %o4 ! move a block of 32 bytes 3135 stwa %o4, [%o1]%asi 3136 ld [%o0+4], %o4 3137 stwa %o4, [%o1+4]%asi 3138 ld [%o0+8], %o4 3139 stwa %o4, [%o1+8]%asi 3140 ld [%o0+12], %o4 3141 stwa %o4, [%o1+12]%asi 3142 ld [%o0+16], %o4 3143 stwa %o4, [%o1+16]%asi 3144 ld [%o0+20], %o4 3145 subcc %o2, 32, %o2 ! decrement length count 3146 stwa %o4, [%o1+20]%asi 3147 ld [%o0+24], %o4 3148 add %o0, 32, %o0 ! increase src ptr by 32 3149 stwa %o4, [%o1+24]%asi 3150 ld [%o0-4], %o4 3151 add %o1, 32, %o1 ! increase dst ptr by 32 3152 bgu,pt %ncc, .co_medw32 ! repeat if at least 32 bytes left 3153 stwa %o4, [%o1-4]%asi 3154 .co_medw31: 3155 addcc %o2, 24, %o2 ! adjust count to be off by 7 3156 ble,pt %ncc, .co_medw7 ! skip if 7 or fewer bytes left 3157 nop ! 3158 .co_medw15: 3159 ld [%o0], %o4 ! move a block of 8 bytes 3160 subcc %o2, 8, %o2 ! decrement length count 3161 stwa %o4, [%o1]%asi 3162 add %o0, 8, %o0 ! increase src ptr by 8 3163 ld [%o0-4], %o4 3164 add %o1, 8, %o1 ! increase dst ptr by 8 3165 bgu,pt %ncc, .co_medw15 3166 stwa %o4, [%o1-4]%asi 3167 .co_medw7: 3168 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3169 bz,pt %ncc, .co_smallx ! exit if finished 3170 cmp %o2, 4 3171 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3172 nop ! 3173 ld [%o0], %o4 ! move 4 bytes 3174 add %o0, 4, %o0 ! increase src ptr by 4 3175 add %o1, 4, %o1 ! increase dst ptr by 4 3176 subcc %o2, 4, %o2 ! decrease count by 4 3177 bnz .co_small3x 3178 stwa %o4, [%o1-4]%asi 3179 membar #Sync 3180 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3181 retl 3182 mov %g0, %o0 3183 3184 .co_medhalf: 3185 subcc %o2, 31, %o2 ! adjust length to allow cc test 3186 ble,pt %ncc, .co_medh31 3187 nop 3188 .co_medh32: ! load and store block of 32 bytes 3189 3190 lduh [%o0], %o4 ! move 32 bytes 3191 subcc %o2, 32, %o2 ! decrement length count 3192 lduw [%o0+2], %o3 3193 sllx %o4, 48, %o4 3194 sllx %o3, 16, %o3 3195 or %o4, %o3, %o3 3196 lduh [%o0+6], %o4 3197 or %o4, %o3, %o4 3198 stxa %o4, [%o1]%asi 3199 3200 lduh [%o0+8], %o4 3201 lduw [%o0+10], %o3 3202 sllx %o4, 48, %o4 3203 sllx %o3, 16, %o3 3204 or %o4, %o3, %o3 3205 lduh [%o0+14], %o4 3206 or %o4, %o3, %o4 3207 stxa %o4, [%o1+8]%asi 3208 3209 lduh [%o0+16], %o4 3210 lduw [%o0+18], %o3 3211 sllx %o4, 48, %o4 3212 sllx %o3, 16, %o3 3213 or %o4, %o3, %o3 3214 lduh [%o0+22], %o4 3215 or %o4, %o3, %o4 3216 stxa %o4, [%o1+16]%asi 3217 3218 add %o0, 32, %o0 ! increase src ptr by 32 3219 add %o1, 32, %o1 ! increase dst ptr by 32 3220 3221 lduh [%o0-8], %o4 3222 lduw [%o0-6], %o3 3223 sllx %o4, 48, %o4 3224 sllx %o3, 16, %o3 3225 or %o4, %o3, %o3 3226 lduh [%o0-2], %o4 3227 or %o3, %o4, %o4 3228 bgu,pt %ncc, .co_medh32 ! repeat if at least 32 bytes left 3229 stxa %o4, [%o1-8]%asi 3230 3231 .co_medh31: 3232 addcc %o2, 24, %o2 ! adjust count to be off by 7 3233 ble,pt %ncc, .co_medh7 ! skip if 7 or fewer bytes left 3234 nop ! 3235 .co_medh15: 3236 lduh [%o0], %o4 ! move 16 bytes 3237 subcc %o2, 8, %o2 ! decrement length count 3238 lduw [%o0+2], %o3 3239 sllx %o4, 48, %o4 3240 sllx %o3, 16, %o3 3241 or %o4, %o3, %o3 3242 add %o1, 8, %o1 ! increase dst ptr by 8 3243 lduh [%o0+6], %o4 3244 add %o0, 8, %o0 ! increase src ptr by 8 3245 or %o4, %o3, %o4 3246 bgu,pt %ncc, .co_medh15 3247 stxa %o4, [%o1-8]%asi 3248 .co_medh7: 3249 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3250 bz,pt %ncc, .co_smallx ! exit if finished 3251 cmp %o2, 4 3252 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3253 nop ! 3254 lduh [%o0], %o4 3255 sll %o4, 16, %o4 3256 lduh [%o0+2], %o3 3257 or %o3, %o4, %o4 3258 subcc %o2, 4, %o2 3259 add %o0, 4, %o0 3260 add %o1, 4, %o1 3261 bnz .co_small3x 3262 stwa %o4, [%o1-4]%asi 3263 membar #Sync 3264 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3265 retl 3266 mov %g0, %o0 3267 3268 .align 16 3269 .co_med_byte: 3270 bnz,pt %ncc, .co_medbh32a ! go to correct byte move 3271 subcc %o2, 31, %o2 ! adjust length to allow cc test 3272 ble,pt %ncc, .co_medb31 3273 nop 3274 .co_medb32: ! Alignment 1 or 5 3275 subcc %o2, 32, %o2 ! decrement length count 3276 3277 ldub [%o0], %o4 ! load and store a block of 32 bytes 3278 sllx %o4, 56, %o3 3279 lduh [%o0+1], %o4 3280 sllx %o4, 40, %o4 3281 or %o4, %o3, %o3 3282 lduw [%o0+3], %o4 3283 sllx %o4, 8, %o4 3284 or %o4, %o3, %o3 3285 ldub [%o0+7], %o4 3286 or %o4, %o3, %o4 3287 stxa %o4, [%o1]%asi 3288 3289 ldub [%o0+8], %o4 3290 sllx %o4, 56, %o3 3291 lduh [%o0+9], %o4 3292 sllx %o4, 40, %o4 3293 or %o4, %o3, %o3 3294 lduw [%o0+11], %o4 3295 sllx %o4, 8, %o4 3296 or %o4, %o3, %o3 3297 ldub [%o0+15], %o4 3298 or %o4, %o3, %o4 3299 stxa %o4, [%o1+8]%asi 3300 3301 ldub [%o0+16], %o4 3302 sllx %o4, 56, %o3 3303 lduh [%o0+17], %o4 3304 sllx %o4, 40, %o4 3305 or %o4, %o3, %o3 3306 lduw [%o0+19], %o4 3307 sllx %o4, 8, %o4 3308 or %o4, %o3, %o3 3309 ldub [%o0+23], %o4 3310 or %o4, %o3, %o4 3311 stxa %o4, [%o1+16]%asi 3312 3313 add %o0, 32, %o0 ! increase src ptr by 32 3314 add %o1, 32, %o1 ! increase dst ptr by 32 3315 3316 ldub [%o0-8], %o4 3317 sllx %o4, 56, %o3 3318 lduh [%o0-7], %o4 3319 sllx %o4, 40, %o4 3320 or %o4, %o3, %o3 3321 lduw [%o0-5], %o4 3322 sllx %o4, 8, %o4 3323 or %o4, %o3, %o3 3324 ldub [%o0-1], %o4 3325 or %o4, %o3, %o4 3326 bgu,pt %ncc, .co_medb32 ! repeat if at least 32 bytes left 3327 stxa %o4, [%o1-8]%asi 3328 3329 .co_medb31: ! 31 or fewer bytes remaining 3330 addcc %o2, 24, %o2 ! adjust count to be off by 7 3331 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left 3332 nop ! 3333 .co_medb15: 3334 3335 ldub [%o0], %o4 ! load and store a block of 8 bytes 3336 subcc %o2, 8, %o2 ! decrement length count 3337 sllx %o4, 56, %o3 3338 lduh [%o0+1], %o4 3339 sllx %o4, 40, %o4 3340 or %o4, %o3, %o3 3341 lduw [%o0+3], %o4 3342 add %o1, 8, %o1 ! increase dst ptr by 16 3343 sllx %o4, 8, %o4 3344 or %o4, %o3, %o3 3345 ldub [%o0+7], %o4 3346 add %o0, 8, %o0 ! increase src ptr by 16 3347 or %o4, %o3, %o4 3348 bgu,pt %ncc, .co_medb15 3349 stxa %o4, [%o1-8]%asi 3350 .co_medb7: 3351 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3352 bz,pt %ncc, .co_smallx ! exit if finished 3353 cmp %o2, 4 3354 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3355 nop ! 3356 ldub [%o0], %o4 ! move 4 bytes 3357 sll %o4, 24, %o3 3358 lduh [%o0+1], %o4 3359 sll %o4, 8, %o4 3360 or %o4, %o3, %o3 3361 ldub [%o0+3], %o4 3362 or %o4, %o3, %o4 3363 subcc %o2, 4, %o2 3364 add %o0, 4, %o0 3365 add %o1, 4, %o1 3366 bnz .co_small3x 3367 stwa %o4, [%o1-4]%asi 3368 membar #Sync 3369 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3370 retl 3371 mov %g0, %o0 3372 3373 .align 16 3374 .co_medbh32a: 3375 ble,pt %ncc, .co_medbh31 3376 nop 3377 .co_medbh32: ! Alignment 3 or 7 3378 subcc %o2, 32, %o2 ! decrement length count 3379 3380 ldub [%o0], %o4 ! load and store a block of 32 bytes 3381 sllx %o4, 56, %o3 3382 lduw [%o0+1], %o4 3383 sllx %o4, 24, %o4 3384 or %o4, %o3, %o3 3385 lduh [%o0+5], %o4 3386 sllx %o4, 8, %o4 3387 or %o4, %o3, %o3 3388 ldub [%o0+7], %o4 3389 or %o4, %o3, %o4 3390 stxa %o4, [%o1]%asi 3391 3392 ldub [%o0+8], %o4 3393 sllx %o4, 56, %o3 3394 lduw [%o0+9], %o4 3395 sllx %o4, 24, %o4 3396 or %o4, %o3, %o3 3397 lduh [%o0+13], %o4 3398 sllx %o4, 8, %o4 3399 or %o4, %o3, %o3 3400 ldub [%o0+15], %o4 3401 or %o4, %o3, %o4 3402 stxa %o4, [%o1+8]%asi 3403 3404 ldub [%o0+16], %o4 3405 sllx %o4, 56, %o3 3406 lduw [%o0+17], %o4 3407 sllx %o4, 24, %o4 3408 or %o4, %o3, %o3 3409 lduh [%o0+21], %o4 3410 sllx %o4, 8, %o4 3411 or %o4, %o3, %o3 3412 ldub [%o0+23], %o4 3413 or %o4, %o3, %o4 3414 stxa %o4, [%o1+16]%asi 3415 3416 add %o0, 32, %o0 ! increase src ptr by 32 3417 add %o1, 32, %o1 ! increase dst ptr by 32 3418 3419 ldub [%o0-8], %o4 3420 sllx %o4, 56, %o3 3421 lduw [%o0-7], %o4 3422 sllx %o4, 24, %o4 3423 or %o4, %o3, %o3 3424 lduh [%o0-3], %o4 3425 sllx %o4, 8, %o4 3426 or %o4, %o3, %o3 3427 ldub [%o0-1], %o4 3428 or %o4, %o3, %o4 3429 bgu,pt %ncc, .co_medbh32 ! repeat if at least 32 bytes left 3430 stxa %o4, [%o1-8]%asi 3431 3432 .co_medbh31: 3433 addcc %o2, 24, %o2 ! adjust count to be off by 7 3434 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left 3435 nop ! 3436 .co_medbh15: 3437 ldub [%o0], %o4 ! load and store a block of 8 bytes 3438 sllx %o4, 56, %o3 3439 lduw [%o0+1], %o4 3440 sllx %o4, 24, %o4 3441 or %o4, %o3, %o3 3442 lduh [%o0+5], %o4 3443 sllx %o4, 8, %o4 3444 or %o4, %o3, %o3 3445 ldub [%o0+7], %o4 3446 or %o4, %o3, %o4 3447 stxa %o4, [%o1]%asi 3448 subcc %o2, 8, %o2 ! decrement length count 3449 add %o1, 8, %o1 ! increase dst ptr by 8 3450 add %o0, 8, %o0 ! increase src ptr by 8 3451 bgu,pt %ncc, .co_medbh15 3452 stxa %o4, [%o1-8]%asi 3453 ba .co_medb7 3454 nop 3455 /* 3456 * End of small copy (no window) code 3457 */ 3458 3459 /* 3460 * Long copy code 3461 */ 3462 .co_copy_more: 3463 sethi %hi(copyio_fault), %o3 3464 or %o3, %lo(copyio_fault), %o3 3465 membar #Sync 3466 stn %o3, [THREAD_REG + T_LOFAULT] 3467 3468 /* 3469 * Following code is for large copies. We know there is at 3470 * least FP_COPY bytes available. FP regs are used, so 3471 * we save registers and fp regs before starting 3472 */ 3473 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3474 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 3475 rd %fprs, %g1 ! check for unused fp 3476 ! if fprs.fef == 0, set it. 3477 ! Setting it when already set costs more than checking 3478 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0 3479 bz,pt %ncc, .co_fp_unused 3480 mov ASI_USER, %asi 3481 BST_FP_TOSTACK(%o3) 3482 ba .co_fp_ready 3483 .co_fp_unused: 3484 prefetch [%i0 + (1 * CACHE_LINE)], #one_read 3485 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 3486 .co_fp_ready: 3487 rd %gsr, %l5 ! save %gsr value 3488 andcc %i1, 1, %o3 ! is dest byte aligned 3489 bnz,pt %ncc, .co_big_d1 3490 .co_big_d1f: ! dest is now half word aligned 3491 andcc %i1, 2, %o3 3492 bnz,pt %ncc, .co_big_d2 3493 .co_big_d2f: ! dest is now word aligned 3494 andcc %i1, 4, %o3 ! is dest longword aligned 3495 bnz,pt %ncc, .co_big_d4 3496 .co_big_d4f: ! dest is now long word aligned 3497 andcc %i0, 7, %o3 ! is src long word aligned 3498 brnz,pt %o3, .co_big_unal8 3499 prefetch [%i0 + (2 * CACHE_LINE)], #one_read 3500 ! Src and dst are long word aligned 3501 ! align dst to 64 byte boundary 3502 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 3503 brz,pn %o3, .co_al_to_64 3504 nop 3505 sub %o3, 64, %o3 ! %o3 has negative bytes to move 3506 add %i2, %o3, %i2 ! adjust remaining count 3507 andcc %o3, 8, %o4 ! odd long words to move? 3508 brz,pt %o4, .co_al_to_16 3509 nop 3510 add %o3, 8, %o3 3511 ldx [%i0], %o4 3512 add %i0, 8, %i0 ! increment src ptr 3513 stxa %o4, [%i1]ASI_USER 3514 add %i1, 8, %i1 ! increment dst ptr 3515 ! Dest is aligned on 16 bytes, src 8 byte aligned 3516 .co_al_to_16: 3517 andcc %o3, 0x30, %o4 ! move to move? 3518 brz,pt %o4, .co_al_to_64 3519 nop 3520 .co_al_mv_16: 3521 add %o3, 16, %o3 3522 ldx [%i0], %o4 3523 stxa %o4, [%i1]ASI_USER 3524 add %i0, 16, %i0 ! increment src ptr 3525 ldx [%i0-8], %o4 3526 add %i1, 8, %i1 ! increment dst ptr 3527 stxa %o4, [%i1]ASI_USER 3528 andcc %o3, 0x30, %o4 3529 brnz,pt %o4, .co_al_mv_16 3530 add %i1, 8, %i1 ! increment dst ptr 3531 ! Dest is aligned on 64 bytes, src 8 byte aligned 3532 .co_al_to_64: 3533 ! Determine source alignment 3534 ! to correct 8 byte offset 3535 andcc %i0, 32, %o3 3536 brnz,pn %o3, .co_aln_1 3537 andcc %i0, 16, %o3 3538 brnz,pn %o3, .co_aln_01 3539 andcc %i0, 8, %o3 3540 brz,pn %o3, .co_aln_000 3541 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3542 ba .co_aln_001 3543 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3544 .co_aln_01: 3545 brnz,pn %o3, .co_aln_011 3546 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3547 ba .co_aln_010 3548 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3549 .co_aln_1: 3550 andcc %i0, 16, %o3 3551 brnz,pn %o3, .co_aln_11 3552 andcc %i0, 8, %o3 3553 brnz,pn %o3, .co_aln_101 3554 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3555 ba .co_aln_100 3556 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3557 .co_aln_11: 3558 brz,pn %o3, .co_aln_110 3559 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3560 3561 .co_aln_111: 3562 ! Alignment off by 8 bytes 3563 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3564 ldd [%i0], %d0 3565 add %i0, 8, %i0 3566 sub %i2, 8, %i2 3567 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3568 and %i2, 0x7f, %i2 ! residue bytes in %i2 3569 sub %i1, %i0, %i1 3570 .co_aln_111_loop: 3571 ldda [%i0]ASI_BLK_P,%d16 ! block load 3572 subcc %o3, 64, %o3 3573 fmovd %d16, %d2 3574 fmovd %d18, %d4 3575 fmovd %d20, %d6 3576 fmovd %d22, %d8 3577 fmovd %d24, %d10 3578 fmovd %d26, %d12 3579 fmovd %d28, %d14 3580 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3581 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3582 add %i0, 64, %i0 3583 fmovd %d30, %d0 3584 bgt,pt %ncc, .co_aln_111_loop 3585 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3586 add %i1, %i0, %i1 3587 3588 stda %d0, [%i1]ASI_USER 3589 ba .co_remain_stuff 3590 add %i1, 8, %i1 3591 ! END OF aln_111 3592 3593 .co_aln_110: 3594 ! Alignment off by 16 bytes 3595 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3596 ldd [%i0], %d0 3597 ldd [%i0+8], %d2 3598 add %i0, 16, %i0 3599 sub %i2, 16, %i2 3600 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3601 and %i2, 0x7f, %i2 ! residue bytes in %i2 3602 sub %i1, %i0, %i1 3603 .co_aln_110_loop: 3604 ldda [%i0]ASI_BLK_P,%d16 ! block load 3605 subcc %o3, 64, %o3 3606 fmovd %d16, %d4 3607 fmovd %d18, %d6 3608 fmovd %d20, %d8 3609 fmovd %d22, %d10 3610 fmovd %d24, %d12 3611 fmovd %d26, %d14 3612 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3613 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3614 add %i0, 64, %i0 3615 fmovd %d28, %d0 3616 fmovd %d30, %d2 3617 bgt,pt %ncc, .co_aln_110_loop 3618 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3619 add %i1, %i0, %i1 3620 3621 stda %d0, [%i1]%asi 3622 stda %d2, [%i1+8]%asi 3623 ba .co_remain_stuff 3624 add %i1, 16, %i1 3625 ! END OF aln_110 3626 3627 .co_aln_101: 3628 ! Alignment off by 24 bytes 3629 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3630 ldd [%i0], %d0 3631 ldd [%i0+8], %d2 3632 ldd [%i0+16], %d4 3633 add %i0, 24, %i0 3634 sub %i2, 24, %i2 3635 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3636 and %i2, 0x7f, %i2 ! residue bytes in %i2 3637 sub %i1, %i0, %i1 3638 .co_aln_101_loop: 3639 ldda [%i0]ASI_BLK_P,%d16 ! block load 3640 subcc %o3, 64, %o3 3641 fmovd %d16, %d6 3642 fmovd %d18, %d8 3643 fmovd %d20, %d10 3644 fmovd %d22, %d12 3645 fmovd %d24, %d14 3646 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3647 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3648 add %i0, 64, %i0 3649 fmovd %d26, %d0 3650 fmovd %d28, %d2 3651 fmovd %d30, %d4 3652 bgt,pt %ncc, .co_aln_101_loop 3653 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3654 add %i1, %i0, %i1 3655 3656 stda %d0, [%i1]%asi 3657 stda %d2, [%i1+8]%asi 3658 stda %d4, [%i1+16]%asi 3659 ba .co_remain_stuff 3660 add %i1, 24, %i1 3661 ! END OF aln_101 3662 3663 .co_aln_100: 3664 ! Alignment off by 32 bytes 3665 ldd [%i0], %d0 3666 ldd [%i0+8], %d2 3667 ldd [%i0+16],%d4 3668 ldd [%i0+24],%d6 3669 add %i0, 32, %i0 3670 sub %i2, 32, %i2 3671 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3672 and %i2, 0x7f, %i2 ! residue bytes in %i2 3673 sub %i1, %i0, %i1 3674 .co_aln_100_loop: 3675 ldda [%i0]ASI_BLK_P,%d16 ! block load 3676 subcc %o3, 64, %o3 3677 fmovd %d16, %d8 3678 fmovd %d18, %d10 3679 fmovd %d20, %d12 3680 fmovd %d22, %d14 3681 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3682 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3683 add %i0, 64, %i0 3684 fmovd %d24, %d0 3685 fmovd %d26, %d2 3686 fmovd %d28, %d4 3687 fmovd %d30, %d6 3688 bgt,pt %ncc, .co_aln_100_loop 3689 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3690 add %i1, %i0, %i1 3691 3692 stda %d0, [%i1]%asi 3693 stda %d2, [%i1+8]%asi 3694 stda %d4, [%i1+16]%asi 3695 stda %d6, [%i1+24]%asi 3696 ba .co_remain_stuff 3697 add %i1, 32, %i1 3698 ! END OF aln_100 3699 3700 .co_aln_011: 3701 ! Alignment off by 40 bytes 3702 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3703 ldd [%i0], %d0 3704 ldd [%i0+8], %d2 3705 ldd [%i0+16], %d4 3706 ldd [%i0+24], %d6 3707 ldd [%i0+32], %d8 3708 add %i0, 40, %i0 3709 sub %i2, 40, %i2 3710 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3711 and %i2, 0x7f, %i2 ! residue bytes in %i2 3712 sub %i1, %i0, %i1 3713 .co_aln_011_loop: 3714 ldda [%i0]ASI_BLK_P,%d16 ! block load 3715 subcc %o3, 64, %o3 3716 fmovd %d16, %d10 3717 fmovd %d18, %d12 3718 fmovd %d20, %d14 3719 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3720 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3721 add %i0, 64, %i0 3722 fmovd %d22, %d0 3723 fmovd %d24, %d2 3724 fmovd %d26, %d4 3725 fmovd %d28, %d6 3726 fmovd %d30, %d8 3727 bgt,pt %ncc, .co_aln_011_loop 3728 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3729 add %i1, %i0, %i1 3730 3731 stda %d0, [%i1]%asi 3732 stda %d2, [%i1+8]%asi 3733 stda %d4, [%i1+16]%asi 3734 stda %d6, [%i1+24]%asi 3735 stda %d8, [%i1+32]%asi 3736 ba .co_remain_stuff 3737 add %i1, 40, %i1 3738 ! END OF aln_011 3739 3740 .co_aln_010: 3741 ! Alignment off by 48 bytes 3742 ldd [%i0], %d0 3743 ldd [%i0+8], %d2 3744 ldd [%i0+16], %d4 3745 ldd [%i0+24], %d6 3746 ldd [%i0+32], %d8 3747 ldd [%i0+40], %d10 3748 add %i0, 48, %i0 3749 sub %i2, 48, %i2 3750 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3751 and %i2, 0x7f, %i2 ! residue bytes in %i2 3752 sub %i1, %i0, %i1 3753 .co_aln_010_loop: 3754 ldda [%i0]ASI_BLK_P,%d16 ! block load 3755 subcc %o3, 64, %o3 3756 fmovd %d16, %d12 3757 fmovd %d18, %d14 3758 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3759 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3760 add %i0, 64, %i0 3761 fmovd %d20, %d0 3762 fmovd %d22, %d2 3763 fmovd %d24, %d4 3764 fmovd %d26, %d6 3765 fmovd %d28, %d8 3766 fmovd %d30, %d10 3767 bgt,pt %ncc, .co_aln_010_loop 3768 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3769 add %i1, %i0, %i1 3770 3771 stda %d0, [%i1]%asi 3772 stda %d2, [%i1+8]%asi 3773 stda %d4, [%i1+16]%asi 3774 stda %d6, [%i1+24]%asi 3775 stda %d8, [%i1+32]%asi 3776 stda %d10, [%i1+40]%asi 3777 ba .co_remain_stuff 3778 add %i1, 48, %i1 3779 ! END OF aln_010 3780 3781 .co_aln_001: 3782 ! Alignment off by 56 bytes 3783 ldd [%i0], %d0 3784 ldd [%i0+8], %d2 3785 ldd [%i0+16], %d4 3786 ldd [%i0+24], %d6 3787 ldd [%i0+32], %d8 3788 ldd [%i0+40], %d10 3789 ldd [%i0+48], %d12 3790 add %i0, 56, %i0 3791 sub %i2, 56, %i2 3792 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3793 and %i2, 0x7f, %i2 ! residue bytes in %i2 3794 sub %i1, %i0, %i1 3795 .co_aln_001_loop: 3796 ldda [%i0]ASI_BLK_P,%d16 ! block load 3797 subcc %o3, 64, %o3 3798 fmovd %d16, %d14 3799 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3800 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3801 add %i0, 64, %i0 3802 fmovd %d18, %d0 3803 fmovd %d20, %d2 3804 fmovd %d22, %d4 3805 fmovd %d24, %d6 3806 fmovd %d26, %d8 3807 fmovd %d28, %d10 3808 fmovd %d30, %d12 3809 bgt,pt %ncc, .co_aln_001_loop 3810 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3811 add %i1, %i0, %i1 3812 3813 stda %d0, [%i1]%asi 3814 stda %d2, [%i1+8]%asi 3815 stda %d4, [%i1+16]%asi 3816 stda %d6, [%i1+24]%asi 3817 stda %d8, [%i1+32]%asi 3818 stda %d10, [%i1+40]%asi 3819 stda %d12, [%i1+48]%asi 3820 ba .co_remain_stuff 3821 add %i1, 56, %i1 3822 ! END OF aln_001 3823 3824 .co_aln_000: 3825 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3826 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3827 and %i2, 0x7f, %i2 ! residue bytes in %i2 3828 sub %i1, %i0, %i1 3829 .co_aln_000_loop: 3830 ldda [%i0]ASI_BLK_P,%d0 3831 subcc %o3, 64, %o3 3832 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3833 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3834 add %i0, 64, %i0 3835 bgt,pt %ncc, .co_aln_000_loop 3836 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3837 add %i1, %i0, %i1 3838 3839 ! END OF aln_000 3840 3841 .co_remain_stuff: 3842 subcc %i2, 31, %i2 ! adjust length to allow cc test 3843 ble,pt %ncc, .co_aln_31 3844 nop 3845 .co_aln_32: 3846 ldx [%i0], %o4 ! move 32 bytes 3847 subcc %i2, 32, %i2 ! decrement length count by 32 3848 stxa %o4, [%i1]%asi 3849 ldx [%i0+8], %o4 3850 stxa %o4, [%i1+8]%asi 3851 ldx [%i0+16], %o4 3852 add %i0, 32, %i0 ! increase src ptr by 32 3853 stxa %o4, [%i1+16]%asi 3854 ldx [%i0-8], %o4 3855 add %i1, 32, %i1 ! increase dst ptr by 32 3856 bgu,pt %ncc, .co_aln_32 ! repeat if at least 32 bytes left 3857 stxa %o4, [%i1-8]%asi 3858 .co_aln_31: 3859 addcc %i2, 24, %i2 ! adjust count to be off by 7 3860 ble,pt %ncc, .co_aln_7 ! skip if 7 or fewer bytes left 3861 nop ! 3862 .co_aln_15: 3863 ldx [%i0], %o4 ! move 8 bytes 3864 add %i0, 8, %i0 ! increase src ptr by 8 3865 subcc %i2, 8, %i2 ! decrease count by 8 3866 add %i1, 8, %i1 ! increase dst ptr by 8 3867 bgu,pt %ncc, .co_aln_15 3868 stxa %o4, [%i1-8]%asi 3869 .co_aln_7: 3870 addcc %i2, 7, %i2 ! finish adjustment of remaining count 3871 bz,pt %ncc, .co_exit ! exit if finished 3872 cmp %i2, 4 3873 blt,pt %ncc, .co_unaln3x ! skip if less than 4 bytes left 3874 nop ! 3875 ld [%i0], %o4 ! move 4 bytes 3876 add %i0, 4, %i0 ! increase src ptr by 4 3877 add %i1, 4, %i1 ! increase dst ptr by 4 3878 subcc %i2, 4, %i2 ! decrease count by 4 3879 bnz .co_unaln3x 3880 stwa %o4, [%i1-4]%asi 3881 ba .co_exit 3882 nop 3883 3884 ! destination alignment code 3885 .co_big_d1: 3886 ldub [%i0], %o4 ! move a byte 3887 add %i0, 1, %i0 3888 stba %o4, [%i1]ASI_USER 3889 add %i1, 1, %i1 3890 andcc %i1, 2, %o3 3891 bz,pt %ncc, .co_big_d2f 3892 sub %i2, 1, %i2 3893 .co_big_d2: 3894 ldub [%i0], %o4 ! move a half-word (src align unknown) 3895 ldub [%i0+1], %o3 3896 add %i0, 2, %i0 3897 sll %o4, 8, %o4 ! position 3898 or %o4, %o3, %o4 ! merge 3899 stha %o4, [%i1]ASI_USER 3900 add %i1, 2, %i1 3901 andcc %i1, 4, %o3 ! is dest longword aligned 3902 bz,pt %ncc, .co_big_d4f 3903 sub %i2, 2, %i2 3904 .co_big_d4: ! dest is at least word aligned 3905 nop 3906 ldub [%i0], %o4 ! move a word (src align unknown) 3907 ldub [%i0+1], %o3 3908 sll %o4, 24, %o4 ! position 3909 sll %o3, 16, %o3 ! position 3910 or %o4, %o3, %o3 ! merge 3911 ldub [%i0+2], %o4 3912 sll %o4, 8, %o4 ! position 3913 or %o4, %o3, %o3 ! merge 3914 ldub [%i0+3], %o4 3915 or %o4, %o3, %o4 ! merge 3916 stwa %o4,[%i1]ASI_USER ! store four bytes 3917 add %i0, 4, %i0 ! adjust src by 4 3918 add %i1, 4, %i1 ! adjust dest by 4 3919 ba .co_big_d4f 3920 sub %i2, 4, %i2 ! adjust count by 4 3921 3922 3923 ! Dst is on 8 byte boundary; src is not; 3924 .co_big_unal8: 3925 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned? 3926 bz %ncc, .co_unalnsrc 3927 sub %o3, 64, %o3 ! %o3 will be multiple of 8 3928 neg %o3 ! bytes until dest is 64 byte aligned 3929 sub %i2, %o3, %i2 ! update cnt with bytes to be moved 3930 ! Move bytes according to source alignment 3931 andcc %i0, 0x1, %o4 3932 bnz %ncc, .co_unalnbyte ! check for byte alignment 3933 nop 3934 andcc %i0, 2, %o4 ! check for half word alignment 3935 bnz %ncc, .co_unalnhalf 3936 nop 3937 ! Src is word aligned, move bytes until dest 64 byte aligned 3938 .co_unalnword: 3939 ld [%i0], %o4 ! load 4 bytes 3940 stwa %o4, [%i1]%asi ! and store 4 bytes 3941 ld [%i0+4], %o4 ! load 4 bytes 3942 add %i0, 8, %i0 ! increase src ptr by 8 3943 stwa %o4, [%i1+4]%asi ! and store 4 bytes 3944 subcc %o3, 8, %o3 ! decrease count by 8 3945 bnz %ncc, .co_unalnword 3946 add %i1, 8, %i1 ! increase dst ptr by 8 3947 ba .co_unalnsrc 3948 nop 3949 3950 ! Src is half-word aligned, move bytes until dest 64 byte aligned 3951 .co_unalnhalf: 3952 lduh [%i0], %o4 ! load 2 bytes 3953 sllx %o4, 32, %i3 ! shift left 3954 lduw [%i0+2], %o4 3955 or %o4, %i3, %i3 3956 sllx %i3, 16, %i3 3957 lduh [%i0+6], %o4 3958 or %o4, %i3, %i3 3959 stxa %i3, [%i1]ASI_USER 3960 add %i0, 8, %i0 3961 subcc %o3, 8, %o3 3962 bnz %ncc, .co_unalnhalf 3963 add %i1, 8, %i1 3964 ba .co_unalnsrc 3965 nop 3966 3967 ! Src is Byte aligned, move bytes until dest 64 byte aligned 3968 .co_unalnbyte: 3969 sub %i1, %i0, %i1 ! share pointer advance 3970 .co_unalnbyte_loop: 3971 ldub [%i0], %o4 3972 sllx %o4, 56, %i3 3973 lduh [%i0+1], %o4 3974 sllx %o4, 40, %o4 3975 or %o4, %i3, %i3 3976 lduh [%i0+3], %o4 3977 sllx %o4, 24, %o4 3978 or %o4, %i3, %i3 3979 lduh [%i0+5], %o4 3980 sllx %o4, 8, %o4 3981 or %o4, %i3, %i3 3982 ldub [%i0+7], %o4 3983 or %o4, %i3, %i3 3984 stxa %i3, [%i1+%i0]ASI_USER 3985 subcc %o3, 8, %o3 3986 bnz %ncc, .co_unalnbyte_loop 3987 add %i0, 8, %i0 3988 add %i1,%i0, %i1 ! restore pointer 3989 3990 ! Destination is now block (64 byte aligned), src is not 8 byte aligned 3991 .co_unalnsrc: 3992 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size 3993 and %i2, 0x3f, %i2 ! residue bytes in %i2 3994 add %i2, 64, %i2 ! Insure we don't load beyond 3995 sub %i3, 64, %i3 ! end of source buffer 3996 3997 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address 3998 prefetch [%o4 + (3 * CACHE_LINE)], #one_read 3999 alignaddr %i0, %g0, %g0 ! generate %gsr 4000 add %i0, %i3, %i0 ! advance %i0 to after blocks 4001 ! 4002 ! Determine source alignment to correct 8 byte offset 4003 andcc %i0, 0x20, %o3 4004 brnz,pn %o3, .co_unaln_1 4005 andcc %i0, 0x10, %o3 4006 brnz,pn %o3, .co_unaln_01 4007 andcc %i0, 0x08, %o3 4008 brz,a %o3, .co_unaln_000 4009 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4010 ba .co_unaln_001 4011 nop 4012 .co_unaln_01: 4013 brnz,a %o3, .co_unaln_011 4014 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4015 ba .co_unaln_010 4016 nop 4017 .co_unaln_1: 4018 brnz,pn %o3, .co_unaln_11 4019 andcc %i0, 0x08, %o3 4020 brnz,a %o3, .co_unaln_101 4021 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4022 ba .co_unaln_100 4023 nop 4024 .co_unaln_11: 4025 brz,pn %o3, .co_unaln_110 4026 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 4027 4028 .co_unaln_111: 4029 ldd [%o4+56], %d14 4030 .co_unaln_111_loop: 4031 add %o4, 64, %o4 4032 ldda [%o4]ASI_BLK_P, %d16 4033 faligndata %d14, %d16, %d48 4034 faligndata %d16, %d18, %d50 4035 faligndata %d18, %d20, %d52 4036 faligndata %d20, %d22, %d54 4037 faligndata %d22, %d24, %d56 4038 faligndata %d24, %d26, %d58 4039 faligndata %d26, %d28, %d60 4040 faligndata %d28, %d30, %d62 4041 fmovd %d30, %d14 4042 stda %d48, [%i1]ASI_BLK_AIUS 4043 subcc %i3, 64, %i3 4044 add %i1, 64, %i1 4045 bgu,pt %ncc, .co_unaln_111_loop 4046 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4047 ba .co_unaln_done 4048 nop 4049 4050 .co_unaln_110: 4051 ldd [%o4+48], %d12 4052 ldd [%o4+56], %d14 4053 .co_unaln_110_loop: 4054 add %o4, 64, %o4 4055 ldda [%o4]ASI_BLK_P, %d16 4056 faligndata %d12, %d14, %d48 4057 faligndata %d14, %d16, %d50 4058 faligndata %d16, %d18, %d52 4059 faligndata %d18, %d20, %d54 4060 faligndata %d20, %d22, %d56 4061 faligndata %d22, %d24, %d58 4062 faligndata %d24, %d26, %d60 4063 faligndata %d26, %d28, %d62 4064 fmovd %d28, %d12 4065 fmovd %d30, %d14 4066 stda %d48, [%i1]ASI_BLK_AIUS 4067 subcc %i3, 64, %i3 4068 add %i1, 64, %i1 4069 bgu,pt %ncc, .co_unaln_110_loop 4070 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4071 ba .co_unaln_done 4072 nop 4073 4074 .co_unaln_101: 4075 ldd [%o4+40], %d10 4076 ldd [%o4+48], %d12 4077 ldd [%o4+56], %d14 4078 .co_unaln_101_loop: 4079 add %o4, 64, %o4 4080 ldda [%o4]ASI_BLK_P, %d16 4081 faligndata %d10, %d12, %d48 4082 faligndata %d12, %d14, %d50 4083 faligndata %d14, %d16, %d52 4084 faligndata %d16, %d18, %d54 4085 faligndata %d18, %d20, %d56 4086 faligndata %d20, %d22, %d58 4087 faligndata %d22, %d24, %d60 4088 faligndata %d24, %d26, %d62 4089 fmovd %d26, %d10 4090 fmovd %d28, %d12 4091 fmovd %d30, %d14 4092 stda %d48, [%i1]ASI_BLK_AIUS 4093 subcc %i3, 64, %i3 4094 add %i1, 64, %i1 4095 bgu,pt %ncc, .co_unaln_101_loop 4096 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4097 ba .co_unaln_done 4098 nop 4099 4100 .co_unaln_100: 4101 ldd [%o4+32], %d8 4102 ldd [%o4+40], %d10 4103 ldd [%o4+48], %d12 4104 ldd [%o4+56], %d14 4105 .co_unaln_100_loop: 4106 add %o4, 64, %o4 4107 ldda [%o4]ASI_BLK_P, %d16 4108 faligndata %d8, %d10, %d48 4109 faligndata %d10, %d12, %d50 4110 faligndata %d12, %d14, %d52 4111 faligndata %d14, %d16, %d54 4112 faligndata %d16, %d18, %d56 4113 faligndata %d18, %d20, %d58 4114 faligndata %d20, %d22, %d60 4115 faligndata %d22, %d24, %d62 4116 fmovd %d24, %d8 4117 fmovd %d26, %d10 4118 fmovd %d28, %d12 4119 fmovd %d30, %d14 4120 stda %d48, [%i1]ASI_BLK_AIUS 4121 subcc %i3, 64, %i3 4122 add %i1, 64, %i1 4123 bgu,pt %ncc, .co_unaln_100_loop 4124 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4125 ba .co_unaln_done 4126 nop 4127 4128 .co_unaln_011: 4129 ldd [%o4+24], %d6 4130 ldd [%o4+32], %d8 4131 ldd [%o4+40], %d10 4132 ldd [%o4+48], %d12 4133 ldd [%o4+56], %d14 4134 .co_unaln_011_loop: 4135 add %o4, 64, %o4 4136 ldda [%o4]ASI_BLK_P, %d16 4137 faligndata %d6, %d8, %d48 4138 faligndata %d8, %d10, %d50 4139 faligndata %d10, %d12, %d52 4140 faligndata %d12, %d14, %d54 4141 faligndata %d14, %d16, %d56 4142 faligndata %d16, %d18, %d58 4143 faligndata %d18, %d20, %d60 4144 faligndata %d20, %d22, %d62 4145 fmovd %d22, %d6 4146 fmovd %d24, %d8 4147 fmovd %d26, %d10 4148 fmovd %d28, %d12 4149 fmovd %d30, %d14 4150 stda %d48, [%i1]ASI_BLK_AIUS 4151 subcc %i3, 64, %i3 4152 add %i1, 64, %i1 4153 bgu,pt %ncc, .co_unaln_011_loop 4154 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4155 ba .co_unaln_done 4156 nop 4157 4158 .co_unaln_010: 4159 ldd [%o4+16], %d4 4160 ldd [%o4+24], %d6 4161 ldd [%o4+32], %d8 4162 ldd [%o4+40], %d10 4163 ldd [%o4+48], %d12 4164 ldd [%o4+56], %d14 4165 .co_unaln_010_loop: 4166 add %o4, 64, %o4 4167 ldda [%o4]ASI_BLK_P, %d16 4168 faligndata %d4, %d6, %d48 4169 faligndata %d6, %d8, %d50 4170 faligndata %d8, %d10, %d52 4171 faligndata %d10, %d12, %d54 4172 faligndata %d12, %d14, %d56 4173 faligndata %d14, %d16, %d58 4174 faligndata %d16, %d18, %d60 4175 faligndata %d18, %d20, %d62 4176 fmovd %d20, %d4 4177 fmovd %d22, %d6 4178 fmovd %d24, %d8 4179 fmovd %d26, %d10 4180 fmovd %d28, %d12 4181 fmovd %d30, %d14 4182 stda %d48, [%i1]ASI_BLK_AIUS 4183 subcc %i3, 64, %i3 4184 add %i1, 64, %i1 4185 bgu,pt %ncc, .co_unaln_010_loop 4186 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4187 ba .co_unaln_done 4188 nop 4189 4190 .co_unaln_001: 4191 ldd [%o4+8], %d2 4192 ldd [%o4+16], %d4 4193 ldd [%o4+24], %d6 4194 ldd [%o4+32], %d8 4195 ldd [%o4+40], %d10 4196 ldd [%o4+48], %d12 4197 ldd [%o4+56], %d14 4198 .co_unaln_001_loop: 4199 add %o4, 64, %o4 4200 ldda [%o4]ASI_BLK_P, %d16 4201 faligndata %d2, %d4, %d48 4202 faligndata %d4, %d6, %d50 4203 faligndata %d6, %d8, %d52 4204 faligndata %d8, %d10, %d54 4205 faligndata %d10, %d12, %d56 4206 faligndata %d12, %d14, %d58 4207 faligndata %d14, %d16, %d60 4208 faligndata %d16, %d18, %d62 4209 fmovd %d18, %d2 4210 fmovd %d20, %d4 4211 fmovd %d22, %d6 4212 fmovd %d24, %d8 4213 fmovd %d26, %d10 4214 fmovd %d28, %d12 4215 fmovd %d30, %d14 4216 stda %d48, [%i1]ASI_BLK_AIUS 4217 subcc %i3, 64, %i3 4218 add %i1, 64, %i1 4219 bgu,pt %ncc, .co_unaln_001_loop 4220 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4221 ba .co_unaln_done 4222 nop 4223 4224 .co_unaln_000: 4225 ldda [%o4]ASI_BLK_P, %d0 4226 .co_unaln_000_loop: 4227 add %o4, 64, %o4 4228 ldda [%o4]ASI_BLK_P, %d16 4229 faligndata %d0, %d2, %d48 4230 faligndata %d2, %d4, %d50 4231 faligndata %d4, %d6, %d52 4232 faligndata %d6, %d8, %d54 4233 faligndata %d8, %d10, %d56 4234 faligndata %d10, %d12, %d58 4235 faligndata %d12, %d14, %d60 4236 faligndata %d14, %d16, %d62 4237 fmovd %d16, %d0 4238 fmovd %d18, %d2 4239 fmovd %d20, %d4 4240 fmovd %d22, %d6 4241 fmovd %d24, %d8 4242 fmovd %d26, %d10 4243 fmovd %d28, %d12 4244 fmovd %d30, %d14 4245 stda %d48, [%i1]ASI_BLK_AIUS 4246 subcc %i3, 64, %i3 4247 add %i1, 64, %i1 4248 bgu,pt %ncc, .co_unaln_000_loop 4249 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4250 4251 .co_unaln_done: 4252 ! Handle trailing bytes, 64 to 127 4253 ! Dest long word aligned, Src not long word aligned 4254 cmp %i2, 15 4255 bleu %ncc, .co_unaln_short 4256 4257 andn %i2, 0x7, %i3 ! %i3 is multiple of 8 4258 and %i2, 0x7, %i2 ! residue bytes in %i2 4259 add %i2, 8, %i2 4260 sub %i3, 8, %i3 ! insure we don't load past end of src 4261 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address 4262 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8 4263 ldd [%o4], %d0 ! fetch partial word 4264 .co_unaln_by8: 4265 ldd [%o4+8], %d2 4266 add %o4, 8, %o4 4267 faligndata %d0, %d2, %d16 4268 subcc %i3, 8, %i3 4269 stda %d16, [%i1]%asi 4270 fmovd %d2, %d0 4271 bgu,pt %ncc, .co_unaln_by8 4272 add %i1, 8, %i1 4273 4274 .co_unaln_short: 4275 cmp %i2, 8 4276 blt,pt %ncc, .co_unalnfin 4277 nop 4278 ldub [%i0], %o4 4279 sll %o4, 24, %o3 4280 ldub [%i0+1], %o4 4281 sll %o4, 16, %o4 4282 or %o4, %o3, %o3 4283 ldub [%i0+2], %o4 4284 sll %o4, 8, %o4 4285 or %o4, %o3, %o3 4286 ldub [%i0+3], %o4 4287 or %o4, %o3, %o3 4288 stwa %o3, [%i1]%asi 4289 ldub [%i0+4], %o4 4290 sll %o4, 24, %o3 4291 ldub [%i0+5], %o4 4292 sll %o4, 16, %o4 4293 or %o4, %o3, %o3 4294 ldub [%i0+6], %o4 4295 sll %o4, 8, %o4 4296 or %o4, %o3, %o3 4297 ldub [%i0+7], %o4 4298 or %o4, %o3, %o3 4299 stwa %o3, [%i1+4]%asi 4300 add %i0, 8, %i0 4301 add %i1, 8, %i1 4302 sub %i2, 8, %i2 4303 .co_unalnfin: 4304 cmp %i2, 4 4305 blt,pt %ncc, .co_unalnz 4306 tst %i2 4307 ldub [%i0], %o3 ! read byte 4308 subcc %i2, 4, %i2 ! reduce count by 4 4309 sll %o3, 24, %o3 ! position 4310 ldub [%i0+1], %o4 4311 sll %o4, 16, %o4 ! position 4312 or %o4, %o3, %o3 ! merge 4313 ldub [%i0+2], %o4 4314 sll %o4, 8, %o4 ! position 4315 or %o4, %o3, %o3 ! merge 4316 add %i1, 4, %i1 ! advance dst by 4 4317 ldub [%i0+3], %o4 4318 add %i0, 4, %i0 ! advance src by 4 4319 or %o4, %o3, %o4 ! merge 4320 bnz,pt %ncc, .co_unaln3x 4321 stwa %o4, [%i1-4]%asi 4322 ba .co_exit 4323 nop 4324 .co_unalnz: 4325 bz,pt %ncc, .co_exit 4326 wr %l5, %g0, %gsr ! restore %gsr 4327 .co_unaln3x: ! Exactly 1, 2, or 3 bytes remain 4328 subcc %i2, 1, %i2 ! reduce count for cc test 4329 ldub [%i0], %o4 ! load one byte 4330 bz,pt %ncc, .co_exit 4331 stba %o4, [%i1]%asi ! store one byte 4332 ldub [%i0+1], %o4 ! load second byte 4333 subcc %i2, 1, %i2 4334 bz,pt %ncc, .co_exit 4335 stba %o4, [%i1+1]%asi ! store second byte 4336 ldub [%i0+2], %o4 ! load third byte 4337 stba %o4, [%i1+2]%asi ! store third byte 4338 .co_exit: 4339 brnz %g1, .co_fp_restore 4340 nop 4341 FZERO 4342 wr %g1, %g0, %fprs 4343 ba,pt %ncc, .co_ex2 4344 membar #Sync 4345 .co_fp_restore: 4346 BLD_FP_FROMSTACK(%o4) 4347 .co_ex2: 4348 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 4349 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 4350 ret 4351 restore %g0, 0, %o0 4352 4353 .copyout_err: 4354 ldn [THREAD_REG + T_COPYOPS], %o4 4355 brz %o4, 2f 4356 nop 4357 ldn [%o4 + CP_COPYOUT], %g2 4358 jmp %g2 4359 nop 4360 2: 4361 retl 4362 mov -1, %o0 4363 4364 #else /* NIAGARA_IMPL */ 4365 .do_copyout: 4366 ! 4367 ! Check the length and bail if zero. 4368 ! 4369 tst %o2 4370 bnz,pt %ncc, 1f 4371 nop 4372 retl 4373 clr %o0 4374 1: 4375 sethi %hi(copyio_fault), %o4 4376 or %o4, %lo(copyio_fault), %o4 4377 sethi %hi(copyio_fault_nowindow), %o3 4378 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 4379 or %o3, %lo(copyio_fault_nowindow), %o3 4380 membar #Sync 4381 stn %o3, [THREAD_REG + T_LOFAULT] 4382 4383 mov %o0, SAVE_SRC 4384 mov %o1, SAVE_DST 4385 mov %o2, SAVE_COUNT 4386 4387 ! 4388 ! Check to see if we're more than SMALL_LIMIT (7 bytes). 4389 ! Run in leaf mode, using the %o regs as our input regs. 4390 ! 4391 subcc %o2, SMALL_LIMIT, %o3 4392 bgu,a,pt %ncc, .dco_ns 4393 or %o0, %o1, %o3 4394 ! 4395 ! What was previously ".small_copyout" 4396 ! Do full differenced copy. 4397 ! 4398 .dcobcp: 4399 sub %g0, %o2, %o3 ! negate count 4400 add %o0, %o2, %o0 ! make %o0 point at the end 4401 add %o1, %o2, %o1 ! make %o1 point at the end 4402 ba,pt %ncc, .dcocl 4403 ldub [%o0 + %o3], %o4 ! load first byte 4404 ! 4405 ! %o0 and %o2 point at the end and remain pointing at the end 4406 ! of their buffers. We pull things out by adding %o3 (which is 4407 ! the negation of the length) to the buffer end which gives us 4408 ! the curent location in the buffers. By incrementing %o3 we walk 4409 ! through both buffers without having to bump each buffer's 4410 ! pointer. A very fast 4 instruction loop. 4411 ! 4412 .align 16 4413 .dcocl: 4414 stba %o4, [%o1 + %o3]ASI_USER 4415 inccc %o3 4416 bl,a,pt %ncc, .dcocl 4417 ldub [%o0 + %o3], %o4 4418 ! 4419 ! We're done. Go home. 4420 ! 4421 membar #Sync 4422 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 4423 retl 4424 clr %o0 4425 ! 4426 ! Try aligned copies from here. 4427 ! 4428 .dco_ns: 4429 ! %o0 = kernel addr (to be copied from) 4430 ! %o1 = user addr (to be copied to) 4431 ! %o2 = length 4432 ! %o3 = %o1 | %o2 (used for alignment checking) 4433 ! %o4 is alternate lo_fault 4434 ! %o5 is original lo_fault 4435 ! 4436 ! See if we're single byte aligned. If we are, check the 4437 ! limit for single byte copies. If we're smaller or equal, 4438 ! bounce to the byte for byte copy loop. Otherwise do it in 4439 ! HW (if enabled). 4440 ! 4441 btst 1, %o3 4442 bz,pt %icc, .dcoh8 4443 btst 7, %o3 4444 ! 4445 ! Single byte aligned. Do we do it via HW or via 4446 ! byte for byte? Do a quick no memory reference 4447 ! check to pick up small copies. 4448 ! 4449 sethi %hi(hw_copy_limit_1), %o3 4450 ! 4451 ! Big enough that we need to check the HW limit for 4452 ! this size copy. 4453 ! 4454 ld [%o3 + %lo(hw_copy_limit_1)], %o3 4455 ! 4456 ! Is HW copy on? If not, do everything byte for byte. 4457 ! 4458 tst %o3 4459 bz,pn %icc, .dcobcp 4460 subcc %o3, %o2, %o3 4461 ! 4462 ! If we're less than or equal to the single byte copy limit, 4463 ! bop to the copy loop. 4464 ! 4465 bge,pt %ncc, .dcobcp 4466 nop 4467 ! 4468 ! We're big enough and copy is on. Do it with HW. 4469 ! 4470 ba,pt %ncc, .big_copyout 4471 nop 4472 .dcoh8: 4473 ! 4474 ! 8 byte aligned? 4475 ! 4476 bnz,a %ncc, .dcoh4 4477 btst 3, %o3 4478 ! 4479 ! See if we're in the "small range". 4480 ! If so, go off and do the copy. 4481 ! If not, load the hard limit. %o3 is 4482 ! available for reuse. 4483 ! 4484 sethi %hi(hw_copy_limit_8), %o3 4485 ld [%o3 + %lo(hw_copy_limit_8)], %o3 4486 ! 4487 ! If it's zero, there's no HW bcopy. 4488 ! Bop off to the aligned copy. 4489 ! 4490 tst %o3 4491 bz,pn %icc, .dcos8 4492 subcc %o3, %o2, %o3 4493 ! 4494 ! We're negative if our size is larger than hw_copy_limit_8. 4495 ! 4496 bge,pt %ncc, .dcos8 4497 nop 4498 ! 4499 ! HW assist is on and we're large enough. Do it. 4500 ! 4501 ba,pt %ncc, .big_copyout 4502 nop 4503 .dcos8: 4504 ! 4505 ! Housekeeping for copy loops. Uses same idea as in the byte for 4506 ! byte copy loop above. 4507 ! 4508 add %o0, %o2, %o0 4509 add %o1, %o2, %o1 4510 sub %g0, %o2, %o3 4511 ba,pt %ncc, .dodebc 4512 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy 4513 ! 4514 ! 4 byte aligned? 4515 ! 4516 .dcoh4: 4517 bnz,pn %ncc, .dcoh2 4518 ! 4519 ! See if we're in the "small range". 4520 ! If so, go off an do the copy. 4521 ! If not, load the hard limit. %o3 is 4522 ! available for reuse. 4523 ! 4524 sethi %hi(hw_copy_limit_4), %o3 4525 ld [%o3 + %lo(hw_copy_limit_4)], %o3 4526 ! 4527 ! If it's zero, there's no HW bcopy. 4528 ! Bop off to the aligned copy. 4529 ! 4530 tst %o3 4531 bz,pn %icc, .dcos4 4532 subcc %o3, %o2, %o3 4533 ! 4534 ! We're negative if our size is larger than hw_copy_limit_4. 4535 ! 4536 bge,pt %ncc, .dcos4 4537 nop 4538 ! 4539 ! HW assist is on and we're large enough. Do it. 4540 ! 4541 ba,pt %ncc, .big_copyout 4542 nop 4543 .dcos4: 4544 add %o0, %o2, %o0 4545 add %o1, %o2, %o1 4546 sub %g0, %o2, %o3 4547 ba,pt %ncc, .dodfbc 4548 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy 4549 ! 4550 ! We must be 2 byte aligned. Off we go. 4551 ! The check for small copies was done in the 4552 ! delay at .dcoh4 4553 ! 4554 .dcoh2: 4555 ble %ncc, .dcos2 4556 sethi %hi(hw_copy_limit_2), %o3 4557 ld [%o3 + %lo(hw_copy_limit_2)], %o3 4558 tst %o3 4559 bz,pn %icc, .dcos2 4560 subcc %o3, %o2, %o3 4561 bge,pt %ncc, .dcos2 4562 nop 4563 ! 4564 ! HW is on and we're big enough. Do it. 4565 ! 4566 ba,pt %ncc, .big_copyout 4567 nop 4568 .dcos2: 4569 add %o0, %o2, %o0 4570 add %o1, %o2, %o1 4571 sub %g0, %o2, %o3 4572 ba,pt %ncc, .dodtbc 4573 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy 4574 .small_copyout: 4575 ! 4576 ! Why are we doing this AGAIN? There are certain conditions in 4577 ! big_copyout that will cause us to forego the HW assisted copies 4578 ! and bounce back to a non-HW assisted copy. This dispatches those 4579 ! copies. Note that we branch around this in the main line code. 4580 ! 4581 ! We make no check for limits or HW enablement here. We've 4582 ! already been told that we're a poster child so just go off 4583 ! and do it. 4584 ! 4585 or %o0, %o1, %o3 4586 btst 1, %o3 4587 bnz %icc, .dcobcp ! Most likely 4588 btst 7, %o3 4589 bz %icc, .dcos8 4590 btst 3, %o3 4591 bz %icc, .dcos4 4592 nop 4593 ba,pt %ncc, .dcos2 4594 nop 4595 .align 32 4596 .dodebc: 4597 ldx [%o0 + %o3], %o4 4598 deccc %o2 4599 stxa %o4, [%o1 + %o3]ASI_USER 4600 bg,pt %ncc, .dodebc 4601 addcc %o3, 8, %o3 4602 ! 4603 ! End of copy loop. Check to see if we're done. Most 4604 ! eight byte aligned copies end here. 4605 ! 4606 bz,pt %ncc, .dcofh 4607 nop 4608 ! 4609 ! Something is left - do it byte for byte. 4610 ! 4611 ba,pt %ncc, .dcocl 4612 ldub [%o0 + %o3], %o4 ! load next byte 4613 ! 4614 ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy. 4615 ! 4616 .align 32 4617 .dodfbc: 4618 lduw [%o0 + %o3], %o4 4619 deccc %o2 4620 sta %o4, [%o1 + %o3]ASI_USER 4621 bg,pt %ncc, .dodfbc 4622 addcc %o3, 4, %o3 4623 ! 4624 ! End of copy loop. Check to see if we're done. Most 4625 ! four byte aligned copies end here. 4626 ! 4627 bz,pt %ncc, .dcofh 4628 nop 4629 ! 4630 ! Something is left. Do it byte for byte. 4631 ! 4632 ba,pt %ncc, .dcocl 4633 ldub [%o0 + %o3], %o4 ! load next byte 4634 ! 4635 ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to 4636 ! copy. 4637 ! 4638 .align 32 4639 .dodtbc: 4640 lduh [%o0 + %o3], %o4 4641 deccc %o2 4642 stha %o4, [%o1 + %o3]ASI_USER 4643 bg,pt %ncc, .dodtbc 4644 addcc %o3, 2, %o3 4645 ! 4646 ! End of copy loop. Anything left? 4647 ! 4648 bz,pt %ncc, .dcofh 4649 nop 4650 ! 4651 ! Deal with the last byte 4652 ! 4653 ldub [%o0 + %o3], %o4 4654 stba %o4, [%o1 + %o3]ASI_USER 4655 .dcofh: 4656 membar #Sync 4657 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 4658 retl 4659 clr %o0 4660 4661 .big_copyout: 4662 ! We're going to go off and do a block copy. 4663 ! Switch fault handlers and grab a window. We 4664 ! don't do a membar #Sync since we've done only 4665 ! kernel data to this point. 4666 stn %o4, [THREAD_REG + T_LOFAULT] 4667 4668 ! Copy out that reach here are larger than 256 bytes. The 4669 ! hw_copy_limit_1 is set to 256. Never set this limit less 4670 ! 128 bytes. 4671 save %sp, -SA(MINFRAME), %sp 4672 .do_block_copyout: 4673 4674 ! Swap src/dst since the code below is memcpy code 4675 ! and memcpy/bcopy have different calling sequences 4676 mov %i1, %i5 4677 mov %i0, %i1 4678 mov %i5, %i0 4679 4680 ! Block (64 bytes) align the destination. 4681 andcc %i0, 0x3f, %i3 ! is dst block aligned 4682 bz %ncc, copyout_blalign ! dst already block aligned 4683 sub %i3, 0x40, %i3 4684 neg %i3 ! bytes till dst 64 bytes aligned 4685 sub %i2, %i3, %i2 ! update i2 with new count 4686 4687 ! Based on source and destination alignment do 4688 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 4689 4690 ! Is dst & src 8B aligned 4691 or %i0, %i1, %o2 4692 andcc %o2, 0x7, %g0 4693 bz %ncc, .co_alewdcp 4694 nop 4695 4696 ! Is dst & src 4B aligned 4697 andcc %o2, 0x3, %g0 4698 bz %ncc, .co_alwdcp 4699 nop 4700 4701 ! Is dst & src 2B aligned 4702 andcc %o2, 0x1, %g0 4703 bz %ncc, .co_alhlfwdcp 4704 nop 4705 4706 ! 1B aligned 4707 1: ldub [%i1], %o2 4708 stba %o2, [%i0]ASI_USER 4709 inc %i1 4710 deccc %i3 4711 bgu,pt %ncc, 1b 4712 inc %i0 4713 4714 ba copyout_blalign 4715 nop 4716 4717 ! dst & src 4B aligned 4718 .co_alwdcp: 4719 ld [%i1], %o2 4720 sta %o2, [%i0]ASI_USER 4721 add %i1, 0x4, %i1 4722 subcc %i3, 0x4, %i3 4723 bgu,pt %ncc, .co_alwdcp 4724 add %i0, 0x4, %i0 4725 4726 ba copyout_blalign 4727 nop 4728 4729 ! dst & src 2B aligned 4730 .co_alhlfwdcp: 4731 lduh [%i1], %o2 4732 stuha %o2, [%i0]ASI_USER 4733 add %i1, 0x2, %i1 4734 subcc %i3, 0x2, %i3 4735 bgu,pt %ncc, .co_alhlfwdcp 4736 add %i0, 0x2, %i0 4737 4738 ba copyout_blalign 4739 nop 4740 4741 ! dst & src 8B aligned 4742 .co_alewdcp: 4743 ldx [%i1], %o2 4744 stxa %o2, [%i0]ASI_USER 4745 add %i1, 0x8, %i1 4746 subcc %i3, 0x8, %i3 4747 bgu,pt %ncc, .co_alewdcp 4748 add %i0, 0x8, %i0 4749 4750 ! Now Destination is block (64 bytes) aligned 4751 copyout_blalign: 4752 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 4753 sub %i2, %i3, %i2 ! Residue bytes in %i2 4754 4755 mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi 4756 4757 andcc %i1, 0xf, %o2 ! is src quadword aligned 4758 bz,pn %xcc, .co_blkcpy ! src offset in %o2 (last 4-bits) 4759 nop 4760 cmp %o2, 0x8 4761 bg .co_upper_double 4762 nop 4763 bl .co_lower_double 4764 nop 4765 4766 ! Falls through when source offset is equal to 8 i.e. 4767 ! source is double word aligned. 4768 ! In this case no shift/merge of data is required 4769 4770 sub %i1, %o2, %i1 ! align the src at 16 bytes. 4771 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 4772 prefetch [%l0+0x0], #one_read 4773 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4774 .co_loop0: 4775 add %i1, 0x10, %i1 4776 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4777 prefetch [%l0+0x40], #one_read 4778 4779 stxa %l3, [%i0+0x0]%asi 4780 stxa %l4, [%i0+0x8]%asi 4781 4782 add %i1, 0x10, %i1 4783 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4784 4785 stxa %l5, [%i0+0x10]%asi 4786 stxa %l2, [%i0+0x18]%asi 4787 4788 add %i1, 0x10, %i1 4789 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4790 4791 stxa %l3, [%i0+0x20]%asi 4792 stxa %l4, [%i0+0x28]%asi 4793 4794 add %i1, 0x10, %i1 4795 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4796 4797 stxa %l5, [%i0+0x30]%asi 4798 stxa %l2, [%i0+0x38]%asi 4799 4800 add %l0, 0x40, %l0 4801 subcc %i3, 0x40, %i3 4802 bgu,pt %xcc, .co_loop0 4803 add %i0, 0x40, %i0 4804 ba .co_blkdone 4805 add %i1, %o2, %i1 ! increment the source by src offset 4806 ! the src offset was stored in %o2 4807 4808 .co_lower_double: 4809 4810 sub %i1, %o2, %i1 ! align the src at 16 bytes. 4811 sll %o2, 3, %o0 ! %o0 left shift 4812 mov 0x40, %o1 4813 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 4814 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 4815 prefetch [%l0+0x0], #one_read 4816 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l2 and %l3 has 4817 ! complete data 4818 .co_loop1: 4819 add %i1, 0x10, %i1 4820 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has partial data 4821 ! for this read. 4822 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 4823 ! into %l2 and %l3 4824 prefetch [%l0+0x40], #one_read 4825 4826 stxa %l2, [%i0+0x0]%asi 4827 stxa %l3, [%i0+0x8]%asi 4828 4829 add %i1, 0x10, %i1 4830 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4831 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 4832 ! %l4 from previous read 4833 ! into %l4 and %l5 4834 stxa %l4, [%i0+0x10]%asi 4835 stxa %l5, [%i0+0x18]%asi 4836 4837 ! Repeat the same for next 32 bytes. 4838 4839 add %i1, 0x10, %i1 4840 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4841 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 4842 4843 stxa %l2, [%i0+0x20]%asi 4844 stxa %l3, [%i0+0x28]%asi 4845 4846 add %i1, 0x10, %i1 4847 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4848 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 4849 4850 stxa %l4, [%i0+0x30]%asi 4851 stxa %l5, [%i0+0x38]%asi 4852 4853 add %l0, 0x40, %l0 4854 subcc %i3, 0x40, %i3 4855 bgu,pt %xcc, .co_loop1 4856 add %i0, 0x40, %i0 4857 ba .co_blkdone 4858 add %i1, %o2, %i1 ! increment the source by src offset 4859 ! the src offset was stored in %o2 4860 4861 .co_upper_double: 4862 4863 sub %i1, %o2, %i1 ! align the src at 16 bytes. 4864 sub %o2, 0x8, %o0 4865 sll %o0, 3, %o0 ! %o0 left shift 4866 mov 0x40, %o1 4867 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 4868 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 4869 prefetch [%l0+0x0], #one_read 4870 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l3 4871 ! for this read and 4872 ! no data in %l2 4873 .co_loop2: 4874 add %i1, 0x10, %i1 4875 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has complete data 4876 ! and %l5 has partial 4877 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 4878 ! into %l3 and %l4 4879 prefetch [%l0+0x40], #one_read 4880 4881 stxa %l3, [%i0+0x0]%asi 4882 stxa %l4, [%i0+0x8]%asi 4883 4884 add %i1, 0x10, %i1 4885 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4886 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 4887 ! %l5 from previous read 4888 ! into %l5 and %l2 4889 4890 stxa %l5, [%i0+0x10]%asi 4891 stxa %l2, [%i0+0x18]%asi 4892 4893 ! Repeat the same for next 32 bytes. 4894 4895 add %i1, 0x10, %i1 4896 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4897 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 4898 4899 stxa %l3, [%i0+0x20]%asi 4900 stxa %l4, [%i0+0x28]%asi 4901 4902 add %i1, 0x10, %i1 4903 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4904 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 4905 4906 stxa %l5, [%i0+0x30]%asi 4907 stxa %l2, [%i0+0x38]%asi 4908 4909 add %l0, 0x40, %l0 4910 subcc %i3, 0x40, %i3 4911 bgu,pt %xcc, .co_loop2 4912 add %i0, 0x40, %i0 4913 ba .co_blkdone 4914 add %i1, %o2, %i1 ! increment the source by src offset 4915 ! the src offset was stored in %o2 4916 4917 4918 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 4919 .co_blkcpy: 4920 4921 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 4922 prefetch [%o0+0x0], #one_read 4923 1: 4924 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0 4925 add %i1, 0x10, %i1 4926 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4927 add %i1, 0x10, %i1 4928 4929 prefetch [%o0+0x40], #one_read 4930 4931 stxa %l0, [%i0+0x0]%asi 4932 4933 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4934 add %i1, 0x10, %i1 4935 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6 4936 add %i1, 0x10, %i1 4937 4938 stxa %l1, [%i0+0x8]%asi 4939 stxa %l2, [%i0+0x10]%asi 4940 stxa %l3, [%i0+0x18]%asi 4941 stxa %l4, [%i0+0x20]%asi 4942 stxa %l5, [%i0+0x28]%asi 4943 stxa %l6, [%i0+0x30]%asi 4944 stxa %l7, [%i0+0x38]%asi 4945 4946 add %o0, 0x40, %o0 4947 subcc %i3, 0x40, %i3 4948 bgu,pt %xcc, 1b 4949 add %i0, 0x40, %i0 4950 4951 .co_blkdone: 4952 membar #Sync 4953 4954 brz,pt %i2, .copyout_exit 4955 nop 4956 4957 ! Handle trailing bytes 4958 cmp %i2, 0x8 4959 blu,pt %ncc, .co_residue 4960 nop 4961 4962 ! Can we do some 8B ops 4963 or %i1, %i0, %o2 4964 andcc %o2, 0x7, %g0 4965 bnz %ncc, .co_last4 4966 nop 4967 4968 ! Do 8byte ops as long as possible 4969 .co_last8: 4970 ldx [%i1], %o2 4971 stxa %o2, [%i0]ASI_USER 4972 add %i1, 0x8, %i1 4973 sub %i2, 0x8, %i2 4974 cmp %i2, 0x8 4975 bgu,pt %ncc, .co_last8 4976 add %i0, 0x8, %i0 4977 4978 brz,pt %i2, .copyout_exit 4979 nop 4980 4981 ba .co_residue 4982 nop 4983 4984 .co_last4: 4985 ! Can we do 4B ops 4986 andcc %o2, 0x3, %g0 4987 bnz %ncc, .co_last2 4988 nop 4989 1: 4990 ld [%i1], %o2 4991 sta %o2, [%i0]ASI_USER 4992 add %i1, 0x4, %i1 4993 sub %i2, 0x4, %i2 4994 cmp %i2, 0x4 4995 bgu,pt %ncc, 1b 4996 add %i0, 0x4, %i0 4997 4998 brz,pt %i2, .copyout_exit 4999 nop 5000 5001 ba .co_residue 5002 nop 5003 5004 .co_last2: 5005 ! Can we do 2B ops 5006 andcc %o2, 0x1, %g0 5007 bnz %ncc, .co_residue 5008 nop 5009 5010 1: 5011 lduh [%i1], %o2 5012 stuha %o2, [%i0]ASI_USER 5013 add %i1, 0x2, %i1 5014 sub %i2, 0x2, %i2 5015 cmp %i2, 0x2 5016 bgu,pt %ncc, 1b 5017 add %i0, 0x2, %i0 5018 5019 brz,pt %i2, .copyout_exit 5020 nop 5021 5022 ! Copy the residue as byte copy 5023 .co_residue: 5024 ldub [%i1], %i4 5025 stba %i4, [%i0]ASI_USER 5026 inc %i1 5027 deccc %i2 5028 bgu,pt %xcc, .co_residue 5029 inc %i0 5030 5031 .copyout_exit: 5032 membar #Sync 5033 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 5034 ret 5035 restore %g0, 0, %o0 5036 5037 .copyout_err: 5038 ldn [THREAD_REG + T_COPYOPS], %o4 5039 brz %o4, 2f 5040 nop 5041 ldn [%o4 + CP_COPYOUT], %g2 5042 jmp %g2 5043 nop 5044 2: 5045 retl 5046 mov -1, %o0 5047 #endif /* NIAGARA_IMPL */ 5048 SET_SIZE(copyout) 5049 5050 5051 ENTRY(xcopyout) 5052 sethi %hi(.xcopyout_err), REAL_LOFAULT 5053 b .do_copyout 5054 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 5055 .xcopyout_err: 5056 ldn [THREAD_REG + T_COPYOPS], %o4 5057 brz %o4, 2f 5058 nop 5059 ldn [%o4 + CP_XCOPYOUT], %g2 5060 jmp %g2 5061 nop 5062 2: 5063 retl 5064 mov %g1, %o0 5065 SET_SIZE(xcopyout) 5066 5067 ENTRY(xcopyout_little) 5068 sethi %hi(.little_err), %o4 5069 ldn [THREAD_REG + T_LOFAULT], %o5 5070 or %o4, %lo(.little_err), %o4 5071 membar #Sync ! sync error barrier 5072 stn %o4, [THREAD_REG + T_LOFAULT] 5073 5074 subcc %g0, %o2, %o3 5075 add %o0, %o2, %o0 5076 bz,pn %ncc, 2f ! check for zero bytes 5077 sub %o2, 1, %o4 5078 add %o0, %o4, %o0 ! start w/last byte 5079 add %o1, %o2, %o1 5080 ldub [%o0+%o3], %o4 5081 5082 1: stba %o4, [%o1+%o3]ASI_AIUSL 5083 inccc %o3 5084 sub %o0, 2, %o0 ! get next byte 5085 bcc,a,pt %ncc, 1b 5086 ldub [%o0+%o3], %o4 5087 5088 2: membar #Sync ! sync error barrier 5089 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 5090 retl 5091 mov %g0, %o0 ! return (0) 5092 SET_SIZE(xcopyout_little) 5093 5094 /* 5095 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 5096 */ 5097 5098 ENTRY(copyin) 5099 sethi %hi(.copyin_err), REAL_LOFAULT 5100 or REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT 5101 5102 #if !defined(NIAGARA_IMPL) 5103 .do_copyin: 5104 tst %o2 ! check for zero count; quick exit 5105 bz,pt %ncc, .ci_smallqx 5106 mov %o0, SAVE_SRC 5107 mov %o1, SAVE_DST 5108 mov %o2, SAVE_COUNT 5109 cmp %o2, FP_COPY ! check for small copy/leaf case 5110 bgt,pt %ncc, .ci_copy_more 5111 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 5112 /* 5113 * Small copy in code 5114 * 5115 */ 5116 sethi %hi(copyio_fault_nowindow), %o3 5117 or %o3, %lo(copyio_fault_nowindow), %o3 5118 membar #Sync 5119 stn %o3, [THREAD_REG + T_LOFAULT] 5120 5121 mov ASI_USER, %asi 5122 cmp %o2, SHORTCOPY ! make sure there is enough to align 5123 ble,pt %ncc, .ci_smallest 5124 andcc %o1, 0x7, %o3 ! is dest long word aligned 5125 bnz,pn %ncc, .ci_align 5126 andcc %o1, 1, %o3 ! is dest byte aligned 5127 5128 ! Destination is long word aligned 5129 .ci_al_src: 5130 andcc %o0, 7, %o3 5131 brnz,pt %o3, .ci_src_dst_unal8 5132 nop 5133 /* 5134 * Special case for handling when src and dest are both long word aligned 5135 * and total data to move is less than FP_COPY bytes 5136 * Also handles finish up for large block moves, so may be less than 32 bytes 5137 */ 5138 .ci_medlong: 5139 subcc %o2, 31, %o2 ! adjust length to allow cc test 5140 ble,pt %ncc, .ci_medl31 5141 nop 5142 .ci_medl32: 5143 ldxa [%o0]%asi, %o4 ! move 32 bytes 5144 subcc %o2, 32, %o2 ! decrement length count by 32 5145 stx %o4, [%o1] 5146 ldxa [%o0+8]%asi, %o4 5147 stx %o4, [%o1+8] 5148 ldxa [%o0+16]%asi, %o4 5149 add %o0, 32, %o0 ! increase src ptr by 32 5150 stx %o4, [%o1+16] 5151 ldxa [%o0-8]%asi, %o4 5152 add %o1, 32, %o1 ! increase dst ptr by 32 5153 bgu,pt %ncc, .ci_medl32 ! repeat if at least 32 bytes left 5154 stx %o4, [%o1-8] 5155 .ci_medl31: 5156 addcc %o2, 24, %o2 ! adjust count to be off by 7 5157 ble,pt %ncc, .ci_medl7 ! skip if 7 or fewer bytes left 5158 nop 5159 .ci_medl8: 5160 ldxa [%o0]%asi, %o4 ! move 8 bytes 5161 add %o0, 8, %o0 ! increase src ptr by 8 5162 subcc %o2, 8, %o2 ! decrease count by 8 5163 add %o1, 8, %o1 ! increase dst ptr by 8 5164 bgu,pt %ncc, .ci_medl8 5165 stx %o4, [%o1-8] 5166 .ci_medl7: 5167 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5168 bnz,pt %ncc, .ci_small4 ! do final bytes if not finished 5169 nop 5170 .ci_smallx: ! finish up and exit 5171 membar #Sync 5172 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5173 .ci_smallqx: 5174 retl 5175 mov %g0, %o0 5176 5177 .ci_small4: 5178 cmp %o2, 4 5179 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5180 nop ! 5181 lda [%o0]%asi, %o4 ! move 4 bytes 5182 add %o0, 4, %o0 ! increase src ptr by 4 5183 add %o1, 4, %o1 ! increase dst ptr by 4 5184 subcc %o2, 4, %o2 ! decrease count by 4 5185 bz %ncc, .ci_smallx 5186 stw %o4, [%o1-4] 5187 5188 .ci_small3x: ! Exactly 1, 2, or 3 bytes remain 5189 subcc %o2, 1, %o2 ! reduce count for cc test 5190 lduba [%o0]%asi, %o4 ! load one byte 5191 bz,pt %ncc, .ci_smallx 5192 stb %o4, [%o1] ! store one byte 5193 lduba [%o0+1]%asi, %o4 ! load second byte 5194 subcc %o2, 1, %o2 5195 bz,pt %ncc, .ci_smallx 5196 stb %o4, [%o1+1] ! store second byte 5197 lduba [%o0+2]%asi, %o4 ! load third byte 5198 ba .ci_smallx 5199 stb %o4, [%o1+2] ! store third byte 5200 5201 .ci_smallest: ! 7 or fewer bytes remain 5202 cmp %o2, 4 5203 blt,pt %ncc, .ci_small3x 5204 nop 5205 lduba [%o0]%asi, %o4 ! read byte 5206 subcc %o2, 4, %o2 ! reduce count by 4 5207 stb %o4, [%o1] ! write byte 5208 lduba [%o0+1]%asi, %o4 ! repeat for total of 4 bytes 5209 add %o0, 4, %o0 ! advance src by 4 5210 stb %o4, [%o1+1] 5211 lduba [%o0-2]%asi, %o4 5212 add %o1, 4, %o1 ! advance dst by 4 5213 stb %o4, [%o1-2] 5214 lduba [%o0-1]%asi, %o4 5215 bnz,pt %ncc, .ci_small3x 5216 stb %o4, [%o1-1] 5217 membar #Sync 5218 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5219 retl 5220 mov %g0, %o0 5221 5222 .ci_align: 5223 bnz,pt %ncc, .ci_al_d1 5224 .ci_al_d1f: ! dest is now half word aligned 5225 andcc %o1, 2, %o3 ! is dest word aligned 5226 bnz,pt %ncc, .ci_al_d2 5227 .ci_al_d2f: ! dest is now word aligned 5228 andcc %o1, 4, %o3 ! is dest longword aligned? 5229 bz,pt %ncc, .ci_al_src 5230 nop 5231 .ci_al_d4: ! dest is word aligned; src is unknown 5232 lduba [%o0]%asi, %o4 ! move a word (src align unknown) 5233 lduba [%o0+1]%asi, %o3 5234 sll %o4, 24, %o4 ! position 5235 sll %o3, 16, %o3 ! position 5236 or %o4, %o3, %o3 ! merge 5237 lduba [%o0+2]%asi, %o4 5238 sll %o4, 8, %o4 ! position 5239 or %o4, %o3, %o3 ! merge 5240 lduba [%o0+3]%asi, %o4 5241 or %o4, %o3, %o4 ! merge 5242 stw %o4,[%o1] ! store four bytes 5243 add %o0, 4, %o0 ! adjust src by 4 5244 add %o1, 4, %o1 ! adjust dest by 4 5245 sub %o2, 4, %o2 ! adjust count by 4 5246 andcc %o0, 7, %o3 ! check for src long word alignment 5247 brz,pt %o3, .ci_medlong 5248 .ci_src_dst_unal8: 5249 ! dst is 8-byte aligned, src is not 5250 ! Size is less than FP_COPY 5251 ! Following code is to select for alignment 5252 andcc %o0, 0x3, %o3 ! test word alignment 5253 bz,pt %ncc, .ci_medword 5254 nop 5255 andcc %o0, 0x1, %o3 ! test halfword alignment 5256 bnz,pt %ncc, .ci_med_byte ! go to byte move if not halfword 5257 andcc %o0, 0x2, %o3 ! test which byte alignment 5258 ba .ci_medhalf 5259 nop 5260 .ci_al_d1: ! align dest to half word 5261 lduba [%o0]%asi, %o4 ! move a byte 5262 add %o0, 1, %o0 5263 stb %o4, [%o1] 5264 add %o1, 1, %o1 5265 andcc %o1, 2, %o3 ! is dest word aligned 5266 bz,pt %ncc, .ci_al_d2f 5267 sub %o2, 1, %o2 5268 .ci_al_d2: ! align dest to word 5269 lduba [%o0]%asi, %o4 ! move a half-word (src align unknown) 5270 lduba [%o0+1]%asi, %o3 5271 sll %o4, 8, %o4 ! position 5272 or %o4, %o3, %o4 ! merge 5273 sth %o4, [%o1] 5274 add %o0, 2, %o0 5275 add %o1, 2, %o1 5276 andcc %o1, 4, %o3 ! is dest longword aligned? 5277 bz,pt %ncc, .ci_al_src 5278 sub %o2, 2, %o2 5279 ba .ci_al_d4 5280 nop 5281 /* 5282 * Handle all cases where src and dest are aligned on word 5283 * boundaries. Use unrolled loops for better performance. 5284 * This option wins over standard large data move when 5285 * source and destination is in cache for medium 5286 * to short data moves. 5287 */ 5288 .ci_medword: 5289 subcc %o2, 31, %o2 ! adjust length to allow cc test 5290 ble,pt %ncc, .ci_medw31 5291 nop 5292 .ci_medw32: 5293 lda [%o0]%asi, %o4 ! move a block of 32 bytes 5294 stw %o4, [%o1] 5295 lda [%o0+4]%asi, %o4 5296 stw %o4, [%o1+4] 5297 lda [%o0+8]%asi, %o4 5298 stw %o4, [%o1+8] 5299 lda [%o0+12]%asi, %o4 5300 stw %o4, [%o1+12] 5301 lda [%o0+16]%asi, %o4 5302 stw %o4, [%o1+16] 5303 lda [%o0+20]%asi, %o4 5304 subcc %o2, 32, %o2 ! decrement length count 5305 stw %o4, [%o1+20] 5306 lda [%o0+24]%asi, %o4 5307 add %o0, 32, %o0 ! increase src ptr by 32 5308 stw %o4, [%o1+24] 5309 lda [%o0-4]%asi, %o4 5310 add %o1, 32, %o1 ! increase dst ptr by 32 5311 bgu,pt %ncc, .ci_medw32 ! repeat if at least 32 bytes left 5312 stw %o4, [%o1-4] 5313 .ci_medw31: 5314 addcc %o2, 24, %o2 ! adjust count to be off by 7 5315 ble,pt %ncc, .ci_medw7 ! skip if 7 or fewer bytes left 5316 nop ! 5317 .ci_medw15: 5318 lda [%o0]%asi, %o4 ! move a block of 8 bytes 5319 subcc %o2, 8, %o2 ! decrement length count 5320 stw %o4, [%o1] 5321 add %o0, 8, %o0 ! increase src ptr by 8 5322 lda [%o0-4]%asi, %o4 5323 add %o1, 8, %o1 ! increase dst ptr by 8 5324 bgu,pt %ncc, .ci_medw15 5325 stw %o4, [%o1-4] 5326 .ci_medw7: 5327 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5328 bz,pt %ncc, .ci_smallx ! exit if finished 5329 cmp %o2, 4 5330 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5331 nop ! 5332 lda [%o0]%asi, %o4 ! move 4 bytes 5333 add %o0, 4, %o0 ! increase src ptr by 4 5334 add %o1, 4, %o1 ! increase dst ptr by 4 5335 subcc %o2, 4, %o2 ! decrease count by 4 5336 bnz .ci_small3x 5337 stw %o4, [%o1-4] 5338 membar #Sync 5339 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5340 retl 5341 mov %g0, %o0 5342 5343 .ci_medhalf: 5344 subcc %o2, 31, %o2 ! adjust length to allow cc test 5345 ble,pt %ncc, .ci_medh31 5346 nop 5347 .ci_medh32: ! load and store block of 32 bytes 5348 subcc %o2, 32, %o2 ! decrement length count 5349 5350 lduha [%o0]%asi, %o4 ! move 32 bytes 5351 lduwa [%o0+2]%asi, %o3 5352 sllx %o4, 48, %o4 5353 sllx %o3, 16, %o3 5354 or %o4, %o3, %o3 5355 lduha [%o0+6]%asi, %o4 5356 or %o4, %o3, %o4 5357 stx %o4, [%o1] 5358 5359 lduha [%o0+8]%asi, %o4 5360 lduwa [%o0+10]%asi, %o3 5361 sllx %o4, 48, %o4 5362 sllx %o3, 16, %o3 5363 or %o4, %o3, %o3 5364 lduha [%o0+14]%asi, %o4 5365 or %o4, %o3, %o4 5366 stx %o4, [%o1+8] 5367 5368 lduha [%o0+16]%asi, %o4 5369 lduwa [%o0+18]%asi, %o3 5370 sllx %o4, 48, %o4 5371 sllx %o3, 16, %o3 5372 or %o4, %o3, %o3 5373 lduha [%o0+22]%asi, %o4 5374 or %o4, %o3, %o4 5375 stx %o4, [%o1+16] 5376 5377 add %o0, 32, %o0 ! increase src ptr by 32 5378 add %o1, 32, %o1 ! increase dst ptr by 32 5379 5380 lduha [%o0-8]%asi, %o4 5381 lduwa [%o0-6]%asi, %o3 5382 sllx %o4, 48, %o4 5383 sllx %o3, 16, %o3 5384 or %o4, %o3, %o3 5385 lduha [%o0-2]%asi, %o4 5386 or %o3, %o4, %o4 5387 bgu,pt %ncc, .ci_medh32 ! repeat if at least 32 bytes left 5388 stx %o4, [%o1-8] 5389 5390 .ci_medh31: 5391 addcc %o2, 24, %o2 ! adjust count to be off by 7 5392 ble,pt %ncc, .ci_medh7 ! skip if 7 or fewer bytes left 5393 nop ! 5394 .ci_medh15: 5395 lduha [%o0]%asi, %o4 ! move 16 bytes 5396 subcc %o2, 8, %o2 ! decrement length count 5397 lduwa [%o0+2]%asi, %o3 5398 sllx %o4, 48, %o4 5399 sllx %o3, 16, %o3 5400 or %o4, %o3, %o3 5401 add %o1, 8, %o1 ! increase dst ptr by 8 5402 lduha [%o0+6]%asi, %o4 5403 add %o0, 8, %o0 ! increase src ptr by 8 5404 or %o4, %o3, %o4 5405 bgu,pt %ncc, .ci_medh15 5406 stx %o4, [%o1-8] 5407 .ci_medh7: 5408 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5409 bz,pt %ncc, .ci_smallx ! exit if finished 5410 cmp %o2, 4 5411 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5412 nop ! 5413 lduha [%o0]%asi, %o4 5414 sll %o4, 16, %o4 5415 lduha [%o0+2]%asi, %o3 5416 or %o3, %o4, %o4 5417 subcc %o2, 4, %o2 5418 add %o0, 4, %o0 5419 add %o1, 4, %o1 5420 bnz .ci_small3x 5421 stw %o4, [%o1-4] 5422 membar #Sync 5423 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5424 retl 5425 mov %g0, %o0 5426 5427 .align 16 5428 .ci_med_byte: 5429 bnz,pt %ncc, .ci_medbh32a ! go to correct byte move 5430 subcc %o2, 31, %o2 ! adjust length to allow cc test 5431 ble,pt %ncc, .ci_medb31 5432 nop 5433 .ci_medb32: ! Alignment 1 or 5 5434 subcc %o2, 32, %o2 ! decrement length count 5435 5436 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes 5437 sllx %o4, 56, %o3 5438 lduha [%o0+1]%asi, %o4 5439 sllx %o4, 40, %o4 5440 or %o4, %o3, %o3 5441 lduwa [%o0+3]%asi, %o4 5442 sllx %o4, 8, %o4 5443 or %o4, %o3, %o3 5444 lduba [%o0+7]%asi, %o4 5445 or %o4, %o3, %o4 5446 stx %o4, [%o1] 5447 5448 lduba [%o0+8]%asi, %o4 5449 sllx %o4, 56, %o3 5450 lduha [%o0+9]%asi, %o4 5451 sllx %o4, 40, %o4 5452 or %o4, %o3, %o3 5453 lduwa [%o0+11]%asi, %o4 5454 sllx %o4, 8, %o4 5455 or %o4, %o3, %o3 5456 lduba [%o0+15]%asi, %o4 5457 or %o4, %o3, %o4 5458 stx %o4, [%o1+8] 5459 5460 lduba [%o0+16]%asi, %o4 5461 sllx %o4, 56, %o3 5462 lduha [%o0+17]%asi, %o4 5463 sllx %o4, 40, %o4 5464 or %o4, %o3, %o3 5465 lduwa [%o0+19]%asi, %o4 5466 sllx %o4, 8, %o4 5467 or %o4, %o3, %o3 5468 lduba [%o0+23]%asi, %o4 5469 or %o4, %o3, %o4 5470 stx %o4, [%o1+16] 5471 5472 add %o0, 32, %o0 ! increase src ptr by 32 5473 add %o1, 32, %o1 ! increase dst ptr by 32 5474 5475 lduba [%o0-8]%asi, %o4 5476 sllx %o4, 56, %o3 5477 lduha [%o0-7]%asi, %o4 5478 sllx %o4, 40, %o4 5479 or %o4, %o3, %o3 5480 lduwa [%o0-5]%asi, %o4 5481 sllx %o4, 8, %o4 5482 or %o4, %o3, %o3 5483 lduba [%o0-1]%asi, %o4 5484 or %o4, %o3, %o4 5485 bgu,pt %ncc, .ci_medb32 ! repeat if at least 32 bytes left 5486 stx %o4, [%o1-8] 5487 5488 .ci_medb31: ! 31 or fewer bytes remaining 5489 addcc %o2, 24, %o2 ! adjust count to be off by 7 5490 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left 5491 nop ! 5492 .ci_medb15: 5493 5494 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes 5495 subcc %o2, 8, %o2 ! decrement length count 5496 sllx %o4, 56, %o3 5497 lduha [%o0+1]%asi, %o4 5498 sllx %o4, 40, %o4 5499 or %o4, %o3, %o3 5500 lduwa [%o0+3]%asi, %o4 5501 add %o1, 8, %o1 ! increase dst ptr by 16 5502 sllx %o4, 8, %o4 5503 or %o4, %o3, %o3 5504 lduba [%o0+7]%asi, %o4 5505 add %o0, 8, %o0 ! increase src ptr by 16 5506 or %o4, %o3, %o4 5507 bgu,pt %ncc, .ci_medb15 5508 stx %o4, [%o1-8] 5509 .ci_medb7: 5510 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5511 bz,pt %ncc, .ci_smallx ! exit if finished 5512 cmp %o2, 4 5513 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5514 nop ! 5515 lduba [%o0]%asi, %o4 ! move 4 bytes 5516 sll %o4, 24, %o3 5517 lduha [%o0+1]%asi, %o4 5518 sll %o4, 8, %o4 5519 or %o4, %o3, %o3 5520 lduba [%o0+3]%asi, %o4 5521 or %o4, %o3, %o4 5522 subcc %o2, 4, %o2 5523 add %o0, 4, %o0 5524 add %o1, 4, %o1 5525 bnz .ci_small3x 5526 stw %o4, [%o1-4] 5527 membar #Sync 5528 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5529 retl 5530 mov %g0, %o0 5531 5532 .align 16 5533 .ci_medbh32a: ! Alignment 3 or 7 5534 ble,pt %ncc, .ci_medbh31 5535 nop 5536 .ci_medbh32: ! Alignment 3 or 7 5537 subcc %o2, 32, %o2 ! decrement length count 5538 5539 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes 5540 sllx %o4, 56, %o3 5541 lduwa [%o0+1]%asi, %o4 5542 sllx %o4, 24, %o4 5543 or %o4, %o3, %o3 5544 lduha [%o0+5]%asi, %o4 5545 sllx %o4, 8, %o4 5546 or %o4, %o3, %o3 5547 lduba [%o0+7]%asi, %o4 5548 or %o4, %o3, %o4 5549 stx %o4, [%o1] 5550 5551 lduba [%o0+8]%asi, %o4 5552 sllx %o4, 56, %o3 5553 lduwa [%o0+9]%asi, %o4 5554 sllx %o4, 24, %o4 5555 or %o4, %o3, %o3 5556 lduha [%o0+13]%asi, %o4 5557 sllx %o4, 8, %o4 5558 or %o4, %o3, %o3 5559 lduba [%o0+15]%asi, %o4 5560 or %o4, %o3, %o4 5561 stx %o4, [%o1+8] 5562 5563 lduba [%o0+16]%asi, %o4 5564 sllx %o4, 56, %o3 5565 lduwa [%o0+17]%asi, %o4 5566 sllx %o4, 24, %o4 5567 or %o4, %o3, %o3 5568 lduha [%o0+21]%asi, %o4 5569 sllx %o4, 8, %o4 5570 or %o4, %o3, %o3 5571 lduba [%o0+23]%asi, %o4 5572 or %o4, %o3, %o4 5573 stx %o4, [%o1+16] 5574 5575 add %o0, 32, %o0 ! increase src ptr by 32 5576 add %o1, 32, %o1 ! increase dst ptr by 32 5577 5578 lduba [%o0-8]%asi, %o4 5579 sllx %o4, 56, %o3 5580 lduwa [%o0-7]%asi, %o4 5581 sllx %o4, 24, %o4 5582 or %o4, %o3, %o3 5583 lduha [%o0-3]%asi, %o4 5584 sllx %o4, 8, %o4 5585 or %o4, %o3, %o3 5586 lduba [%o0-1]%asi, %o4 5587 or %o4, %o3, %o4 5588 bgu,pt %ncc, .ci_medbh32 ! repeat if at least 32 bytes left 5589 stx %o4, [%o1-8] 5590 5591 .ci_medbh31: 5592 addcc %o2, 24, %o2 ! adjust count to be off by 7 5593 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left 5594 nop ! 5595 .ci_medbh15: 5596 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes 5597 sllx %o4, 56, %o3 5598 lduwa [%o0+1]%asi, %o4 5599 sllx %o4, 24, %o4 5600 or %o4, %o3, %o3 5601 lduha [%o0+5]%asi, %o4 5602 sllx %o4, 8, %o4 5603 or %o4, %o3, %o3 5604 lduba [%o0+7]%asi, %o4 5605 or %o4, %o3, %o4 5606 stx %o4, [%o1] 5607 subcc %o2, 8, %o2 ! decrement length count 5608 add %o1, 8, %o1 ! increase dst ptr by 8 5609 add %o0, 8, %o0 ! increase src ptr by 8 5610 bgu,pt %ncc, .ci_medbh15 5611 stx %o4, [%o1-8] 5612 ba .ci_medb7 5613 nop 5614 5615 /* 5616 * End of small copy in code (no window) 5617 * 5618 */ 5619 5620 /* 5621 * Long copy in code (using register window and fp regs) 5622 * 5623 */ 5624 5625 .ci_copy_more: 5626 sethi %hi(copyio_fault), %o3 5627 or %o3, %lo(copyio_fault), %o3 5628 membar #Sync 5629 stn %o3, [THREAD_REG + T_LOFAULT] 5630 /* 5631 * Following code is for large copies. We know there is at 5632 * least FP_COPY bytes available. FP regs are used, so 5633 * we save registers and fp regs before starting 5634 */ 5635 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 5636 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 5637 rd %fprs, %g1 ! check for unused fp 5638 ! if fprs.fef == 0, set it. 5639 ! Setting it when already set costs more than checking 5640 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0 5641 bz,pt %ncc, .ci_fp_unused 5642 mov ASI_USER, %asi 5643 BST_FP_TOSTACK(%o3) 5644 ba .ci_fp_ready 5645 .ci_fp_unused: 5646 prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read 5647 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 5648 .ci_fp_ready: 5649 rd %gsr, %l5 ! save %gsr value 5650 andcc %i1, 1, %o3 ! is dest byte aligned 5651 bnz,pt %ncc, .ci_big_d1 5652 .ci_big_d1f: ! dest is now half word aligned 5653 andcc %i1, 2, %o3 5654 bnz,pt %ncc, .ci_big_d2 5655 .ci_big_d2f: ! dest is now word aligned 5656 andcc %i1, 4, %o3 5657 bnz,pt %ncc, .ci_big_d4 5658 .ci_big_d4f: ! dest is long word aligned 5659 andcc %i0, 7, %o3 ! is src long word aligned 5660 brnz,pt %o3, .ci_big_unal8 5661 prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read 5662 ! Src and dst are long word aligned 5663 ! align dst to 64 byte boundary 5664 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 5665 brz,pn %o3, .ci_al_to_64 5666 nop 5667 sub %o3, 64, %o3 ! %o3 has negative bytes to move 5668 add %i2, %o3, %i2 ! adjust remaining count 5669 andcc %o3, 8, %o4 ! odd long words to move? 5670 brz,pt %o4, .ci_al_to_16 5671 nop 5672 add %o3, 8, %o3 5673 ldxa [%i0]%asi, %o4 5674 add %i0, 8, %i0 ! increment src ptr 5675 add %i1, 8, %i1 ! increment dst ptr 5676 stx %o4, [%i1-8] 5677 ! Dest is aligned on 16 bytes, src 8 byte aligned 5678 .ci_al_to_16: 5679 andcc %o3, 0x30, %o4 ! pair of long words to move? 5680 brz,pt %o4, .ci_al_to_64 5681 nop 5682 .ci_al_mv_16: 5683 add %o3, 16, %o3 5684 ldxa [%i0]%asi, %o4 5685 stx %o4, [%i1] 5686 add %i0, 16, %i0 ! increment src ptr 5687 ldxa [%i0-8]%asi, %o4 5688 stx %o4, [%i1+8] 5689 andcc %o3, 0x30, %o4 5690 brnz,pt %o4, .ci_al_mv_16 5691 add %i1, 16, %i1 ! increment dst ptr 5692 ! Dest is aligned on 64 bytes, src 8 byte aligned 5693 .ci_al_to_64: 5694 ! Determine source alignment 5695 ! to correct 8 byte offset 5696 andcc %i0, 32, %o3 5697 brnz,pn %o3, .ci_aln_1 5698 andcc %i0, 16, %o3 5699 brnz,pn %o3, .ci_aln_01 5700 andcc %i0, 8, %o3 5701 brz,pn %o3, .ci_aln_000 5702 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5703 ba .ci_aln_001 5704 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5705 .ci_aln_01: 5706 brnz,pn %o3, .ci_aln_011 5707 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5708 ba .ci_aln_010 5709 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5710 .ci_aln_1: 5711 andcc %i0, 16, %o3 5712 brnz,pn %o3, .ci_aln_11 5713 andcc %i0, 8, %o3 5714 brnz,pn %o3, .ci_aln_101 5715 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5716 ba .ci_aln_100 5717 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5718 .ci_aln_11: 5719 brz,pn %o3, .ci_aln_110 5720 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5721 5722 .ci_aln_111: 5723 ! Alignment off by 8 bytes 5724 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5725 ldda [%i0]%asi, %d0 5726 add %i0, 8, %i0 5727 sub %i2, 8, %i2 5728 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5729 and %i2, 0x7f, %i2 ! residue bytes in %i2 5730 sub %i1, %i0, %i1 5731 .ci_aln_111_loop: 5732 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5733 subcc %o3, 64, %o3 5734 fmovd %d16, %d2 5735 fmovd %d18, %d4 5736 fmovd %d20, %d6 5737 fmovd %d22, %d8 5738 fmovd %d24, %d10 5739 fmovd %d26, %d12 5740 fmovd %d28, %d14 5741 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5742 stda %d0,[%i0+%i1]ASI_BLK_P 5743 add %i0, 64, %i0 5744 fmovd %d30, %d0 5745 bgt,pt %ncc, .ci_aln_111_loop 5746 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5747 add %i1, %i0, %i1 5748 5749 std %d0, [%i1] 5750 ba .ci_remain_stuff 5751 add %i1, 8, %i1 5752 ! END OF aln_111 5753 5754 .ci_aln_110: 5755 ! Alignment off by 16 bytes 5756 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5757 ldda [%i0]%asi, %d0 5758 ldda [%i0+8]%asi, %d2 5759 add %i0, 16, %i0 5760 sub %i2, 16, %i2 5761 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5762 and %i2, 0x7f, %i2 ! residue bytes in %i2 5763 sub %i1, %i0, %i1 5764 .ci_aln_110_loop: 5765 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5766 subcc %o3, 64, %o3 5767 fmovd %d16, %d4 5768 fmovd %d18, %d6 5769 fmovd %d20, %d8 5770 fmovd %d22, %d10 5771 fmovd %d24, %d12 5772 fmovd %d26, %d14 5773 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5774 stda %d0,[%i0+%i1]ASI_BLK_P 5775 add %i0, 64, %i0 5776 fmovd %d28, %d0 5777 fmovd %d30, %d2 5778 bgt,pt %ncc, .ci_aln_110_loop 5779 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5780 add %i1, %i0, %i1 5781 5782 std %d0, [%i1] 5783 std %d2, [%i1+8] 5784 ba .ci_remain_stuff 5785 add %i1, 16, %i1 5786 ! END OF aln_110 5787 5788 .ci_aln_101: 5789 ! Alignment off by 24 bytes 5790 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5791 ldda [%i0]%asi, %d0 5792 ldda [%i0+8]%asi, %d2 5793 ldda [%i0+16]%asi, %d4 5794 add %i0, 24, %i0 5795 sub %i2, 24, %i2 5796 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5797 and %i2, 0x7f, %i2 ! residue bytes in %i2 5798 sub %i1, %i0, %i1 5799 .ci_aln_101_loop: 5800 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5801 subcc %o3, 64, %o3 5802 fmovd %d16, %d6 5803 fmovd %d18, %d8 5804 fmovd %d20, %d10 5805 fmovd %d22, %d12 5806 fmovd %d24, %d14 5807 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5808 stda %d0,[%i0+%i1]ASI_BLK_P 5809 add %i0, 64, %i0 5810 fmovd %d26, %d0 5811 fmovd %d28, %d2 5812 fmovd %d30, %d4 5813 bgt,pt %ncc, .ci_aln_101_loop 5814 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5815 add %i1, %i0, %i1 5816 5817 std %d0, [%i1] 5818 std %d2, [%i1+8] 5819 std %d4, [%i1+16] 5820 ba .ci_remain_stuff 5821 add %i1, 24, %i1 5822 ! END OF aln_101 5823 5824 .ci_aln_100: 5825 ! Alignment off by 32 bytes 5826 ldda [%i0]%asi, %d0 5827 ldda [%i0+8]%asi, %d2 5828 ldda [%i0+16]%asi,%d4 5829 ldda [%i0+24]%asi,%d6 5830 add %i0, 32, %i0 5831 sub %i2, 32, %i2 5832 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5833 and %i2, 0x7f, %i2 ! residue bytes in %i2 5834 sub %i1, %i0, %i1 5835 .ci_aln_100_loop: 5836 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5837 subcc %o3, 64, %o3 5838 fmovd %d16, %d8 5839 fmovd %d18, %d10 5840 fmovd %d20, %d12 5841 fmovd %d22, %d14 5842 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5843 stda %d0,[%i0+%i1]ASI_BLK_P 5844 add %i0, 64, %i0 5845 fmovd %d24, %d0 5846 fmovd %d26, %d2 5847 fmovd %d28, %d4 5848 fmovd %d30, %d6 5849 bgt,pt %ncc, .ci_aln_100_loop 5850 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5851 add %i1, %i0, %i1 5852 5853 std %d0, [%i1] 5854 std %d2, [%i1+8] 5855 std %d4, [%i1+16] 5856 std %d6, [%i1+24] 5857 ba .ci_remain_stuff 5858 add %i1, 32, %i1 5859 ! END OF aln_100 5860 5861 .ci_aln_011: 5862 ! Alignment off by 40 bytes 5863 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5864 ldda [%i0]%asi, %d0 5865 ldda [%i0+8]%asi, %d2 5866 ldda [%i0+16]%asi, %d4 5867 ldda [%i0+24]%asi, %d6 5868 ldda [%i0+32]%asi, %d8 5869 add %i0, 40, %i0 5870 sub %i2, 40, %i2 5871 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5872 and %i2, 0x7f, %i2 ! residue bytes in %i2 5873 sub %i1, %i0, %i1 5874 .ci_aln_011_loop: 5875 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5876 subcc %o3, 64, %o3 5877 fmovd %d16, %d10 5878 fmovd %d18, %d12 5879 fmovd %d20, %d14 5880 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5881 stda %d0,[%i0+%i1]ASI_BLK_P 5882 add %i0, 64, %i0 5883 fmovd %d22, %d0 5884 fmovd %d24, %d2 5885 fmovd %d26, %d4 5886 fmovd %d28, %d6 5887 fmovd %d30, %d8 5888 bgt,pt %ncc, .ci_aln_011_loop 5889 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5890 add %i1, %i0, %i1 5891 5892 std %d0, [%i1] 5893 std %d2, [%i1+8] 5894 std %d4, [%i1+16] 5895 std %d6, [%i1+24] 5896 std %d8, [%i1+32] 5897 ba .ci_remain_stuff 5898 add %i1, 40, %i1 5899 ! END OF aln_011 5900 5901 .ci_aln_010: 5902 ! Alignment off by 48 bytes 5903 ldda [%i0]%asi, %d0 5904 ldda [%i0+8]%asi, %d2 5905 ldda [%i0+16]%asi, %d4 5906 ldda [%i0+24]%asi, %d6 5907 ldda [%i0+32]%asi, %d8 5908 ldda [%i0+40]%asi, %d10 5909 add %i0, 48, %i0 5910 sub %i2, 48, %i2 5911 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5912 and %i2, 0x7f, %i2 ! residue bytes in %i2 5913 sub %i1, %i0, %i1 5914 .ci_aln_010_loop: 5915 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5916 subcc %o3, 64, %o3 5917 fmovd %d16, %d12 5918 fmovd %d18, %d14 5919 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5920 stda %d0,[%i0+%i1]ASI_BLK_P 5921 add %i0, 64, %i0 5922 fmovd %d20, %d0 5923 fmovd %d22, %d2 5924 fmovd %d24, %d4 5925 fmovd %d26, %d6 5926 fmovd %d28, %d8 5927 fmovd %d30, %d10 5928 bgt,pt %ncc, .ci_aln_010_loop 5929 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5930 add %i1, %i0, %i1 5931 5932 std %d0, [%i1] 5933 std %d2, [%i1+8] 5934 std %d4, [%i1+16] 5935 std %d6, [%i1+24] 5936 std %d8, [%i1+32] 5937 std %d10, [%i1+40] 5938 ba .ci_remain_stuff 5939 add %i1, 48, %i1 5940 ! END OF aln_010 5941 5942 .ci_aln_001: 5943 ! Alignment off by 56 bytes 5944 ldda [%i0]%asi, %d0 5945 ldda [%i0+8]%asi, %d2 5946 ldda [%i0+16]%asi, %d4 5947 ldda [%i0+24]%asi, %d6 5948 ldda [%i0+32]%asi, %d8 5949 ldda [%i0+40]%asi, %d10 5950 ldda [%i0+48]%asi, %d12 5951 add %i0, 56, %i0 5952 sub %i2, 56, %i2 5953 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5954 and %i2, 0x7f, %i2 ! residue bytes in %i2 5955 sub %i1, %i0, %i1 5956 .ci_aln_001_loop: 5957 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5958 subcc %o3, 64, %o3 5959 fmovd %d16, %d14 5960 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5961 stda %d0,[%i0+%i1]ASI_BLK_P 5962 add %i0, 64, %i0 5963 fmovd %d18, %d0 5964 fmovd %d20, %d2 5965 fmovd %d22, %d4 5966 fmovd %d24, %d6 5967 fmovd %d26, %d8 5968 fmovd %d28, %d10 5969 fmovd %d30, %d12 5970 bgt,pt %ncc, .ci_aln_001_loop 5971 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5972 add %i1, %i0, %i1 5973 5974 std %d0, [%i1] 5975 std %d2, [%i1+8] 5976 std %d4, [%i1+16] 5977 std %d6, [%i1+24] 5978 std %d8, [%i1+32] 5979 std %d10, [%i1+40] 5980 std %d12, [%i1+48] 5981 ba .ci_remain_stuff 5982 add %i1, 56, %i1 5983 ! END OF aln_001 5984 5985 .ci_aln_000: 5986 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5987 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5988 and %i2, 0x7f, %i2 ! residue bytes in %i2 5989 sub %i1, %i0, %i1 5990 .ci_aln_000_loop: 5991 ldda [%i0]ASI_BLK_AIUS,%d0 5992 subcc %o3, 64, %o3 5993 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5994 stda %d0,[%i0+%i1]ASI_BLK_P 5995 add %i0, 64, %i0 5996 bgt,pt %ncc, .ci_aln_000_loop 5997 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5998 add %i1, %i0, %i1 5999 6000 ! END OF aln_000 6001 6002 .ci_remain_stuff: 6003 subcc %i2, 31, %i2 ! adjust length to allow cc test 6004 ble,pt %ncc, .ci_aln_31 6005 nop 6006 .ci_aln_32: 6007 ldxa [%i0]%asi, %o4 ! move 32 bytes 6008 subcc %i2, 32, %i2 ! decrement length count by 32 6009 stx %o4, [%i1] 6010 ldxa [%i0+8]%asi, %o4 6011 stx %o4, [%i1+8] 6012 ldxa [%i0+16]%asi, %o4 6013 add %i0, 32, %i0 ! increase src ptr by 32 6014 stx %o4, [%i1+16] 6015 ldxa [%i0-8]%asi, %o4 6016 add %i1, 32, %i1 ! increase dst ptr by 32 6017 bgu,pt %ncc, .ci_aln_32 ! repeat if at least 32 bytes left 6018 stx %o4, [%i1-8] 6019 .ci_aln_31: 6020 addcc %i2, 24, %i2 ! adjust count to be off by 7 6021 ble,pt %ncc, .ci_aln_7 ! skip if 7 or fewer bytes left 6022 nop ! 6023 .ci_aln_15: 6024 ldxa [%i0]%asi, %o4 ! move 8 bytes 6025 add %i0, 8, %i0 ! increase src ptr by 8 6026 subcc %i2, 8, %i2 ! decrease count by 8 6027 add %i1, 8, %i1 ! increase dst ptr by 8 6028 bgu,pt %ncc, .ci_aln_15 6029 stx %o4, [%i1-8] ! 6030 .ci_aln_7: 6031 addcc %i2, 7, %i2 ! finish adjustment of remaining count 6032 bz,pt %ncc, .ci_exit ! exit if finished 6033 cmp %i2, 4 6034 blt,pt %ncc, .ci_unaln3x ! skip if less than 4 bytes left 6035 nop ! 6036 lda [%i0]%asi, %o4 ! move 4 bytes 6037 add %i0, 4, %i0 ! increase src ptr by 4 6038 add %i1, 4, %i1 ! increase dst ptr by 4 6039 subcc %i2, 4, %i2 ! decrease count by 4 6040 bnz .ci_unaln3x 6041 stw %o4, [%i1-4] 6042 ba .ci_exit 6043 nop 6044 6045 ! destination alignment code 6046 .ci_big_d1: 6047 lduba [%i0]%asi, %o4 ! move a byte 6048 add %i0, 1, %i0 6049 stb %o4, [%i1] 6050 add %i1, 1, %i1 6051 andcc %i1, 2, %o3 6052 bz,pt %ncc, .ci_big_d2f 6053 sub %i2, 1, %i2 6054 .ci_big_d2: ! dest is now at least half word aligned 6055 lduba [%i0]%asi, %o4 ! move a half-word (src align unknown) 6056 lduba [%i0+1]%asi, %o3 6057 add %i0, 2, %i0 6058 sll %o4, 8, %o4 ! position 6059 or %o4, %o3, %o4 ! merge 6060 sth %o4, [%i1] 6061 add %i1, 2, %i1 6062 andcc %i1, 4, %o3 6063 bz,pt %ncc, .ci_big_d4f 6064 sub %i2, 2, %i2 6065 .ci_big_d4: ! dest is at least word aligned 6066 nop 6067 lduba [%i0]%asi, %o4 ! move a word (src align unknown) 6068 lduba [%i0+1]%asi, %o3 6069 sll %o4, 24, %o4 ! position 6070 sll %o3, 16, %o3 ! position 6071 or %o4, %o3, %o3 ! merge 6072 lduba [%i0+2]%asi, %o4 6073 sll %o4, 8, %o4 ! position 6074 or %o4, %o3, %o3 ! merge 6075 lduba [%i0+3]%asi, %o4 6076 or %o4, %o3, %o4 ! merge 6077 stw %o4,[%i1] ! store four bytes 6078 add %i0, 4, %i0 ! adjust src by 4 6079 add %i1, 4, %i1 ! adjust dest by 4 6080 ba .ci_big_d4f 6081 sub %i2, 4, %i2 ! adjust count by 4 6082 6083 6084 ! Dst is on 8 byte boundary; src is not; 6085 .ci_big_unal8: 6086 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned? 6087 bz %ncc, .ci_unalnsrc 6088 sub %o3, 64, %o3 ! %o3 will be multiple of 8 6089 neg %o3 ! bytes until dest is 64 byte aligned 6090 sub %i2, %o3, %i2 ! update cnt with bytes to be moved 6091 ! Move bytes according to source alignment 6092 andcc %i0, 0x1, %o4 6093 bnz %ncc, .ci_unalnbyte ! check for byte alignment 6094 nop 6095 andcc %i0, 2, %o4 ! check for half word alignment 6096 bnz %ncc, .ci_unalnhalf 6097 nop 6098 ! Src is word aligned, move bytes until dest 64 byte aligned 6099 .ci_unalnword: 6100 lda [%i0]%asi, %o4 ! load 4 bytes 6101 stw %o4, [%i1] ! and store 4 bytes 6102 lda [%i0+4]%asi, %o4 ! load 4 bytes 6103 add %i0, 8, %i0 ! increase src ptr by 8 6104 stw %o4, [%i1+4] ! and store 4 bytes 6105 subcc %o3, 8, %o3 ! decrease count by 8 6106 bnz %ncc, .ci_unalnword 6107 add %i1, 8, %i1 ! increase dst ptr by 8 6108 ba .ci_unalnsrc 6109 nop 6110 6111 ! Src is half-word aligned, move bytes until dest 64 byte aligned 6112 .ci_unalnhalf: 6113 lduha [%i0]%asi, %o4 ! load 2 bytes 6114 sllx %o4, 32, %i3 ! shift left 6115 lduwa [%i0+2]%asi, %o4 6116 or %o4, %i3, %i3 6117 sllx %i3, 16, %i3 6118 lduha [%i0+6]%asi, %o4 6119 or %o4, %i3, %i3 6120 stx %i3, [%i1] 6121 add %i0, 8, %i0 6122 subcc %o3, 8, %o3 6123 bnz %ncc, .ci_unalnhalf 6124 add %i1, 8, %i1 6125 ba .ci_unalnsrc 6126 nop 6127 6128 ! Src is Byte aligned, move bytes until dest 64 byte aligned 6129 .ci_unalnbyte: 6130 sub %i1, %i0, %i1 ! share pointer advance 6131 .ci_unalnbyte_loop: 6132 lduba [%i0]%asi, %o4 6133 sllx %o4, 56, %i3 6134 lduha [%i0+1]%asi, %o4 6135 sllx %o4, 40, %o4 6136 or %o4, %i3, %i3 6137 lduha [%i0+3]%asi, %o4 6138 sllx %o4, 24, %o4 6139 or %o4, %i3, %i3 6140 lduha [%i0+5]%asi, %o4 6141 sllx %o4, 8, %o4 6142 or %o4, %i3, %i3 6143 lduba [%i0+7]%asi, %o4 6144 or %o4, %i3, %i3 6145 stx %i3, [%i1+%i0] 6146 subcc %o3, 8, %o3 6147 bnz %ncc, .ci_unalnbyte_loop 6148 add %i0, 8, %i0 6149 add %i1,%i0, %i1 ! restore pointer 6150 6151 ! Destination is now block (64 byte aligned), src is not 8 byte aligned 6152 .ci_unalnsrc: 6153 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size 6154 and %i2, 0x3f, %i2 ! residue bytes in %i2 6155 add %i2, 64, %i2 ! Insure we don't load beyond 6156 sub %i3, 64, %i3 ! end of source buffer 6157 6158 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address 6159 prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read 6160 alignaddr %i0, %g0, %g0 ! generate %gsr 6161 add %i0, %i3, %i0 ! advance %i0 to after blocks 6162 ! 6163 ! Determine source alignment to correct 8 byte offset 6164 andcc %i0, 0x20, %o3 6165 brnz,pn %o3, .ci_unaln_1 6166 andcc %i0, 0x10, %o3 6167 brnz,pn %o3, .ci_unaln_01 6168 andcc %i0, 0x08, %o3 6169 brz,a %o3, .ci_unaln_000 6170 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6171 ba .ci_unaln_001 6172 nop 6173 .ci_unaln_01: 6174 brnz,a %o3, .ci_unaln_011 6175 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6176 ba .ci_unaln_010 6177 nop 6178 .ci_unaln_1: 6179 brnz,pn %o3, .ci_unaln_11 6180 andcc %i0, 0x08, %o3 6181 brnz,a %o3, .ci_unaln_101 6182 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6183 ba .ci_unaln_100 6184 nop 6185 .ci_unaln_11: 6186 brz,pn %o3, .ci_unaln_110 6187 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 6188 6189 .ci_unaln_111: 6190 ldda [%o4+56]%asi, %d14 6191 .ci_unaln_111_loop: 6192 add %o4, 64, %o4 6193 ldda [%o4]ASI_BLK_AIUS, %d16 6194 faligndata %d14, %d16, %d48 6195 faligndata %d16, %d18, %d50 6196 faligndata %d18, %d20, %d52 6197 faligndata %d20, %d22, %d54 6198 faligndata %d22, %d24, %d56 6199 faligndata %d24, %d26, %d58 6200 faligndata %d26, %d28, %d60 6201 faligndata %d28, %d30, %d62 6202 fmovd %d30, %d14 6203 stda %d48, [%i1]ASI_BLK_P 6204 subcc %i3, 64, %i3 6205 add %i1, 64, %i1 6206 bgu,pt %ncc, .ci_unaln_111_loop 6207 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6208 ba .ci_unaln_done 6209 nop 6210 6211 .ci_unaln_110: 6212 ldda [%o4+48]%asi, %d12 6213 ldda [%o4+56]%asi, %d14 6214 .ci_unaln_110_loop: 6215 add %o4, 64, %o4 6216 ldda [%o4]ASI_BLK_AIUS, %d16 6217 faligndata %d12, %d14, %d48 6218 faligndata %d14, %d16, %d50 6219 faligndata %d16, %d18, %d52 6220 faligndata %d18, %d20, %d54 6221 faligndata %d20, %d22, %d56 6222 faligndata %d22, %d24, %d58 6223 faligndata %d24, %d26, %d60 6224 faligndata %d26, %d28, %d62 6225 fmovd %d28, %d12 6226 fmovd %d30, %d14 6227 stda %d48, [%i1]ASI_BLK_P 6228 subcc %i3, 64, %i3 6229 add %i1, 64, %i1 6230 bgu,pt %ncc, .ci_unaln_110_loop 6231 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6232 ba .ci_unaln_done 6233 nop 6234 6235 .ci_unaln_101: 6236 ldda [%o4+40]%asi, %d10 6237 ldda [%o4+48]%asi, %d12 6238 ldda [%o4+56]%asi, %d14 6239 .ci_unaln_101_loop: 6240 add %o4, 64, %o4 6241 ldda [%o4]ASI_BLK_AIUS, %d16 6242 faligndata %d10, %d12, %d48 6243 faligndata %d12, %d14, %d50 6244 faligndata %d14, %d16, %d52 6245 faligndata %d16, %d18, %d54 6246 faligndata %d18, %d20, %d56 6247 faligndata %d20, %d22, %d58 6248 faligndata %d22, %d24, %d60 6249 faligndata %d24, %d26, %d62 6250 fmovd %d26, %d10 6251 fmovd %d28, %d12 6252 fmovd %d30, %d14 6253 stda %d48, [%i1]ASI_BLK_P 6254 subcc %i3, 64, %i3 6255 add %i1, 64, %i1 6256 bgu,pt %ncc, .ci_unaln_101_loop 6257 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6258 ba .ci_unaln_done 6259 nop 6260 6261 .ci_unaln_100: 6262 ldda [%o4+32]%asi, %d8 6263 ldda [%o4+40]%asi, %d10 6264 ldda [%o4+48]%asi, %d12 6265 ldda [%o4+56]%asi, %d14 6266 .ci_unaln_100_loop: 6267 add %o4, 64, %o4 6268 ldda [%o4]ASI_BLK_AIUS, %d16 6269 faligndata %d8, %d10, %d48 6270 faligndata %d10, %d12, %d50 6271 faligndata %d12, %d14, %d52 6272 faligndata %d14, %d16, %d54 6273 faligndata %d16, %d18, %d56 6274 faligndata %d18, %d20, %d58 6275 faligndata %d20, %d22, %d60 6276 faligndata %d22, %d24, %d62 6277 fmovd %d24, %d8 6278 fmovd %d26, %d10 6279 fmovd %d28, %d12 6280 fmovd %d30, %d14 6281 stda %d48, [%i1]ASI_BLK_P 6282 subcc %i3, 64, %i3 6283 add %i1, 64, %i1 6284 bgu,pt %ncc, .ci_unaln_100_loop 6285 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6286 ba .ci_unaln_done 6287 nop 6288 6289 .ci_unaln_011: 6290 ldda [%o4+24]%asi, %d6 6291 ldda [%o4+32]%asi, %d8 6292 ldda [%o4+40]%asi, %d10 6293 ldda [%o4+48]%asi, %d12 6294 ldda [%o4+56]%asi, %d14 6295 .ci_unaln_011_loop: 6296 add %o4, 64, %o4 6297 ldda [%o4]ASI_BLK_AIUS, %d16 6298 faligndata %d6, %d8, %d48 6299 faligndata %d8, %d10, %d50 6300 faligndata %d10, %d12, %d52 6301 faligndata %d12, %d14, %d54 6302 faligndata %d14, %d16, %d56 6303 faligndata %d16, %d18, %d58 6304 faligndata %d18, %d20, %d60 6305 faligndata %d20, %d22, %d62 6306 fmovd %d22, %d6 6307 fmovd %d24, %d8 6308 fmovd %d26, %d10 6309 fmovd %d28, %d12 6310 fmovd %d30, %d14 6311 stda %d48, [%i1]ASI_BLK_P 6312 subcc %i3, 64, %i3 6313 add %i1, 64, %i1 6314 bgu,pt %ncc, .ci_unaln_011_loop 6315 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6316 ba .ci_unaln_done 6317 nop 6318 6319 .ci_unaln_010: 6320 ldda [%o4+16]%asi, %d4 6321 ldda [%o4+24]%asi, %d6 6322 ldda [%o4+32]%asi, %d8 6323 ldda [%o4+40]%asi, %d10 6324 ldda [%o4+48]%asi, %d12 6325 ldda [%o4+56]%asi, %d14 6326 .ci_unaln_010_loop: 6327 add %o4, 64, %o4 6328 ldda [%o4]ASI_BLK_AIUS, %d16 6329 faligndata %d4, %d6, %d48 6330 faligndata %d6, %d8, %d50 6331 faligndata %d8, %d10, %d52 6332 faligndata %d10, %d12, %d54 6333 faligndata %d12, %d14, %d56 6334 faligndata %d14, %d16, %d58 6335 faligndata %d16, %d18, %d60 6336 faligndata %d18, %d20, %d62 6337 fmovd %d20, %d4 6338 fmovd %d22, %d6 6339 fmovd %d24, %d8 6340 fmovd %d26, %d10 6341 fmovd %d28, %d12 6342 fmovd %d30, %d14 6343 stda %d48, [%i1]ASI_BLK_P 6344 subcc %i3, 64, %i3 6345 add %i1, 64, %i1 6346 bgu,pt %ncc, .ci_unaln_010_loop 6347 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6348 ba .ci_unaln_done 6349 nop 6350 6351 .ci_unaln_001: 6352 ldda [%o4+8]%asi, %d2 6353 ldda [%o4+16]%asi, %d4 6354 ldda [%o4+24]%asi, %d6 6355 ldda [%o4+32]%asi, %d8 6356 ldda [%o4+40]%asi, %d10 6357 ldda [%o4+48]%asi, %d12 6358 ldda [%o4+56]%asi, %d14 6359 .ci_unaln_001_loop: 6360 add %o4, 64, %o4 6361 ldda [%o4]ASI_BLK_AIUS, %d16 6362 faligndata %d2, %d4, %d48 6363 faligndata %d4, %d6, %d50 6364 faligndata %d6, %d8, %d52 6365 faligndata %d8, %d10, %d54 6366 faligndata %d10, %d12, %d56 6367 faligndata %d12, %d14, %d58 6368 faligndata %d14, %d16, %d60 6369 faligndata %d16, %d18, %d62 6370 fmovd %d18, %d2 6371 fmovd %d20, %d4 6372 fmovd %d22, %d6 6373 fmovd %d24, %d8 6374 fmovd %d26, %d10 6375 fmovd %d28, %d12 6376 fmovd %d30, %d14 6377 stda %d48, [%i1]ASI_BLK_P 6378 subcc %i3, 64, %i3 6379 add %i1, 64, %i1 6380 bgu,pt %ncc, .ci_unaln_001_loop 6381 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6382 ba .ci_unaln_done 6383 nop 6384 6385 .ci_unaln_000: 6386 ldda [%o4]ASI_BLK_AIUS, %d0 6387 .ci_unaln_000_loop: 6388 add %o4, 64, %o4 6389 ldda [%o4]ASI_BLK_AIUS, %d16 6390 faligndata %d0, %d2, %d48 6391 faligndata %d2, %d4, %d50 6392 faligndata %d4, %d6, %d52 6393 faligndata %d6, %d8, %d54 6394 faligndata %d8, %d10, %d56 6395 faligndata %d10, %d12, %d58 6396 faligndata %d12, %d14, %d60 6397 faligndata %d14, %d16, %d62 6398 fmovd %d16, %d0 6399 fmovd %d18, %d2 6400 fmovd %d20, %d4 6401 fmovd %d22, %d6 6402 fmovd %d24, %d8 6403 fmovd %d26, %d10 6404 fmovd %d28, %d12 6405 fmovd %d30, %d14 6406 stda %d48, [%i1]ASI_BLK_P 6407 subcc %i3, 64, %i3 6408 add %i1, 64, %i1 6409 bgu,pt %ncc, .ci_unaln_000_loop 6410 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6411 6412 .ci_unaln_done: 6413 ! Handle trailing bytes, 64 to 127 6414 ! Dest long word aligned, Src not long word aligned 6415 cmp %i2, 15 6416 bleu %ncc, .ci_unaln_short 6417 6418 andn %i2, 0x7, %i3 ! %i3 is multiple of 8 6419 and %i2, 0x7, %i2 ! residue bytes in %i2 6420 add %i2, 8, %i2 6421 sub %i3, 8, %i3 ! insure we don't load past end of src 6422 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address 6423 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8 6424 ldda [%o4]%asi, %d0 ! fetch partial word 6425 .ci_unaln_by8: 6426 ldda [%o4+8]%asi, %d2 6427 add %o4, 8, %o4 6428 faligndata %d0, %d2, %d16 6429 subcc %i3, 8, %i3 6430 std %d16, [%i1] 6431 fmovd %d2, %d0 6432 bgu,pt %ncc, .ci_unaln_by8 6433 add %i1, 8, %i1 6434 6435 .ci_unaln_short: 6436 cmp %i2, 8 6437 blt,pt %ncc, .ci_unalnfin 6438 nop 6439 lduba [%i0]%asi, %o4 6440 sll %o4, 24, %o3 6441 lduba [%i0+1]%asi, %o4 6442 sll %o4, 16, %o4 6443 or %o4, %o3, %o3 6444 lduba [%i0+2]%asi, %o4 6445 sll %o4, 8, %o4 6446 or %o4, %o3, %o3 6447 lduba [%i0+3]%asi, %o4 6448 or %o4, %o3, %o3 6449 stw %o3, [%i1] 6450 lduba [%i0+4]%asi, %o4 6451 sll %o4, 24, %o3 6452 lduba [%i0+5]%asi, %o4 6453 sll %o4, 16, %o4 6454 or %o4, %o3, %o3 6455 lduba [%i0+6]%asi, %o4 6456 sll %o4, 8, %o4 6457 or %o4, %o3, %o3 6458 lduba [%i0+7]%asi, %o4 6459 or %o4, %o3, %o3 6460 stw %o3, [%i1+4] 6461 add %i0, 8, %i0 6462 add %i1, 8, %i1 6463 sub %i2, 8, %i2 6464 .ci_unalnfin: 6465 cmp %i2, 4 6466 blt,pt %ncc, .ci_unalnz 6467 tst %i2 6468 lduba [%i0]%asi, %o3 ! read byte 6469 subcc %i2, 4, %i2 ! reduce count by 4 6470 sll %o3, 24, %o3 ! position 6471 lduba [%i0+1]%asi, %o4 6472 sll %o4, 16, %o4 ! position 6473 or %o4, %o3, %o3 ! merge 6474 lduba [%i0+2]%asi, %o4 6475 sll %o4, 8, %o4 ! position 6476 or %o4, %o3, %o3 ! merge 6477 add %i1, 4, %i1 ! advance dst by 4 6478 lduba [%i0+3]%asi, %o4 6479 add %i0, 4, %i0 ! advance src by 4 6480 or %o4, %o3, %o4 ! merge 6481 bnz,pt %ncc, .ci_unaln3x 6482 stw %o4, [%i1-4] 6483 ba .ci_exit 6484 nop 6485 .ci_unalnz: 6486 bz,pt %ncc, .ci_exit 6487 wr %l5, %g0, %gsr ! restore %gsr 6488 .ci_unaln3x: ! Exactly 1, 2, or 3 bytes remain 6489 subcc %i2, 1, %i2 ! reduce count for cc test 6490 lduba [%i0]%asi, %o4 ! load one byte 6491 bz,pt %ncc, .ci_exit 6492 stb %o4, [%i1] ! store one byte 6493 lduba [%i0+1]%asi, %o4 ! load second byte 6494 subcc %i2, 1, %i2 6495 bz,pt %ncc, .ci_exit 6496 stb %o4, [%i1+1] ! store second byte 6497 lduba [%i0+2]%asi, %o4 ! load third byte 6498 stb %o4, [%i1+2] ! store third byte 6499 .ci_exit: 6500 brnz %g1, .ci_fp_restore 6501 nop 6502 FZERO 6503 wr %g1, %g0, %fprs 6504 ba,pt %ncc, .ci_ex2 6505 membar #Sync 6506 .ci_fp_restore: 6507 BLD_FP_FROMSTACK(%o4) 6508 .ci_ex2: 6509 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 6510 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 6511 ret 6512 restore %g0, 0, %o0 6513 6514 .copyin_err: 6515 ldn [THREAD_REG + T_COPYOPS], %o4 6516 brz %o4, 2f 6517 nop 6518 ldn [%o4 + CP_COPYIN], %g2 6519 jmp %g2 6520 nop 6521 2: 6522 retl 6523 mov -1, %o0 6524 6525 #else /* NIAGARA_IMPL */ 6526 .do_copyin: 6527 ! 6528 ! Check the length and bail if zero. 6529 ! 6530 tst %o2 6531 bnz,pt %ncc, 1f 6532 nop 6533 retl 6534 clr %o0 6535 1: 6536 sethi %hi(copyio_fault), %o4 6537 or %o4, %lo(copyio_fault), %o4 6538 sethi %hi(copyio_fault_nowindow), %o3 6539 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 6540 or %o3, %lo(copyio_fault_nowindow), %o3 6541 membar #Sync 6542 stn %o3, [THREAD_REG + T_LOFAULT] 6543 6544 mov %o0, SAVE_SRC 6545 mov %o1, SAVE_DST 6546 mov %o2, SAVE_COUNT 6547 6548 ! 6549 ! Check to see if we're more than SMALL_LIMIT. 6550 ! 6551 subcc %o2, SMALL_LIMIT, %o3 6552 bgu,a,pt %ncc, .dci_ns 6553 or %o0, %o1, %o3 6554 ! 6555 ! What was previously ".small_copyin" 6556 ! 6557 .dcibcp: 6558 sub %g0, %o2, %o3 ! setup for copy loop 6559 add %o0, %o2, %o0 6560 add %o1, %o2, %o1 6561 ba,pt %ncc, .dcicl 6562 lduba [%o0 + %o3]ASI_USER, %o4 6563 ! 6564 ! %o0 and %o1 point at the end and remain pointing at the end 6565 ! of their buffers. We pull things out by adding %o3 (which is 6566 ! the negation of the length) to the buffer end which gives us 6567 ! the curent location in the buffers. By incrementing %o3 we walk 6568 ! through both buffers without having to bump each buffer's 6569 ! pointer. A very fast 4 instruction loop. 6570 ! 6571 .align 16 6572 .dcicl: 6573 stb %o4, [%o1 + %o3] 6574 inccc %o3 6575 bl,a,pt %ncc, .dcicl 6576 lduba [%o0 + %o3]ASI_USER, %o4 6577 ! 6578 ! We're done. Go home. 6579 ! 6580 membar #Sync 6581 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 6582 retl 6583 clr %o0 6584 ! 6585 ! Try aligned copies from here. 6586 ! 6587 .dci_ns: 6588 ! 6589 ! See if we're single byte aligned. If we are, check the 6590 ! limit for single byte copies. If we're smaller, or equal, 6591 ! bounce to the byte for byte copy loop. Otherwise do it in 6592 ! HW (if enabled). 6593 ! 6594 btst 1, %o3 6595 bz,a,pt %icc, .dcih8 6596 btst 7, %o3 6597 ! 6598 ! We're single byte aligned. 6599 ! 6600 sethi %hi(hw_copy_limit_1), %o3 6601 ld [%o3 + %lo(hw_copy_limit_1)], %o3 6602 ! 6603 ! Is HW copy on? If not do everything byte for byte. 6604 ! 6605 tst %o3 6606 bz,pn %icc, .dcibcp 6607 subcc %o3, %o2, %o3 6608 ! 6609 ! Are we bigger than the HW limit? If not 6610 ! go to byte for byte. 6611 ! 6612 bge,pt %ncc, .dcibcp 6613 nop 6614 ! 6615 ! We're big enough and copy is on. Do it with HW. 6616 ! 6617 ba,pt %ncc, .big_copyin 6618 nop 6619 .dcih8: 6620 ! 6621 ! 8 byte aligned? 6622 ! 6623 bnz,a %ncc, .dcih4 6624 btst 3, %o3 6625 ! 6626 ! We're eight byte aligned. 6627 ! 6628 sethi %hi(hw_copy_limit_8), %o3 6629 ld [%o3 + %lo(hw_copy_limit_8)], %o3 6630 ! 6631 ! Is HW assist on? If not, do it with the aligned copy. 6632 ! 6633 tst %o3 6634 bz,pn %icc, .dcis8 6635 subcc %o3, %o2, %o3 6636 bge %ncc, .dcis8 6637 nop 6638 ba,pt %ncc, .big_copyin 6639 nop 6640 .dcis8: 6641 ! 6642 ! Housekeeping for copy loops. Uses same idea as in the byte for 6643 ! byte copy loop above. 6644 ! 6645 add %o0, %o2, %o0 6646 add %o1, %o2, %o1 6647 sub %g0, %o2, %o3 6648 ba,pt %ncc, .didebc 6649 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy 6650 ! 6651 ! 4 byte aligned? 6652 ! 6653 .dcih4: 6654 bnz %ncc, .dcih2 6655 sethi %hi(hw_copy_limit_4), %o3 6656 ld [%o3 + %lo(hw_copy_limit_4)], %o3 6657 ! 6658 ! Is HW assist on? If not, do it with the aligned copy. 6659 ! 6660 tst %o3 6661 bz,pn %icc, .dcis4 6662 subcc %o3, %o2, %o3 6663 ! 6664 ! We're negative if our size is less than or equal to hw_copy_limit_4. 6665 ! 6666 bge %ncc, .dcis4 6667 nop 6668 ba,pt %ncc, .big_copyin 6669 nop 6670 .dcis4: 6671 ! 6672 ! Housekeeping for copy loops. Uses same idea as in the byte 6673 ! for byte copy loop above. 6674 ! 6675 add %o0, %o2, %o0 6676 add %o1, %o2, %o1 6677 sub %g0, %o2, %o3 6678 ba,pt %ncc, .didfbc 6679 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy 6680 .dcih2: 6681 ! 6682 ! We're two byte aligned. Check for "smallness" 6683 ! done in delay at .dcih4 6684 ! 6685 bleu,pt %ncc, .dcis2 6686 sethi %hi(hw_copy_limit_2), %o3 6687 ld [%o3 + %lo(hw_copy_limit_2)], %o3 6688 ! 6689 ! Is HW assist on? If not, do it with the aligned copy. 6690 ! 6691 tst %o3 6692 bz,pn %icc, .dcis2 6693 subcc %o3, %o2, %o3 6694 ! 6695 ! Are we larger than the HW limit? 6696 ! 6697 bge %ncc, .dcis2 6698 nop 6699 ! 6700 ! HW assist is on and we're large enough to use it. 6701 ! 6702 ba,pt %ncc, .big_copyin 6703 nop 6704 ! 6705 ! Housekeeping for copy loops. Uses same idea as in the byte 6706 ! for byte copy loop above. 6707 ! 6708 .dcis2: 6709 add %o0, %o2, %o0 6710 add %o1, %o2, %o1 6711 sub %g0, %o2, %o3 6712 ba,pt %ncc, .didtbc 6713 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy 6714 ! 6715 .small_copyin: 6716 ! 6717 ! Why are we doing this AGAIN? There are certain conditions in 6718 ! big copyin that will cause us to forgo the HW assisted copys 6719 ! and bounce back to a non-hw assisted copy. This dispatches 6720 ! those copies. Note that we branch around this in the main line 6721 ! code. 6722 ! 6723 ! We make no check for limits or HW enablement here. We've 6724 ! already been told that we're a poster child so just go off 6725 ! and do it. 6726 ! 6727 or %o0, %o1, %o3 6728 btst 1, %o3 6729 bnz %icc, .dcibcp ! Most likely 6730 btst 7, %o3 6731 bz %icc, .dcis8 6732 btst 3, %o3 6733 bz %icc, .dcis4 6734 nop 6735 ba,pt %ncc, .dcis2 6736 nop 6737 ! 6738 ! Eight byte aligned copies. A steal from the original .small_copyin 6739 ! with modifications. %o2 is number of 8 byte chunks to copy. When 6740 ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more 6741 ! to copy. 6742 ! 6743 .align 32 6744 .didebc: 6745 ldxa [%o0 + %o3]ASI_USER, %o4 6746 deccc %o2 6747 stx %o4, [%o1 + %o3] 6748 bg,pt %ncc, .didebc 6749 addcc %o3, 8, %o3 6750 ! 6751 ! End of copy loop. Most 8 byte aligned copies end here. 6752 ! 6753 bz,pt %ncc, .dcifh 6754 nop 6755 ! 6756 ! Something is left. Do it byte for byte. 6757 ! 6758 ba,pt %ncc, .dcicl 6759 lduba [%o0 + %o3]ASI_USER, %o4 6760 ! 6761 ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy. 6762 ! 6763 .align 32 6764 .didfbc: 6765 lduwa [%o0 + %o3]ASI_USER, %o4 6766 deccc %o2 6767 st %o4, [%o1 + %o3] 6768 bg,pt %ncc, .didfbc 6769 addcc %o3, 4, %o3 6770 ! 6771 ! End of copy loop. Most 4 byte aligned copies end here. 6772 ! 6773 bz,pt %ncc, .dcifh 6774 nop 6775 ! 6776 ! Something is left. Do it byte for byte. 6777 ! 6778 ba,pt %ncc, .dcicl 6779 lduba [%o0 + %o3]ASI_USER, %o4 6780 ! 6781 ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to 6782 ! copy. 6783 ! 6784 .align 32 6785 .didtbc: 6786 lduha [%o0 + %o3]ASI_USER, %o4 6787 deccc %o2 6788 sth %o4, [%o1 + %o3] 6789 bg,pt %ncc, .didtbc 6790 addcc %o3, 2, %o3 6791 ! 6792 ! End of copy loop. Most 2 byte aligned copies end here. 6793 ! 6794 bz,pt %ncc, .dcifh 6795 nop 6796 ! 6797 ! Deal with the last byte 6798 ! 6799 lduba [%o0 + %o3]ASI_USER, %o4 6800 stb %o4, [%o1 + %o3] 6801 .dcifh: 6802 membar #Sync 6803 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 6804 retl 6805 clr %o0 6806 6807 .big_copyin: 6808 ! We're going off to do a block copy. 6809 ! Switch fault hendlers and grab a window. We 6810 ! don't do a membar #Sync since we've done only 6811 ! kernel data to this point. 6812 stn %o4, [THREAD_REG + T_LOFAULT] 6813 6814 ! Copy in that reach here are larger than 256 bytes. The 6815 ! hw_copy_limit_1 is set to 256. Never set this limit less 6816 ! 128 bytes. 6817 save %sp, -SA(MINFRAME), %sp 6818 .do_blockcopyin: 6819 6820 ! Swap src/dst since the code below is memcpy code 6821 ! and memcpy/bcopy have different calling sequences 6822 mov %i1, %i5 6823 mov %i0, %i1 6824 mov %i5, %i0 6825 6826 ! Block (64 bytes) align the destination. 6827 andcc %i0, 0x3f, %i3 ! is dst block aligned 6828 bz %ncc, copyin_blalign ! dst already block aligned 6829 sub %i3, 0x40, %i3 6830 neg %i3 ! bytes till dst 64 bytes aligned 6831 sub %i2, %i3, %i2 ! update i2 with new count 6832 6833 ! Based on source and destination alignment do 6834 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 6835 6836 ! Is dst & src 8B aligned 6837 or %i0, %i1, %o2 6838 andcc %o2, 0x7, %g0 6839 bz %ncc, .ci_alewdcp 6840 nop 6841 6842 ! Is dst & src 4B aligned 6843 andcc %o2, 0x3, %g0 6844 bz %ncc, .ci_alwdcp 6845 nop 6846 6847 ! Is dst & src 2B aligned 6848 andcc %o2, 0x1, %g0 6849 bz %ncc, .ci_alhlfwdcp 6850 nop 6851 6852 ! 1B aligned 6853 1: lduba [%i1]ASI_USER, %o2 6854 stb %o2, [%i0] 6855 inc %i1 6856 deccc %i3 6857 bgu,pt %ncc, 1b 6858 inc %i0 6859 6860 ba copyin_blalign 6861 nop 6862 6863 ! dst & src 4B aligned 6864 .ci_alwdcp: 6865 lda [%i1]ASI_USER, %o2 6866 st %o2, [%i0] 6867 add %i1, 0x4, %i1 6868 subcc %i3, 0x4, %i3 6869 bgu,pt %ncc, .ci_alwdcp 6870 add %i0, 0x4, %i0 6871 6872 ba copyin_blalign 6873 nop 6874 6875 ! dst & src 2B aligned 6876 .ci_alhlfwdcp: 6877 lduha [%i1]ASI_USER, %o2 6878 stuh %o2, [%i0] 6879 add %i1, 0x2, %i1 6880 subcc %i3, 0x2, %i3 6881 bgu,pt %ncc, .ci_alhlfwdcp 6882 add %i0, 0x2, %i0 6883 6884 ba copyin_blalign 6885 nop 6886 6887 ! dst & src 8B aligned 6888 .ci_alewdcp: 6889 ldxa [%i1]ASI_USER, %o2 6890 stx %o2, [%i0] 6891 add %i1, 0x8, %i1 6892 subcc %i3, 0x8, %i3 6893 bgu,pt %ncc, .ci_alewdcp 6894 add %i0, 0x8, %i0 6895 6896 copyin_blalign: 6897 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 6898 sub %i2, %i3, %i2 ! Residue bytes in %i2 6899 6900 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 6901 6902 andcc %i1, 0xf, %o2 ! is src quadword aligned 6903 bz,pn %xcc, .ci_blkcpy ! src offset in %o2 (last 4-bits) 6904 nop 6905 cmp %o2, 0x8 6906 bg .ci_upper_double 6907 nop 6908 bl .ci_lower_double 6909 nop 6910 6911 ! Falls through when source offset is equal to 8 i.e. 6912 ! source is double word aligned. 6913 ! In this case no shift/merge of data is required 6914 6915 sub %i1, %o2, %i1 ! align the src at 16 bytes. 6916 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 6917 prefetcha [%l0]ASI_USER, #one_read 6918 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 6919 add %l0, 0x40, %l0 6920 .ci_loop0: 6921 add %i1, 0x10, %i1 6922 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 6923 6924 prefetcha [%l0]ASI_USER, #one_read 6925 6926 stxa %l3, [%i0+0x0]%asi 6927 stxa %l4, [%i0+0x8]%asi 6928 6929 add %i1, 0x10, %i1 6930 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 6931 6932 stxa %l5, [%i0+0x10]%asi 6933 stxa %l2, [%i0+0x18]%asi 6934 6935 add %i1, 0x10, %i1 6936 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 6937 6938 stxa %l3, [%i0+0x20]%asi 6939 stxa %l4, [%i0+0x28]%asi 6940 6941 add %i1, 0x10, %i1 6942 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 6943 6944 stxa %l5, [%i0+0x30]%asi 6945 stxa %l2, [%i0+0x38]%asi 6946 6947 add %l0, 0x40, %l0 6948 subcc %i3, 0x40, %i3 6949 bgu,pt %xcc, .ci_loop0 6950 add %i0, 0x40, %i0 6951 ba .ci_blkdone 6952 add %i1, %o2, %i1 ! increment the source by src offset 6953 ! the src offset was stored in %o2 6954 6955 .ci_lower_double: 6956 6957 sub %i1, %o2, %i1 ! align the src at 16 bytes. 6958 sll %o2, 3, %o0 ! %o0 left shift 6959 mov 0x40, %o1 6960 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 6961 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 6962 prefetcha [%l0]ASI_USER, #one_read 6963 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l2 6964 ! and %l3 has complete 6965 ! data 6966 add %l0, 0x40, %l0 6967 .ci_loop1: 6968 add %i1, 0x10, %i1 6969 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has partial data 6970 ! for this read. 6971 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 6972 ! into %l2 and %l3 6973 6974 prefetcha [%l0]ASI_USER, #one_read 6975 6976 stxa %l2, [%i0+0x0]%asi 6977 stxa %l3, [%i0+0x8]%asi 6978 6979 add %i1, 0x10, %i1 6980 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 6981 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 6982 ! %l4 from previous read 6983 ! into %l4 and %l5 6984 stxa %l4, [%i0+0x10]%asi 6985 stxa %l5, [%i0+0x18]%asi 6986 6987 ! Repeat the same for next 32 bytes. 6988 6989 add %i1, 0x10, %i1 6990 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 6991 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 6992 6993 stxa %l2, [%i0+0x20]%asi 6994 stxa %l3, [%i0+0x28]%asi 6995 6996 add %i1, 0x10, %i1 6997 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 6998 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 6999 7000 stxa %l4, [%i0+0x30]%asi 7001 stxa %l5, [%i0+0x38]%asi 7002 7003 add %l0, 0x40, %l0 7004 subcc %i3, 0x40, %i3 7005 bgu,pt %xcc, .ci_loop1 7006 add %i0, 0x40, %i0 7007 ba .ci_blkdone 7008 add %i1, %o2, %i1 ! increment the source by src offset 7009 ! the src offset was stored in %o2 7010 7011 .ci_upper_double: 7012 7013 sub %i1, %o2, %i1 ! align the src at 16 bytes. 7014 sub %o2, 0x8, %o0 7015 sll %o0, 3, %o0 ! %o0 left shift 7016 mov 0x40, %o1 7017 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 7018 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 7019 prefetcha [%l0]ASI_USER, #one_read 7020 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l3 7021 ! for this read and 7022 ! no data in %l2 7023 add %l0, 0x40, %l0 7024 .ci_loop2: 7025 add %i1, 0x10, %i1 7026 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has complete data 7027 ! and %l5 has partial 7028 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 7029 ! into %l3 and %l4 7030 prefetcha [%l0]ASI_USER, #one_read 7031 7032 stxa %l3, [%i0+0x0]%asi 7033 stxa %l4, [%i0+0x8]%asi 7034 7035 add %i1, 0x10, %i1 7036 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7037 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 7038 ! %l5 from previous read 7039 ! into %l5 and %l2 7040 7041 stxa %l5, [%i0+0x10]%asi 7042 stxa %l2, [%i0+0x18]%asi 7043 7044 ! Repeat the same for next 32 bytes. 7045 7046 add %i1, 0x10, %i1 7047 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 7048 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 7049 7050 stxa %l3, [%i0+0x20]%asi 7051 stxa %l4, [%i0+0x28]%asi 7052 7053 add %i1, 0x10, %i1 7054 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7055 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 7056 7057 stxa %l5, [%i0+0x30]%asi 7058 stxa %l2, [%i0+0x38]%asi 7059 7060 add %l0, 0x40, %l0 7061 subcc %i3, 0x40, %i3 7062 bgu,pt %xcc, .ci_loop2 7063 add %i0, 0x40, %i0 7064 ba .ci_blkdone 7065 add %i1, %o2, %i1 ! increment the source by src offset 7066 ! the src offset was stored in %o2 7067 7068 7069 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 7070 .ci_blkcpy: 7071 7072 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 7073 prefetcha [%o0]ASI_USER, #one_read 7074 add %o0, 0x40, %o0 7075 1: 7076 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0 7077 add %i1, 0x10, %i1 7078 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7079 add %i1, 0x10, %i1 7080 7081 prefetcha [%o0]ASI_USER, #one_read 7082 7083 stxa %l0, [%i0+0x0]%asi 7084 7085 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 7086 add %i1, 0x10, %i1 7087 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6 7088 add %i1, 0x10, %i1 7089 7090 stxa %l1, [%i0+0x8]%asi 7091 stxa %l2, [%i0+0x10]%asi 7092 stxa %l3, [%i0+0x18]%asi 7093 stxa %l4, [%i0+0x20]%asi 7094 stxa %l5, [%i0+0x28]%asi 7095 stxa %l6, [%i0+0x30]%asi 7096 stxa %l7, [%i0+0x38]%asi 7097 7098 add %o0, 0x40, %o0 7099 subcc %i3, 0x40, %i3 7100 bgu,pt %xcc, 1b 7101 add %i0, 0x40, %i0 7102 7103 .ci_blkdone: 7104 membar #Sync 7105 7106 brz,pt %i2, .copyin_exit 7107 nop 7108 7109 ! Handle trailing bytes 7110 cmp %i2, 0x8 7111 blu,pt %ncc, .ci_residue 7112 nop 7113 7114 ! Can we do some 8B ops 7115 or %i1, %i0, %o2 7116 andcc %o2, 0x7, %g0 7117 bnz %ncc, .ci_last4 7118 nop 7119 7120 ! Do 8byte ops as long as possible 7121 .ci_last8: 7122 ldxa [%i1]ASI_USER, %o2 7123 stx %o2, [%i0] 7124 add %i1, 0x8, %i1 7125 sub %i2, 0x8, %i2 7126 cmp %i2, 0x8 7127 bgu,pt %ncc, .ci_last8 7128 add %i0, 0x8, %i0 7129 7130 brz,pt %i2, .copyin_exit 7131 nop 7132 7133 ba .ci_residue 7134 nop 7135 7136 .ci_last4: 7137 ! Can we do 4B ops 7138 andcc %o2, 0x3, %g0 7139 bnz %ncc, .ci_last2 7140 nop 7141 1: 7142 lda [%i1]ASI_USER, %o2 7143 st %o2, [%i0] 7144 add %i1, 0x4, %i1 7145 sub %i2, 0x4, %i2 7146 cmp %i2, 0x4 7147 bgu,pt %ncc, 1b 7148 add %i0, 0x4, %i0 7149 7150 brz,pt %i2, .copyin_exit 7151 nop 7152 7153 ba .ci_residue 7154 nop 7155 7156 .ci_last2: 7157 ! Can we do 2B ops 7158 andcc %o2, 0x1, %g0 7159 bnz %ncc, .ci_residue 7160 nop 7161 7162 1: 7163 lduha [%i1]ASI_USER, %o2 7164 stuh %o2, [%i0] 7165 add %i1, 0x2, %i1 7166 sub %i2, 0x2, %i2 7167 cmp %i2, 0x2 7168 bgu,pt %ncc, 1b 7169 add %i0, 0x2, %i0 7170 7171 brz,pt %i2, .copyin_exit 7172 nop 7173 7174 ! Copy the residue as byte copy 7175 .ci_residue: 7176 lduba [%i1]ASI_USER, %i4 7177 stb %i4, [%i0] 7178 inc %i1 7179 deccc %i2 7180 bgu,pt %xcc, .ci_residue 7181 inc %i0 7182 7183 .copyin_exit: 7184 membar #Sync 7185 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 7186 ret 7187 restore %g0, 0, %o0 7188 .copyin_err: 7189 ldn [THREAD_REG + T_COPYOPS], %o4 7190 brz %o4, 2f 7191 nop 7192 ldn [%o4 + CP_COPYIN], %g2 7193 jmp %g2 7194 nop 7195 2: 7196 retl 7197 mov -1, %o0 7198 #endif /* NIAGARA_IMPL */ 7199 SET_SIZE(copyin) 7200 7201 ENTRY(xcopyin) 7202 sethi %hi(.xcopyin_err), REAL_LOFAULT 7203 b .do_copyin 7204 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 7205 .xcopyin_err: 7206 ldn [THREAD_REG + T_COPYOPS], %o4 7207 brz %o4, 2f 7208 nop 7209 ldn [%o4 + CP_XCOPYIN], %g2 7210 jmp %g2 7211 nop 7212 2: 7213 retl 7214 mov %g1, %o0 7215 SET_SIZE(xcopyin) 7216 7217 ENTRY(xcopyin_little) 7218 sethi %hi(.little_err), %o4 7219 ldn [THREAD_REG + T_LOFAULT], %o5 7220 or %o4, %lo(.little_err), %o4 7221 membar #Sync ! sync error barrier 7222 stn %o4, [THREAD_REG + T_LOFAULT] 7223 7224 subcc %g0, %o2, %o3 7225 add %o0, %o2, %o0 7226 bz,pn %ncc, 2f ! check for zero bytes 7227 sub %o2, 1, %o4 7228 add %o0, %o4, %o0 ! start w/last byte 7229 add %o1, %o2, %o1 7230 lduba [%o0+%o3]ASI_AIUSL, %o4 7231 7232 1: stb %o4, [%o1+%o3] 7233 inccc %o3 7234 sub %o0, 2, %o0 ! get next byte 7235 bcc,a,pt %ncc, 1b 7236 lduba [%o0+%o3]ASI_AIUSL, %o4 7237 7238 2: membar #Sync ! sync error barrier 7239 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 7240 retl 7241 mov %g0, %o0 ! return (0) 7242 7243 .little_err: 7244 membar #Sync ! sync error barrier 7245 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 7246 retl 7247 mov %g1, %o0 7248 SET_SIZE(xcopyin_little) 7249 7250 7251 /* 7252 * Copy a block of storage - must not overlap (from + len <= to). 7253 * No fault handler installed (to be called under on_fault()) 7254 */ 7255 7256 ENTRY(copyin_noerr) 7257 sethi %hi(.copyio_noerr), REAL_LOFAULT 7258 b .do_copyin 7259 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 7260 .copyio_noerr: 7261 jmp SAVED_LOFAULT 7262 nop 7263 SET_SIZE(copyin_noerr) 7264 7265 /* 7266 * Copy a block of storage - must not overlap (from + len <= to). 7267 * No fault handler installed (to be called under on_fault()) 7268 */ 7269 7270 ENTRY(copyout_noerr) 7271 sethi %hi(.copyio_noerr), REAL_LOFAULT 7272 b .do_copyout 7273 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 7274 SET_SIZE(copyout_noerr) 7275 7276 .align 4 7277 DGDEF(use_hw_bcopy) 7278 .word 1 7279 DGDEF(use_hw_bzero) 7280 .word 1 7281 DGDEF(hw_copy_limit_1) 7282 .word 0x100 7283 DGDEF(hw_copy_limit_2) 7284 .word 0x200 7285 DGDEF(hw_copy_limit_4) 7286 .word 0x400 7287 DGDEF(hw_copy_limit_8) 7288 .word 0x400 7289 7290 .align 64 7291 .section ".text" 7292 7293 /* 7294 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 7295 * longer than 256 bytes in length using Niagara's block stores/quad store. 7296 * If the criteria for using this routine are not met then it calls bzero 7297 * and returns 1. Otherwise 0 is returned indicating success. 7298 * Caller is responsible for ensuring use_hw_bzero is true and that 7299 * kpreempt_disable() has been called. 7300 */ 7301 ! %i0 - start address 7302 ! %i1 - length of region (multiple of 64) 7303 7304 ENTRY(hwblkclr) 7305 save %sp, -SA(MINFRAME), %sp 7306 7307 ! Must be block-aligned 7308 andcc %i0, 0x3f, %g0 7309 bnz,pn %ncc, 1f 7310 nop 7311 7312 ! ... and must be 256 bytes or more 7313 cmp %i1, 0x100 7314 blu,pn %ncc, 1f 7315 nop 7316 7317 ! ... and length must be a multiple of 64 7318 andcc %i1, 0x3f, %g0 7319 bz,pn %ncc, .pz_doblock 7320 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 7321 7322 1: ! punt, call bzero but notify the caller that bzero was used 7323 mov %i0, %o0 7324 call bzero 7325 mov %i1, %o1 7326 ret 7327 restore %g0, 1, %o0 ! return (1) - did not use block operations 7328 7329 ! Already verified that there are at least 256 bytes to set 7330 .pz_doblock: 7331 stxa %g0, [%i0+0x0]%asi 7332 stxa %g0, [%i0+0x40]%asi 7333 stxa %g0, [%i0+0x80]%asi 7334 stxa %g0, [%i0+0xc0]%asi 7335 7336 stxa %g0, [%i0+0x8]%asi 7337 stxa %g0, [%i0+0x10]%asi 7338 stxa %g0, [%i0+0x18]%asi 7339 stxa %g0, [%i0+0x20]%asi 7340 stxa %g0, [%i0+0x28]%asi 7341 stxa %g0, [%i0+0x30]%asi 7342 stxa %g0, [%i0+0x38]%asi 7343 7344 stxa %g0, [%i0+0x48]%asi 7345 stxa %g0, [%i0+0x50]%asi 7346 stxa %g0, [%i0+0x58]%asi 7347 stxa %g0, [%i0+0x60]%asi 7348 stxa %g0, [%i0+0x68]%asi 7349 stxa %g0, [%i0+0x70]%asi 7350 stxa %g0, [%i0+0x78]%asi 7351 7352 stxa %g0, [%i0+0x88]%asi 7353 stxa %g0, [%i0+0x90]%asi 7354 stxa %g0, [%i0+0x98]%asi 7355 stxa %g0, [%i0+0xa0]%asi 7356 stxa %g0, [%i0+0xa8]%asi 7357 stxa %g0, [%i0+0xb0]%asi 7358 stxa %g0, [%i0+0xb8]%asi 7359 7360 stxa %g0, [%i0+0xc8]%asi 7361 stxa %g0, [%i0+0xd0]%asi 7362 stxa %g0, [%i0+0xd8]%asi 7363 stxa %g0, [%i0+0xe0]%asi 7364 stxa %g0, [%i0+0xe8]%asi 7365 stxa %g0, [%i0+0xf0]%asi 7366 stxa %g0, [%i0+0xf8]%asi 7367 7368 sub %i1, 0x100, %i1 7369 cmp %i1, 0x100 7370 bgu,pt %ncc, .pz_doblock 7371 add %i0, 0x100, %i0 7372 7373 2: 7374 ! Check if more than 64 bytes to set 7375 cmp %i1,0x40 7376 blu %ncc, .pz_finish 7377 nop 7378 7379 3: 7380 stxa %g0, [%i0+0x0]%asi 7381 stxa %g0, [%i0+0x8]%asi 7382 stxa %g0, [%i0+0x10]%asi 7383 stxa %g0, [%i0+0x18]%asi 7384 stxa %g0, [%i0+0x20]%asi 7385 stxa %g0, [%i0+0x28]%asi 7386 stxa %g0, [%i0+0x30]%asi 7387 stxa %g0, [%i0+0x38]%asi 7388 7389 subcc %i1, 0x40, %i1 7390 bgu,pt %ncc, 3b 7391 add %i0, 0x40, %i0 7392 7393 .pz_finish: 7394 membar #Sync 7395 ret 7396 restore %g0, 0, %o0 ! return (bzero or not) 7397 SET_SIZE(hwblkclr) 7398 7399 /* 7400 * Copy 32 bytes of data from src (%o0) to dst (%o1) 7401 * using physical addresses. 7402 */ 7403 ENTRY_NP(hw_pa_bcopy32) 7404 rdpr %pstate, %g1 7405 andn %g1, PSTATE_IE, %g2 7406 wrpr %g0, %g2, %pstate 7407 7408 ldxa [%o0]ASI_MEM, %o2 7409 add %o0, 8, %o0 7410 ldxa [%o0]ASI_MEM, %o3 7411 add %o0, 8, %o0 7412 ldxa [%o0]ASI_MEM, %o4 7413 add %o0, 8, %o0 7414 ldxa [%o0]ASI_MEM, %o5 7415 stxa %o2, [%o1]ASI_MEM 7416 add %o1, 8, %o1 7417 stxa %o3, [%o1]ASI_MEM 7418 add %o1, 8, %o1 7419 stxa %o4, [%o1]ASI_MEM 7420 add %o1, 8, %o1 7421 stxa %o5, [%o1]ASI_MEM 7422 7423 membar #Sync 7424 retl 7425 wrpr %g0, %g1, %pstate 7426 SET_SIZE(hw_pa_bcopy32) 7427 7428 /* 7429 * Zero a block of storage. 7430 * 7431 * uzero is used by the kernel to zero a block in user address space. 7432 */ 7433 7434 /* 7435 * Control flow of the bzero/kzero/uzero routine. 7436 * 7437 * For fewer than 7 bytes stores, bytes will be zeroed. 7438 * 7439 * For less than 15 bytes stores, align the address on 4 byte boundary. 7440 * Then store as many 4-byte chunks, followed by trailing bytes. 7441 * 7442 * For sizes greater than 15 bytes, align the address on 8 byte boundary. 7443 * if (count > 128) { 7444 * store as many 8-bytes chunks to block align the address 7445 * store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR 7446 * store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero) 7447 * } 7448 * Store as many 8-byte chunks, followed by trailing bytes. 7449 */ 7450 7451 ENTRY(uzero) 7452 ! 7453 ! Set a new lo_fault handler only if we came in with one 7454 ! already specified. 7455 ! 7456 wr %g0, ASI_USER, %asi 7457 ldn [THREAD_REG + T_LOFAULT], %o5 7458 tst %o5 7459 bz,pt %ncc, .do_zero 7460 sethi %hi(.zeroerr), %o2 7461 or %o2, %lo(.zeroerr), %o2 7462 membar #Sync 7463 ba,pt %ncc, .do_zero 7464 stn %o2, [THREAD_REG + T_LOFAULT] 7465 7466 ENTRY(kzero) 7467 ! 7468 ! Always set a lo_fault handler 7469 ! 7470 wr %g0, ASI_P, %asi 7471 ldn [THREAD_REG + T_LOFAULT], %o5 7472 sethi %hi(.zeroerr), %o2 7473 or %o5, LOFAULT_SET, %o5 7474 or %o2, %lo(.zeroerr), %o2 7475 membar #Sync 7476 ba,pt %ncc, .do_zero 7477 stn %o2, [THREAD_REG + T_LOFAULT] 7478 7479 /* 7480 * We got here because of a fault during kzero or if 7481 * uzero or bzero was called with t_lofault non-zero. 7482 * Otherwise we've already run screaming from the room. 7483 * Errno value is in %g1. Note that we're here iff 7484 * we did set t_lofault. 7485 */ 7486 .zeroerr: 7487 ! 7488 ! Undo asi register setting. Just set it to be the 7489 ! kernel default without checking. 7490 ! 7491 wr %g0, ASI_P, %asi 7492 7493 ! 7494 ! We did set t_lofault. It may well have been zero coming in. 7495 ! 7496 1: 7497 tst %o5 7498 membar #Sync 7499 bne,pn %ncc, 3f 7500 andncc %o5, LOFAULT_SET, %o5 7501 2: 7502 ! 7503 ! Old handler was zero. Just return the error. 7504 ! 7505 retl ! return 7506 mov %g1, %o0 ! error code from %g1 7507 3: 7508 ! 7509 ! We're here because %o5 was non-zero. It was non-zero 7510 ! because either LOFAULT_SET was present, a previous fault 7511 ! handler was present or both. In all cases we need to reset 7512 ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET 7513 ! before we either simply return the error or we invoke the 7514 ! previously specified handler. 7515 ! 7516 be %ncc, 2b 7517 stn %o5, [THREAD_REG + T_LOFAULT] 7518 jmp %o5 ! goto real handler 7519 nop 7520 SET_SIZE(kzero) 7521 SET_SIZE(uzero) 7522 7523 /* 7524 * Zero a block of storage. 7525 */ 7526 7527 ENTRY(bzero) 7528 wr %g0, ASI_P, %asi 7529 7530 ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector 7531 tst %o5 7532 bz,pt %ncc, .do_zero 7533 sethi %hi(.zeroerr), %o2 7534 or %o2, %lo(.zeroerr), %o2 7535 membar #Sync ! sync error barrier 7536 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 7537 7538 .do_zero: 7539 cmp %o1, 7 7540 blu,pn %ncc, .byteclr 7541 nop 7542 7543 cmp %o1, 15 7544 blu,pn %ncc, .wdalign 7545 nop 7546 7547 andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound 7548 bz,pt %ncc, .blkalign ! already double aligned 7549 sub %o3, 8, %o3 ! -(bytes till double aligned) 7550 add %o1, %o3, %o1 ! update o1 with new count 7551 7552 1: 7553 stba %g0, [%o0]%asi 7554 inccc %o3 7555 bl,pt %ncc, 1b 7556 inc %o0 7557 7558 ! Now address is double aligned 7559 .blkalign: 7560 cmp %o1, 0x80 ! check if there are 128 bytes to set 7561 blu,pn %ncc, .bzero_small 7562 mov %o1, %o3 7563 7564 sethi %hi(use_hw_bzero), %o2 7565 ld [%o2 + %lo(use_hw_bzero)], %o2 7566 tst %o2 7567 bz %ncc, .bzero_small 7568 mov %o1, %o3 7569 7570 rd %asi, %o3 7571 wr %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 7572 cmp %o3, ASI_P 7573 bne,a %ncc, .algnblk 7574 wr %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi 7575 7576 .algnblk: 7577 andcc %o0, 0x3f, %o3 ! is block aligned? 7578 bz,pt %ncc, .bzero_blk 7579 sub %o3, 0x40, %o3 ! -(bytes till block aligned) 7580 add %o1, %o3, %o1 ! o1 is the remainder 7581 7582 ! Clear -(%o3) bytes till block aligned 7583 1: 7584 stxa %g0, [%o0]%asi 7585 addcc %o3, 8, %o3 7586 bl,pt %ncc, 1b 7587 add %o0, 8, %o0 7588 7589 .bzero_blk: 7590 and %o1, 0x3f, %o3 ! calc bytes left after blk clear 7591 andn %o1, 0x3f, %o4 ! calc size of blocks in bytes 7592 7593 cmp %o4, 0x100 ! 256 bytes or more 7594 blu,pn %ncc, 3f 7595 nop 7596 7597 2: 7598 stxa %g0, [%o0+0x0]%asi 7599 stxa %g0, [%o0+0x40]%asi 7600 stxa %g0, [%o0+0x80]%asi 7601 stxa %g0, [%o0+0xc0]%asi 7602 7603 stxa %g0, [%o0+0x8]%asi 7604 stxa %g0, [%o0+0x10]%asi 7605 stxa %g0, [%o0+0x18]%asi 7606 stxa %g0, [%o0+0x20]%asi 7607 stxa %g0, [%o0+0x28]%asi 7608 stxa %g0, [%o0+0x30]%asi 7609 stxa %g0, [%o0+0x38]%asi 7610 7611 stxa %g0, [%o0+0x48]%asi 7612 stxa %g0, [%o0+0x50]%asi 7613 stxa %g0, [%o0+0x58]%asi 7614 stxa %g0, [%o0+0x60]%asi 7615 stxa %g0, [%o0+0x68]%asi 7616 stxa %g0, [%o0+0x70]%asi 7617 stxa %g0, [%o0+0x78]%asi 7618 7619 stxa %g0, [%o0+0x88]%asi 7620 stxa %g0, [%o0+0x90]%asi 7621 stxa %g0, [%o0+0x98]%asi 7622 stxa %g0, [%o0+0xa0]%asi 7623 stxa %g0, [%o0+0xa8]%asi 7624 stxa %g0, [%o0+0xb0]%asi 7625 stxa %g0, [%o0+0xb8]%asi 7626 7627 stxa %g0, [%o0+0xc8]%asi 7628 stxa %g0, [%o0+0xd0]%asi 7629 stxa %g0, [%o0+0xd8]%asi 7630 stxa %g0, [%o0+0xe0]%asi 7631 stxa %g0, [%o0+0xe8]%asi 7632 stxa %g0, [%o0+0xf0]%asi 7633 stxa %g0, [%o0+0xf8]%asi 7634 7635 sub %o4, 0x100, %o4 7636 cmp %o4, 0x100 7637 bgu,pt %ncc, 2b 7638 add %o0, 0x100, %o0 7639 7640 3: 7641 ! ... check if 64 bytes to set 7642 cmp %o4, 0x40 7643 blu %ncc, .bzero_blk_done 7644 nop 7645 7646 4: 7647 stxa %g0, [%o0+0x0]%asi 7648 stxa %g0, [%o0+0x8]%asi 7649 stxa %g0, [%o0+0x10]%asi 7650 stxa %g0, [%o0+0x18]%asi 7651 stxa %g0, [%o0+0x20]%asi 7652 stxa %g0, [%o0+0x28]%asi 7653 stxa %g0, [%o0+0x30]%asi 7654 stxa %g0, [%o0+0x38]%asi 7655 7656 subcc %o4, 0x40, %o4 7657 bgu,pt %ncc, 3b 7658 add %o0, 0x40, %o0 7659 7660 .bzero_blk_done: 7661 membar #Sync 7662 ! 7663 ! Undo asi register setting. 7664 ! 7665 rd %asi, %o4 7666 wr %g0, ASI_P, %asi 7667 cmp %o4, ASI_BLK_INIT_ST_QUAD_LDD_P 7668 bne,a %ncc, .bzero_small 7669 wr %g0, ASI_USER, %asi 7670 7671 .bzero_small: 7672 ! Set the remaining doubles 7673 subcc %o3, 8, %o3 ! Can we store any doubles? 7674 blu,pn %ncc, .byteclr 7675 and %o1, 7, %o1 ! calc bytes left after doubles 7676 7677 .dbclr: 7678 stxa %g0, [%o0]%asi ! Clear the doubles 7679 subcc %o3, 8, %o3 7680 bgeu,pt %ncc, .dbclr 7681 add %o0, 8, %o0 7682 7683 ba .byteclr 7684 nop 7685 7686 .wdalign: 7687 andcc %o0, 3, %o3 ! is add aligned on a word boundary 7688 bz,pn %ncc, .wdclr 7689 andn %o1, 3, %o3 ! create word sized count in %o3 7690 7691 dec %o1 ! decrement count 7692 stba %g0, [%o0]%asi ! clear a byte 7693 ba .wdalign 7694 inc %o0 ! next byte 7695 7696 .wdclr: 7697 sta %g0, [%o0]%asi ! 4-byte clearing loop 7698 subcc %o3, 4, %o3 7699 bnz,pt %ncc, .wdclr 7700 inc 4, %o0 7701 7702 and %o1, 3, %o1 ! leftover count, if any 7703 7704 .byteclr: 7705 ! Set the leftover bytes 7706 brz %o1, .bzero_exit 7707 nop 7708 7709 7: 7710 deccc %o1 ! byte clearing loop 7711 stba %g0, [%o0]%asi 7712 bgu,pt %ncc, 7b 7713 inc %o0 7714 7715 .bzero_exit: 7716 ! 7717 ! We're just concerned with whether t_lofault was set 7718 ! when we came in. We end up here from either kzero() 7719 ! or bzero(). kzero() *always* sets a lofault handler. 7720 ! It ors LOFAULT_SET into %o5 to indicate it has done 7721 ! this even if the value of %o5 is otherwise zero. 7722 ! bzero() sets a lofault handler *only* if one was 7723 ! previously set. Accordingly we need to examine 7724 ! %o5 and if it is non-zero be sure to clear LOFAULT_SET 7725 ! before resetting the error handler. 7726 ! 7727 tst %o5 7728 bz %ncc, 1f 7729 andn %o5, LOFAULT_SET, %o5 7730 membar #Sync ! sync error barrier 7731 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 7732 1: 7733 retl 7734 clr %o0 ! return (0) 7735 7736 SET_SIZE(bzero)