1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/param.h> 30 #include <sys/errno.h> 31 #include <sys/asm_linkage.h> 32 #include <sys/vtrace.h> 33 #include <sys/machthread.h> 34 #include <sys/machparam.h> 35 36 #if defined(lint) 37 #include <sys/types.h> 38 #else /* lint */ 39 #include "assym.h" 40 #endif /* lint */ 41 42 /* 43 * Prefetch considerations 44 * 45 * We prefetch one cacheline ahead. This may not be enough on Serengeti 46 * systems - see default_copyout() etc which prefetch 5 lines ahead. 47 * On the other hand, we expect most of the source buffers to be 48 * recently used enough to be cached. 49 * 50 * On US-I the prefetches are inoperative. On US-II they preload the E$; 51 * the mainloop unrolling and load-buffer should cover loads from E$. 52 * The stores appear to be the slow point on US-II. 53 * 54 * On US-IIICu the prefetch preloads the L2$ too, but there is no load 55 * buffer so the loads will stall for D$ miss, L2$ hit. The hardware 56 * auto-prefetch is not activated by integer loads. No solution 57 * in sight for this, barring odd games with FP read, write, integer read. 58 * 59 * US-IV (Panther) appears similar to US-IIICu, except that a strong 60 * variant of prefetch is available which can take TLB traps. We don't 61 * use this. The h/w prefetch stride can be set to 64, 128 or 192, 62 * and they only reach to the L2$ (we don't use these either). 63 * L2$ load-to-use latency is 15 cycles (best). 64 */ 65 66 67 /* 68 * ip_ocsum(address, halfword_count, sum) 69 * Do a 16 bit one's complement sum of a given number of (16-bit) 70 * halfwords. The halfword pointer must not be odd. 71 * %o0 address; %o1 count; %o2 sum accumulator; %o4 temp 72 * %g2 and %g3 used in main loop 73 * 74 * (from @(#)ocsum.s 1.3 89/02/24 SMI) 75 * 76 */ 77 78 #if defined(lint) 79 80 /* ARGSUSED */ 81 unsigned int 82 ip_ocsum(u_short *address, int halfword_count, unsigned int sum) 83 { return (0); } 84 85 #else /* lint */ 86 87 ENTRY(ip_ocsum) 88 89 /* 90 * On ttcp transmits, called once per ocsum_copyin but with a small 91 * block ( >99.9% ). Could be the tx hdrs? How many acks/seg are we rxing? 92 * On ttcp receives, called more than once per ocsum_copyout. Rx hdrs 93 * and tx acks? 94 * 95 * To do: telnet and nfs traffic 96 * 97 * On an NCA'd webserver about 10% of the calls are >64 bytes 98 * about 10% of those start on a 64byte boundary 99 * about 30% are >5*64 bytes. 100 * The NCA numbers & proportions don't change with h/w cksum on. 101 * 102 * Tx hdrs are likely to be already in cache. 103 * Rx hdrs depends if already inspected. 104 */ 105 106 ! 107 ! Entry point for checksum-only. 108 ! %o0 contains buffer address 109 ! %o1 contains count of 16bit words 110 ! %o2 contains sum 111 ! 112 ! %o3 temporary 113 ! %o4 temporary 114 ! %g1 32bit mask 115 ! %g4 16bit mask 116 ! %g5 64bit mask (all 1s) 117 ! 118 not %g0, %g5 ! all 1's 119 prefetch [%o0], #n_reads ! first hword, dword, cacheline 120 121 clruw %g5, %g1 ! 32 1's at low end 122 srl %g5, 16, %g4 ! 16 1's at low end 123 124 cmp %o1, 32 ! at least a cacheline (64 bytes)? 125 bge,pn %icc, ip_ocsum_long ! yes, do the whole works 126 andn %o0, 7, %o5 ! delay: base src addr 127 128 129 cmp %o1, 4 ! < 4 halfwords? 130 bl,pn %icc, .tiny ! < 4 halfwords, just do them 131 inc 8, %o5 ! delay: next addr (no matter for .tiny) 132 133 /* leading dword with 1-4 hwords: 9 clocks */ 134 /* Assumes ok to read the entire dword with the leading hwords */ 135 136 ldx [%o5-8], %o3 ! NB base addr 137 sub %o5, %o0, %g2 ! byte count: 2/4/6/8 138 mov %o5, %o0 139 140 sll %g2, 2, %g2 ! 8/16/24/32 for mask 141 142 sllx %g5, %g2, %o5 143 144 sllx %o5, %g2, %o5 ! mask: 16/32/48/64 0's at low end 145 146 srl %g2, 3, %g2 ! hw count 147 andn %o3, %o5, %o3 ! select hw's from src 148 149 srlx %o3, 32, %o4 ! hi32 150 b 9f 151 sub %o1, %g2, %o1 ! delay: decr count, 1-4 halfwords 152 153 .short_dw: ! max 7 iters of 4 clocks; 1 mispred of 4 154 ldx [%o0], %o3 ! tmp64 = *src++ (groups with the branch) 155 156 inc 8, %o0 ! (D-cache load-use delay) 157 dec 4, %o1 ! decrement count, 4 halfwords 158 159 srlx %o3, 32, %o4 ! hi32 160 9: and %o3, %g1, %o3 ! lo32 161 162 add %o4, %o2, %o2 ! accumulator 163 andncc %o1, 3, %g0 ! more than 3 hwords left? 164 165 bnz,pt %icc, .short_dw 166 add %o3, %o2, %o2 ! accumulator 167 168 .short_hw: ! trailing dw: 0-3 hwords 169 tst %o1 ! 0 seems fairly common... 170 bz,a .short_fold 171 srlx %o2, 32, %o4 ! delay: hi32 172 ! mispredict 4 + 7 clocks for 1-3 173 ldx [%o0], %o3 174 sll %o1, 4, %o1 ! bitcount: 16/32/48 175 176 srlx %g5, %o1, %o5 ! mask: 16/32/48 0's at high end 177 178 andn %o3, %o5, %o3 ! select hw's from src 179 180 srlx %o3, 32, %o4 ! hi32 181 and %o3, %g1, %o3 ! lo32 182 183 add %o4, %o2, %o2 ! accumulator 184 185 add %o3, %o2, %o2 ! accumulator 186 187 ! at this point the 64-bit accumulator 188 ! has the result that needs to be returned in 16-bits 189 srlx %o2, 32, %o4 ! hi32 190 .short_fold: 191 and %o2, %g1, %o2 ! lo32 192 193 add %o4, %o2, %o2 ! 33b 194 195 srlx %o2, 16, %o3 ! hi17 196 and %o2, %g4, %o2 ! lo16 197 198 add %o3, %o2, %o2 ! 18b 199 200 srlx %o2, 16, %o3 ! hi2 201 and %o2, %g4, %o2 ! lo16 202 203 retl ! return 204 add %o3, %o2, %o0 ! 16b result in %o0 205 206 .tiny: ! almost never: less than 4 halfwords total. 207 tst %o1 208 bz,a .short_fold 209 210 srlx %o2, 32, %o4 ! delay: hi32 211 212 lduh [%o0], %o3 ! tmp16 = *src++ 213 1: 214 inc 2, %o0 215 ! stall for D-cache 216 217 add %o3, %o2, %o2 ! accumulator 218 219 deccc %o1 ! decrement count 220 bnz,a,pt %icc, 1b 221 lduh [%o0], %o3 ! tmp16 = *src++ 222 223 ! at this point the 64-bit accumulator 224 ! has the result that needs to be returned in 16-bits 225 b .short_fold 226 srlx %o2, 32, %o4 ! hi32 227 228 SET_SIZE(ip_ocsum) ! 64-bit version 229 230 231 ENTRY(ip_ocsum_long) ! 64-bit, large blocks 232 save %sp, -SA(MINFRAME), %sp ! get another window 233 ! 234 ! %i0 contains buffer address 235 ! %i1 contains count of 16bit words 236 ! %i2 contains sum 237 ! %i4 contains the mainloop count 238 ! %i5 comes in with the buffer address rounded down to the first dword 239 ! 240 ! %g1 32bit mask 241 ! %g4 16bit mask 242 ! %g5 64bit mask (all 1s) 243 ! %g6 fetch-ahead offset for Ecache 244 ! 245 ! %l0-7,%o0-5,%g2-3 mainloop temporaries 246 ! 247 ! 248 ! 1 clock overhead 249 btst 63, %i0 ! src 64-byte aligned? 250 bz,a,pt %icc, .mainsection ! aligned blocks are fairly common 251 andncc %i1, 31, %i4 ! at least 64 bytes for main loop? 252 253 254 ! Leading dword, with 1-4 hwords: 9 clocks 255 ! Assumes ok to read the entire dword with the leading bytes 256 ldx [%i5], %l0 ! NB base addr 257 inc 8, %i5 ! next addr 258 259 sub %i5, %i0, %l2 ! byte count: 2/4/6/8 260 mov %i5, %i0 261 262 sll %l2, 2, %l2 ! 8/16/24/32 for mask 263 264 sllx %g5, %l2, %l4 265 266 sllx %l4, %l2, %l4 ! mask: 16, 32, 48, 64 0's at lsb 267 268 srl %l2, 3, %l2 ! 1/2/3/4 for count 269 andn %l0, %l4, %l0 ! select hw's from src 270 271 srlx %l0, 32, %o0 ! hi32 272 b 9f 273 sub %i1, %l2, %i1 ! decr count, 1-4 halfwords 274 275 ! Do dwords until source is 64-byte aligned, 0-6 iterations 276 ! 4 clocks per + 4 for 1 mispred = 16 clocks avg 277 .dw: ldx [%i0], %l0 ! tmp64 = *src++ (groups with the branch below) 278 279 inc 8, %i0 ! (Dcache load-use delay) 280 dec 4, %i1 ! decrement count, 4 halfwords 281 282 srlx %l0, 32, %o0 ! hi32 283 9: and %l0, %g1, %l0 ! lo32 284 285 add %o0, %i2, %i2 ! accumulator 286 btst 63, %i0 ! src 64-byte aligned? 287 288 bnz,pt %icc, .dw 289 add %l0, %i2, %i2 ! accumulator 290 291 292 ! At this point source address is 64 byte aligned 293 ! and we've dealt with 1-32 halfwords. 294 andncc %i1, 31, %i4 ! at least 64 bytes for main loop? 295 .mainsection: ! total 18n + 21 clocks 296 bz,pn %icc, .postamble 297 and %i1, 31, %i1 ! count for postamble 298 299 ! preload for main loop - 9 clocks assuming D$ hits at 1 per 300 ldx [%i0+0], %l0 301 ldx [%i0+8], %l1 302 ldx [%i0+16], %l2 ! %l0 could be used here if Dcache hit 303 ldx [%i0+24], %l3 ! but US-II prefetch only loads Ecache 304 ldx [%i0+32], %l4 ! check on US-III: could mix preloads & splits? 305 ldx [%i0+40], %l5 306 ldx [%i0+48], %l6 307 ldx [%i0+56], %l7 308 inc 64, %i0 309 prefetch [%i0], #n_reads 310 311 ! main loop. Read 64 bytes at a time - 18 clocks per iteration 312 5: ! plus 4 for the exit mispredict 313 srlx %l0, 32, %o0 ! hi32 to %o0 314 and %l0, %g1, %l0 ! lo32 to %l0 315 316 srlx %l1, 32, %o1 ! hi32 to %o1 317 and %l1, %g1, %l1 ! lo32 to %l1 318 319 srlx %l2, 32, %o2 ! hi32 to %o2 320 and %l2, %g1, %l2 ! lo32 to %l2 321 322 srlx %l3, 32, %o3 ! hi32 to %o3 323 and %l3, %g1, %l3 ! lo32 to %l3 324 325 srlx %l4, 32, %o4 ! hi32 to %o4 326 and %l4, %g1, %l4 ! lo32 to %l4 327 328 srlx %l5, 32, %o5 ! hi32 to %o5 329 and %l5, %g1, %l5 ! lo32 to %l5 330 331 srlx %l6, 32, %g2 ! hi32 to %g2 332 and %l6, %g1, %l6 ! lo32 to %l6 333 334 srlx %l7, 32, %g3 ! hi32 to %g3 335 and %l7, %g1, %l7 ! lo32 to %l7 336 ! splits gave 16 off 32b vals 337 deccc 32, %i4 ! mv early,avoid mispredicts? nohelp US-II. 338 bz,pn %icc, .looptidy ! count now zero? 339 add %l0, %o0, %o0 ! delay 340 341 ldx [%i0+0], %l0 342 add %l1, %o1, %o1 ! adds and loads 343 add %l2, %o2, %o2 344 345 ldx [%i0+8], %l1 346 add %l3, %o3, %o3 347 add %l4, %o4, %o4 348 349 ldx [%i0+16], %l2 350 add %l5, %o5, %o5 351 add %l6, %g2, %g2 352 353 ldx [%i0+24], %l3 354 add %l7, %g3, %g3 ! now 8 off 33b vals 355 add %o0, %o1, %o0 356 357 ldx [%i0+32], %l4 358 add %o2, %o3, %o1 359 add %o4, %o5, %o2 360 361 ldx [%i0+40], %l5 362 add %g2, %g3, %o3 ! now 4 off 34b vals 363 add %o0, %o1, %o0 364 365 ldx [%i0+48], %l6 366 add %o2, %o3, %o1 ! 2 off 35b 367 368 ldx [%i0+56], %l7 369 add %o0, %o1, %o0 ! 36b 370 inc 64, %i0 ! increment source address 371 372 add %o0, %i2, %i2 ! accumulator 373 ba 5b 374 prefetch [%i0], #n_reads ! next cacheline 375 ! end of main loop 376 .looptidy: ! compute remaining partial sum - 8 clocks 377 add %l1, %o1, %o1 378 add %l2, %o2, %o2 379 380 add %l3, %o3, %o3 381 add %l4, %o4, %o4 382 383 add %l5, %o5, %o5 384 add %l6, %g2, %g2 385 386 add %l7, %g3, %g3 ! 8 x 33b 387 add %o0, %o1, %o0 388 389 add %o2, %o3, %o1 390 add %o4, %o5, %o2 391 392 add %g2, %g3, %o3 ! 4 x 34b 393 add %o0, %o1, %o0 394 395 add %o2, %o3, %o1 ! 2 x 35b 396 add %o0, %i2, %i2 ! accumulator 397 398 add %o1, %i2, %i2 ! accumulator 399 400 401 .postamble: 402 ! postamble hword count is in %i1 (can be zero) 403 ! while at least 1 dword, do dwords. Max 7 iterations. 404 andncc %i1, 3, %g0 ! more than 3 hwords? 405 .dotail_dw: 406 bz,a,pn %icc, .dotail_hw 407 tst %i1 ! delay: any at all left? 408 8: 409 ldx [%i0], %l0 ! tmp64 = *src++ 410 inc 8, %i0 411 dec 4, %i1 ! decrement count, 4 halfwords 412 413 ! stall for D-cache 414 415 srlx %l0, 32, %o0 ! hi32 416 and %l0, %g1, %l0 ! lo32 417 418 add %o0, %i2, %i2 ! accumulator 419 420 andncc %i1, 3, %g0 ! more than 3 hwords? 421 bnz,pt %icc, 8b 422 add %l0, %i2, %i2 ! accumulator 423 424 ! while at least 1 hword, do hwords. Max 3 iterations. 425 tst %i1 426 .dotail_hw: 427 bz,a .fold 428 srlx %i2, 32, %o0 ! delay: hi32 429 lduh [%i0], %l0 ! tmp16 = *src++ 430 1: 431 inc 2, %i0 432 ! stall for D-cache 433 434 add %l0, %i2, %i2 ! accumulator 435 436 deccc %i1 ! decrement count 437 bnz,a,pt %icc, 1b 438 lduh [%i0], %l0 ! tmp16 = *src++ 439 440 ! at this point the 64-bit accumulator 441 ! has the result that needs to be returned in 16-bits 442 srlx %i2, 32, %o0 ! hi32 443 .fold: 444 and %i2, %g1, %o1 ! lo32 445 446 add %o0, %o1, %o0 ! 33b 447 448 srlx %o0, 16, %o1 ! hi17 449 and %o0, %g4, %o0 ! lo16 450 451 add %o1, %o0, %o0 ! 18b 452 453 srlx %o0, 16, %o1 ! hi2 454 and %o0, %g4, %o0 ! lo16 455 456 add %o1, %o0, %i0 ! 16b result in %i0 457 458 ret ! return 459 restore 460 461 462 SET_SIZE(ip_ocsum_long) ! 64-bit version 463 464 #endif /* lint */