1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28 
  29 #include <sys/param.h>
  30 #include <sys/errno.h>
  31 #include <sys/asm_linkage.h>
  32 #include <sys/vtrace.h>
  33 #include <sys/machthread.h>
  34 #include <sys/machparam.h>
  35 
  36 #if defined(lint)
  37 #include <sys/types.h>
  38 #else   /* lint */
  39 #include "assym.h"
  40 #endif  /* lint */
  41 
  42 /*
  43  * Prefetch considerations
  44  * 
  45  * We prefetch one cacheline ahead.  This may not be enough on Serengeti
  46  * systems - see default_copyout() etc which prefetch 5 lines ahead.
  47  * On the other hand, we expect most of the source buffers to be
  48  * recently used enough to be cached.
  49  *
  50  * On US-I the prefetches are inoperative.  On US-II they preload the E$;
  51  * the mainloop unrolling and load-buffer should cover loads from E$.
  52  * The stores appear to be the slow point on US-II.
  53  * 
  54  * On US-IIICu the prefetch preloads the L2$ too, but there is no load
  55  * buffer so the loads will stall for D$ miss, L2$ hit.  The hardware
  56  * auto-prefetch is not activated by integer loads.  No solution
  57  * in sight for this, barring odd games with FP read, write, integer read.
  58  * 
  59  * US-IV (Panther) appears similar to US-IIICu, except that a strong
  60  * variant of prefetch is available which can take TLB traps.  We don't
  61  * use this.  The h/w prefetch stride can be set to 64, 128 or 192,
  62  * and they only reach to the L2$ (we don't use these either).
  63  * L2$ load-to-use latency is 15 cycles (best).
  64  */
  65 
  66 
  67 /*
  68  * ip_ocsum(address, halfword_count, sum)
  69  * Do a 16 bit one's complement sum of a given number of (16-bit)
  70  * halfwords. The halfword pointer must not be odd.
  71  *      %o0 address; %o1 count; %o2 sum accumulator; %o4 temp
  72  *      %g2 and %g3 used in main loop
  73  *
  74  * (from @(#)ocsum.s 1.3 89/02/24 SMI)
  75  *
  76  */
  77 
  78 #if defined(lint) 
  79 
  80 /* ARGSUSED */
  81 unsigned int
  82 ip_ocsum(u_short *address, int halfword_count, unsigned int sum)
  83 { return (0); }
  84 
  85 #else   /* lint */
  86 
  87         ENTRY(ip_ocsum)
  88 
  89 /*
  90  * On ttcp transmits, called once per ocsum_copyin but with a small
  91  * block ( >99.9% ).  Could be the tx hdrs?  How many acks/seg are we rxing?
  92  * On ttcp receives, called more than once per ocsum_copyout. Rx hdrs
  93  * and tx acks?
  94  *
  95  * To do: telnet and nfs traffic
  96  *
  97  * On an NCA'd webserver about 10% of the calls are >64 bytes
  98  *      about 10% of those start on a 64byte boundary
  99  *      about 30% are >5*64 bytes.
 100  * The NCA numbers & proportions don't change with h/w cksum on.
 101  *
 102  * Tx hdrs are likely to be already in cache.
 103  * Rx hdrs depends if already inspected.
 104  */
 105 
 106         !
 107         ! Entry point for checksum-only.
 108         ! %o0 contains buffer address
 109         ! %o1 contains count of 16bit words
 110         ! %o2 contains sum
 111         !
 112         ! %o3 temporary
 113         ! %o4 temporary
 114         ! %g1 32bit mask
 115         ! %g4 16bit mask
 116         ! %g5 64bit mask (all 1s)
 117         !
 118         not     %g0, %g5        ! all 1's
 119         prefetch [%o0], #n_reads        ! first hword, dword, cacheline
 120 
 121         clruw   %g5, %g1        ! 32 1's at low end
 122         srl     %g5, 16, %g4    ! 16 1's at low end
 123 
 124         cmp     %o1, 32         ! at least a cacheline (64 bytes)?
 125         bge,pn %icc, ip_ocsum_long      ! yes, do the whole works
 126         andn    %o0, 7, %o5     ! delay: base src addr
 127 
 128 
 129         cmp     %o1, 4          ! < 4 halfwords?
 130         bl,pn   %icc, .tiny     ! < 4 halfwords, just do them
 131         inc     8, %o5          ! delay: next addr (no matter for .tiny)
 132 
 133         /* leading dword with 1-4 hwords: 9 clocks */
 134         /* Assumes ok to read the entire dword with the leading hwords */
 135 
 136         ldx     [%o5-8], %o3    ! NB base addr
 137         sub     %o5, %o0, %g2   ! byte count: 2/4/6/8
 138         mov     %o5, %o0
 139 
 140         sll     %g2, 2, %g2     ! 8/16/24/32 for mask
 141 
 142         sllx    %g5, %g2, %o5
 143 
 144         sllx    %o5, %g2, %o5   ! mask: 16/32/48/64 0's at low end
 145 
 146         srl     %g2, 3, %g2     ! hw count
 147         andn    %o3, %o5, %o3   ! select hw's from src
 148 
 149         srlx    %o3, 32, %o4    ! hi32
 150         b       9f
 151         sub     %o1, %g2, %o1   ! delay: decr count, 1-4 halfwords
 152 
 153 .short_dw:                      ! max 7 iters of 4 clocks; 1 mispred of 4
 154         ldx     [%o0], %o3      ! tmp64 = *src++ (groups with the branch)
 155 
 156         inc     8, %o0          ! (D-cache load-use delay)
 157         dec     4, %o1          ! decrement count, 4 halfwords
 158 
 159         srlx    %o3, 32, %o4    ! hi32
 160 9:      and     %o3, %g1, %o3   ! lo32
 161 
 162         add     %o4, %o2, %o2   ! accumulator
 163         andncc  %o1, 3, %g0     ! more than 3 hwords left?
 164 
 165         bnz,pt %icc, .short_dw
 166         add     %o3, %o2, %o2   ! accumulator
 167 
 168 .short_hw:                      ! trailing dw: 0-3 hwords
 169         tst     %o1             ! 0 seems fairly common...
 170         bz,a    .short_fold
 171         srlx    %o2, 32, %o4    ! delay: hi32
 172                                 ! mispredict 4 + 7 clocks for 1-3
 173         ldx     [%o0], %o3
 174         sll     %o1, 4, %o1     ! bitcount: 16/32/48
 175 
 176         srlx    %g5, %o1, %o5   ! mask: 16/32/48  0's at high end
 177 
 178         andn    %o3, %o5, %o3   ! select hw's from src
 179 
 180         srlx    %o3, 32, %o4    ! hi32
 181         and     %o3, %g1, %o3   ! lo32
 182 
 183         add     %o4, %o2, %o2   ! accumulator
 184 
 185         add     %o3, %o2, %o2   ! accumulator
 186 
 187         ! at this point the 64-bit accumulator
 188         ! has the result that needs to be returned in 16-bits
 189         srlx    %o2, 32, %o4    ! hi32
 190 .short_fold:
 191         and     %o2, %g1, %o2   ! lo32
 192 
 193         add     %o4, %o2, %o2   ! 33b
 194 
 195         srlx    %o2, 16, %o3    ! hi17
 196         and     %o2, %g4, %o2   ! lo16
 197 
 198         add     %o3, %o2, %o2   ! 18b
 199 
 200         srlx    %o2, 16, %o3    ! hi2
 201         and     %o2, %g4, %o2   ! lo16
 202 
 203         retl                    ! return
 204         add     %o3, %o2, %o0   ! 16b result in %o0
 205 
 206 .tiny:                          ! almost never: less than 4 halfwords total.
 207         tst     %o1
 208         bz,a    .short_fold
 209 
 210         srlx    %o2, 32, %o4    ! delay: hi32
 211 
 212         lduh    [%o0], %o3      ! tmp16 = *src++
 213 1:      
 214         inc     2, %o0
 215                                 ! stall for D-cache
 216 
 217         add     %o3, %o2, %o2   ! accumulator
 218 
 219         deccc   %o1             ! decrement count
 220         bnz,a,pt %icc, 1b
 221         lduh    [%o0], %o3      ! tmp16 = *src++
 222 
 223         ! at this point the 64-bit accumulator
 224         ! has the result that needs to be returned in 16-bits
 225         b       .short_fold
 226         srlx    %o2, 32, %o4    ! hi32
 227 
 228         SET_SIZE(ip_ocsum)      ! 64-bit version
 229 
 230 
 231         ENTRY(ip_ocsum_long)    ! 64-bit, large blocks
 232         save    %sp, -SA(MINFRAME), %sp ! get another window
 233         !
 234         ! %i0 contains buffer address
 235         ! %i1 contains count of 16bit words
 236         ! %i2 contains sum
 237         ! %i4 contains the mainloop count
 238         ! %i5 comes in with the buffer address rounded down to the first dword
 239         !
 240         ! %g1 32bit mask
 241         ! %g4 16bit mask
 242         ! %g5 64bit mask (all 1s)
 243         ! %g6 fetch-ahead offset for Ecache
 244         !
 245         ! %l0-7,%o0-5,%g2-3 mainloop temporaries
 246         !
 247         !
 248                                 ! 1 clock overhead
 249         btst    63, %i0         ! src 64-byte aligned?
 250         bz,a,pt %icc, .mainsection      ! aligned blocks are fairly common
 251         andncc  %i1, 31, %i4    ! at least 64 bytes for main loop?
 252 
 253 
 254         ! Leading dword, with 1-4 hwords: 9 clocks
 255         ! Assumes ok to read the entire dword with the leading bytes
 256         ldx     [%i5], %l0      ! NB base addr
 257         inc     8, %i5          ! next addr
 258 
 259         sub     %i5, %i0, %l2   ! byte count: 2/4/6/8
 260         mov     %i5, %i0
 261 
 262         sll     %l2, 2, %l2     ! 8/16/24/32 for mask
 263 
 264         sllx    %g5, %l2, %l4
 265 
 266         sllx    %l4, %l2, %l4   ! mask: 16, 32, 48, 64 0's at lsb
 267 
 268         srl     %l2, 3, %l2     ! 1/2/3/4 for count
 269         andn    %l0, %l4, %l0   ! select hw's from src
 270 
 271         srlx    %l0, 32, %o0    ! hi32
 272         b       9f
 273         sub     %i1, %l2, %i1   ! decr count, 1-4 halfwords
 274 
 275         ! Do dwords until source is 64-byte aligned, 0-6 iterations
 276         ! 4 clocks per + 4 for 1 mispred = 16 clocks avg
 277 .dw:    ldx     [%i0], %l0      ! tmp64 = *src++ (groups with the branch below)
 278 
 279         inc     8, %i0          ! (Dcache load-use delay)
 280         dec     4, %i1          ! decrement count, 4 halfwords
 281 
 282         srlx    %l0, 32, %o0    ! hi32
 283 9:      and     %l0, %g1, %l0   ! lo32
 284 
 285         add     %o0, %i2, %i2   ! accumulator
 286         btst    63, %i0         ! src 64-byte aligned?
 287 
 288         bnz,pt  %icc, .dw
 289         add     %l0, %i2, %i2   ! accumulator
 290 
 291 
 292         ! At this point source address is 64 byte aligned
 293         ! and we've dealt with 1-32 halfwords.
 294         andncc  %i1, 31, %i4    ! at least 64 bytes for main loop?
 295 .mainsection:                           ! total 18n + 21 clocks
 296         bz,pn   %icc, .postamble
 297         and     %i1, 31, %i1    ! count for postamble
 298 
 299         ! preload for main loop - 9 clocks assuming D$ hits at 1 per
 300         ldx     [%i0+0], %l0
 301         ldx     [%i0+8], %l1
 302         ldx     [%i0+16], %l2   ! %l0 could be used here if Dcache hit
 303         ldx     [%i0+24], %l3   !  but US-II prefetch only loads Ecache
 304         ldx     [%i0+32], %l4   !  check on US-III: could mix preloads & splits?
 305         ldx     [%i0+40], %l5
 306         ldx     [%i0+48], %l6
 307         ldx     [%i0+56], %l7
 308         inc     64, %i0
 309         prefetch [%i0], #n_reads
 310 
 311         ! main loop. Read 64 bytes at a time - 18 clocks per iteration
 312 5:      !                                       plus 4 for the exit mispredict
 313         srlx    %l0, 32, %o0            ! hi32 to %o0
 314         and     %l0, %g1, %l0           ! lo32 to %l0
 315 
 316         srlx    %l1, 32, %o1            ! hi32 to %o1
 317         and     %l1, %g1, %l1           ! lo32 to %l1
 318 
 319         srlx    %l2, 32, %o2            ! hi32 to %o2
 320         and     %l2, %g1, %l2           ! lo32 to %l2
 321 
 322         srlx    %l3, 32, %o3            ! hi32 to %o3
 323         and     %l3, %g1, %l3           ! lo32 to %l3
 324 
 325         srlx    %l4, 32, %o4            ! hi32 to %o4
 326         and     %l4, %g1, %l4           ! lo32 to %l4
 327 
 328         srlx    %l5, 32, %o5            ! hi32 to %o5
 329         and     %l5, %g1, %l5           ! lo32 to %l5
 330 
 331         srlx    %l6, 32, %g2            ! hi32 to %g2
 332         and     %l6, %g1, %l6           ! lo32 to %l6
 333 
 334         srlx    %l7, 32, %g3            ! hi32 to %g3
 335         and     %l7, %g1, %l7           ! lo32 to %l7
 336                                 ! splits gave 16 off 32b vals
 337         deccc   32, %i4         ! mv early,avoid mispredicts? nohelp US-II.
 338         bz,pn   %icc, .looptidy ! count now zero?
 339         add     %l0, %o0, %o0   ! delay
 340 
 341         ldx     [%i0+0], %l0
 342         add     %l1, %o1, %o1   ! adds and loads
 343         add     %l2, %o2, %o2
 344 
 345         ldx     [%i0+8], %l1
 346         add     %l3, %o3, %o3
 347         add     %l4, %o4, %o4
 348 
 349         ldx     [%i0+16], %l2
 350         add     %l5, %o5, %o5
 351         add     %l6, %g2, %g2
 352 
 353         ldx     [%i0+24], %l3
 354         add     %l7, %g3, %g3           ! now 8 off 33b vals
 355         add     %o0, %o1, %o0
 356 
 357         ldx     [%i0+32], %l4
 358         add     %o2, %o3, %o1
 359         add     %o4, %o5, %o2
 360 
 361         ldx     [%i0+40], %l5
 362         add     %g2, %g3, %o3           ! now 4 off 34b vals
 363         add     %o0, %o1, %o0
 364 
 365         ldx     [%i0+48], %l6
 366         add     %o2, %o3, %o1           ! 2 off 35b
 367 
 368         ldx     [%i0+56], %l7
 369         add     %o0, %o1, %o0           ! 36b
 370         inc     64, %i0         ! increment source address
 371 
 372         add     %o0, %i2, %i2   ! accumulator
 373         ba      5b
 374         prefetch [%i0], #n_reads        ! next cacheline
 375                                 ! end of main loop
 376 .looptidy:      ! compute remaining partial sum - 8 clocks
 377         add     %l1, %o1, %o1
 378         add     %l2, %o2, %o2
 379 
 380         add     %l3, %o3, %o3
 381         add     %l4, %o4, %o4
 382 
 383         add     %l5, %o5, %o5
 384         add     %l6, %g2, %g2
 385 
 386         add     %l7, %g3, %g3           ! 8 x 33b
 387         add     %o0, %o1, %o0
 388 
 389         add     %o2, %o3, %o1
 390         add     %o4, %o5, %o2
 391 
 392         add     %g2, %g3, %o3           ! 4 x 34b
 393         add     %o0, %o1, %o0
 394 
 395         add     %o2, %o3, %o1           ! 2 x 35b
 396         add     %o0, %i2, %i2   ! accumulator
 397 
 398         add     %o1, %i2, %i2   ! accumulator
 399 
 400 
 401 .postamble:
 402         ! postamble hword count is in %i1 (can be zero)
 403         ! while at least 1 dword, do dwords.   Max 7 iterations.
 404         andncc  %i1, 3, %g0     ! more than 3 hwords?
 405 .dotail_dw:
 406         bz,a,pn %icc, .dotail_hw
 407         tst     %i1             ! delay: any at all left?
 408 8:      
 409         ldx     [%i0], %l0      ! tmp64 = *src++
 410         inc     8, %i0
 411         dec     4, %i1          ! decrement count, 4 halfwords
 412 
 413                                 ! stall for D-cache
 414 
 415         srlx    %l0, 32, %o0    ! hi32
 416         and     %l0, %g1, %l0   ! lo32
 417 
 418         add     %o0, %i2, %i2   ! accumulator
 419 
 420         andncc  %i1, 3, %g0     ! more than 3 hwords?
 421         bnz,pt  %icc, 8b
 422         add     %l0, %i2, %i2   ! accumulator
 423 
 424         ! while at least 1 hword, do hwords.   Max 3 iterations.
 425         tst     %i1
 426 .dotail_hw:
 427         bz,a    .fold
 428         srlx    %i2, 32, %o0    ! delay: hi32
 429         lduh    [%i0], %l0      ! tmp16 = *src++
 430 1:      
 431         inc     2, %i0
 432                                 ! stall for D-cache
 433 
 434         add     %l0, %i2, %i2   ! accumulator
 435 
 436         deccc   %i1             ! decrement count
 437         bnz,a,pt %icc, 1b
 438         lduh    [%i0], %l0      ! tmp16 = *src++
 439 
 440         ! at this point the 64-bit accumulator
 441         ! has the result that needs to be returned in 16-bits
 442         srlx    %i2, 32, %o0    ! hi32
 443 .fold:
 444         and     %i2, %g1, %o1   ! lo32
 445 
 446         add     %o0, %o1, %o0   ! 33b
 447 
 448         srlx    %o0, 16, %o1    ! hi17
 449         and     %o0, %g4, %o0   ! lo16
 450 
 451         add     %o1, %o0, %o0   ! 18b
 452 
 453         srlx    %o0, 16, %o1    ! hi2
 454         and     %o0, %g4, %o0   ! lo16
 455 
 456         add     %o1, %o0, %i0   ! 16b result in %i0
 457 
 458         ret                     ! return
 459         restore
 460 
 461 
 462         SET_SIZE(ip_ocsum_long) ! 64-bit version
 463 
 464 #endif  /* lint */