1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/param.h>
  28 #include <sys/errno.h>
  29 #include <sys/asm_linkage.h>
  30 #include <sys/vtrace.h>
  31 #include <sys/machthread.h>
  32 #include <sys/machparam.h>
  33 
  34 #include "assym.h"
  35 
  36 /*
  37  * Prefetch considerations
  38  * 
  39  * We prefetch one cacheline ahead.  This may not be enough on Serengeti
  40  * systems - see default_copyout() etc which prefetch 5 lines ahead.
  41  * On the other hand, we expect most of the source buffers to be
  42  * recently used enough to be cached.
  43  *
  44  * On US-I the prefetches are inoperative.  On US-II they preload the E$;
  45  * the mainloop unrolling and load-buffer should cover loads from E$.
  46  * The stores appear to be the slow point on US-II.
  47  * 
  48  * On US-IIICu the prefetch preloads the L2$ too, but there is no load
  49  * buffer so the loads will stall for D$ miss, L2$ hit.  The hardware
  50  * auto-prefetch is not activated by integer loads.  No solution
  51  * in sight for this, barring odd games with FP read, write, integer read.
  52  * 
  53  * US-IV (Panther) appears similar to US-IIICu, except that a strong
  54  * variant of prefetch is available which can take TLB traps.  We don't
  55  * use this.  The h/w prefetch stride can be set to 64, 128 or 192,
  56  * and they only reach to the L2$ (we don't use these either).
  57  * L2$ load-to-use latency is 15 cycles (best).
  58  */
  59 
  60 
  61 /*
  62  * ip_ocsum(address, halfword_count, sum)
  63  * Do a 16 bit one's complement sum of a given number of (16-bit)
  64  * halfwords. The halfword pointer must not be odd.
  65  *      %o0 address; %o1 count; %o2 sum accumulator; %o4 temp
  66  *      %g2 and %g3 used in main loop
  67  *
  68  * (from @(#)ocsum.s 1.3 89/02/24 SMI)
  69  *
  70  */
  71 
  72         ENTRY(ip_ocsum)
  73 
  74 /*
  75  * On ttcp transmits, called once per ocsum_copyin but with a small
  76  * block ( >99.9% ).  Could be the tx hdrs?  How many acks/seg are we rxing?
  77  * On ttcp receives, called more than once per ocsum_copyout. Rx hdrs
  78  * and tx acks?
  79  *
  80  * To do: telnet and nfs traffic
  81  *
  82  * On an NCA'd webserver about 10% of the calls are >64 bytes
  83  *      about 10% of those start on a 64byte boundary
  84  *      about 30% are >5*64 bytes.
  85  * The NCA numbers & proportions don't change with h/w cksum on.
  86  *
  87  * Tx hdrs are likely to be already in cache.
  88  * Rx hdrs depends if already inspected.
  89  */
  90 
  91         !
  92         ! Entry point for checksum-only.
  93         ! %o0 contains buffer address
  94         ! %o1 contains count of 16bit words
  95         ! %o2 contains sum
  96         !
  97         ! %o3 temporary
  98         ! %o4 temporary
  99         ! %g1 32bit mask
 100         ! %g4 16bit mask
 101         ! %g5 64bit mask (all 1s)
 102         !
 103         not     %g0, %g5        ! all 1's
 104         prefetch [%o0], #n_reads        ! first hword, dword, cacheline
 105 
 106         clruw   %g5, %g1        ! 32 1's at low end
 107         srl     %g5, 16, %g4    ! 16 1's at low end
 108 
 109         cmp     %o1, 32         ! at least a cacheline (64 bytes)?
 110         bge,pn %icc, ip_ocsum_long      ! yes, do the whole works
 111         andn    %o0, 7, %o5     ! delay: base src addr
 112 
 113 
 114         cmp     %o1, 4          ! < 4 halfwords?
 115         bl,pn   %icc, .tiny     ! < 4 halfwords, just do them
 116         inc     8, %o5          ! delay: next addr (no matter for .tiny)
 117 
 118         /* leading dword with 1-4 hwords: 9 clocks */
 119         /* Assumes ok to read the entire dword with the leading hwords */
 120 
 121         ldx     [%o5-8], %o3    ! NB base addr
 122         sub     %o5, %o0, %g2   ! byte count: 2/4/6/8
 123         mov     %o5, %o0
 124 
 125         sll     %g2, 2, %g2     ! 8/16/24/32 for mask
 126 
 127         sllx    %g5, %g2, %o5
 128 
 129         sllx    %o5, %g2, %o5   ! mask: 16/32/48/64 0's at low end
 130 
 131         srl     %g2, 3, %g2     ! hw count
 132         andn    %o3, %o5, %o3   ! select hw's from src
 133 
 134         srlx    %o3, 32, %o4    ! hi32
 135         b       9f
 136         sub     %o1, %g2, %o1   ! delay: decr count, 1-4 halfwords
 137 
 138 .short_dw:                      ! max 7 iters of 4 clocks; 1 mispred of 4
 139         ldx     [%o0], %o3      ! tmp64 = *src++ (groups with the branch)
 140 
 141         inc     8, %o0          ! (D-cache load-use delay)
 142         dec     4, %o1          ! decrement count, 4 halfwords
 143 
 144         srlx    %o3, 32, %o4    ! hi32
 145 9:      and     %o3, %g1, %o3   ! lo32
 146 
 147         add     %o4, %o2, %o2   ! accumulator
 148         andncc  %o1, 3, %g0     ! more than 3 hwords left?
 149 
 150         bnz,pt %icc, .short_dw
 151         add     %o3, %o2, %o2   ! accumulator
 152 
 153 .short_hw:                      ! trailing dw: 0-3 hwords
 154         tst     %o1             ! 0 seems fairly common...
 155         bz,a    .short_fold
 156         srlx    %o2, 32, %o4    ! delay: hi32
 157                                 ! mispredict 4 + 7 clocks for 1-3
 158         ldx     [%o0], %o3
 159         sll     %o1, 4, %o1     ! bitcount: 16/32/48
 160 
 161         srlx    %g5, %o1, %o5   ! mask: 16/32/48  0's at high end
 162 
 163         andn    %o3, %o5, %o3   ! select hw's from src
 164 
 165         srlx    %o3, 32, %o4    ! hi32
 166         and     %o3, %g1, %o3   ! lo32
 167 
 168         add     %o4, %o2, %o2   ! accumulator
 169 
 170         add     %o3, %o2, %o2   ! accumulator
 171 
 172         ! at this point the 64-bit accumulator
 173         ! has the result that needs to be returned in 16-bits
 174         srlx    %o2, 32, %o4    ! hi32
 175 .short_fold:
 176         and     %o2, %g1, %o2   ! lo32
 177 
 178         add     %o4, %o2, %o2   ! 33b
 179 
 180         srlx    %o2, 16, %o3    ! hi17
 181         and     %o2, %g4, %o2   ! lo16
 182 
 183         add     %o3, %o2, %o2   ! 18b
 184 
 185         srlx    %o2, 16, %o3    ! hi2
 186         and     %o2, %g4, %o2   ! lo16
 187 
 188         retl                    ! return
 189         add     %o3, %o2, %o0   ! 16b result in %o0
 190 
 191 .tiny:                          ! almost never: less than 4 halfwords total.
 192         tst     %o1
 193         bz,a    .short_fold
 194 
 195         srlx    %o2, 32, %o4    ! delay: hi32
 196 
 197         lduh    [%o0], %o3      ! tmp16 = *src++
 198 1:      
 199         inc     2, %o0
 200                                 ! stall for D-cache
 201 
 202         add     %o3, %o2, %o2   ! accumulator
 203 
 204         deccc   %o1             ! decrement count
 205         bnz,a,pt %icc, 1b
 206         lduh    [%o0], %o3      ! tmp16 = *src++
 207 
 208         ! at this point the 64-bit accumulator
 209         ! has the result that needs to be returned in 16-bits
 210         b       .short_fold
 211         srlx    %o2, 32, %o4    ! hi32
 212 
 213         SET_SIZE(ip_ocsum)      ! 64-bit version
 214 
 215 
 216         ENTRY(ip_ocsum_long)    ! 64-bit, large blocks
 217         save    %sp, -SA(MINFRAME), %sp ! get another window
 218         !
 219         ! %i0 contains buffer address
 220         ! %i1 contains count of 16bit words
 221         ! %i2 contains sum
 222         ! %i4 contains the mainloop count
 223         ! %i5 comes in with the buffer address rounded down to the first dword
 224         !
 225         ! %g1 32bit mask
 226         ! %g4 16bit mask
 227         ! %g5 64bit mask (all 1s)
 228         ! %g6 fetch-ahead offset for Ecache
 229         !
 230         ! %l0-7,%o0-5,%g2-3 mainloop temporaries
 231         !
 232         !
 233                                 ! 1 clock overhead
 234         btst    63, %i0         ! src 64-byte aligned?
 235         bz,a,pt %icc, .mainsection      ! aligned blocks are fairly common
 236         andncc  %i1, 31, %i4    ! at least 64 bytes for main loop?
 237 
 238 
 239         ! Leading dword, with 1-4 hwords: 9 clocks
 240         ! Assumes ok to read the entire dword with the leading bytes
 241         ldx     [%i5], %l0      ! NB base addr
 242         inc     8, %i5          ! next addr
 243 
 244         sub     %i5, %i0, %l2   ! byte count: 2/4/6/8
 245         mov     %i5, %i0
 246 
 247         sll     %l2, 2, %l2     ! 8/16/24/32 for mask
 248 
 249         sllx    %g5, %l2, %l4
 250 
 251         sllx    %l4, %l2, %l4   ! mask: 16, 32, 48, 64 0's at lsb
 252 
 253         srl     %l2, 3, %l2     ! 1/2/3/4 for count
 254         andn    %l0, %l4, %l0   ! select hw's from src
 255 
 256         srlx    %l0, 32, %o0    ! hi32
 257         b       9f
 258         sub     %i1, %l2, %i1   ! decr count, 1-4 halfwords
 259 
 260         ! Do dwords until source is 64-byte aligned, 0-6 iterations
 261         ! 4 clocks per + 4 for 1 mispred = 16 clocks avg
 262 .dw:    ldx     [%i0], %l0      ! tmp64 = *src++ (groups with the branch below)
 263 
 264         inc     8, %i0          ! (Dcache load-use delay)
 265         dec     4, %i1          ! decrement count, 4 halfwords
 266 
 267         srlx    %l0, 32, %o0    ! hi32
 268 9:      and     %l0, %g1, %l0   ! lo32
 269 
 270         add     %o0, %i2, %i2   ! accumulator
 271         btst    63, %i0         ! src 64-byte aligned?
 272 
 273         bnz,pt  %icc, .dw
 274         add     %l0, %i2, %i2   ! accumulator
 275 
 276 
 277         ! At this point source address is 64 byte aligned
 278         ! and we've dealt with 1-32 halfwords.
 279         andncc  %i1, 31, %i4    ! at least 64 bytes for main loop?
 280 .mainsection:                           ! total 18n + 21 clocks
 281         bz,pn   %icc, .postamble
 282         and     %i1, 31, %i1    ! count for postamble
 283 
 284         ! preload for main loop - 9 clocks assuming D$ hits at 1 per
 285         ldx     [%i0+0], %l0
 286         ldx     [%i0+8], %l1
 287         ldx     [%i0+16], %l2   ! %l0 could be used here if Dcache hit
 288         ldx     [%i0+24], %l3   !  but US-II prefetch only loads Ecache
 289         ldx     [%i0+32], %l4   !  check on US-III: could mix preloads & splits?
 290         ldx     [%i0+40], %l5
 291         ldx     [%i0+48], %l6
 292         ldx     [%i0+56], %l7
 293         inc     64, %i0
 294         prefetch [%i0], #n_reads
 295 
 296         ! main loop. Read 64 bytes at a time - 18 clocks per iteration
 297 5:      !                                       plus 4 for the exit mispredict
 298         srlx    %l0, 32, %o0            ! hi32 to %o0
 299         and     %l0, %g1, %l0           ! lo32 to %l0
 300 
 301         srlx    %l1, 32, %o1            ! hi32 to %o1
 302         and     %l1, %g1, %l1           ! lo32 to %l1
 303 
 304         srlx    %l2, 32, %o2            ! hi32 to %o2
 305         and     %l2, %g1, %l2           ! lo32 to %l2
 306 
 307         srlx    %l3, 32, %o3            ! hi32 to %o3
 308         and     %l3, %g1, %l3           ! lo32 to %l3
 309 
 310         srlx    %l4, 32, %o4            ! hi32 to %o4
 311         and     %l4, %g1, %l4           ! lo32 to %l4
 312 
 313         srlx    %l5, 32, %o5            ! hi32 to %o5
 314         and     %l5, %g1, %l5           ! lo32 to %l5
 315 
 316         srlx    %l6, 32, %g2            ! hi32 to %g2
 317         and     %l6, %g1, %l6           ! lo32 to %l6
 318 
 319         srlx    %l7, 32, %g3            ! hi32 to %g3
 320         and     %l7, %g1, %l7           ! lo32 to %l7
 321                                 ! splits gave 16 off 32b vals
 322         deccc   32, %i4         ! mv early,avoid mispredicts? nohelp US-II.
 323         bz,pn   %icc, .looptidy ! count now zero?
 324         add     %l0, %o0, %o0   ! delay
 325 
 326         ldx     [%i0+0], %l0
 327         add     %l1, %o1, %o1   ! adds and loads
 328         add     %l2, %o2, %o2
 329 
 330         ldx     [%i0+8], %l1
 331         add     %l3, %o3, %o3
 332         add     %l4, %o4, %o4
 333 
 334         ldx     [%i0+16], %l2
 335         add     %l5, %o5, %o5
 336         add     %l6, %g2, %g2
 337 
 338         ldx     [%i0+24], %l3
 339         add     %l7, %g3, %g3           ! now 8 off 33b vals
 340         add     %o0, %o1, %o0
 341 
 342         ldx     [%i0+32], %l4
 343         add     %o2, %o3, %o1
 344         add     %o4, %o5, %o2
 345 
 346         ldx     [%i0+40], %l5
 347         add     %g2, %g3, %o3           ! now 4 off 34b vals
 348         add     %o0, %o1, %o0
 349 
 350         ldx     [%i0+48], %l6
 351         add     %o2, %o3, %o1           ! 2 off 35b
 352 
 353         ldx     [%i0+56], %l7
 354         add     %o0, %o1, %o0           ! 36b
 355         inc     64, %i0         ! increment source address
 356 
 357         add     %o0, %i2, %i2   ! accumulator
 358         ba      5b
 359         prefetch [%i0], #n_reads        ! next cacheline
 360                                 ! end of main loop
 361 .looptidy:      ! compute remaining partial sum - 8 clocks
 362         add     %l1, %o1, %o1
 363         add     %l2, %o2, %o2
 364 
 365         add     %l3, %o3, %o3
 366         add     %l4, %o4, %o4
 367 
 368         add     %l5, %o5, %o5
 369         add     %l6, %g2, %g2
 370 
 371         add     %l7, %g3, %g3           ! 8 x 33b
 372         add     %o0, %o1, %o0
 373 
 374         add     %o2, %o3, %o1
 375         add     %o4, %o5, %o2
 376 
 377         add     %g2, %g3, %o3           ! 4 x 34b
 378         add     %o0, %o1, %o0
 379 
 380         add     %o2, %o3, %o1           ! 2 x 35b
 381         add     %o0, %i2, %i2   ! accumulator
 382 
 383         add     %o1, %i2, %i2   ! accumulator
 384 
 385 
 386 .postamble:
 387         ! postamble hword count is in %i1 (can be zero)
 388         ! while at least 1 dword, do dwords.   Max 7 iterations.
 389         andncc  %i1, 3, %g0     ! more than 3 hwords?
 390 .dotail_dw:
 391         bz,a,pn %icc, .dotail_hw
 392         tst     %i1             ! delay: any at all left?
 393 8:      
 394         ldx     [%i0], %l0      ! tmp64 = *src++
 395         inc     8, %i0
 396         dec     4, %i1          ! decrement count, 4 halfwords
 397 
 398                                 ! stall for D-cache
 399 
 400         srlx    %l0, 32, %o0    ! hi32
 401         and     %l0, %g1, %l0   ! lo32
 402 
 403         add     %o0, %i2, %i2   ! accumulator
 404 
 405         andncc  %i1, 3, %g0     ! more than 3 hwords?
 406         bnz,pt  %icc, 8b
 407         add     %l0, %i2, %i2   ! accumulator
 408 
 409         ! while at least 1 hword, do hwords.   Max 3 iterations.
 410         tst     %i1
 411 .dotail_hw:
 412         bz,a    .fold
 413         srlx    %i2, 32, %o0    ! delay: hi32
 414         lduh    [%i0], %l0      ! tmp16 = *src++
 415 1:      
 416         inc     2, %i0
 417                                 ! stall for D-cache
 418 
 419         add     %l0, %i2, %i2   ! accumulator
 420 
 421         deccc   %i1             ! decrement count
 422         bnz,a,pt %icc, 1b
 423         lduh    [%i0], %l0      ! tmp16 = *src++
 424 
 425         ! at this point the 64-bit accumulator
 426         ! has the result that needs to be returned in 16-bits
 427         srlx    %i2, 32, %o0    ! hi32
 428 .fold:
 429         and     %i2, %g1, %o1   ! lo32
 430 
 431         add     %o0, %o1, %o0   ! 33b
 432 
 433         srlx    %o0, 16, %o1    ! hi17
 434         and     %o0, %g4, %o0   ! lo16
 435 
 436         add     %o1, %o0, %o0   ! 18b
 437 
 438         srlx    %o0, 16, %o1    ! hi2
 439         and     %o0, %g4, %o0   ! lo16
 440 
 441         add     %o1, %o0, %i0   ! 16b result in %i0
 442 
 443         ret                     ! return
 444         restore
 445 
 446 
 447         SET_SIZE(ip_ocsum_long) ! 64-bit version
 448