2 Wdiff usr/src/uts/sun4/ml/ip_ocsum.s

Print this page

de-linting of .s files

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/sun4/ml/ip_ocsum.s
          +++ new/usr/src/uts/sun4/ml/ip_ocsum.s

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License, Version 1.0 only
   6    6   * (the "License").  You may not use this file except in compliance
   7    7   * with the License.
   8    8   *
   9    9   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10   10   * or http://www.opensolaris.org/os/licensing.
  11   11   * See the License for the specific language governing permissions
  12   12   * and limitations under the License.
  13   13   *
  14   14   * When distributing Covered Code, include this CDDL HEADER in each
  15   15   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16   16   * If applicable, add the following below this CDDL HEADER, with the

↓ open down ↓

16 lines elided

↑ open up ↑

  17   17   * fields enclosed by brackets "[]" replaced with your own identifying
  18   18   * information: Portions Copyright [yyyy] [name of copyright owner]
  19   19   *
  20   20   * CDDL HEADER END
  21   21   */
  22   22  /*
  23   23   * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27      -#pragma ident   "%Z%%M% %I%     %E% SMI"
  28      -
  29   27  #include <sys/param.h>
  30   28  #include <sys/errno.h>
  31   29  #include <sys/asm_linkage.h>
  32   30  #include <sys/vtrace.h>
  33   31  #include <sys/machthread.h>
  34   32  #include <sys/machparam.h>
  35   33  
  36      -#if defined(lint)
  37      -#include <sys/types.h>
  38      -#else   /* lint */
  39   34  #include "assym.h"
  40      -#endif  /* lint */
  41   35  
  42   36  /*
  43   37   * Prefetch considerations
  44   38   * 
  45   39   * We prefetch one cacheline ahead.  This may not be enough on Serengeti
  46   40   * systems - see default_copyout() etc which prefetch 5 lines ahead.
  47   41   * On the other hand, we expect most of the source buffers to be
  48   42   * recently used enough to be cached.
  49   43   *
  50   44   * On US-I the prefetches are inoperative.  On US-II they preload the E$;

  51   45   * the mainloop unrolling and load-buffer should cover loads from E$.
  52   46   * The stores appear to be the slow point on US-II.
  53   47   * 
  54   48   * On US-IIICu the prefetch preloads the L2$ too, but there is no load
  55   49   * buffer so the loads will stall for D$ miss, L2$ hit.  The hardware
  56   50   * auto-prefetch is not activated by integer loads.  No solution
  57   51   * in sight for this, barring odd games with FP read, write, integer read.
  58   52   * 
  59   53   * US-IV (Panther) appears similar to US-IIICu, except that a strong
  60   54   * variant of prefetch is available which can take TLB traps.  We don't
  61   55   * use this.  The h/w prefetch stride can be set to 64, 128 or 192,
  62   56   * and they only reach to the L2$ (we don't use these either).
  63   57   * L2$ load-to-use latency is 15 cycles (best).
  64   58   */
  65   59  
  66   60  
  67   61  /*

↓ open down ↓

17 lines elided

↑ open up ↑

  68   62   * ip_ocsum(address, halfword_count, sum)
  69   63   * Do a 16 bit one's complement sum of a given number of (16-bit)
  70   64   * halfwords. The halfword pointer must not be odd.
  71   65   *      %o0 address; %o1 count; %o2 sum accumulator; %o4 temp
  72   66   *      %g2 and %g3 used in main loop
  73   67   *
  74   68   * (from @(#)ocsum.s 1.3 89/02/24 SMI)
  75   69   *
  76   70   */
  77   71  
  78      -#if defined(lint) 
  79      -
  80      -/* ARGSUSED */
  81      -unsigned int
  82      -ip_ocsum(u_short *address, int halfword_count, unsigned int sum)
  83      -{ return (0); }
  84      -
  85      -#else   /* lint */
  86      -
  87   72          ENTRY(ip_ocsum)
  88   73  
  89   74  /*
  90   75   * On ttcp transmits, called once per ocsum_copyin but with a small
  91   76   * block ( >99.9% ).  Could be the tx hdrs?  How many acks/seg are we rxing?
  92   77   * On ttcp receives, called more than once per ocsum_copyout. Rx hdrs
  93   78   * and tx acks?
  94   79   *
  95   80   * To do: telnet and nfs traffic
  96   81   *

  97   82   * On an NCA'd webserver about 10% of the calls are >64 bytes
  98   83   *      about 10% of those start on a 64byte boundary
  99   84   *      about 30% are >5*64 bytes.
 100   85   * The NCA numbers & proportions don't change with h/w cksum on.
 101   86   *
 102   87   * Tx hdrs are likely to be already in cache.
 103   88   * Rx hdrs depends if already inspected.
 104   89   */
 105   90  
 106   91          !
 107   92          ! Entry point for checksum-only.
 108   93          ! %o0 contains buffer address
 109   94          ! %o1 contains count of 16bit words
 110   95          ! %o2 contains sum
 111   96          !
 112   97          ! %o3 temporary
 113   98          ! %o4 temporary
 114   99          ! %g1 32bit mask
 115  100          ! %g4 16bit mask
 116  101          ! %g5 64bit mask (all 1s)
 117  102          !
 118  103          not     %g0, %g5        ! all 1's
 119  104          prefetch [%o0], #n_reads        ! first hword, dword, cacheline
 120  105  
 121  106          clruw   %g5, %g1        ! 32 1's at low end
 122  107          srl     %g5, 16, %g4    ! 16 1's at low end
 123  108  
 124  109          cmp     %o1, 32         ! at least a cacheline (64 bytes)?
 125  110          bge,pn %icc, ip_ocsum_long      ! yes, do the whole works
 126  111          andn    %o0, 7, %o5     ! delay: base src addr
 127  112  
 128  113  
 129  114          cmp     %o1, 4          ! < 4 halfwords?
 130  115          bl,pn   %icc, .tiny     ! < 4 halfwords, just do them
 131  116          inc     8, %o5          ! delay: next addr (no matter for .tiny)
 132  117  
 133  118          /* leading dword with 1-4 hwords: 9 clocks */
 134  119          /* Assumes ok to read the entire dword with the leading hwords */
 135  120  
 136  121          ldx     [%o5-8], %o3    ! NB base addr
 137  122          sub     %o5, %o0, %g2   ! byte count: 2/4/6/8
 138  123          mov     %o5, %o0
 139  124  
 140  125          sll     %g2, 2, %g2     ! 8/16/24/32 for mask
 141  126  
 142  127          sllx    %g5, %g2, %o5
 143  128  
 144  129          sllx    %o5, %g2, %o5   ! mask: 16/32/48/64 0's at low end
 145  130  
 146  131          srl     %g2, 3, %g2     ! hw count
 147  132          andn    %o3, %o5, %o3   ! select hw's from src
 148  133  
 149  134          srlx    %o3, 32, %o4    ! hi32
 150  135          b       9f
 151  136          sub     %o1, %g2, %o1   ! delay: decr count, 1-4 halfwords
 152  137  
 153  138  .short_dw:                      ! max 7 iters of 4 clocks; 1 mispred of 4
 154  139          ldx     [%o0], %o3      ! tmp64 = *src++ (groups with the branch)
 155  140  
 156  141          inc     8, %o0          ! (D-cache load-use delay)
 157  142          dec     4, %o1          ! decrement count, 4 halfwords
 158  143  
 159  144          srlx    %o3, 32, %o4    ! hi32
 160  145  9:      and     %o3, %g1, %o3   ! lo32
 161  146  
 162  147          add     %o4, %o2, %o2   ! accumulator
 163  148          andncc  %o1, 3, %g0     ! more than 3 hwords left?
 164  149  
 165  150          bnz,pt %icc, .short_dw
 166  151          add     %o3, %o2, %o2   ! accumulator
 167  152  
 168  153  .short_hw:                      ! trailing dw: 0-3 hwords
 169  154          tst     %o1             ! 0 seems fairly common...
 170  155          bz,a    .short_fold
 171  156          srlx    %o2, 32, %o4    ! delay: hi32
 172  157                                  ! mispredict 4 + 7 clocks for 1-3
 173  158          ldx     [%o0], %o3
 174  159          sll     %o1, 4, %o1     ! bitcount: 16/32/48
 175  160  
 176  161          srlx    %g5, %o1, %o5   ! mask: 16/32/48  0's at high end
 177  162  
 178  163          andn    %o3, %o5, %o3   ! select hw's from src
 179  164  
 180  165          srlx    %o3, 32, %o4    ! hi32
 181  166          and     %o3, %g1, %o3   ! lo32
 182  167  
 183  168          add     %o4, %o2, %o2   ! accumulator
 184  169  
 185  170          add     %o3, %o2, %o2   ! accumulator
 186  171  
 187  172          ! at this point the 64-bit accumulator
 188  173          ! has the result that needs to be returned in 16-bits
 189  174          srlx    %o2, 32, %o4    ! hi32
 190  175  .short_fold:
 191  176          and     %o2, %g1, %o2   ! lo32
 192  177  
 193  178          add     %o4, %o2, %o2   ! 33b
 194  179  
 195  180          srlx    %o2, 16, %o3    ! hi17
 196  181          and     %o2, %g4, %o2   ! lo16
 197  182  
 198  183          add     %o3, %o2, %o2   ! 18b
 199  184  
 200  185          srlx    %o2, 16, %o3    ! hi2
 201  186          and     %o2, %g4, %o2   ! lo16
 202  187  
 203  188          retl                    ! return
 204  189          add     %o3, %o2, %o0   ! 16b result in %o0
 205  190  
 206  191  .tiny:                          ! almost never: less than 4 halfwords total.
 207  192          tst     %o1
 208  193          bz,a    .short_fold
 209  194  
 210  195          srlx    %o2, 32, %o4    ! delay: hi32
 211  196  
 212  197          lduh    [%o0], %o3      ! tmp16 = *src++
 213  198  1:      
 214  199          inc     2, %o0
 215  200                                  ! stall for D-cache
 216  201  
 217  202          add     %o3, %o2, %o2   ! accumulator
 218  203  
 219  204          deccc   %o1             ! decrement count
 220  205          bnz,a,pt %icc, 1b
 221  206          lduh    [%o0], %o3      ! tmp16 = *src++
 222  207  
 223  208          ! at this point the 64-bit accumulator
 224  209          ! has the result that needs to be returned in 16-bits
 225  210          b       .short_fold
 226  211          srlx    %o2, 32, %o4    ! hi32
 227  212  
 228  213          SET_SIZE(ip_ocsum)      ! 64-bit version
 229  214  
 230  215  
 231  216          ENTRY(ip_ocsum_long)    ! 64-bit, large blocks
 232  217          save    %sp, -SA(MINFRAME), %sp ! get another window
 233  218          !
 234  219          ! %i0 contains buffer address
 235  220          ! %i1 contains count of 16bit words
 236  221          ! %i2 contains sum
 237  222          ! %i4 contains the mainloop count
 238  223          ! %i5 comes in with the buffer address rounded down to the first dword
 239  224          !
 240  225          ! %g1 32bit mask
 241  226          ! %g4 16bit mask
 242  227          ! %g5 64bit mask (all 1s)
 243  228          ! %g6 fetch-ahead offset for Ecache
 244  229          !
 245  230          ! %l0-7,%o0-5,%g2-3 mainloop temporaries
 246  231          !
 247  232          !
 248  233                                  ! 1 clock overhead
 249  234          btst    63, %i0         ! src 64-byte aligned?
 250  235          bz,a,pt %icc, .mainsection      ! aligned blocks are fairly common
 251  236          andncc  %i1, 31, %i4    ! at least 64 bytes for main loop?
 252  237  
 253  238  
 254  239          ! Leading dword, with 1-4 hwords: 9 clocks
 255  240          ! Assumes ok to read the entire dword with the leading bytes
 256  241          ldx     [%i5], %l0      ! NB base addr
 257  242          inc     8, %i5          ! next addr
 258  243  
 259  244          sub     %i5, %i0, %l2   ! byte count: 2/4/6/8
 260  245          mov     %i5, %i0
 261  246  
 262  247          sll     %l2, 2, %l2     ! 8/16/24/32 for mask
 263  248  
 264  249          sllx    %g5, %l2, %l4
 265  250  
 266  251          sllx    %l4, %l2, %l4   ! mask: 16, 32, 48, 64 0's at lsb
 267  252  
 268  253          srl     %l2, 3, %l2     ! 1/2/3/4 for count
 269  254          andn    %l0, %l4, %l0   ! select hw's from src
 270  255  
 271  256          srlx    %l0, 32, %o0    ! hi32
 272  257          b       9f
 273  258          sub     %i1, %l2, %i1   ! decr count, 1-4 halfwords
 274  259  
 275  260          ! Do dwords until source is 64-byte aligned, 0-6 iterations
 276  261          ! 4 clocks per + 4 for 1 mispred = 16 clocks avg
 277  262  .dw:    ldx     [%i0], %l0      ! tmp64 = *src++ (groups with the branch below)
 278  263  
 279  264          inc     8, %i0          ! (Dcache load-use delay)
 280  265          dec     4, %i1          ! decrement count, 4 halfwords
 281  266  
 282  267          srlx    %l0, 32, %o0    ! hi32
 283  268  9:      and     %l0, %g1, %l0   ! lo32
 284  269  
 285  270          add     %o0, %i2, %i2   ! accumulator
 286  271          btst    63, %i0         ! src 64-byte aligned?
 287  272  
 288  273          bnz,pt  %icc, .dw
 289  274          add     %l0, %i2, %i2   ! accumulator
 290  275  
 291  276  
 292  277          ! At this point source address is 64 byte aligned
 293  278          ! and we've dealt with 1-32 halfwords.
 294  279          andncc  %i1, 31, %i4    ! at least 64 bytes for main loop?
 295  280  .mainsection:                           ! total 18n + 21 clocks
 296  281          bz,pn   %icc, .postamble
 297  282          and     %i1, 31, %i1    ! count for postamble
 298  283  
 299  284          ! preload for main loop - 9 clocks assuming D$ hits at 1 per
 300  285          ldx     [%i0+0], %l0
 301  286          ldx     [%i0+8], %l1
 302  287          ldx     [%i0+16], %l2   ! %l0 could be used here if Dcache hit
 303  288          ldx     [%i0+24], %l3   !  but US-II prefetch only loads Ecache
 304  289          ldx     [%i0+32], %l4   !  check on US-III: could mix preloads & splits?
 305  290          ldx     [%i0+40], %l5
 306  291          ldx     [%i0+48], %l6
 307  292          ldx     [%i0+56], %l7
 308  293          inc     64, %i0
 309  294          prefetch [%i0], #n_reads
 310  295  
 311  296          ! main loop. Read 64 bytes at a time - 18 clocks per iteration
 312  297  5:      !                                       plus 4 for the exit mispredict
 313  298          srlx    %l0, 32, %o0            ! hi32 to %o0
 314  299          and     %l0, %g1, %l0           ! lo32 to %l0
 315  300  
 316  301          srlx    %l1, 32, %o1            ! hi32 to %o1
 317  302          and     %l1, %g1, %l1           ! lo32 to %l1
 318  303  
 319  304          srlx    %l2, 32, %o2            ! hi32 to %o2
 320  305          and     %l2, %g1, %l2           ! lo32 to %l2
 321  306  
 322  307          srlx    %l3, 32, %o3            ! hi32 to %o3
 323  308          and     %l3, %g1, %l3           ! lo32 to %l3
 324  309  
 325  310          srlx    %l4, 32, %o4            ! hi32 to %o4
 326  311          and     %l4, %g1, %l4           ! lo32 to %l4
 327  312  
 328  313          srlx    %l5, 32, %o5            ! hi32 to %o5
 329  314          and     %l5, %g1, %l5           ! lo32 to %l5
 330  315  
 331  316          srlx    %l6, 32, %g2            ! hi32 to %g2
 332  317          and     %l6, %g1, %l6           ! lo32 to %l6
 333  318  
 334  319          srlx    %l7, 32, %g3            ! hi32 to %g3
 335  320          and     %l7, %g1, %l7           ! lo32 to %l7
 336  321                                  ! splits gave 16 off 32b vals
 337  322          deccc   32, %i4         ! mv early,avoid mispredicts? nohelp US-II.
 338  323          bz,pn   %icc, .looptidy ! count now zero?
 339  324          add     %l0, %o0, %o0   ! delay
 340  325  
 341  326          ldx     [%i0+0], %l0
 342  327          add     %l1, %o1, %o1   ! adds and loads
 343  328          add     %l2, %o2, %o2
 344  329  
 345  330          ldx     [%i0+8], %l1
 346  331          add     %l3, %o3, %o3
 347  332          add     %l4, %o4, %o4
 348  333  
 349  334          ldx     [%i0+16], %l2
 350  335          add     %l5, %o5, %o5
 351  336          add     %l6, %g2, %g2
 352  337  
 353  338          ldx     [%i0+24], %l3
 354  339          add     %l7, %g3, %g3           ! now 8 off 33b vals
 355  340          add     %o0, %o1, %o0
 356  341  
 357  342          ldx     [%i0+32], %l4
 358  343          add     %o2, %o3, %o1
 359  344          add     %o4, %o5, %o2
 360  345  
 361  346          ldx     [%i0+40], %l5
 362  347          add     %g2, %g3, %o3           ! now 4 off 34b vals
 363  348          add     %o0, %o1, %o0
 364  349  
 365  350          ldx     [%i0+48], %l6
 366  351          add     %o2, %o3, %o1           ! 2 off 35b
 367  352  
 368  353          ldx     [%i0+56], %l7
 369  354          add     %o0, %o1, %o0           ! 36b
 370  355          inc     64, %i0         ! increment source address
 371  356  
 372  357          add     %o0, %i2, %i2   ! accumulator
 373  358          ba      5b
 374  359          prefetch [%i0], #n_reads        ! next cacheline
 375  360                                  ! end of main loop
 376  361  .looptidy:      ! compute remaining partial sum - 8 clocks
 377  362          add     %l1, %o1, %o1
 378  363          add     %l2, %o2, %o2
 379  364  
 380  365          add     %l3, %o3, %o3
 381  366          add     %l4, %o4, %o4
 382  367  
 383  368          add     %l5, %o5, %o5
 384  369          add     %l6, %g2, %g2
 385  370  
 386  371          add     %l7, %g3, %g3           ! 8 x 33b
 387  372          add     %o0, %o1, %o0
 388  373  
 389  374          add     %o2, %o3, %o1
 390  375          add     %o4, %o5, %o2
 391  376  
 392  377          add     %g2, %g3, %o3           ! 4 x 34b
 393  378          add     %o0, %o1, %o0
 394  379  
 395  380          add     %o2, %o3, %o1           ! 2 x 35b
 396  381          add     %o0, %i2, %i2   ! accumulator
 397  382  
 398  383          add     %o1, %i2, %i2   ! accumulator
 399  384  
 400  385  
 401  386  .postamble:
 402  387          ! postamble hword count is in %i1 (can be zero)
 403  388          ! while at least 1 dword, do dwords.   Max 7 iterations.
 404  389          andncc  %i1, 3, %g0     ! more than 3 hwords?
 405  390  .dotail_dw:
 406  391          bz,a,pn %icc, .dotail_hw
 407  392          tst     %i1             ! delay: any at all left?
 408  393  8:      
 409  394          ldx     [%i0], %l0      ! tmp64 = *src++
 410  395          inc     8, %i0
 411  396          dec     4, %i1          ! decrement count, 4 halfwords
 412  397  
 413  398                                  ! stall for D-cache
 414  399  
 415  400          srlx    %l0, 32, %o0    ! hi32
 416  401          and     %l0, %g1, %l0   ! lo32
 417  402  
 418  403          add     %o0, %i2, %i2   ! accumulator
 419  404  
 420  405          andncc  %i1, 3, %g0     ! more than 3 hwords?
 421  406          bnz,pt  %icc, 8b
 422  407          add     %l0, %i2, %i2   ! accumulator
 423  408  
 424  409          ! while at least 1 hword, do hwords.   Max 3 iterations.
 425  410          tst     %i1
 426  411  .dotail_hw:
 427  412          bz,a    .fold
 428  413          srlx    %i2, 32, %o0    ! delay: hi32
 429  414          lduh    [%i0], %l0      ! tmp16 = *src++
 430  415  1:      
 431  416          inc     2, %i0
 432  417                                  ! stall for D-cache
 433  418  
 434  419          add     %l0, %i2, %i2   ! accumulator
 435  420  
 436  421          deccc   %i1             ! decrement count
 437  422          bnz,a,pt %icc, 1b
 438  423          lduh    [%i0], %l0      ! tmp16 = *src++
 439  424  
 440  425          ! at this point the 64-bit accumulator
 441  426          ! has the result that needs to be returned in 16-bits
 442  427          srlx    %i2, 32, %o0    ! hi32
 443  428  .fold:
 444  429          and     %i2, %g1, %o1   ! lo32
 445  430  
 446  431          add     %o0, %o1, %o0   ! 33b
 447  432  
 448  433          srlx    %o0, 16, %o1    ! hi17
 449  434          and     %o0, %g4, %o0   ! lo16
 450  435  
 451  436          add     %o1, %o0, %o0   ! 18b
 452  437  
 453  438          srlx    %o0, 16, %o1    ! hi2

↓ open down ↓

357 lines elided

↑ open up ↑

 454  439          and     %o0, %g4, %o0   ! lo16
 455  440  
 456  441          add     %o1, %o0, %i0   ! 16b result in %i0
 457  442  
 458  443          ret                     ! return
 459  444          restore
 460  445  
 461  446  
 462  447          SET_SIZE(ip_ocsum_long) ! 64-bit version
 463  448  
 464      -#endif  /* lint */

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX