Print this page
1235 Use symbol capabilities to eliminate libc_hwcap*

Split Close
Expand all
Collapse all
          --- old/usr/src/lib/libc/i386_hwcap1/gen/memcpy.s
          +++ new/usr/src/lib/libc/capabilities/i386/common/memcpy.s
↓ open down ↓ 50 lines elided ↑ open up ↑
  51   51          ENTRY(memcpy)
  52   52          pushl   %esi
  53   53          pushl   %edi
  54   54  
  55   55          movl    8+4(%esp),%edi  / %edi = dest address
  56   56          movl    %edi, %eax      / save this
  57   57          movl    8+8(%esp),%esi  / %esi = source address
  58   58          movl    8+12(%esp),%ecx/ %ecx = length of string
  59   59                                  / %edx scratch register
  60   60                                  / %eax scratch register
  61      -.memcpy_post:   
       61 +.memcpy_post:
  62   62          nop                     / this really helps, don't know why
  63   63                                  / note: cld is perf death on P4
  64   64          cmpl    $63,%ecx
  65   65          ja      .move_sse       / not worth doing sse for less
  66   66  
  67      -.movew: 
       67 +.movew:
  68   68          movl    %ecx,%edx       / save byte cnt
  69   69          shrl    $2,%ecx         / %ecx = number of words to move
  70   70          rep ; smovl             / move the words
  71   71  
  72   72  
  73   73          andl    $0x3,%edx       / %edx = number of bytes left to move
  74   74          jz      .Return         / %edx <= 3, so just unroll the loop
  75   75  
  76   76          movb    (%esi), %cl
  77   77          movb    %cl, (%edi)
↓ open down ↓ 1 lines elided ↑ open up ↑
  79   79          jz      .Return
  80   80          movb    1(%esi), %cl
  81   81          movb    %cl, 1(%edi)
  82   82          decl    %edx
  83   83          jz      .Return
  84   84          movb    2(%esi), %cl
  85   85          movb    %cl, 2(%edi)
  86   86  
  87   87  .Return:
  88   88          popl    %edi            / restore register variables
  89      -        popl    %esi            
       89 +        popl    %esi
  90   90          ret
  91   91  
  92   92  .move_sse:
  93   93          /
  94   94          / time to 16 byte align destination
  95   95          /
  96   96          andl    $15, %eax
  97   97          jnz     .sse_unaligned  / jmp if dest is unaligned
  98   98  .sse:                           / dest is aligned, check source
  99   99          movl    %ecx, %edx      / get byte count
 100  100          shrl    $6, %edx        / number of 64 byte blocks to move
 101  101          testl   $15, %esi
 102  102          jnz     .sse_da         / go to slow loop if source is unaligned
 103  103          cmpl    $65535, %ecx
 104  104          ja      .sse_sa_nt_loop
 105      -        
      105 +
 106  106          /
 107  107          / use aligned load since we're lucky
 108  108          /
 109  109  .sse_sa_loop:
 110  110          prefetcht0 568(%esi)    / prefetch source & copy 64 byte at a time
 111  111          prefetcht0 568(%edi)    / prefetch source & copy 64 byte at a time
 112  112          movaps  0(%esi), %xmm0
 113      -        movaps  %xmm0, 0(%edi)   
      113 +        movaps  %xmm0, 0(%edi)
 114  114          movaps  16(%esi), %xmm1
 115  115          movaps  %xmm1, 16(%edi)
 116  116          movaps  32(%esi), %xmm2
 117      -        movaps  %xmm2, 32(%edi)  
      117 +        movaps  %xmm2, 32(%edi)
 118  118          movaps  48(%esi), %xmm3
 119  119          movaps  %xmm3, 48(%edi)
 120  120          addl    $64, %esi
 121  121          addl    $64, %edi
 122  122          decl    %edx
 123  123          jnz     .sse_sa_loop
 124      -        
      124 +
 125  125  .sse_cleanup:
 126  126          andl    $63, %ecx       / compute remaining bytes
 127  127          movl    8+4(%esp), %eax / setup return value
 128  128          jz      .Return
 129  129          jmp     .movew
 130      -        
      130 +
 131  131          /
 132  132          / use aligned load since we're lucky
 133  133          /
 134  134          .align 16
 135  135  .sse_sa_nt_loop:
 136  136          prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
 137  137          movaps  (%esi), %xmm0
 138      -        movntps %xmm0, 0(%edi)   
      138 +        movntps %xmm0, 0(%edi)
 139  139          movaps  16(%esi), %xmm1
 140  140          movntps %xmm1, 16(%edi)
 141  141          movaps  32(%esi), %xmm2
 142      -        movntps %xmm2, 32(%edi)  
      142 +        movntps %xmm2, 32(%edi)
 143  143          movaps  48(%esi), %xmm3
 144  144          movntps %xmm3, 48(%edi)
 145  145          addl    $64, %esi
 146  146          addl    $64, %edi
 147  147          decl    %edx
 148  148          jnz     .sse_sa_nt_loop
 149  149  #if defined(_SSE2_INSN)
 150  150          mfence
 151  151  #elif defined(_SSE_INSN)
 152  152          sfence
↓ open down ↓ 11 lines elided ↑ open up ↑
 164  164          movl    %ecx, %edx      / saved count
 165  165          subl    %eax, %ecx      / subtract from byte count
 166  166          cmpl    $64, %ecx       / after aligning, will we still have 64 bytes?
 167  167          cmovb   %edx, %ecx      / if not, restore original byte count,
 168  168          cmovb   8+4(%esp), %eax / and restore return value,
 169  169          jb      .movew          / and do a non-SSE move.
 170  170          xchg    %ecx, %eax      / flip for copy
 171  171          rep ; smovb             / move the bytes
 172  172          xchg    %ecx, %eax      / flip back
 173  173          jmp     .sse
 174      -        
      174 +
 175  175          .align 16
 176  176  .sse_da:
 177  177          cmpl    $65535, %ecx
 178  178          jbe     .sse_da_loop
 179  179  
 180  180          /
 181  181          / use unaligned load since source doesn't line up
 182  182          /
 183  183  .sse_da_nt_loop:
 184  184          prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
 185  185          movups  0(%esi), %xmm0
 186      -        movntps %xmm0, 0(%edi)   
      186 +        movntps %xmm0, 0(%edi)
 187  187          movups  16(%esi), %xmm1
 188  188          movntps %xmm1, 16(%edi)
 189  189          movups  32(%esi), %xmm2
 190      -        movntps %xmm2, 32(%edi)  
      190 +        movntps %xmm2, 32(%edi)
 191  191          movups  48(%esi), %xmm3
 192  192          movntps %xmm3, 48(%edi)
 193  193          addl    $64, %esi
 194  194          addl    $64, %edi
 195  195          decl    %edx
 196  196          jnz     .sse_da_nt_loop
 197  197  #if defined(_SSE2_INSN)
 198  198          mfence
 199  199  #elif defined(_SSE_INSN)
 200  200          sfence
↓ open down ↓ 2 lines elided ↑ open up ↑
 203  203  #endif
 204  204          jmp     .sse_cleanup
 205  205          /
 206  206          / use unaligned load since source doesn't line up
 207  207          /
 208  208          .align  16
 209  209  .sse_da_loop:
 210  210          prefetcht0 568(%esi)    / prefetch source & copy 64 byte at a time
 211  211          prefetcht0 568(%edi)
 212  212          movups  0(%esi), %xmm0
 213      -        movaps  %xmm0, 0(%edi)   
      213 +        movaps  %xmm0, 0(%edi)
 214  214          movups  16(%esi), %xmm1
 215  215          movaps  %xmm1, 16(%edi)
 216  216          movups  32(%esi), %xmm2
 217      -        movaps  %xmm2, 32(%edi)  
      217 +        movaps  %xmm2, 32(%edi)
 218  218          movups  48(%esi), %xmm3
 219  219          movaps  %xmm3, 48(%edi)
 220  220          addl    $64, %esi
 221  221          addl    $64, %edi
 222  222          decl    %edx
 223  223          jnz     .sse_da_loop
 224  224          jmp     .sse_cleanup
 225      -        
      225 +
 226  226          SET_SIZE(memcpy)
 227  227  
 228  228  
 229  229  / .CopyLeft handles the memmove case where we must perform the copy backwards,
 230  230  / because of overlap between src and dst. This is not particularly optimized.
 231  231  
 232  232  .CopyLeft:
 233  233          movl    $3,%eax                 / heavily used constant
 234  234          std                             / reverse direction bit (RtoL)
 235  235          cmpl    $12,%ecx                / if (size < 12)
↓ open down ↓ 9 lines elided ↑ open up ↑
 245  245  .BigCopyLeft:                           / } else {
 246  246          xchgl   %edx,%ecx
 247  247          movl    %ecx,%esi               / align source w/byte copy
 248  248          leal    -1(%edx,%edi),%edi
 249  249          andl    %eax,%ecx
 250  250          jz      .SkipAlignLeft
 251  251          addl    $1, %ecx                / we need to insure that future
 252  252          subl    %ecx,%edx               / copy is done on aligned boundary
 253  253          rep;    smovb
 254  254  .SkipAlignLeft:
 255      -        movl    %edx,%ecx       
      255 +        movl    %edx,%ecx
 256  256          subl    %eax,%esi
 257  257          shrl    $2,%ecx                 / do 4 byte copy RtoL
 258  258          subl    %eax,%edi
 259  259          rep;    smovl
 260  260          andl    %eax,%edx               / do 1 byte copy whats left
 261  261          jz      .CleanupReturnLeft
 262      -        movl    %edx,%ecx       
      262 +        movl    %edx,%ecx
 263  263          addl    %eax,%esi               / rep; smovl instruction will decrement
 264  264          addl    %eax,%edi               / %edi, %esi by four after each copy
 265  265                                          / adding 3 will restore pointers to byte
 266  266                                          / before last double word copied
 267  267                                          / which is where they are expected to
 268  268                                          / be for the single byte copy code
 269  269          rep;    smovb
 270  270  .CleanupReturnLeft:
 271  271          cld                             / reset direction flag to LtoR
 272  272          popl    %edi
 273  273          popl    %esi                    / restore registers
 274  274          movl    4(%esp),%eax            / set up return value
 275  275          ret                             / return(dba);
 276  276          SET_SIZE(memmove)
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX