Print this page
1235 Use symbol capabilities to eliminate libc_hwcap*

Split Close
Expand all
Collapse all
          --- old/usr/src/lib/libc/i386_hwcap1/gen/memset.s
          +++ new/usr/src/lib/libc/capabilities/i386/common/memset.s
↓ open down ↓ 35 lines elided ↑ open up ↑
  36   36          movl    12(%esp),%eax   / %al = byte to duplicate
  37   37          movl    16(%esp),%ecx   / %ecx = number of copies
  38   38  
  39   39          / For all basic blocks in this routine, maintain the following
  40   40          / entry conditions:     %eax each byte is set to desired byte.
  41   41          /                       NOTE: .byteset doesn't require this
  42   42          /                       %ecx contains # bytes to set
  43   43          /                       %edi contain address to set
  44   44  
  45   45          cld                     / make sure we go the right way...
  46      -        cmpl    $20,%ecx        / strings with fewer than 20 chars should be byte set
  47      -        jbe     .byteset        
       46 +        cmpl    $20,%ecx        / strings with fewer than 20 chars should be
       47 +                                / byte set
       48 +        jbe     .byteset
  48   49  
  49   50          andl    $0xff, %eax     / trim anything above low byte
  50   51          imul    $0x01010101, %eax       / extend low byte to each byte
  51      -        
       52 +
  52   53          cmpl    $256, %ecx      / smaller areas don't benefit from alignment
  53   54          jbe     .wordset
  54   55  
  55   56          cmpl    $511, %ecx      / areas smaller than this should be wordset
  56      -        jbe     .check_wordset  
       57 +        jbe     .check_wordset
  57   58  
  58   59          /
  59   60          / prep work for sse temporal and non-temporal
  60   61          /
  61   62  
  62   63          pushl   %ebx            / more registers are needed
  63   64          pushl   %esi            / for alignment work
  64   65  
  65   66          /
  66   67          / align address to 64 byte boundaries.
  67   68          /
  68   69  
  69   70          movl    %ecx, %ebx      / save byte count
  70   71          movl    %edi, %esi      / esi is scratch register
  71   72          andl    $63, %esi       / bytes to align to 64 byte align addr
  72      -        neg     %esi            / compute count of bytes 
       73 +        neg     %esi            / compute count of bytes
  73   74          addl    $64, %esi       / needed to align
  74   75          andl    $63, %esi       / to 64 byte align addr
  75   76          jz      .sse_aligned    / skip alignment if not needed
  76   77          subl    %esi, %ebx      / ebx contains remainder of bytes to set
  77   78          movl    %esi, %ecx      / alignment bytes
  78   79          shrl    $2,%ecx         / %ecx = number of words to set
  79   80          rep; sstol
  80   81          movl    %esi,%ecx
  81   82          andl    $3,%ecx         / %ecx = number of bytes left
  82   83          rep; sstob
  83   84          movl    %ebx, %ecx      / remainder to be set
  84   85  
  85   86  .sse_aligned:
  86      -        
       87 +
  87   88          shr     $6, %ecx        / number of 64 byte blocks to set
  88   89  
  89   90          /
  90   91          / load xmm0 with bytes to be set
  91   92          /
  92   93          subl    $16,%esp        / give ourselves some working room on the stack
  93   94          movl    %eax,(%esp)     / copy eax into each of 4 bytes
  94   95          movl    %eax,4(%esp)    / avoid pushl since it causes more interlocking
  95   96          movl    %eax,8(%esp)    /
  96   97          movl    %eax,12(%esp)   /
  97   98          movups  (%esp), %xmm0   / unaligned load from stack into xmm0
  98   99          addl    $16,%esp        / restore stack position
  99      -        
      100 +
 100  101          cmpl    $262143, %ebx   / blocks smaller than this allocate in the cache
 101  102          jbe     .sse_loop
 102  103          jmp     .sse_nt_loop    / branch across alignment nops
 103      -                
      104 +
 104  105          .align 16
 105  106  
 106      -.sse_nt_loop:   
      107 +.sse_nt_loop:
 107  108          movntps %xmm0, (%edi)   / block non-temporal store
 108  109          movntps %xmm0, 16(%edi) / use sse rather than sse2
 109  110          movntps %xmm0, 32(%edi) / so we work more places
 110  111          movntps %xmm0, 48(%edi) /
 111  112  
 112  113          addl    $64, %edi       / increment dest address
 113  114          dec     %ecx            / dec count of blocks
 114  115          jnz     .sse_nt_loop    / jump if not done
 115  116  
 116  117          andl    $63, %ebx       / remainder of bytes to copy
↓ open down ↓ 2 lines elided ↑ open up ↑
 119  120          popl    %ebx            /
 120  121  #if defined(_SSE2_INSN)
 121  122          mfence
 122  123  #elif defined(_SSE_INSN)
 123  124          sfence
 124  125  #else
 125  126  #error "Must have either SSE or SSE2"
 126  127  #endif
 127  128          cmpl    $20, %ecx       / compare and jump accordingly
 128  129          jbe     .byteset
 129      -        jmp     .wordset        
      130 +        jmp     .wordset
 130  131  
 131  132          .align 16
 132  133  .sse_loop:
 133  134          movaps %xmm0, (%edi)    / block copy w/ SSE
 134  135          movaps %xmm0, 16(%edi)
 135  136          movaps %xmm0, 32(%edi)
 136  137          movaps %xmm0, 48(%edi)
 137  138  
 138  139          addl    $64, %edi       / increment addr
 139  140          dec     %ecx            / dec count of blocks
 140  141          jnz     .sse_loop       / jump if not done
 141  142  
 142  143          andl    $63, %ebx       / remainder of bytes to copy
 143  144          movl    %ebx, %ecx      / in %ecx as normal
 144  145          popl    %esi            / restore stack config
 145  146          popl    %ebx            /
 146      -        cmpl    $20, %ecx       
      147 +        cmpl    $20, %ecx
 147  148          jbe     .byteset
 148  149          jmp     .wordset
 149  150  
 150  151  .check_wordset:
 151  152          movl    %edi, %edx      / save current store ptr
 152  153          andl    $7, %edi        / check alignment
 153  154          movl    %edx,%edi       / %edi = string address
 154      -        jz      .wordset        / all ok 
 155      -        
      155 +        jz      .wordset        / all ok
 156  156  
 157      -.align_wordset: 
      157 +
      158 +.align_wordset:
 158  159          pushl   %ebx            / more registers are needed
 159      -        pushl   %esi            
      160 +        pushl   %esi
 160  161  
 161  162          movl    %ecx, %ebx
 162  163          movl    %edi, %esi
 163  164          andl    $7, %esi
 164  165          neg     %esi
 165  166          addl    $8, %esi
 166  167          andl    $7, %esi
 167  168          subl    %esi, %ebx      / ebx contains remainder of bytes to copy
 168  169          movl    %esi, %ecx
 169      -        rep; sstob       
      170 +        rep; sstob
 170  171          movl    %ebx, %ecx
 171  172          popl    %esi            / restore stack config
 172  173          popl    %ebx            /
 173  174  
 174  175  .wordset:
 175  176          movl    %ecx, %edx      / save cont
 176  177          shrl    $2,%ecx         / %ecx = number of words to set
 177  178          rep; sstol
 178  179          movl    %edx,%ecx
 179  180          andl    $3,%ecx         / %ecx = number of bytes left
 180  181  
 181  182  .byteset:
 182  183          rep; sstob
 183  184          movl    8(%esp),%eax    / return string address
 184  185          popl    %edi            / restore register variable
 185  186          ret
 186  187          SET_SIZE(memset)
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX