ilcaplibc Wdiff usr/src/lib/libc/capabilities/i386/common/memcpy.s

Print this page

1235 Use symbol capabilities to eliminate libc_hwcap*

Split	Close
Expand all
Collapse all

          --- old/usr/src/lib/libc/i386_hwcap1/gen/memcpy.s
          +++ new/usr/src/lib/libc/capabilities/i386/common/memcpy.s

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27   27          .file   "memcpy.s"
  28   28  
  29   29  #include <sys/asm_linkage.h>
  30   30  
  31   31          ANSI_PRAGMA_WEAK(memmove,function)
  32   32          ANSI_PRAGMA_WEAK(memcpy,function)
  33   33  
  34   34          ENTRY(memmove)
  35   35          movl    0+12(%esp),%ecx / get number of bytes to move
  36   36          pushl   %esi            / save off %edi, %esi and move destination
  37   37          pushl   %edi
  38   38          movl    8+ 4(%esp),%edi / destination buffer address
  39   39          movl    8+ 8(%esp),%esi / source buffer address
  40   40          movl    %edi, %eax
  41   41          testl   %ecx,%ecx
  42   42          jz      .Return
  43   43  
  44   44          cmpl    %esi,%edi       / if (source addr > dest addr)
  45   45          leal    -1(%esi,%ecx),%edx      / %edx = src + size - 1
  46   46          jbe     .memcpy_post    / jump if dst <= src
  47   47          cmpl    %edx,%edi
  48   48          jbe     .CopyLeft       / jump if dst <= src + size - 1
  49   49          jmp     .memcpy_post
  50   50

↓ open down ↓

50 lines elided

↑ open up ↑

  51   51          ENTRY(memcpy)
  52   52          pushl   %esi
  53   53          pushl   %edi
  54   54  
  55   55          movl    8+4(%esp),%edi  / %edi = dest address
  56   56          movl    %edi, %eax      / save this
  57   57          movl    8+8(%esp),%esi  / %esi = source address
  58   58          movl    8+12(%esp),%ecx/ %ecx = length of string
  59   59                                  / %edx scratch register
  60   60                                  / %eax scratch register
  61      -.memcpy_post:   
       61 +.memcpy_post:
  62   62          nop                     / this really helps, don't know why
  63   63                                  / note: cld is perf death on P4
  64   64          cmpl    $63,%ecx
  65   65          ja      .move_sse       / not worth doing sse for less
  66   66  
  67      -.movew: 
       67 +.movew:
  68   68          movl    %ecx,%edx       / save byte cnt
  69   69          shrl    $2,%ecx         / %ecx = number of words to move
  70   70          rep ; smovl             / move the words
  71   71  
  72   72  
  73   73          andl    $0x3,%edx       / %edx = number of bytes left to move
  74   74          jz      .Return         / %edx <= 3, so just unroll the loop
  75   75  
  76   76          movb    (%esi), %cl
  77   77          movb    %cl, (%edi)

  78   78          decl    %edx

↓ open down ↓

1 lines elided

↑ open up ↑

  79   79          jz      .Return
  80   80          movb    1(%esi), %cl
  81   81          movb    %cl, 1(%edi)
  82   82          decl    %edx
  83   83          jz      .Return
  84   84          movb    2(%esi), %cl
  85   85          movb    %cl, 2(%edi)
  86   86  
  87   87  .Return:
  88   88          popl    %edi            / restore register variables
  89      -        popl    %esi            
       89 +        popl    %esi
  90   90          ret
  91   91  
  92   92  .move_sse:
  93   93          /
  94   94          / time to 16 byte align destination
  95   95          /
  96   96          andl    $15, %eax
  97   97          jnz     .sse_unaligned  / jmp if dest is unaligned
  98   98  .sse:                           / dest is aligned, check source
  99   99          movl    %ecx, %edx      / get byte count
 100  100          shrl    $6, %edx        / number of 64 byte blocks to move
 101  101          testl   $15, %esi
 102  102          jnz     .sse_da         / go to slow loop if source is unaligned
 103  103          cmpl    $65535, %ecx
 104  104          ja      .sse_sa_nt_loop
 105      -        
      105 +
 106  106          /
 107  107          / use aligned load since we're lucky
 108  108          /
 109  109  .sse_sa_loop:
 110  110          prefetcht0 568(%esi)    / prefetch source & copy 64 byte at a time
 111  111          prefetcht0 568(%edi)    / prefetch source & copy 64 byte at a time
 112  112          movaps  0(%esi), %xmm0
 113      -        movaps  %xmm0, 0(%edi)   
      113 +        movaps  %xmm0, 0(%edi)
 114  114          movaps  16(%esi), %xmm1
 115  115          movaps  %xmm1, 16(%edi)
 116  116          movaps  32(%esi), %xmm2
 117      -        movaps  %xmm2, 32(%edi)  
      117 +        movaps  %xmm2, 32(%edi)
 118  118          movaps  48(%esi), %xmm3
 119  119          movaps  %xmm3, 48(%edi)
 120  120          addl    $64, %esi
 121  121          addl    $64, %edi
 122  122          decl    %edx
 123  123          jnz     .sse_sa_loop
 124      -        
      124 +
 125  125  .sse_cleanup:
 126  126          andl    $63, %ecx       / compute remaining bytes
 127  127          movl    8+4(%esp), %eax / setup return value
 128  128          jz      .Return
 129  129          jmp     .movew
 130      -        
      130 +
 131  131          /
 132  132          / use aligned load since we're lucky
 133  133          /
 134  134          .align 16
 135  135  .sse_sa_nt_loop:
 136  136          prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
 137  137          movaps  (%esi), %xmm0
 138      -        movntps %xmm0, 0(%edi)   
      138 +        movntps %xmm0, 0(%edi)
 139  139          movaps  16(%esi), %xmm1
 140  140          movntps %xmm1, 16(%edi)
 141  141          movaps  32(%esi), %xmm2
 142      -        movntps %xmm2, 32(%edi)  
      142 +        movntps %xmm2, 32(%edi)
 143  143          movaps  48(%esi), %xmm3
 144  144          movntps %xmm3, 48(%edi)
 145  145          addl    $64, %esi
 146  146          addl    $64, %edi
 147  147          decl    %edx
 148  148          jnz     .sse_sa_nt_loop
 149  149  #if defined(_SSE2_INSN)
 150  150          mfence
 151  151  #elif defined(_SSE_INSN)
 152  152          sfence

 153  153  #else
 154  154  #error "Must have either SSE or SSE2"
 155  155  #endif
 156  156          jmp     .sse_cleanup
 157  157  
 158  158          /
 159  159          / Make certain that destination buffer becomes aligned
 160  160          /
 161  161  .sse_unaligned:
 162  162          neg     %eax            / subtract from 16 and get destination
 163  163          andl    $15, %eax       / aligned on a 16 byte boundary

↓ open down ↓

11 lines elided

↑ open up ↑

 164  164          movl    %ecx, %edx      / saved count
 165  165          subl    %eax, %ecx      / subtract from byte count
 166  166          cmpl    $64, %ecx       / after aligning, will we still have 64 bytes?
 167  167          cmovb   %edx, %ecx      / if not, restore original byte count,
 168  168          cmovb   8+4(%esp), %eax / and restore return value,
 169  169          jb      .movew          / and do a non-SSE move.
 170  170          xchg    %ecx, %eax      / flip for copy
 171  171          rep ; smovb             / move the bytes
 172  172          xchg    %ecx, %eax      / flip back
 173  173          jmp     .sse
 174      -        
      174 +
 175  175          .align 16
 176  176  .sse_da:
 177  177          cmpl    $65535, %ecx
 178  178          jbe     .sse_da_loop
 179  179  
 180  180          /
 181  181          / use unaligned load since source doesn't line up
 182  182          /
 183  183  .sse_da_nt_loop:
 184  184          prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
 185  185          movups  0(%esi), %xmm0
 186      -        movntps %xmm0, 0(%edi)   
      186 +        movntps %xmm0, 0(%edi)
 187  187          movups  16(%esi), %xmm1
 188  188          movntps %xmm1, 16(%edi)
 189  189          movups  32(%esi), %xmm2
 190      -        movntps %xmm2, 32(%edi)  
      190 +        movntps %xmm2, 32(%edi)
 191  191          movups  48(%esi), %xmm3
 192  192          movntps %xmm3, 48(%edi)
 193  193          addl    $64, %esi
 194  194          addl    $64, %edi
 195  195          decl    %edx
 196  196          jnz     .sse_da_nt_loop
 197  197  #if defined(_SSE2_INSN)
 198  198          mfence
 199  199  #elif defined(_SSE_INSN)
 200  200          sfence

 201  201  #else
 202  202  #error "Must have either SSE or SSE2"

↓ open down ↓

2 lines elided

↑ open up ↑

 203  203  #endif
 204  204          jmp     .sse_cleanup
 205  205          /
 206  206          / use unaligned load since source doesn't line up
 207  207          /
 208  208          .align  16
 209  209  .sse_da_loop:
 210  210          prefetcht0 568(%esi)    / prefetch source & copy 64 byte at a time
 211  211          prefetcht0 568(%edi)
 212  212          movups  0(%esi), %xmm0
 213      -        movaps  %xmm0, 0(%edi)   
      213 +        movaps  %xmm0, 0(%edi)
 214  214          movups  16(%esi), %xmm1
 215  215          movaps  %xmm1, 16(%edi)
 216  216          movups  32(%esi), %xmm2
 217      -        movaps  %xmm2, 32(%edi)  
      217 +        movaps  %xmm2, 32(%edi)
 218  218          movups  48(%esi), %xmm3
 219  219          movaps  %xmm3, 48(%edi)
 220  220          addl    $64, %esi
 221  221          addl    $64, %edi
 222  222          decl    %edx
 223  223          jnz     .sse_da_loop
 224  224          jmp     .sse_cleanup
 225      -        
      225 +
 226  226          SET_SIZE(memcpy)
 227  227  
 228  228  
 229  229  / .CopyLeft handles the memmove case where we must perform the copy backwards,
 230  230  / because of overlap between src and dst. This is not particularly optimized.
 231  231  
 232  232  .CopyLeft:
 233  233          movl    $3,%eax                 / heavily used constant
 234  234          std                             / reverse direction bit (RtoL)
 235  235          cmpl    $12,%ecx                / if (size < 12)

 236  236          ja      .BigCopyLeft            / {
 237  237          movl    %edx,%esi               /     src = src + size - 1
 238  238          leal    -1(%ecx,%edi),%edi      /     dst = dst + size - 1
 239  239          rep;    smovb                   /    do the byte copy
 240  240          cld                             /    reset direction flag to LtoR
 241  241          popl    %edi                    /  }
 242  242          popl    %esi                    /  restore registers
 243  243          movl    4(%esp),%eax            /  set up return value
 244  244          ret                             /  return(dba);

↓ open down ↓

9 lines elided

↑ open up ↑

 245  245  .BigCopyLeft:                           / } else {
 246  246          xchgl   %edx,%ecx
 247  247          movl    %ecx,%esi               / align source w/byte copy
 248  248          leal    -1(%edx,%edi),%edi
 249  249          andl    %eax,%ecx
 250  250          jz      .SkipAlignLeft
 251  251          addl    $1, %ecx                / we need to insure that future
 252  252          subl    %ecx,%edx               / copy is done on aligned boundary
 253  253          rep;    smovb
 254  254  .SkipAlignLeft:
 255      -        movl    %edx,%ecx       
      255 +        movl    %edx,%ecx
 256  256          subl    %eax,%esi
 257  257          shrl    $2,%ecx                 / do 4 byte copy RtoL
 258  258          subl    %eax,%edi
 259  259          rep;    smovl
 260  260          andl    %eax,%edx               / do 1 byte copy whats left
 261  261          jz      .CleanupReturnLeft
 262      -        movl    %edx,%ecx       
      262 +        movl    %edx,%ecx
 263  263          addl    %eax,%esi               / rep; smovl instruction will decrement
 264  264          addl    %eax,%edi               / %edi, %esi by four after each copy
 265  265                                          / adding 3 will restore pointers to byte
 266  266                                          / before last double word copied
 267  267                                          / which is where they are expected to
 268  268                                          / be for the single byte copy code
 269  269          rep;    smovb
 270  270  .CleanupReturnLeft:
 271  271          cld                             / reset direction flag to LtoR
 272  272          popl    %edi
 273  273          popl    %esi                    / restore registers
 274  274          movl    4(%esp),%eax            / set up return value
 275  275          ret                             / return(dba);
 276  276          SET_SIZE(memmove)

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX