1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27         .file   "memcpy.s"
  28 
  29 #include <sys/asm_linkage.h>
  30 
  31         ANSI_PRAGMA_WEAK(memmove,function)
  32         ANSI_PRAGMA_WEAK(memcpy,function)
  33 
  34         ENTRY(memmove)
  35         movl    0+12(%esp),%ecx / get number of bytes to move
  36         pushl   %esi            / save off %edi, %esi and move destination
  37         pushl   %edi
  38         movl    8+ 4(%esp),%edi / destination buffer address
  39         movl    8+ 8(%esp),%esi / source buffer address
  40         movl    %edi, %eax
  41         testl   %ecx,%ecx
  42         jz      .Return
  43 
  44         cmpl    %esi,%edi       / if (source addr > dest addr)
  45         leal    -1(%esi,%ecx),%edx      / %edx = src + size - 1
  46         jbe     .memcpy_post    / jump if dst <= src
  47         cmpl    %edx,%edi
  48         jbe     .CopyLeft       / jump if dst <= src + size - 1
  49         jmp     .memcpy_post
  50 
  51         ENTRY(memcpy)
  52         pushl   %esi
  53         pushl   %edi
  54 
  55         movl    8+4(%esp),%edi  / %edi = dest address
  56         movl    %edi, %eax      / save this
  57         movl    8+8(%esp),%esi  / %esi = source address
  58         movl    8+12(%esp),%ecx/ %ecx = length of string
  59                                 / %edx scratch register
  60                                 / %eax scratch register
  61 .memcpy_post:   
  62         nop                     / this really helps, don't know why
  63                                 / note: cld is perf death on P4
  64         cmpl    $63,%ecx
  65         ja      .move_sse       / not worth doing sse for less
  66 
  67 .movew: 
  68         movl    %ecx,%edx       / save byte cnt
  69         shrl    $2,%ecx         / %ecx = number of words to move
  70         rep ; smovl             / move the words
  71 
  72 
  73         andl    $0x3,%edx       / %edx = number of bytes left to move
  74         jz      .Return         / %edx <= 3, so just unroll the loop
  75 
  76         movb    (%esi), %cl
  77         movb    %cl, (%edi)
  78         decl    %edx
  79         jz      .Return
  80         movb    1(%esi), %cl
  81         movb    %cl, 1(%edi)
  82         decl    %edx
  83         jz      .Return
  84         movb    2(%esi), %cl
  85         movb    %cl, 2(%edi)
  86 
  87 .Return:
  88         popl    %edi            / restore register variables
  89         popl    %esi            
  90         ret
  91 
  92 .move_sse:
  93         /
  94         / time to 16 byte align destination
  95         /
  96         andl    $15, %eax
  97         jnz     .sse_unaligned  / jmp if dest is unaligned
  98 .sse:                           / dest is aligned, check source
  99         movl    %ecx, %edx      / get byte count
 100         shrl    $6, %edx        / number of 64 byte blocks to move
 101         testl   $15, %esi
 102         jnz     .sse_da         / go to slow loop if source is unaligned
 103         cmpl    $65535, %ecx
 104         ja      .sse_sa_nt_loop
 105         
 106         /
 107         / use aligned load since we're lucky
 108         /
 109 .sse_sa_loop:
 110         prefetcht0 568(%esi)    / prefetch source & copy 64 byte at a time
 111         prefetcht0 568(%edi)    / prefetch source & copy 64 byte at a time
 112         movaps  0(%esi), %xmm0
 113         movaps  %xmm0, 0(%edi)   
 114         movaps  16(%esi), %xmm1
 115         movaps  %xmm1, 16(%edi)
 116         movaps  32(%esi), %xmm2
 117         movaps  %xmm2, 32(%edi)  
 118         movaps  48(%esi), %xmm3
 119         movaps  %xmm3, 48(%edi)
 120         addl    $64, %esi
 121         addl    $64, %edi
 122         decl    %edx
 123         jnz     .sse_sa_loop
 124         
 125 .sse_cleanup:
 126         andl    $63, %ecx       / compute remaining bytes
 127         movl    8+4(%esp), %eax / setup return value
 128         jz      .Return
 129         jmp     .movew
 130         
 131         /
 132         / use aligned load since we're lucky
 133         /
 134         .align 16
 135 .sse_sa_nt_loop:
 136         prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
 137         movaps  (%esi), %xmm0
 138         movntps %xmm0, 0(%edi)   
 139         movaps  16(%esi), %xmm1
 140         movntps %xmm1, 16(%edi)
 141         movaps  32(%esi), %xmm2
 142         movntps %xmm2, 32(%edi)  
 143         movaps  48(%esi), %xmm3
 144         movntps %xmm3, 48(%edi)
 145         addl    $64, %esi
 146         addl    $64, %edi
 147         decl    %edx
 148         jnz     .sse_sa_nt_loop
 149 #if defined(_SSE2_INSN)
 150         mfence
 151 #elif defined(_SSE_INSN)
 152         sfence
 153 #else
 154 #error "Must have either SSE or SSE2"
 155 #endif
 156         jmp     .sse_cleanup
 157 
 158         /
 159         / Make certain that destination buffer becomes aligned
 160         /
 161 .sse_unaligned:
 162         neg     %eax            / subtract from 16 and get destination
 163         andl    $15, %eax       / aligned on a 16 byte boundary
 164         movl    %ecx, %edx      / saved count
 165         subl    %eax, %ecx      / subtract from byte count
 166         cmpl    $64, %ecx       / after aligning, will we still have 64 bytes?
 167         cmovb   %edx, %ecx      / if not, restore original byte count,
 168         cmovb   8+4(%esp), %eax / and restore return value,
 169         jb      .movew          / and do a non-SSE move.
 170         xchg    %ecx, %eax      / flip for copy
 171         rep ; smovb             / move the bytes
 172         xchg    %ecx, %eax      / flip back
 173         jmp     .sse
 174         
 175         .align 16
 176 .sse_da:
 177         cmpl    $65535, %ecx
 178         jbe     .sse_da_loop
 179 
 180         /
 181         / use unaligned load since source doesn't line up
 182         /
 183 .sse_da_nt_loop:
 184         prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
 185         movups  0(%esi), %xmm0
 186         movntps %xmm0, 0(%edi)   
 187         movups  16(%esi), %xmm1
 188         movntps %xmm1, 16(%edi)
 189         movups  32(%esi), %xmm2
 190         movntps %xmm2, 32(%edi)  
 191         movups  48(%esi), %xmm3
 192         movntps %xmm3, 48(%edi)
 193         addl    $64, %esi
 194         addl    $64, %edi
 195         decl    %edx
 196         jnz     .sse_da_nt_loop
 197 #if defined(_SSE2_INSN)
 198         mfence
 199 #elif defined(_SSE_INSN)
 200         sfence
 201 #else
 202 #error "Must have either SSE or SSE2"
 203 #endif
 204         jmp     .sse_cleanup
 205         /
 206         / use unaligned load since source doesn't line up
 207         /
 208         .align  16
 209 .sse_da_loop:
 210         prefetcht0 568(%esi)    / prefetch source & copy 64 byte at a time
 211         prefetcht0 568(%edi)
 212         movups  0(%esi), %xmm0
 213         movaps  %xmm0, 0(%edi)   
 214         movups  16(%esi), %xmm1
 215         movaps  %xmm1, 16(%edi)
 216         movups  32(%esi), %xmm2
 217         movaps  %xmm2, 32(%edi)  
 218         movups  48(%esi), %xmm3
 219         movaps  %xmm3, 48(%edi)
 220         addl    $64, %esi
 221         addl    $64, %edi
 222         decl    %edx
 223         jnz     .sse_da_loop
 224         jmp     .sse_cleanup
 225         
 226         SET_SIZE(memcpy)
 227 
 228 
 229 / .CopyLeft handles the memmove case where we must perform the copy backwards,
 230 / because of overlap between src and dst. This is not particularly optimized.
 231 
 232 .CopyLeft:
 233         movl    $3,%eax                 / heavily used constant
 234         std                             / reverse direction bit (RtoL)
 235         cmpl    $12,%ecx                / if (size < 12)
 236         ja      .BigCopyLeft            / {
 237         movl    %edx,%esi               /     src = src + size - 1
 238         leal    -1(%ecx,%edi),%edi      /     dst = dst + size - 1
 239         rep;    smovb                   /    do the byte copy
 240         cld                             /    reset direction flag to LtoR
 241         popl    %edi                    /  }
 242         popl    %esi                    /  restore registers
 243         movl    4(%esp),%eax            /  set up return value
 244         ret                             /  return(dba);
 245 .BigCopyLeft:                           / } else {
 246         xchgl   %edx,%ecx
 247         movl    %ecx,%esi               / align source w/byte copy
 248         leal    -1(%edx,%edi),%edi
 249         andl    %eax,%ecx
 250         jz      .SkipAlignLeft
 251         addl    $1, %ecx                / we need to insure that future
 252         subl    %ecx,%edx               / copy is done on aligned boundary
 253         rep;    smovb
 254 .SkipAlignLeft:
 255         movl    %edx,%ecx       
 256         subl    %eax,%esi
 257         shrl    $2,%ecx                 / do 4 byte copy RtoL
 258         subl    %eax,%edi
 259         rep;    smovl
 260         andl    %eax,%edx               / do 1 byte copy whats left
 261         jz      .CleanupReturnLeft
 262         movl    %edx,%ecx       
 263         addl    %eax,%esi               / rep; smovl instruction will decrement
 264         addl    %eax,%edi               / %edi, %esi by four after each copy
 265                                         / adding 3 will restore pointers to byte
 266                                         / before last double word copied
 267                                         / which is where they are expected to
 268                                         / be for the single byte copy code
 269         rep;    smovb
 270 .CleanupReturnLeft:
 271         cld                             / reset direction flag to LtoR
 272         popl    %edi
 273         popl    %esi                    / restore registers
 274         movl    4(%esp),%eax            / set up return value
 275         ret                             / return(dba);
 276         SET_SIZE(memmove)