1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27         .file   "memset.s"
  28 
  29 #include <sys/asm_linkage.h>
  30 
  31         ANSI_PRAGMA_WEAK(memset,function)
  32 
  33         ENTRY(memset)
  34         pushl   %edi            / save register variable
  35         movl    8(%esp),%edi    / %edi = string address
  36         movl    12(%esp),%eax   / %al = byte to duplicate
  37         movl    16(%esp),%ecx   / %ecx = number of copies
  38 
  39         / For all basic blocks in this routine, maintain the following
  40         / entry conditions:     %eax each byte is set to desired byte.
  41         /                       NOTE: .byteset doesn't require this
  42         /                       %ecx contains # bytes to set
  43         /                       %edi contain address to set
  44 
  45         cld                     / make sure we go the right way...
  46         cmpl    $20,%ecx        / strings with fewer than 20 chars should be byte set
  47         jbe     .byteset        
  48 
  49         andl    $0xff, %eax     / trim anything above low byte
  50         imul    $0x01010101, %eax       / extend low byte to each byte
  51         
  52         cmpl    $256, %ecx      / smaller areas don't benefit from alignment
  53         jbe     .wordset
  54 
  55         cmpl    $511, %ecx      / areas smaller than this should be wordset
  56         jbe     .check_wordset  
  57 
  58         /
  59         / prep work for sse temporal and non-temporal
  60         /
  61 
  62         pushl   %ebx            / more registers are needed
  63         pushl   %esi            / for alignment work
  64 
  65         /
  66         / align address to 64 byte boundaries.
  67         /
  68 
  69         movl    %ecx, %ebx      / save byte count
  70         movl    %edi, %esi      / esi is scratch register
  71         andl    $63, %esi       / bytes to align to 64 byte align addr
  72         neg     %esi            / compute count of bytes 
  73         addl    $64, %esi       / needed to align
  74         andl    $63, %esi       / to 64 byte align addr
  75         jz      .sse_aligned    / skip alignment if not needed
  76         subl    %esi, %ebx      / ebx contains remainder of bytes to set
  77         movl    %esi, %ecx      / alignment bytes
  78         shrl    $2,%ecx         / %ecx = number of words to set
  79         rep; sstol
  80         movl    %esi,%ecx
  81         andl    $3,%ecx         / %ecx = number of bytes left
  82         rep; sstob
  83         movl    %ebx, %ecx      / remainder to be set
  84 
  85 .sse_aligned:
  86         
  87         shr     $6, %ecx        / number of 64 byte blocks to set
  88 
  89         /
  90         / load xmm0 with bytes to be set
  91         /
  92         subl    $16,%esp        / give ourselves some working room on the stack
  93         movl    %eax,(%esp)     / copy eax into each of 4 bytes
  94         movl    %eax,4(%esp)    / avoid pushl since it causes more interlocking
  95         movl    %eax,8(%esp)    /
  96         movl    %eax,12(%esp)   /
  97         movups  (%esp), %xmm0   / unaligned load from stack into xmm0
  98         addl    $16,%esp        / restore stack position
  99         
 100         cmpl    $262143, %ebx   / blocks smaller than this allocate in the cache
 101         jbe     .sse_loop
 102         jmp     .sse_nt_loop    / branch across alignment nops
 103                 
 104         .align 16
 105 
 106 .sse_nt_loop:   
 107         movntps %xmm0, (%edi)   / block non-temporal store
 108         movntps %xmm0, 16(%edi) / use sse rather than sse2
 109         movntps %xmm0, 32(%edi) / so we work more places
 110         movntps %xmm0, 48(%edi) /
 111 
 112         addl    $64, %edi       / increment dest address
 113         dec     %ecx            / dec count of blocks
 114         jnz     .sse_nt_loop    / jump if not done
 115 
 116         andl    $63, %ebx       / remainder of bytes to copy
 117         movl    %ebx, %ecx      / ecx contains remainer of bytes to set
 118         popl    %esi            / restore stack config
 119         popl    %ebx            /
 120 #if defined(_SSE2_INSN)
 121         mfence
 122 #elif defined(_SSE_INSN)
 123         sfence
 124 #else
 125 #error "Must have either SSE or SSE2"
 126 #endif
 127         cmpl    $20, %ecx       / compare and jump accordingly
 128         jbe     .byteset
 129         jmp     .wordset        
 130 
 131         .align 16
 132 .sse_loop:
 133         movaps %xmm0, (%edi)    / block copy w/ SSE
 134         movaps %xmm0, 16(%edi)
 135         movaps %xmm0, 32(%edi)
 136         movaps %xmm0, 48(%edi)
 137 
 138         addl    $64, %edi       / increment addr
 139         dec     %ecx            / dec count of blocks
 140         jnz     .sse_loop       / jump if not done
 141 
 142         andl    $63, %ebx       / remainder of bytes to copy
 143         movl    %ebx, %ecx      / in %ecx as normal
 144         popl    %esi            / restore stack config
 145         popl    %ebx            /
 146         cmpl    $20, %ecx       
 147         jbe     .byteset
 148         jmp     .wordset
 149 
 150 .check_wordset:
 151         movl    %edi, %edx      / save current store ptr
 152         andl    $7, %edi        / check alignment
 153         movl    %edx,%edi       / %edi = string address
 154         jz      .wordset        / all ok 
 155         
 156 
 157 .align_wordset: 
 158         pushl   %ebx            / more registers are needed
 159         pushl   %esi            
 160 
 161         movl    %ecx, %ebx
 162         movl    %edi, %esi
 163         andl    $7, %esi
 164         neg     %esi
 165         addl    $8, %esi
 166         andl    $7, %esi
 167         subl    %esi, %ebx      / ebx contains remainder of bytes to copy
 168         movl    %esi, %ecx
 169         rep; sstob       
 170         movl    %ebx, %ecx
 171         popl    %esi            / restore stack config
 172         popl    %ebx            /
 173 
 174 .wordset:
 175         movl    %ecx, %edx      / save cont
 176         shrl    $2,%ecx         / %ecx = number of words to set
 177         rep; sstol
 178         movl    %edx,%ecx
 179         andl    $3,%ecx         / %ecx = number of bytes left
 180 
 181 .byteset:
 182         rep; sstob
 183         movl    8(%esp),%eax    / return string address
 184         popl    %edi            / restore register variable
 185         ret
 186         SET_SIZE(memset)