1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27         .file   "memset.s"
  28 
  29 #include <sys/asm_linkage.h>
  30 
  31         ANSI_PRAGMA_WEAK(memset,function)
  32 
  33         ENTRY(memset)
  34         pushl   %edi            / save register variable
  35         movl    8(%esp),%edi    / %edi = string address
  36         movl    12(%esp),%eax   / %al = byte to duplicate
  37         movl    16(%esp),%ecx   / %ecx = number of copies
  38 
  39         / For all basic blocks in this routine, maintain the following
  40         / entry conditions:     %eax each byte is set to desired byte.
  41         /                       NOTE: .byteset doesn't require this
  42         /                       %ecx contains # bytes to set
  43         /                       %edi contain address to set
  44 
  45         cld                     / make sure we go the right way...
  46         cmpl    $20,%ecx        / strings with fewer than 20 chars should be
  47                                 / byte set
  48         jbe     .byteset
  49 
  50         andl    $0xff, %eax     / trim anything above low byte
  51         imul    $0x01010101, %eax       / extend low byte to each byte
  52 
  53         cmpl    $256, %ecx      / smaller areas don't benefit from alignment
  54         jbe     .wordset
  55 
  56         cmpl    $511, %ecx      / areas smaller than this should be wordset
  57         jbe     .check_wordset
  58 
  59         /
  60         / prep work for sse temporal and non-temporal
  61         /
  62 
  63         pushl   %ebx            / more registers are needed
  64         pushl   %esi            / for alignment work
  65 
  66         /
  67         / align address to 64 byte boundaries.
  68         /
  69 
  70         movl    %ecx, %ebx      / save byte count
  71         movl    %edi, %esi      / esi is scratch register
  72         andl    $63, %esi       / bytes to align to 64 byte align addr
  73         neg     %esi            / compute count of bytes
  74         addl    $64, %esi       / needed to align
  75         andl    $63, %esi       / to 64 byte align addr
  76         jz      .sse_aligned    / skip alignment if not needed
  77         subl    %esi, %ebx      / ebx contains remainder of bytes to set
  78         movl    %esi, %ecx      / alignment bytes
  79         shrl    $2,%ecx         / %ecx = number of words to set
  80         rep; sstol
  81         movl    %esi,%ecx
  82         andl    $3,%ecx         / %ecx = number of bytes left
  83         rep; sstob
  84         movl    %ebx, %ecx      / remainder to be set
  85 
  86 .sse_aligned:
  87 
  88         shr     $6, %ecx        / number of 64 byte blocks to set
  89 
  90         /
  91         / load xmm0 with bytes to be set
  92         /
  93         subl    $16,%esp        / give ourselves some working room on the stack
  94         movl    %eax,(%esp)     / copy eax into each of 4 bytes
  95         movl    %eax,4(%esp)    / avoid pushl since it causes more interlocking
  96         movl    %eax,8(%esp)    /
  97         movl    %eax,12(%esp)   /
  98         movups  (%esp), %xmm0   / unaligned load from stack into xmm0
  99         addl    $16,%esp        / restore stack position
 100 
 101         cmpl    $262143, %ebx   / blocks smaller than this allocate in the cache
 102         jbe     .sse_loop
 103         jmp     .sse_nt_loop    / branch across alignment nops
 104 
 105         .align 16
 106 
 107 .sse_nt_loop:
 108         movntps %xmm0, (%edi)   / block non-temporal store
 109         movntps %xmm0, 16(%edi) / use sse rather than sse2
 110         movntps %xmm0, 32(%edi) / so we work more places
 111         movntps %xmm0, 48(%edi) /
 112 
 113         addl    $64, %edi       / increment dest address
 114         dec     %ecx            / dec count of blocks
 115         jnz     .sse_nt_loop    / jump if not done
 116 
 117         andl    $63, %ebx       / remainder of bytes to copy
 118         movl    %ebx, %ecx      / ecx contains remainer of bytes to set
 119         popl    %esi            / restore stack config
 120         popl    %ebx            /
 121 #if defined(_SSE2_INSN)
 122         mfence
 123 #elif defined(_SSE_INSN)
 124         sfence
 125 #else
 126 #error "Must have either SSE or SSE2"
 127 #endif
 128         cmpl    $20, %ecx       / compare and jump accordingly
 129         jbe     .byteset
 130         jmp     .wordset
 131 
 132         .align 16
 133 .sse_loop:
 134         movaps %xmm0, (%edi)    / block copy w/ SSE
 135         movaps %xmm0, 16(%edi)
 136         movaps %xmm0, 32(%edi)
 137         movaps %xmm0, 48(%edi)
 138 
 139         addl    $64, %edi       / increment addr
 140         dec     %ecx            / dec count of blocks
 141         jnz     .sse_loop       / jump if not done
 142 
 143         andl    $63, %ebx       / remainder of bytes to copy
 144         movl    %ebx, %ecx      / in %ecx as normal
 145         popl    %esi            / restore stack config
 146         popl    %ebx            /
 147         cmpl    $20, %ecx
 148         jbe     .byteset
 149         jmp     .wordset
 150 
 151 .check_wordset:
 152         movl    %edi, %edx      / save current store ptr
 153         andl    $7, %edi        / check alignment
 154         movl    %edx,%edi       / %edi = string address
 155         jz      .wordset        / all ok
 156 
 157 
 158 .align_wordset:
 159         pushl   %ebx            / more registers are needed
 160         pushl   %esi
 161 
 162         movl    %ecx, %ebx
 163         movl    %edi, %esi
 164         andl    $7, %esi
 165         neg     %esi
 166         addl    $8, %esi
 167         andl    $7, %esi
 168         subl    %esi, %ebx      / ebx contains remainder of bytes to copy
 169         movl    %esi, %ecx
 170         rep; sstob
 171         movl    %ebx, %ecx
 172         popl    %esi            / restore stack config
 173         popl    %ebx            /
 174 
 175 .wordset:
 176         movl    %ecx, %edx      / save cont
 177         shrl    $2,%ecx         / %ecx = number of words to set
 178         rep; sstol
 179         movl    %edx,%ecx
 180         andl    $3,%ecx         / %ecx = number of bytes left
 181 
 182 .byteset:
 183         rep; sstob
 184         movl    8(%esp),%eax    / return string address
 185         popl    %edi            / restore register variable
 186         ret
 187         SET_SIZE(memset)