1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 .file "memset.s" 28 29 #include <sys/asm_linkage.h> 30 31 ANSI_PRAGMA_WEAK(memset,function) 32 33 ENTRY(memset) 34 pushl %edi / save register variable 35 movl 8(%esp),%edi / %edi = string address 36 movl 12(%esp),%eax / %al = byte to duplicate 37 movl 16(%esp),%ecx / %ecx = number of copies 38 39 / For all basic blocks in this routine, maintain the following 40 / entry conditions: %eax each byte is set to desired byte. 41 / NOTE: .byteset doesn't require this 42 / %ecx contains # bytes to set 43 / %edi contain address to set 44 45 cld / make sure we go the right way... 46 cmpl $20,%ecx / strings with fewer than 20 chars should be 47 / byte set 48 jbe .byteset 49 50 andl $0xff, %eax / trim anything above low byte 51 imul $0x01010101, %eax / extend low byte to each byte 52 53 cmpl $256, %ecx / smaller areas don't benefit from alignment 54 jbe .wordset 55 56 cmpl $511, %ecx / areas smaller than this should be wordset 57 jbe .check_wordset 58 59 / 60 / prep work for sse temporal and non-temporal 61 / 62 63 pushl %ebx / more registers are needed 64 pushl %esi / for alignment work 65 66 / 67 / align address to 64 byte boundaries. 68 / 69 70 movl %ecx, %ebx / save byte count 71 movl %edi, %esi / esi is scratch register 72 andl $63, %esi / bytes to align to 64 byte align addr 73 neg %esi / compute count of bytes 74 addl $64, %esi / needed to align 75 andl $63, %esi / to 64 byte align addr 76 jz .sse_aligned / skip alignment if not needed 77 subl %esi, %ebx / ebx contains remainder of bytes to set 78 movl %esi, %ecx / alignment bytes 79 shrl $2,%ecx / %ecx = number of words to set 80 rep; sstol 81 movl %esi,%ecx 82 andl $3,%ecx / %ecx = number of bytes left 83 rep; sstob 84 movl %ebx, %ecx / remainder to be set 85 86 .sse_aligned: 87 88 shr $6, %ecx / number of 64 byte blocks to set 89 90 / 91 / load xmm0 with bytes to be set 92 / 93 subl $16,%esp / give ourselves some working room on the stack 94 movl %eax,(%esp) / copy eax into each of 4 bytes 95 movl %eax,4(%esp) / avoid pushl since it causes more interlocking 96 movl %eax,8(%esp) / 97 movl %eax,12(%esp) / 98 movups (%esp), %xmm0 / unaligned load from stack into xmm0 99 addl $16,%esp / restore stack position 100 101 cmpl $262143, %ebx / blocks smaller than this allocate in the cache 102 jbe .sse_loop 103 jmp .sse_nt_loop / branch across alignment nops 104 105 .align 16 106 107 .sse_nt_loop: 108 movntps %xmm0, (%edi) / block non-temporal store 109 movntps %xmm0, 16(%edi) / use sse rather than sse2 110 movntps %xmm0, 32(%edi) / so we work more places 111 movntps %xmm0, 48(%edi) / 112 113 addl $64, %edi / increment dest address 114 dec %ecx / dec count of blocks 115 jnz .sse_nt_loop / jump if not done 116 117 andl $63, %ebx / remainder of bytes to copy 118 movl %ebx, %ecx / ecx contains remainer of bytes to set 119 popl %esi / restore stack config 120 popl %ebx / 121 #if defined(_SSE2_INSN) 122 mfence 123 #elif defined(_SSE_INSN) 124 sfence 125 #else 126 #error "Must have either SSE or SSE2" 127 #endif 128 cmpl $20, %ecx / compare and jump accordingly 129 jbe .byteset 130 jmp .wordset 131 132 .align 16 133 .sse_loop: 134 movaps %xmm0, (%edi) / block copy w/ SSE 135 movaps %xmm0, 16(%edi) 136 movaps %xmm0, 32(%edi) 137 movaps %xmm0, 48(%edi) 138 139 addl $64, %edi / increment addr 140 dec %ecx / dec count of blocks 141 jnz .sse_loop / jump if not done 142 143 andl $63, %ebx / remainder of bytes to copy 144 movl %ebx, %ecx / in %ecx as normal 145 popl %esi / restore stack config 146 popl %ebx / 147 cmpl $20, %ecx 148 jbe .byteset 149 jmp .wordset 150 151 .check_wordset: 152 movl %edi, %edx / save current store ptr 153 andl $7, %edi / check alignment 154 movl %edx,%edi / %edi = string address 155 jz .wordset / all ok 156 157 158 .align_wordset: 159 pushl %ebx / more registers are needed 160 pushl %esi 161 162 movl %ecx, %ebx 163 movl %edi, %esi 164 andl $7, %esi 165 neg %esi 166 addl $8, %esi 167 andl $7, %esi 168 subl %esi, %ebx / ebx contains remainder of bytes to copy 169 movl %esi, %ecx 170 rep; sstob 171 movl %ebx, %ecx 172 popl %esi / restore stack config 173 popl %ebx / 174 175 .wordset: 176 movl %ecx, %edx / save cont 177 shrl $2,%ecx / %ecx = number of words to set 178 rep; sstol 179 movl %edx,%ecx 180 andl $3,%ecx / %ecx = number of bytes left 181 182 .byteset: 183 rep; sstob 184 movl 8(%esp),%eax / return string address 185 popl %edi / restore register variable 186 ret 187 SET_SIZE(memset)