1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 .file "memset.s" 28 29 #include <sys/asm_linkage.h> 30 31 ANSI_PRAGMA_WEAK(memset,function) 32 33 ENTRY(memset) 34 pushl %edi / save register variable 35 movl 8(%esp),%edi / %edi = string address 36 movl 12(%esp),%eax / %al = byte to duplicate 37 movl 16(%esp),%ecx / %ecx = number of copies 38 39 / For all basic blocks in this routine, maintain the following 40 / entry conditions: %eax each byte is set to desired byte. 41 / NOTE: .byteset doesn't require this 42 / %ecx contains # bytes to set 43 / %edi contain address to set 44 45 cld / make sure we go the right way... 46 cmpl $20,%ecx / strings with fewer than 20 chars should be byte set 47 jbe .byteset 48 49 andl $0xff, %eax / trim anything above low byte 50 imul $0x01010101, %eax / extend low byte to each byte 51 52 cmpl $256, %ecx / smaller areas don't benefit from alignment 53 jbe .wordset 54 55 cmpl $511, %ecx / areas smaller than this should be wordset 56 jbe .check_wordset 57 58 / 59 / prep work for sse temporal and non-temporal 60 / 61 62 pushl %ebx / more registers are needed 63 pushl %esi / for alignment work 64 65 / 66 / align address to 64 byte boundaries. 67 / 68 69 movl %ecx, %ebx / save byte count 70 movl %edi, %esi / esi is scratch register 71 andl $63, %esi / bytes to align to 64 byte align addr 72 neg %esi / compute count of bytes 73 addl $64, %esi / needed to align 74 andl $63, %esi / to 64 byte align addr 75 jz .sse_aligned / skip alignment if not needed 76 subl %esi, %ebx / ebx contains remainder of bytes to set 77 movl %esi, %ecx / alignment bytes 78 shrl $2,%ecx / %ecx = number of words to set 79 rep; sstol 80 movl %esi,%ecx 81 andl $3,%ecx / %ecx = number of bytes left 82 rep; sstob 83 movl %ebx, %ecx / remainder to be set 84 85 .sse_aligned: 86 87 shr $6, %ecx / number of 64 byte blocks to set 88 89 / 90 / load xmm0 with bytes to be set 91 / 92 subl $16,%esp / give ourselves some working room on the stack 93 movl %eax,(%esp) / copy eax into each of 4 bytes 94 movl %eax,4(%esp) / avoid pushl since it causes more interlocking 95 movl %eax,8(%esp) / 96 movl %eax,12(%esp) / 97 movups (%esp), %xmm0 / unaligned load from stack into xmm0 98 addl $16,%esp / restore stack position 99 100 cmpl $262143, %ebx / blocks smaller than this allocate in the cache 101 jbe .sse_loop 102 jmp .sse_nt_loop / branch across alignment nops 103 104 .align 16 105 106 .sse_nt_loop: 107 movntps %xmm0, (%edi) / block non-temporal store 108 movntps %xmm0, 16(%edi) / use sse rather than sse2 109 movntps %xmm0, 32(%edi) / so we work more places 110 movntps %xmm0, 48(%edi) / 111 112 addl $64, %edi / increment dest address 113 dec %ecx / dec count of blocks 114 jnz .sse_nt_loop / jump if not done 115 116 andl $63, %ebx / remainder of bytes to copy 117 movl %ebx, %ecx / ecx contains remainer of bytes to set 118 popl %esi / restore stack config 119 popl %ebx / 120 #if defined(_SSE2_INSN) 121 mfence 122 #elif defined(_SSE_INSN) 123 sfence 124 #else 125 #error "Must have either SSE or SSE2" 126 #endif 127 cmpl $20, %ecx / compare and jump accordingly 128 jbe .byteset 129 jmp .wordset 130 131 .align 16 132 .sse_loop: 133 movaps %xmm0, (%edi) / block copy w/ SSE 134 movaps %xmm0, 16(%edi) 135 movaps %xmm0, 32(%edi) 136 movaps %xmm0, 48(%edi) 137 138 addl $64, %edi / increment addr 139 dec %ecx / dec count of blocks 140 jnz .sse_loop / jump if not done 141 142 andl $63, %ebx / remainder of bytes to copy 143 movl %ebx, %ecx / in %ecx as normal 144 popl %esi / restore stack config 145 popl %ebx / 146 cmpl $20, %ecx 147 jbe .byteset 148 jmp .wordset 149 150 .check_wordset: 151 movl %edi, %edx / save current store ptr 152 andl $7, %edi / check alignment 153 movl %edx,%edi / %edi = string address 154 jz .wordset / all ok 155 156 157 .align_wordset: 158 pushl %ebx / more registers are needed 159 pushl %esi 160 161 movl %ecx, %ebx 162 movl %edi, %esi 163 andl $7, %esi 164 neg %esi 165 addl $8, %esi 166 andl $7, %esi 167 subl %esi, %ebx / ebx contains remainder of bytes to copy 168 movl %esi, %ecx 169 rep; sstob 170 movl %ebx, %ecx 171 popl %esi / restore stack config 172 popl %ebx / 173 174 .wordset: 175 movl %ecx, %edx / save cont 176 shrl $2,%ecx / %ecx = number of words to set 177 rep; sstol 178 movl %edx,%ecx 179 andl $3,%ecx / %ecx = number of bytes left 180 181 .byteset: 182 rep; sstob 183 movl 8(%esp),%eax / return string address 184 popl %edi / restore register variable 185 ret 186 SET_SIZE(memset)