1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #pragma ident "%Z%%M% %I% %E% SMI"
28
29 #include <sys/param.h>
30 #include <sys/errno.h>
31 #include <sys/asm_linkage.h>
32 #include <sys/vtrace.h>
33 #include <sys/machthread.h>
34 #include <sys/machparam.h>
35
36 #if defined(lint)
37 #include <sys/types.h>
38 #else /* lint */
39 #include "assym.h"
40 #endif /* lint */
41
42 /*
43 * Prefetch considerations
44 *
45 * We prefetch one cacheline ahead. This may not be enough on Serengeti
46 * systems - see default_copyout() etc which prefetch 5 lines ahead.
47 * On the other hand, we expect most of the source buffers to be
48 * recently used enough to be cached.
49 *
50 * On US-I the prefetches are inoperative. On US-II they preload the E$;
51 * the mainloop unrolling and load-buffer should cover loads from E$.
52 * The stores appear to be the slow point on US-II.
53 *
54 * On US-IIICu the prefetch preloads the L2$ too, but there is no load
55 * buffer so the loads will stall for D$ miss, L2$ hit. The hardware
56 * auto-prefetch is not activated by integer loads. No solution
57 * in sight for this, barring odd games with FP read, write, integer read.
58 *
59 * US-IV (Panther) appears similar to US-IIICu, except that a strong
60 * variant of prefetch is available which can take TLB traps. We don't
61 * use this. The h/w prefetch stride can be set to 64, 128 or 192,
62 * and they only reach to the L2$ (we don't use these either).
63 * L2$ load-to-use latency is 15 cycles (best).
64 */
65
66
67 /*
68 * ip_ocsum(address, halfword_count, sum)
69 * Do a 16 bit one's complement sum of a given number of (16-bit)
70 * halfwords. The halfword pointer must not be odd.
71 * %o0 address; %o1 count; %o2 sum accumulator; %o4 temp
72 * %g2 and %g3 used in main loop
73 *
74 * (from @(#)ocsum.s 1.3 89/02/24 SMI)
75 *
76 */
77
78 #if defined(lint)
79
80 /* ARGSUSED */
81 unsigned int
82 ip_ocsum(u_short *address, int halfword_count, unsigned int sum)
83 { return (0); }
84
85 #else /* lint */
86
87 ENTRY(ip_ocsum)
88
89 /*
90 * On ttcp transmits, called once per ocsum_copyin but with a small
91 * block ( >99.9% ). Could be the tx hdrs? How many acks/seg are we rxing?
92 * On ttcp receives, called more than once per ocsum_copyout. Rx hdrs
93 * and tx acks?
94 *
95 * To do: telnet and nfs traffic
96 *
97 * On an NCA'd webserver about 10% of the calls are >64 bytes
98 * about 10% of those start on a 64byte boundary
99 * about 30% are >5*64 bytes.
100 * The NCA numbers & proportions don't change with h/w cksum on.
101 *
102 * Tx hdrs are likely to be already in cache.
103 * Rx hdrs depends if already inspected.
104 */
105
106 !
107 ! Entry point for checksum-only.
108 ! %o0 contains buffer address
109 ! %o1 contains count of 16bit words
110 ! %o2 contains sum
111 !
112 ! %o3 temporary
113 ! %o4 temporary
114 ! %g1 32bit mask
115 ! %g4 16bit mask
116 ! %g5 64bit mask (all 1s)
117 !
118 not %g0, %g5 ! all 1's
119 prefetch [%o0], #n_reads ! first hword, dword, cacheline
120
121 clruw %g5, %g1 ! 32 1's at low end
122 srl %g5, 16, %g4 ! 16 1's at low end
123
124 cmp %o1, 32 ! at least a cacheline (64 bytes)?
125 bge,pn %icc, ip_ocsum_long ! yes, do the whole works
126 andn %o0, 7, %o5 ! delay: base src addr
127
128
129 cmp %o1, 4 ! < 4 halfwords?
130 bl,pn %icc, .tiny ! < 4 halfwords, just do them
131 inc 8, %o5 ! delay: next addr (no matter for .tiny)
132
133 /* leading dword with 1-4 hwords: 9 clocks */
134 /* Assumes ok to read the entire dword with the leading hwords */
135
136 ldx [%o5-8], %o3 ! NB base addr
137 sub %o5, %o0, %g2 ! byte count: 2/4/6/8
138 mov %o5, %o0
139
140 sll %g2, 2, %g2 ! 8/16/24/32 for mask
141
142 sllx %g5, %g2, %o5
143
144 sllx %o5, %g2, %o5 ! mask: 16/32/48/64 0's at low end
145
146 srl %g2, 3, %g2 ! hw count
147 andn %o3, %o5, %o3 ! select hw's from src
148
149 srlx %o3, 32, %o4 ! hi32
150 b 9f
151 sub %o1, %g2, %o1 ! delay: decr count, 1-4 halfwords
152
153 .short_dw: ! max 7 iters of 4 clocks; 1 mispred of 4
154 ldx [%o0], %o3 ! tmp64 = *src++ (groups with the branch)
155
156 inc 8, %o0 ! (D-cache load-use delay)
157 dec 4, %o1 ! decrement count, 4 halfwords
158
159 srlx %o3, 32, %o4 ! hi32
160 9: and %o3, %g1, %o3 ! lo32
161
162 add %o4, %o2, %o2 ! accumulator
163 andncc %o1, 3, %g0 ! more than 3 hwords left?
164
165 bnz,pt %icc, .short_dw
166 add %o3, %o2, %o2 ! accumulator
167
168 .short_hw: ! trailing dw: 0-3 hwords
169 tst %o1 ! 0 seems fairly common...
170 bz,a .short_fold
171 srlx %o2, 32, %o4 ! delay: hi32
172 ! mispredict 4 + 7 clocks for 1-3
173 ldx [%o0], %o3
174 sll %o1, 4, %o1 ! bitcount: 16/32/48
175
176 srlx %g5, %o1, %o5 ! mask: 16/32/48 0's at high end
177
178 andn %o3, %o5, %o3 ! select hw's from src
179
180 srlx %o3, 32, %o4 ! hi32
181 and %o3, %g1, %o3 ! lo32
182
183 add %o4, %o2, %o2 ! accumulator
184
185 add %o3, %o2, %o2 ! accumulator
186
187 ! at this point the 64-bit accumulator
188 ! has the result that needs to be returned in 16-bits
189 srlx %o2, 32, %o4 ! hi32
190 .short_fold:
191 and %o2, %g1, %o2 ! lo32
192
193 add %o4, %o2, %o2 ! 33b
194
195 srlx %o2, 16, %o3 ! hi17
196 and %o2, %g4, %o2 ! lo16
197
198 add %o3, %o2, %o2 ! 18b
199
200 srlx %o2, 16, %o3 ! hi2
201 and %o2, %g4, %o2 ! lo16
202
203 retl ! return
204 add %o3, %o2, %o0 ! 16b result in %o0
205
206 .tiny: ! almost never: less than 4 halfwords total.
207 tst %o1
208 bz,a .short_fold
209
210 srlx %o2, 32, %o4 ! delay: hi32
211
212 lduh [%o0], %o3 ! tmp16 = *src++
213 1:
214 inc 2, %o0
215 ! stall for D-cache
216
217 add %o3, %o2, %o2 ! accumulator
218
219 deccc %o1 ! decrement count
220 bnz,a,pt %icc, 1b
221 lduh [%o0], %o3 ! tmp16 = *src++
222
223 ! at this point the 64-bit accumulator
224 ! has the result that needs to be returned in 16-bits
225 b .short_fold
226 srlx %o2, 32, %o4 ! hi32
227
228 SET_SIZE(ip_ocsum) ! 64-bit version
229
230
231 ENTRY(ip_ocsum_long) ! 64-bit, large blocks
232 save %sp, -SA(MINFRAME), %sp ! get another window
233 !
234 ! %i0 contains buffer address
235 ! %i1 contains count of 16bit words
236 ! %i2 contains sum
237 ! %i4 contains the mainloop count
238 ! %i5 comes in with the buffer address rounded down to the first dword
239 !
240 ! %g1 32bit mask
241 ! %g4 16bit mask
242 ! %g5 64bit mask (all 1s)
243 ! %g6 fetch-ahead offset for Ecache
244 !
245 ! %l0-7,%o0-5,%g2-3 mainloop temporaries
246 !
247 !
248 ! 1 clock overhead
249 btst 63, %i0 ! src 64-byte aligned?
250 bz,a,pt %icc, .mainsection ! aligned blocks are fairly common
251 andncc %i1, 31, %i4 ! at least 64 bytes for main loop?
252
253
254 ! Leading dword, with 1-4 hwords: 9 clocks
255 ! Assumes ok to read the entire dword with the leading bytes
256 ldx [%i5], %l0 ! NB base addr
257 inc 8, %i5 ! next addr
258
259 sub %i5, %i0, %l2 ! byte count: 2/4/6/8
260 mov %i5, %i0
261
262 sll %l2, 2, %l2 ! 8/16/24/32 for mask
263
264 sllx %g5, %l2, %l4
265
266 sllx %l4, %l2, %l4 ! mask: 16, 32, 48, 64 0's at lsb
267
268 srl %l2, 3, %l2 ! 1/2/3/4 for count
269 andn %l0, %l4, %l0 ! select hw's from src
270
271 srlx %l0, 32, %o0 ! hi32
272 b 9f
273 sub %i1, %l2, %i1 ! decr count, 1-4 halfwords
274
275 ! Do dwords until source is 64-byte aligned, 0-6 iterations
276 ! 4 clocks per + 4 for 1 mispred = 16 clocks avg
277 .dw: ldx [%i0], %l0 ! tmp64 = *src++ (groups with the branch below)
278
279 inc 8, %i0 ! (Dcache load-use delay)
280 dec 4, %i1 ! decrement count, 4 halfwords
281
282 srlx %l0, 32, %o0 ! hi32
283 9: and %l0, %g1, %l0 ! lo32
284
285 add %o0, %i2, %i2 ! accumulator
286 btst 63, %i0 ! src 64-byte aligned?
287
288 bnz,pt %icc, .dw
289 add %l0, %i2, %i2 ! accumulator
290
291
292 ! At this point source address is 64 byte aligned
293 ! and we've dealt with 1-32 halfwords.
294 andncc %i1, 31, %i4 ! at least 64 bytes for main loop?
295 .mainsection: ! total 18n + 21 clocks
296 bz,pn %icc, .postamble
297 and %i1, 31, %i1 ! count for postamble
298
299 ! preload for main loop - 9 clocks assuming D$ hits at 1 per
300 ldx [%i0+0], %l0
301 ldx [%i0+8], %l1
302 ldx [%i0+16], %l2 ! %l0 could be used here if Dcache hit
303 ldx [%i0+24], %l3 ! but US-II prefetch only loads Ecache
304 ldx [%i0+32], %l4 ! check on US-III: could mix preloads & splits?
305 ldx [%i0+40], %l5
306 ldx [%i0+48], %l6
307 ldx [%i0+56], %l7
308 inc 64, %i0
309 prefetch [%i0], #n_reads
310
311 ! main loop. Read 64 bytes at a time - 18 clocks per iteration
312 5: ! plus 4 for the exit mispredict
313 srlx %l0, 32, %o0 ! hi32 to %o0
314 and %l0, %g1, %l0 ! lo32 to %l0
315
316 srlx %l1, 32, %o1 ! hi32 to %o1
317 and %l1, %g1, %l1 ! lo32 to %l1
318
319 srlx %l2, 32, %o2 ! hi32 to %o2
320 and %l2, %g1, %l2 ! lo32 to %l2
321
322 srlx %l3, 32, %o3 ! hi32 to %o3
323 and %l3, %g1, %l3 ! lo32 to %l3
324
325 srlx %l4, 32, %o4 ! hi32 to %o4
326 and %l4, %g1, %l4 ! lo32 to %l4
327
328 srlx %l5, 32, %o5 ! hi32 to %o5
329 and %l5, %g1, %l5 ! lo32 to %l5
330
331 srlx %l6, 32, %g2 ! hi32 to %g2
332 and %l6, %g1, %l6 ! lo32 to %l6
333
334 srlx %l7, 32, %g3 ! hi32 to %g3
335 and %l7, %g1, %l7 ! lo32 to %l7
336 ! splits gave 16 off 32b vals
337 deccc 32, %i4 ! mv early,avoid mispredicts? nohelp US-II.
338 bz,pn %icc, .looptidy ! count now zero?
339 add %l0, %o0, %o0 ! delay
340
341 ldx [%i0+0], %l0
342 add %l1, %o1, %o1 ! adds and loads
343 add %l2, %o2, %o2
344
345 ldx [%i0+8], %l1
346 add %l3, %o3, %o3
347 add %l4, %o4, %o4
348
349 ldx [%i0+16], %l2
350 add %l5, %o5, %o5
351 add %l6, %g2, %g2
352
353 ldx [%i0+24], %l3
354 add %l7, %g3, %g3 ! now 8 off 33b vals
355 add %o0, %o1, %o0
356
357 ldx [%i0+32], %l4
358 add %o2, %o3, %o1
359 add %o4, %o5, %o2
360
361 ldx [%i0+40], %l5
362 add %g2, %g3, %o3 ! now 4 off 34b vals
363 add %o0, %o1, %o0
364
365 ldx [%i0+48], %l6
366 add %o2, %o3, %o1 ! 2 off 35b
367
368 ldx [%i0+56], %l7
369 add %o0, %o1, %o0 ! 36b
370 inc 64, %i0 ! increment source address
371
372 add %o0, %i2, %i2 ! accumulator
373 ba 5b
374 prefetch [%i0], #n_reads ! next cacheline
375 ! end of main loop
376 .looptidy: ! compute remaining partial sum - 8 clocks
377 add %l1, %o1, %o1
378 add %l2, %o2, %o2
379
380 add %l3, %o3, %o3
381 add %l4, %o4, %o4
382
383 add %l5, %o5, %o5
384 add %l6, %g2, %g2
385
386 add %l7, %g3, %g3 ! 8 x 33b
387 add %o0, %o1, %o0
388
389 add %o2, %o3, %o1
390 add %o4, %o5, %o2
391
392 add %g2, %g3, %o3 ! 4 x 34b
393 add %o0, %o1, %o0
394
395 add %o2, %o3, %o1 ! 2 x 35b
396 add %o0, %i2, %i2 ! accumulator
397
398 add %o1, %i2, %i2 ! accumulator
399
400
401 .postamble:
402 ! postamble hword count is in %i1 (can be zero)
403 ! while at least 1 dword, do dwords. Max 7 iterations.
404 andncc %i1, 3, %g0 ! more than 3 hwords?
405 .dotail_dw:
406 bz,a,pn %icc, .dotail_hw
407 tst %i1 ! delay: any at all left?
408 8:
409 ldx [%i0], %l0 ! tmp64 = *src++
410 inc 8, %i0
411 dec 4, %i1 ! decrement count, 4 halfwords
412
413 ! stall for D-cache
414
415 srlx %l0, 32, %o0 ! hi32
416 and %l0, %g1, %l0 ! lo32
417
418 add %o0, %i2, %i2 ! accumulator
419
420 andncc %i1, 3, %g0 ! more than 3 hwords?
421 bnz,pt %icc, 8b
422 add %l0, %i2, %i2 ! accumulator
423
424 ! while at least 1 hword, do hwords. Max 3 iterations.
425 tst %i1
426 .dotail_hw:
427 bz,a .fold
428 srlx %i2, 32, %o0 ! delay: hi32
429 lduh [%i0], %l0 ! tmp16 = *src++
430 1:
431 inc 2, %i0
432 ! stall for D-cache
433
434 add %l0, %i2, %i2 ! accumulator
435
436 deccc %i1 ! decrement count
437 bnz,a,pt %icc, 1b
438 lduh [%i0], %l0 ! tmp16 = *src++
439
440 ! at this point the 64-bit accumulator
441 ! has the result that needs to be returned in 16-bits
442 srlx %i2, 32, %o0 ! hi32
443 .fold:
444 and %i2, %g1, %o1 ! lo32
445
446 add %o0, %o1, %o0 ! 33b
447
448 srlx %o0, 16, %o1 ! hi17
449 and %o0, %g4, %o0 ! lo16
450
451 add %o1, %o0, %o0 ! 18b
452
453 srlx %o0, 16, %o1 ! hi2
454 and %o0, %g4, %o0 ! lo16
455
456 add %o1, %o0, %i0 ! 16b result in %i0
457
458 ret ! return
459 restore
460
461
462 SET_SIZE(ip_ocsum_long) ! 64-bit version
463
464 #endif /* lint */