Print this page
de-linting of .s files
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/sun4/ml/ip_ocsum.s
+++ new/usr/src/uts/sun4/ml/ip_ocsum.s
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License, Version 1.0 only
6 6 * (the "License"). You may not use this file except in compliance
7 7 * with the License.
8 8 *
9 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 10 * or http://www.opensolaris.org/os/licensing.
11 11 * See the License for the specific language governing permissions
12 12 * and limitations under the License.
13 13 *
14 14 * When distributing Covered Code, include this CDDL HEADER in each
15 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 16 * If applicable, add the following below this CDDL HEADER, with the
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
17 17 * fields enclosed by brackets "[]" replaced with your own identifying
18 18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 19 *
20 20 * CDDL HEADER END
21 21 */
22 22 /*
23 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 -#pragma ident "%Z%%M% %I% %E% SMI"
28 -
29 27 #include <sys/param.h>
30 28 #include <sys/errno.h>
31 29 #include <sys/asm_linkage.h>
32 30 #include <sys/vtrace.h>
33 31 #include <sys/machthread.h>
34 32 #include <sys/machparam.h>
35 33
36 -#if defined(lint)
37 -#include <sys/types.h>
38 -#else /* lint */
39 34 #include "assym.h"
40 -#endif /* lint */
41 35
42 36 /*
43 37 * Prefetch considerations
44 38 *
45 39 * We prefetch one cacheline ahead. This may not be enough on Serengeti
46 40 * systems - see default_copyout() etc which prefetch 5 lines ahead.
47 41 * On the other hand, we expect most of the source buffers to be
48 42 * recently used enough to be cached.
49 43 *
50 44 * On US-I the prefetches are inoperative. On US-II they preload the E$;
51 45 * the mainloop unrolling and load-buffer should cover loads from E$.
52 46 * The stores appear to be the slow point on US-II.
53 47 *
54 48 * On US-IIICu the prefetch preloads the L2$ too, but there is no load
55 49 * buffer so the loads will stall for D$ miss, L2$ hit. The hardware
56 50 * auto-prefetch is not activated by integer loads. No solution
57 51 * in sight for this, barring odd games with FP read, write, integer read.
58 52 *
59 53 * US-IV (Panther) appears similar to US-IIICu, except that a strong
60 54 * variant of prefetch is available which can take TLB traps. We don't
61 55 * use this. The h/w prefetch stride can be set to 64, 128 or 192,
62 56 * and they only reach to the L2$ (we don't use these either).
63 57 * L2$ load-to-use latency is 15 cycles (best).
64 58 */
65 59
66 60
67 61 /*
↓ open down ↓ |
17 lines elided |
↑ open up ↑ |
68 62 * ip_ocsum(address, halfword_count, sum)
69 63 * Do a 16 bit one's complement sum of a given number of (16-bit)
70 64 * halfwords. The halfword pointer must not be odd.
71 65 * %o0 address; %o1 count; %o2 sum accumulator; %o4 temp
72 66 * %g2 and %g3 used in main loop
73 67 *
74 68 * (from @(#)ocsum.s 1.3 89/02/24 SMI)
75 69 *
76 70 */
77 71
78 -#if defined(lint)
79 -
80 -/* ARGSUSED */
81 -unsigned int
82 -ip_ocsum(u_short *address, int halfword_count, unsigned int sum)
83 -{ return (0); }
84 -
85 -#else /* lint */
86 -
87 72 ENTRY(ip_ocsum)
88 73
89 74 /*
90 75 * On ttcp transmits, called once per ocsum_copyin but with a small
91 76 * block ( >99.9% ). Could be the tx hdrs? How many acks/seg are we rxing?
92 77 * On ttcp receives, called more than once per ocsum_copyout. Rx hdrs
93 78 * and tx acks?
94 79 *
95 80 * To do: telnet and nfs traffic
96 81 *
97 82 * On an NCA'd webserver about 10% of the calls are >64 bytes
98 83 * about 10% of those start on a 64byte boundary
99 84 * about 30% are >5*64 bytes.
100 85 * The NCA numbers & proportions don't change with h/w cksum on.
101 86 *
102 87 * Tx hdrs are likely to be already in cache.
103 88 * Rx hdrs depends if already inspected.
104 89 */
105 90
106 91 !
107 92 ! Entry point for checksum-only.
108 93 ! %o0 contains buffer address
109 94 ! %o1 contains count of 16bit words
110 95 ! %o2 contains sum
111 96 !
112 97 ! %o3 temporary
113 98 ! %o4 temporary
114 99 ! %g1 32bit mask
115 100 ! %g4 16bit mask
116 101 ! %g5 64bit mask (all 1s)
117 102 !
118 103 not %g0, %g5 ! all 1's
119 104 prefetch [%o0], #n_reads ! first hword, dword, cacheline
120 105
121 106 clruw %g5, %g1 ! 32 1's at low end
122 107 srl %g5, 16, %g4 ! 16 1's at low end
123 108
124 109 cmp %o1, 32 ! at least a cacheline (64 bytes)?
125 110 bge,pn %icc, ip_ocsum_long ! yes, do the whole works
126 111 andn %o0, 7, %o5 ! delay: base src addr
127 112
128 113
129 114 cmp %o1, 4 ! < 4 halfwords?
130 115 bl,pn %icc, .tiny ! < 4 halfwords, just do them
131 116 inc 8, %o5 ! delay: next addr (no matter for .tiny)
132 117
133 118 /* leading dword with 1-4 hwords: 9 clocks */
134 119 /* Assumes ok to read the entire dword with the leading hwords */
135 120
136 121 ldx [%o5-8], %o3 ! NB base addr
137 122 sub %o5, %o0, %g2 ! byte count: 2/4/6/8
138 123 mov %o5, %o0
139 124
140 125 sll %g2, 2, %g2 ! 8/16/24/32 for mask
141 126
142 127 sllx %g5, %g2, %o5
143 128
144 129 sllx %o5, %g2, %o5 ! mask: 16/32/48/64 0's at low end
145 130
146 131 srl %g2, 3, %g2 ! hw count
147 132 andn %o3, %o5, %o3 ! select hw's from src
148 133
149 134 srlx %o3, 32, %o4 ! hi32
150 135 b 9f
151 136 sub %o1, %g2, %o1 ! delay: decr count, 1-4 halfwords
152 137
153 138 .short_dw: ! max 7 iters of 4 clocks; 1 mispred of 4
154 139 ldx [%o0], %o3 ! tmp64 = *src++ (groups with the branch)
155 140
156 141 inc 8, %o0 ! (D-cache load-use delay)
157 142 dec 4, %o1 ! decrement count, 4 halfwords
158 143
159 144 srlx %o3, 32, %o4 ! hi32
160 145 9: and %o3, %g1, %o3 ! lo32
161 146
162 147 add %o4, %o2, %o2 ! accumulator
163 148 andncc %o1, 3, %g0 ! more than 3 hwords left?
164 149
165 150 bnz,pt %icc, .short_dw
166 151 add %o3, %o2, %o2 ! accumulator
167 152
168 153 .short_hw: ! trailing dw: 0-3 hwords
169 154 tst %o1 ! 0 seems fairly common...
170 155 bz,a .short_fold
171 156 srlx %o2, 32, %o4 ! delay: hi32
172 157 ! mispredict 4 + 7 clocks for 1-3
173 158 ldx [%o0], %o3
174 159 sll %o1, 4, %o1 ! bitcount: 16/32/48
175 160
176 161 srlx %g5, %o1, %o5 ! mask: 16/32/48 0's at high end
177 162
178 163 andn %o3, %o5, %o3 ! select hw's from src
179 164
180 165 srlx %o3, 32, %o4 ! hi32
181 166 and %o3, %g1, %o3 ! lo32
182 167
183 168 add %o4, %o2, %o2 ! accumulator
184 169
185 170 add %o3, %o2, %o2 ! accumulator
186 171
187 172 ! at this point the 64-bit accumulator
188 173 ! has the result that needs to be returned in 16-bits
189 174 srlx %o2, 32, %o4 ! hi32
190 175 .short_fold:
191 176 and %o2, %g1, %o2 ! lo32
192 177
193 178 add %o4, %o2, %o2 ! 33b
194 179
195 180 srlx %o2, 16, %o3 ! hi17
196 181 and %o2, %g4, %o2 ! lo16
197 182
198 183 add %o3, %o2, %o2 ! 18b
199 184
200 185 srlx %o2, 16, %o3 ! hi2
201 186 and %o2, %g4, %o2 ! lo16
202 187
203 188 retl ! return
204 189 add %o3, %o2, %o0 ! 16b result in %o0
205 190
206 191 .tiny: ! almost never: less than 4 halfwords total.
207 192 tst %o1
208 193 bz,a .short_fold
209 194
210 195 srlx %o2, 32, %o4 ! delay: hi32
211 196
212 197 lduh [%o0], %o3 ! tmp16 = *src++
213 198 1:
214 199 inc 2, %o0
215 200 ! stall for D-cache
216 201
217 202 add %o3, %o2, %o2 ! accumulator
218 203
219 204 deccc %o1 ! decrement count
220 205 bnz,a,pt %icc, 1b
221 206 lduh [%o0], %o3 ! tmp16 = *src++
222 207
223 208 ! at this point the 64-bit accumulator
224 209 ! has the result that needs to be returned in 16-bits
225 210 b .short_fold
226 211 srlx %o2, 32, %o4 ! hi32
227 212
228 213 SET_SIZE(ip_ocsum) ! 64-bit version
229 214
230 215
231 216 ENTRY(ip_ocsum_long) ! 64-bit, large blocks
232 217 save %sp, -SA(MINFRAME), %sp ! get another window
233 218 !
234 219 ! %i0 contains buffer address
235 220 ! %i1 contains count of 16bit words
236 221 ! %i2 contains sum
237 222 ! %i4 contains the mainloop count
238 223 ! %i5 comes in with the buffer address rounded down to the first dword
239 224 !
240 225 ! %g1 32bit mask
241 226 ! %g4 16bit mask
242 227 ! %g5 64bit mask (all 1s)
243 228 ! %g6 fetch-ahead offset for Ecache
244 229 !
245 230 ! %l0-7,%o0-5,%g2-3 mainloop temporaries
246 231 !
247 232 !
248 233 ! 1 clock overhead
249 234 btst 63, %i0 ! src 64-byte aligned?
250 235 bz,a,pt %icc, .mainsection ! aligned blocks are fairly common
251 236 andncc %i1, 31, %i4 ! at least 64 bytes for main loop?
252 237
253 238
254 239 ! Leading dword, with 1-4 hwords: 9 clocks
255 240 ! Assumes ok to read the entire dword with the leading bytes
256 241 ldx [%i5], %l0 ! NB base addr
257 242 inc 8, %i5 ! next addr
258 243
259 244 sub %i5, %i0, %l2 ! byte count: 2/4/6/8
260 245 mov %i5, %i0
261 246
262 247 sll %l2, 2, %l2 ! 8/16/24/32 for mask
263 248
264 249 sllx %g5, %l2, %l4
265 250
266 251 sllx %l4, %l2, %l4 ! mask: 16, 32, 48, 64 0's at lsb
267 252
268 253 srl %l2, 3, %l2 ! 1/2/3/4 for count
269 254 andn %l0, %l4, %l0 ! select hw's from src
270 255
271 256 srlx %l0, 32, %o0 ! hi32
272 257 b 9f
273 258 sub %i1, %l2, %i1 ! decr count, 1-4 halfwords
274 259
275 260 ! Do dwords until source is 64-byte aligned, 0-6 iterations
276 261 ! 4 clocks per + 4 for 1 mispred = 16 clocks avg
277 262 .dw: ldx [%i0], %l0 ! tmp64 = *src++ (groups with the branch below)
278 263
279 264 inc 8, %i0 ! (Dcache load-use delay)
280 265 dec 4, %i1 ! decrement count, 4 halfwords
281 266
282 267 srlx %l0, 32, %o0 ! hi32
283 268 9: and %l0, %g1, %l0 ! lo32
284 269
285 270 add %o0, %i2, %i2 ! accumulator
286 271 btst 63, %i0 ! src 64-byte aligned?
287 272
288 273 bnz,pt %icc, .dw
289 274 add %l0, %i2, %i2 ! accumulator
290 275
291 276
292 277 ! At this point source address is 64 byte aligned
293 278 ! and we've dealt with 1-32 halfwords.
294 279 andncc %i1, 31, %i4 ! at least 64 bytes for main loop?
295 280 .mainsection: ! total 18n + 21 clocks
296 281 bz,pn %icc, .postamble
297 282 and %i1, 31, %i1 ! count for postamble
298 283
299 284 ! preload for main loop - 9 clocks assuming D$ hits at 1 per
300 285 ldx [%i0+0], %l0
301 286 ldx [%i0+8], %l1
302 287 ldx [%i0+16], %l2 ! %l0 could be used here if Dcache hit
303 288 ldx [%i0+24], %l3 ! but US-II prefetch only loads Ecache
304 289 ldx [%i0+32], %l4 ! check on US-III: could mix preloads & splits?
305 290 ldx [%i0+40], %l5
306 291 ldx [%i0+48], %l6
307 292 ldx [%i0+56], %l7
308 293 inc 64, %i0
309 294 prefetch [%i0], #n_reads
310 295
311 296 ! main loop. Read 64 bytes at a time - 18 clocks per iteration
312 297 5: ! plus 4 for the exit mispredict
313 298 srlx %l0, 32, %o0 ! hi32 to %o0
314 299 and %l0, %g1, %l0 ! lo32 to %l0
315 300
316 301 srlx %l1, 32, %o1 ! hi32 to %o1
317 302 and %l1, %g1, %l1 ! lo32 to %l1
318 303
319 304 srlx %l2, 32, %o2 ! hi32 to %o2
320 305 and %l2, %g1, %l2 ! lo32 to %l2
321 306
322 307 srlx %l3, 32, %o3 ! hi32 to %o3
323 308 and %l3, %g1, %l3 ! lo32 to %l3
324 309
325 310 srlx %l4, 32, %o4 ! hi32 to %o4
326 311 and %l4, %g1, %l4 ! lo32 to %l4
327 312
328 313 srlx %l5, 32, %o5 ! hi32 to %o5
329 314 and %l5, %g1, %l5 ! lo32 to %l5
330 315
331 316 srlx %l6, 32, %g2 ! hi32 to %g2
332 317 and %l6, %g1, %l6 ! lo32 to %l6
333 318
334 319 srlx %l7, 32, %g3 ! hi32 to %g3
335 320 and %l7, %g1, %l7 ! lo32 to %l7
336 321 ! splits gave 16 off 32b vals
337 322 deccc 32, %i4 ! mv early,avoid mispredicts? nohelp US-II.
338 323 bz,pn %icc, .looptidy ! count now zero?
339 324 add %l0, %o0, %o0 ! delay
340 325
341 326 ldx [%i0+0], %l0
342 327 add %l1, %o1, %o1 ! adds and loads
343 328 add %l2, %o2, %o2
344 329
345 330 ldx [%i0+8], %l1
346 331 add %l3, %o3, %o3
347 332 add %l4, %o4, %o4
348 333
349 334 ldx [%i0+16], %l2
350 335 add %l5, %o5, %o5
351 336 add %l6, %g2, %g2
352 337
353 338 ldx [%i0+24], %l3
354 339 add %l7, %g3, %g3 ! now 8 off 33b vals
355 340 add %o0, %o1, %o0
356 341
357 342 ldx [%i0+32], %l4
358 343 add %o2, %o3, %o1
359 344 add %o4, %o5, %o2
360 345
361 346 ldx [%i0+40], %l5
362 347 add %g2, %g3, %o3 ! now 4 off 34b vals
363 348 add %o0, %o1, %o0
364 349
365 350 ldx [%i0+48], %l6
366 351 add %o2, %o3, %o1 ! 2 off 35b
367 352
368 353 ldx [%i0+56], %l7
369 354 add %o0, %o1, %o0 ! 36b
370 355 inc 64, %i0 ! increment source address
371 356
372 357 add %o0, %i2, %i2 ! accumulator
373 358 ba 5b
374 359 prefetch [%i0], #n_reads ! next cacheline
375 360 ! end of main loop
376 361 .looptidy: ! compute remaining partial sum - 8 clocks
377 362 add %l1, %o1, %o1
378 363 add %l2, %o2, %o2
379 364
380 365 add %l3, %o3, %o3
381 366 add %l4, %o4, %o4
382 367
383 368 add %l5, %o5, %o5
384 369 add %l6, %g2, %g2
385 370
386 371 add %l7, %g3, %g3 ! 8 x 33b
387 372 add %o0, %o1, %o0
388 373
389 374 add %o2, %o3, %o1
390 375 add %o4, %o5, %o2
391 376
392 377 add %g2, %g3, %o3 ! 4 x 34b
393 378 add %o0, %o1, %o0
394 379
395 380 add %o2, %o3, %o1 ! 2 x 35b
396 381 add %o0, %i2, %i2 ! accumulator
397 382
398 383 add %o1, %i2, %i2 ! accumulator
399 384
400 385
401 386 .postamble:
402 387 ! postamble hword count is in %i1 (can be zero)
403 388 ! while at least 1 dword, do dwords. Max 7 iterations.
404 389 andncc %i1, 3, %g0 ! more than 3 hwords?
405 390 .dotail_dw:
406 391 bz,a,pn %icc, .dotail_hw
407 392 tst %i1 ! delay: any at all left?
408 393 8:
409 394 ldx [%i0], %l0 ! tmp64 = *src++
410 395 inc 8, %i0
411 396 dec 4, %i1 ! decrement count, 4 halfwords
412 397
413 398 ! stall for D-cache
414 399
415 400 srlx %l0, 32, %o0 ! hi32
416 401 and %l0, %g1, %l0 ! lo32
417 402
418 403 add %o0, %i2, %i2 ! accumulator
419 404
420 405 andncc %i1, 3, %g0 ! more than 3 hwords?
421 406 bnz,pt %icc, 8b
422 407 add %l0, %i2, %i2 ! accumulator
423 408
424 409 ! while at least 1 hword, do hwords. Max 3 iterations.
425 410 tst %i1
426 411 .dotail_hw:
427 412 bz,a .fold
428 413 srlx %i2, 32, %o0 ! delay: hi32
429 414 lduh [%i0], %l0 ! tmp16 = *src++
430 415 1:
431 416 inc 2, %i0
432 417 ! stall for D-cache
433 418
434 419 add %l0, %i2, %i2 ! accumulator
435 420
436 421 deccc %i1 ! decrement count
437 422 bnz,a,pt %icc, 1b
438 423 lduh [%i0], %l0 ! tmp16 = *src++
439 424
440 425 ! at this point the 64-bit accumulator
441 426 ! has the result that needs to be returned in 16-bits
442 427 srlx %i2, 32, %o0 ! hi32
443 428 .fold:
444 429 and %i2, %g1, %o1 ! lo32
445 430
446 431 add %o0, %o1, %o0 ! 33b
447 432
448 433 srlx %o0, 16, %o1 ! hi17
449 434 and %o0, %g4, %o0 ! lo16
450 435
451 436 add %o1, %o0, %o0 ! 18b
452 437
453 438 srlx %o0, 16, %o1 ! hi2
↓ open down ↓ |
357 lines elided |
↑ open up ↑ |
454 439 and %o0, %g4, %o0 ! lo16
455 440
456 441 add %o1, %o0, %i0 ! 16b result in %i0
457 442
458 443 ret ! return
459 444 restore
460 445
461 446
462 447 SET_SIZE(ip_ocsum_long) ! 64-bit version
463 448
464 -#endif /* lint */
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX