7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #pragma ident "%Z%%M% %I% %E% SMI"
28
29 #include <sys/param.h>
30 #include <sys/errno.h>
31 #include <sys/asm_linkage.h>
32 #include <sys/vtrace.h>
33 #include <sys/machthread.h>
34 #include <sys/machparam.h>
35
36 #if defined(lint)
37 #include <sys/types.h>
38 #else /* lint */
39 #include "assym.h"
40 #endif /* lint */
41
42 /*
43 * Prefetch considerations
44 *
45 * We prefetch one cacheline ahead. This may not be enough on Serengeti
46 * systems - see default_copyout() etc which prefetch 5 lines ahead.
47 * On the other hand, we expect most of the source buffers to be
48 * recently used enough to be cached.
49 *
50 * On US-I the prefetches are inoperative. On US-II they preload the E$;
51 * the mainloop unrolling and load-buffer should cover loads from E$.
52 * The stores appear to be the slow point on US-II.
53 *
54 * On US-IIICu the prefetch preloads the L2$ too, but there is no load
55 * buffer so the loads will stall for D$ miss, L2$ hit. The hardware
56 * auto-prefetch is not activated by integer loads. No solution
57 * in sight for this, barring odd games with FP read, write, integer read.
58 *
59 * US-IV (Panther) appears similar to US-IIICu, except that a strong
60 * variant of prefetch is available which can take TLB traps. We don't
61 * use this. The h/w prefetch stride can be set to 64, 128 or 192,
62 * and they only reach to the L2$ (we don't use these either).
63 * L2$ load-to-use latency is 15 cycles (best).
64 */
65
66
67 /*
68 * ip_ocsum(address, halfword_count, sum)
69 * Do a 16 bit one's complement sum of a given number of (16-bit)
70 * halfwords. The halfword pointer must not be odd.
71 * %o0 address; %o1 count; %o2 sum accumulator; %o4 temp
72 * %g2 and %g3 used in main loop
73 *
74 * (from @(#)ocsum.s 1.3 89/02/24 SMI)
75 *
76 */
77
78 #if defined(lint)
79
80 /* ARGSUSED */
81 unsigned int
82 ip_ocsum(u_short *address, int halfword_count, unsigned int sum)
83 { return (0); }
84
85 #else /* lint */
86
87 ENTRY(ip_ocsum)
88
89 /*
90 * On ttcp transmits, called once per ocsum_copyin but with a small
91 * block ( >99.9% ). Could be the tx hdrs? How many acks/seg are we rxing?
92 * On ttcp receives, called more than once per ocsum_copyout. Rx hdrs
93 * and tx acks?
94 *
95 * To do: telnet and nfs traffic
96 *
97 * On an NCA'd webserver about 10% of the calls are >64 bytes
98 * about 10% of those start on a 64byte boundary
99 * about 30% are >5*64 bytes.
100 * The NCA numbers & proportions don't change with h/w cksum on.
101 *
102 * Tx hdrs are likely to be already in cache.
103 * Rx hdrs depends if already inspected.
104 */
105
106 !
444 and %i2, %g1, %o1 ! lo32
445
446 add %o0, %o1, %o0 ! 33b
447
448 srlx %o0, 16, %o1 ! hi17
449 and %o0, %g4, %o0 ! lo16
450
451 add %o1, %o0, %o0 ! 18b
452
453 srlx %o0, 16, %o1 ! hi2
454 and %o0, %g4, %o0 ! lo16
455
456 add %o1, %o0, %i0 ! 16b result in %i0
457
458 ret ! return
459 restore
460
461
462 SET_SIZE(ip_ocsum_long) ! 64-bit version
463
464 #endif /* lint */
|
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/param.h>
28 #include <sys/errno.h>
29 #include <sys/asm_linkage.h>
30 #include <sys/vtrace.h>
31 #include <sys/machthread.h>
32 #include <sys/machparam.h>
33
34 #include "assym.h"
35
36 /*
37 * Prefetch considerations
38 *
39 * We prefetch one cacheline ahead. This may not be enough on Serengeti
40 * systems - see default_copyout() etc which prefetch 5 lines ahead.
41 * On the other hand, we expect most of the source buffers to be
42 * recently used enough to be cached.
43 *
44 * On US-I the prefetches are inoperative. On US-II they preload the E$;
45 * the mainloop unrolling and load-buffer should cover loads from E$.
46 * The stores appear to be the slow point on US-II.
47 *
48 * On US-IIICu the prefetch preloads the L2$ too, but there is no load
49 * buffer so the loads will stall for D$ miss, L2$ hit. The hardware
50 * auto-prefetch is not activated by integer loads. No solution
51 * in sight for this, barring odd games with FP read, write, integer read.
52 *
53 * US-IV (Panther) appears similar to US-IIICu, except that a strong
54 * variant of prefetch is available which can take TLB traps. We don't
55 * use this. The h/w prefetch stride can be set to 64, 128 or 192,
56 * and they only reach to the L2$ (we don't use these either).
57 * L2$ load-to-use latency is 15 cycles (best).
58 */
59
60
61 /*
62 * ip_ocsum(address, halfword_count, sum)
63 * Do a 16 bit one's complement sum of a given number of (16-bit)
64 * halfwords. The halfword pointer must not be odd.
65 * %o0 address; %o1 count; %o2 sum accumulator; %o4 temp
66 * %g2 and %g3 used in main loop
67 *
68 * (from @(#)ocsum.s 1.3 89/02/24 SMI)
69 *
70 */
71
72 ENTRY(ip_ocsum)
73
74 /*
75 * On ttcp transmits, called once per ocsum_copyin but with a small
76 * block ( >99.9% ). Could be the tx hdrs? How many acks/seg are we rxing?
77 * On ttcp receives, called more than once per ocsum_copyout. Rx hdrs
78 * and tx acks?
79 *
80 * To do: telnet and nfs traffic
81 *
82 * On an NCA'd webserver about 10% of the calls are >64 bytes
83 * about 10% of those start on a 64byte boundary
84 * about 30% are >5*64 bytes.
85 * The NCA numbers & proportions don't change with h/w cksum on.
86 *
87 * Tx hdrs are likely to be already in cache.
88 * Rx hdrs depends if already inspected.
89 */
90
91 !
429 and %i2, %g1, %o1 ! lo32
430
431 add %o0, %o1, %o0 ! 33b
432
433 srlx %o0, 16, %o1 ! hi17
434 and %o0, %g4, %o0 ! lo16
435
436 add %o1, %o0, %o0 ! 18b
437
438 srlx %o0, 16, %o1 ! hi2
439 and %o0, %g4, %o0 ! lo16
440
441 add %o1, %o0, %i0 ! 16b result in %i0
442
443 ret ! return
444 restore
445
446
447 SET_SIZE(ip_ocsum_long) ! 64-bit version
448
|