108 * --------------------
109 *
110 * There are two interesting components to checksum performance: cached and
111 * uncached performance. With cached data, fletcher-2 is about four times
112 * faster than fletcher-4. With uncached data, the performance difference is
113 * negligible, since the cost of a cache fill dominates the processing time.
114 * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
115 * efficient pass over the data.
116 *
117 * In normal operation, the data which is being checksummed is in a buffer
118 * which has been filled either by:
119 *
120 * 1. a compression step, which will be mostly cached, or
121 * 2. a bcopy() or copyin(), which will be uncached (because the
122 * copy is cache-bypassing).
123 *
124 * For both cached and uncached data, both fletcher checksums are much faster
125 * than sha-256, and slower than 'off', which doesn't touch the data at all.
126 */
127
128 #include <sys/types.h>
129 #include <sys/sysmacros.h>
130 #include <sys/byteorder.h>
131 #include <sys/zio.h>
132 #include <sys/spa.h>
133
134 void
135 fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
136 {
137 const uint64_t *ip = buf;
138 const uint64_t *ipend = ip + (size / sizeof (uint64_t));
139 uint64_t a0, b0, a1, b1;
140
141 for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
142 a0 += ip[0];
143 a1 += ip[1];
144 b0 += a0;
145 b1 += a1;
146 }
147
|
108 * --------------------
109 *
110 * There are two interesting components to checksum performance: cached and
111 * uncached performance. With cached data, fletcher-2 is about four times
112 * faster than fletcher-4. With uncached data, the performance difference is
113 * negligible, since the cost of a cache fill dominates the processing time.
114 * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
115 * efficient pass over the data.
116 *
117 * In normal operation, the data which is being checksummed is in a buffer
118 * which has been filled either by:
119 *
120 * 1. a compression step, which will be mostly cached, or
121 * 2. a bcopy() or copyin(), which will be uncached (because the
122 * copy is cache-bypassing).
123 *
124 * For both cached and uncached data, both fletcher checksums are much faster
125 * than sha-256, and slower than 'off', which doesn't touch the data at all.
126 */
127
128 /*
129 * TODO: vectorize these functions
130 * All of these functions are written so that each iteration of the loop
131 * depends on the value of the previous iteration. Also, in the fletcher_4
132 * functions, each statement of the loop body depends on the previous
133 * statement. These dependencies prevent the compiler from vectorizing the
134 * code to take advantage of SIMD extensions (unless GCC is far smarter than I
135 * think). It would be easy to rewrite the loops to be amenable to
136 * autovectorization.
137 */
138
139 #include <sys/types.h>
140 #include <sys/sysmacros.h>
141 #include <sys/byteorder.h>
142 #include <sys/zio.h>
143 #include <sys/spa.h>
144
145 void
146 fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
147 {
148 const uint64_t *ip = buf;
149 const uint64_t *ipend = ip + (size / sizeof (uint64_t));
150 uint64_t a0, b0, a1, b1;
151
152 for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
153 a0 += ip[0];
154 a1 += ip[1];
155 b0 += a0;
156 b1 += a1;
157 }
158
|