Print this page
3741 zfs needs better comments
Submitted by:   Will Andrews <willa@spectralogic.com>
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Alan Somers <alans@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>


 108  * --------------------
 109  *
 110  * There are two interesting components to checksum performance: cached and
 111  * uncached performance.  With cached data, fletcher-2 is about four times
 112  * faster than fletcher-4.  With uncached data, the performance difference is
 113  * negligible, since the cost of a cache fill dominates the processing time.
 114  * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
 115  * efficient pass over the data.
 116  *
 117  * In normal operation, the data which is being checksummed is in a buffer
 118  * which has been filled either by:
 119  *
 120  *      1. a compression step, which will be mostly cached, or
 121  *      2. a bcopy() or copyin(), which will be uncached (because the
 122  *         copy is cache-bypassing).
 123  *
 124  * For both cached and uncached data, both fletcher checksums are much faster
 125  * than sha-256, and slower than 'off', which doesn't touch the data at all.
 126  */
 127 











 128 #include <sys/types.h>
 129 #include <sys/sysmacros.h>
 130 #include <sys/byteorder.h>
 131 #include <sys/zio.h>
 132 #include <sys/spa.h>
 133 
 134 void
 135 fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 136 {
 137         const uint64_t *ip = buf;
 138         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 139         uint64_t a0, b0, a1, b1;
 140 
 141         for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
 142                 a0 += ip[0];
 143                 a1 += ip[1];
 144                 b0 += a0;
 145                 b1 += a1;
 146         }
 147 




 108  * --------------------
 109  *
 110  * There are two interesting components to checksum performance: cached and
 111  * uncached performance.  With cached data, fletcher-2 is about four times
 112  * faster than fletcher-4.  With uncached data, the performance difference is
 113  * negligible, since the cost of a cache fill dominates the processing time.
 114  * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
 115  * efficient pass over the data.
 116  *
 117  * In normal operation, the data which is being checksummed is in a buffer
 118  * which has been filled either by:
 119  *
 120  *      1. a compression step, which will be mostly cached, or
 121  *      2. a bcopy() or copyin(), which will be uncached (because the
 122  *         copy is cache-bypassing).
 123  *
 124  * For both cached and uncached data, both fletcher checksums are much faster
 125  * than sha-256, and slower than 'off', which doesn't touch the data at all.
 126  */
 127 
 128 /*
 129  * TODO: vectorize these functions
 130  * All of these functions are written so that each iteration of the loop
 131  * depends on the value of the previous iteration.  Also, in the fletcher_4
 132  * functions, each statement of the loop body depends on the previous
 133  * statement.  These dependencies prevent the compiler from vectorizing the
 134  * code to take advantage of SIMD extensions (unless GCC is far smarter than I
 135  * think).  It would be easy to rewrite the loops to be amenable to
 136  * autovectorization.
 137  */
 138 
 139 #include <sys/types.h>
 140 #include <sys/sysmacros.h>
 141 #include <sys/byteorder.h>
 142 #include <sys/zio.h>
 143 #include <sys/spa.h>
 144 
 145 void
 146 fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 147 {
 148         const uint64_t *ip = buf;
 149         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 150         uint64_t a0, b0, a1, b1;
 151 
 152         for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
 153                 a0 += ip[0];
 154                 a1 += ip[1];
 155                 b0 += a0;
 156                 b1 += a1;
 157         }
 158