illumos-3741 Wdiff usr/src/common/zfs/zfs_fletcher.c

Print this page

3741 zfs needs better comments
Submitted by:   Will Andrews <willa@spectralogic.com>
Submitted by:   Justin Gibbs <justing@spectralogic.com>
Submitted by:   Alan Somers <alans@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/common/zfs/zfs_fletcher.c
          +++ new/usr/src/common/zfs/zfs_fletcher.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Fletcher Checksums
  28   28   * ------------------
  29   29   *
  30   30   * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
  31   31   * recurrence relations:
  32   32   *
  33   33   *      a  = a    + f
  34   34   *       i    i-1    i-1
  35   35   *
  36   36   *      b  = b    + a
  37   37   *       i    i-1    i
  38   38   *
  39   39   *      c  = c    + b           (fletcher-4 only)
  40   40   *       i    i-1    i
  41   41   *
  42   42   *      d  = d    + c           (fletcher-4 only)
  43   43   *       i    i-1    i
  44   44   *
  45   45   * Where
  46   46   *      a_0 = b_0 = c_0 = d_0 = 0
  47   47   * and
  48   48   *      f_0 .. f_(n-1) are the input data.
  49   49   *
  50   50   * Using standard techniques, these translate into the following series:
  51   51   *
  52   52   *           __n_                            __n_
  53   53   *           \   |                           \   |
  54   54   *      a  =  >     f                   b  =  >     i * f
  55   55   *       n   /___|   n - i               n   /___|       n - i
  56   56   *           i = 1                           i = 1
  57   57   *
  58   58   *
  59   59   *           __n_                            __n_
  60   60   *           \   |  i*(i+1)                  \   |  i*(i+1)*(i+2)
  61   61   *      c  =  >     ------- f           d  =  >     ------------- f
  62   62   *       n   /___|     2     n - i       n   /___|        6        n - i
  63   63   *           i = 1                           i = 1
  64   64   *
  65   65   * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
  66   66   * Since the additions are done mod (2^64), errors in the high bits may not
  67   67   * be noticed.  For this reason, fletcher-2 is deprecated.
  68   68   *
  69   69   * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
  70   70   * A conservative estimate of how big the buffer can get before we overflow
  71   71   * can be estimated using f_i = 0xffffffff for all i:
  72   72   *
  73   73   * % bc
  74   74   *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
  75   75   * 2264
  76   76   *  quit
  77   77   * %
  78   78   *
  79   79   * So blocks of up to 2k will not overflow.  Our largest block size is
  80   80   * 128k, which has 32k 4-byte words, so we can compute the largest possible
  81   81   * accumulators, then divide by 2^64 to figure the max amount of overflow:
  82   82   *
  83   83   * % bc
  84   84   *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
  85   85   *  a/2^64;b/2^64;c/2^64;d/2^64
  86   86   * 0
  87   87   * 0
  88   88   * 1365
  89   89   * 11186858
  90   90   *  quit
  91   91   * %
  92   92   *
  93   93   * So a and b cannot overflow.  To make sure each bit of input has some
  94   94   * effect on the contents of c and d, we can look at what the factors of
  95   95   * the coefficients in the equations for c_n and d_n are.  The number of 2s
  96   96   * in the factors determines the lowest set bit in the multiplier.  Running
  97   97   * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
  98   98   * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
  99   99   * the 64-bit accumulators, every bit of every f_i effects every accumulator,
 100  100   * even for 128k blocks.
 101  101   *
 102  102   * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
 103  103   * we could do our calculations mod (2^32 - 1) by adding in the carries
 104  104   * periodically, and store the number of carries in the top 32-bits.
 105  105   *
 106  106   * --------------------
 107  107   * Checksum Performance
 108  108   * --------------------
 109  109   *
 110  110   * There are two interesting components to checksum performance: cached and
 111  111   * uncached performance.  With cached data, fletcher-2 is about four times
 112  112   * faster than fletcher-4.  With uncached data, the performance difference is
 113  113   * negligible, since the cost of a cache fill dominates the processing time.
 114  114   * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
 115  115   * efficient pass over the data.
 116  116   *
 117  117   * In normal operation, the data which is being checksummed is in a buffer

↓ open down ↓

117 lines elided

↑ open up ↑

 118  118   * which has been filled either by:
 119  119   *
 120  120   *      1. a compression step, which will be mostly cached, or
 121  121   *      2. a bcopy() or copyin(), which will be uncached (because the
 122  122   *         copy is cache-bypassing).
 123  123   *
 124  124   * For both cached and uncached data, both fletcher checksums are much faster
 125  125   * than sha-256, and slower than 'off', which doesn't touch the data at all.
 126  126   */
 127  127  
      128 +/*
      129 + * TODO: vectorize these functions
      130 + * All of these functions are written so that each iteration of the loop
      131 + * depends on the value of the previous iteration.  Also, in the fletcher_4
      132 + * functions, each statement of the loop body depends on the previous
      133 + * statement.  These dependencies prevent the compiler from vectorizing the
      134 + * code to take advantage of SIMD extensions (unless GCC is far smarter than I
      135 + * think).  It would be easy to rewrite the loops to be amenable to
      136 + * autovectorization.
      137 + */
      138 +
 128  139  #include <sys/types.h>
 129  140  #include <sys/sysmacros.h>
 130  141  #include <sys/byteorder.h>
 131  142  #include <sys/zio.h>
 132  143  #include <sys/spa.h>
 133  144  
 134  145  void
 135  146  fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 136  147  {
 137  148          const uint64_t *ip = buf;

 138  149          const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 139  150          uint64_t a0, b0, a1, b1;
 140  151  
 141  152          for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
 142  153                  a0 += ip[0];
 143  154                  a1 += ip[1];
 144  155                  b0 += a0;
 145  156                  b1 += a1;
 146  157          }
 147  158  
 148  159          ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 149  160  }
 150  161  
 151  162  void
 152  163  fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 153  164  {
 154  165          const uint64_t *ip = buf;
 155  166          const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 156  167          uint64_t a0, b0, a1, b1;
 157  168  
 158  169          for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
 159  170                  a0 += BSWAP_64(ip[0]);
 160  171                  a1 += BSWAP_64(ip[1]);
 161  172                  b0 += a0;
 162  173                  b1 += a1;
 163  174          }
 164  175  
 165  176          ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 166  177  }
 167  178  
 168  179  void
 169  180  fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 170  181  {
 171  182          const uint32_t *ip = buf;
 172  183          const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 173  184          uint64_t a, b, c, d;
 174  185  
 175  186          for (a = b = c = d = 0; ip < ipend; ip++) {
 176  187                  a += ip[0];
 177  188                  b += a;
 178  189                  c += b;
 179  190                  d += c;
 180  191          }
 181  192  
 182  193          ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 183  194  }
 184  195  
 185  196  void
 186  197  fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 187  198  {
 188  199          const uint32_t *ip = buf;
 189  200          const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 190  201          uint64_t a, b, c, d;
 191  202  
 192  203          for (a = b = c = d = 0; ip < ipend; ip++) {
 193  204                  a += BSWAP_32(ip[0]);
 194  205                  b += a;
 195  206                  c += b;
 196  207                  d += c;
 197  208          }
 198  209  
 199  210          ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 200  211  }
 201  212  
 202  213  void
 203  214  fletcher_4_incremental_native(const void *buf, uint64_t size,
 204  215      zio_cksum_t *zcp)
 205  216  {
 206  217          const uint32_t *ip = buf;
 207  218          const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 208  219          uint64_t a, b, c, d;
 209  220  
 210  221          a = zcp->zc_word[0];
 211  222          b = zcp->zc_word[1];
 212  223          c = zcp->zc_word[2];
 213  224          d = zcp->zc_word[3];
 214  225  
 215  226          for (; ip < ipend; ip++) {
 216  227                  a += ip[0];
 217  228                  b += a;
 218  229                  c += b;
 219  230                  d += c;
 220  231          }
 221  232  
 222  233          ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 223  234  }
 224  235  
 225  236  void
 226  237  fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
 227  238      zio_cksum_t *zcp)
 228  239  {
 229  240          const uint32_t *ip = buf;
 230  241          const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 231  242          uint64_t a, b, c, d;
 232  243  
 233  244          a = zcp->zc_word[0];
 234  245          b = zcp->zc_word[1];
 235  246          c = zcp->zc_word[2];
 236  247          d = zcp->zc_word[3];
 237  248  
 238  249          for (; ip < ipend; ip++) {
 239  250                  a += BSWAP_32(ip[0]);
 240  251                  b += a;
 241  252                  c += b;
 242  253                  d += c;
 243  254          }
 244  255  
 245  256          ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 246  257  }

↓ open down ↓

109 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX