illumos-3742.1 Wdiff usr/src/uts/common/fs/zfs/vdev_raidz.c

Print this page

3742 zfs comments need cleaner, more consistent style
Submitted by:   Will Andrews <willa@spectralogic.com>
Submitted by:   Alan Somers <alans@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>
Reviewed by:    George Wilson <george.wilson@delphix.com>
Reviewed by:    Eric Schrock <eric.schrock@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/vdev_raidz.c
          +++ new/usr/src/uts/common/fs/zfs/vdev_raidz.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/spa.h>
  29   29  #include <sys/vdev_impl.h>
  30   30  #include <sys/zio.h>
  31   31  #include <sys/zio_checksum.h>
  32   32  #include <sys/fs/zfs.h>
  33   33  #include <sys/fm/fs/zfs.h>
  34   34  
  35   35  /*
  36   36   * Virtual device vector for RAID-Z.
  37   37   *
  38   38   * This vdev supports single, double, and triple parity. For single parity,
  39   39   * we use a simple XOR of all the data columns. For double or triple parity,
  40   40   * we use a special case of Reed-Solomon coding. This extends the
  41   41   * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  42   42   * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  43   43   * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  44   44   * former is also based. The latter is designed to provide higher performance
  45   45   * for writes.
  46   46   *
  47   47   * Note that the Plank paper claimed to support arbitrary N+M, but was then
  48   48   * amended six years later identifying a critical flaw that invalidates its
  49   49   * claims. Nevertheless, the technique can be adapted to work for up to
  50   50   * triple parity. For additional parity, the amendment "Note: Correction to
  51   51   * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  52   52   * is viable, but the additional complexity means that write performance will

↓ open down ↓

52 lines elided

↑ open up ↑

  53   53   * suffer.
  54   54   *
  55   55   * All of the methods above operate on a Galois field, defined over the
  56   56   * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  57   57   * can be expressed with a single byte. Briefly, the operations on the
  58   58   * field are defined as follows:
  59   59   *
  60   60   *   o addition (+) is represented by a bitwise XOR
  61   61   *   o subtraction (-) is therefore identical to addition: A + B = A - B
  62   62   *   o multiplication of A by 2 is defined by the following bitwise expression:
       63 + *
  63   64   *      (A * 2)_7 = A_6
  64   65   *      (A * 2)_6 = A_5
  65   66   *      (A * 2)_5 = A_4
  66   67   *      (A * 2)_4 = A_3 + A_7
  67   68   *      (A * 2)_3 = A_2 + A_7
  68   69   *      (A * 2)_2 = A_1 + A_7
  69   70   *      (A * 2)_1 = A_0
  70   71   *      (A * 2)_0 = A_7
  71   72   *
  72   73   * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).

  73   74   * As an aside, this multiplication is derived from the error correcting
  74   75   * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  75   76   *
  76   77   * Observe that any number in the field (except for 0) can be expressed as a
  77   78   * power of 2 -- a generator for the field. We store a table of the powers of
  78   79   * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  79   80   * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  80   81   * than field addition). The inverse of a field element A (A^-1) is therefore
  81   82   * A ^ (255 - 1) = A^254.
  82   83   *
  83   84   * The up-to-three parity columns, P, Q, R over several data columns,
  84   85   * D_0, ... D_n-1, can be expressed by field operations:
  85   86   *
  86   87   *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  87   88   *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  88   89   *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  89   90   *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  90   91   *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  91   92   *
  92   93   * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
  93   94   * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  94   95   * independent coefficients. (There are no additional coefficients that have
  95   96   * this property which is why the uncorrected Plank method breaks down.)
  96   97   *
  97   98   * See the reconstruction code below for how P, Q and R can used individually
  98   99   * or in concert to recover missing data columns.
  99  100   */
 100  101  
 101  102  typedef struct raidz_col {
 102  103          uint64_t rc_devidx;             /* child device index for I/O */
 103  104          uint64_t rc_offset;             /* device offset */
 104  105          uint64_t rc_size;               /* I/O size */
 105  106          void *rc_data;                  /* I/O data */
 106  107          void *rc_gdata;                 /* used to store the "good" version */
 107  108          int rc_error;                   /* I/O error for this device */
 108  109          uint8_t rc_tried;               /* Did we attempt this I/O column? */
 109  110          uint8_t rc_skipped;             /* Did we skip this I/O column? */
 110  111  } raidz_col_t;

↓ open down ↓

38 lines elided

↑ open up ↑

 111  112  
 112  113  typedef struct raidz_map {
 113  114          uint64_t rm_cols;               /* Regular column count */
 114  115          uint64_t rm_scols;              /* Count including skipped columns */
 115  116          uint64_t rm_bigcols;            /* Number of oversized columns */
 116  117          uint64_t rm_asize;              /* Actual total I/O size */
 117  118          uint64_t rm_missingdata;        /* Count of missing data devices */
 118  119          uint64_t rm_missingparity;      /* Count of missing parity devices */
 119  120          uint64_t rm_firstdatacol;       /* First data column/parity count */
 120  121          uint64_t rm_nskip;              /* Skipped sectors for padding */
 121      -        uint64_t rm_skipstart;  /* Column index of padding start */
      122 +        uint64_t rm_skipstart;          /* Column index of padding start */
 122  123          void *rm_datacopy;              /* rm_asize-buffer of copied data */
 123  124          uintptr_t rm_reports;           /* # of referencing checksum reports */
 124  125          uint8_t rm_freed;               /* map no longer has referencing ZIO */
 125  126          uint8_t rm_ecksuminjected;      /* checksum error was injected */
 126  127          raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 127  128  } raidz_map_t;
 128  129  
 129  130  #define VDEV_RAIDZ_P            0
 130  131  #define VDEV_RAIDZ_Q            1
 131  132  #define VDEV_RAIDZ_R            2

 132  133  
 133  134  #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 134  135  #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 135  136  
 136  137  /*
 137  138   * We provide a mechanism to perform the field multiplication operation on a
 138  139   * 64-bit value all at once rather than a byte at a time. This works by
 139  140   * creating a mask from the top bit in each byte and using that to
 140  141   * conditionally apply the XOR of 0x1d.
 141  142   */
 142  143  #define VDEV_RAIDZ_64MUL_2(x, mask) \
 143  144  { \
 144  145          (mask) = (x) & 0x8080808080808080ULL; \
 145  146          (mask) = ((mask) << 1) - ((mask) >> 7); \
 146  147          (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 147  148              ((mask) & 0x1d1d1d1d1d1d1d1d); \
 148  149  }
 149  150  
 150  151  #define VDEV_RAIDZ_64MUL_4(x, mask) \

↓ open down ↓

19 lines elided

↑ open up ↑

 151  152  { \
 152  153          VDEV_RAIDZ_64MUL_2((x), mask); \
 153  154          VDEV_RAIDZ_64MUL_2((x), mask); \
 154  155  }
 155  156  
 156  157  /*
 157  158   * Force reconstruction to use the general purpose method.
 158  159   */
 159  160  int vdev_raidz_default_to_general;
 160  161  
 161      -/*
 162      - * These two tables represent powers and logs of 2 in the Galois field defined
 163      - * above. These values were computed by repeatedly multiplying by 2 as above.
 164      - */
      162 +/* Powers of 2 in the Galois field defined above. */
 165  163  static const uint8_t vdev_raidz_pow2[256] = {
 166  164          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 167  165          0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 168  166          0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 169  167          0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 170  168          0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 171  169          0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 172  170          0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 173  171          0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 174  172          0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,

 175  173          0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 176  174          0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 177  175          0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 178  176          0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 179  177          0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 180  178          0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 181  179          0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 182  180          0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 183  181          0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 184  182          0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 185  183          0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 186  184          0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 187  185          0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 188  186          0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,

↓ open down ↓

14 lines elided

↑ open up ↑

 189  187          0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 190  188          0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 191  189          0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 192  190          0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 193  191          0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 194  192          0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 195  193          0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 196  194          0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 197  195          0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 198  196  };
      197 +/* Logs of 2 in the Galois field defined above. */
 199  198  static const uint8_t vdev_raidz_log2[256] = {
 200  199          0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 201  200          0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 202  201          0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 203  202          0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 204  203          0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 205  204          0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 206  205          0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 207  206          0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 208  207          0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,

 209  208          0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 210  209          0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 211  210          0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 212  211          0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 213  212          0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 214  213          0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 215  214          0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 216  215          0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 217  216          0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 218  217          0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 219  218          0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 220  219          0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 221  220          0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 222  221          0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 223  222          0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 224  223          0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 225  224          0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 226  225          0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 227  226          0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 228  227          0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 229  228          0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 230  229          0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 231  230          0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 232  231  };
 233  232  
 234  233  static void vdev_raidz_generate_parity(raidz_map_t *rm);
 235  234  
 236  235  /*
 237  236   * Multiply a given number by 2 raised to the given power.
 238  237   */
 239  238  static uint8_t
 240  239  vdev_raidz_exp2(uint_t a, int exp)
 241  240  {
 242  241          if (a == 0)
 243  242                  return (0);
 244  243  
 245  244          ASSERT(exp >= 0);
 246  245          ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 247  246  
 248  247          exp += vdev_raidz_log2[a];
 249  248          if (exp > 255)
 250  249                  exp -= 255;
 251  250  
 252  251          return (vdev_raidz_pow2[exp]);
 253  252  }
 254  253  
 255  254  static void
 256  255  vdev_raidz_map_free(raidz_map_t *rm)
 257  256  {
 258  257          int c;
 259  258          size_t size;
 260  259  
 261  260          for (c = 0; c < rm->rm_firstdatacol; c++) {
 262  261                  zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 263  262  
 264  263                  if (rm->rm_col[c].rc_gdata != NULL)
 265  264                          zio_buf_free(rm->rm_col[c].rc_gdata,
 266  265                              rm->rm_col[c].rc_size);
 267  266          }
 268  267  
 269  268          size = 0;
 270  269          for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 271  270                  size += rm->rm_col[c].rc_size;
 272  271  
 273  272          if (rm->rm_datacopy != NULL)
 274  273                  zio_buf_free(rm->rm_datacopy, size);
 275  274  
 276  275          kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 277  276  }
 278  277  
 279  278  static void
 280  279  vdev_raidz_map_free_vsd(zio_t *zio)
 281  280  {
 282  281          raidz_map_t *rm = zio->io_vsd;
 283  282  
 284  283          ASSERT0(rm->rm_freed);
 285  284          rm->rm_freed = 1;
 286  285  
 287  286          if (rm->rm_reports == 0)
 288  287                  vdev_raidz_map_free(rm);
 289  288  }
 290  289  
 291  290  /*ARGSUSED*/
 292  291  static void
 293  292  vdev_raidz_cksum_free(void *arg, size_t ignored)
 294  293  {
 295  294          raidz_map_t *rm = arg;
 296  295  
 297  296          ASSERT3U(rm->rm_reports, >, 0);
 298  297  
 299  298          if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 300  299                  vdev_raidz_map_free(rm);
 301  300  }
 302  301  
 303  302  static void
 304  303  vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 305  304  {
 306  305          raidz_map_t *rm = zcr->zcr_cbdata;
 307  306          size_t c = zcr->zcr_cbinfo;
 308  307          size_t x;
 309  308  
 310  309          const char *good = NULL;
 311  310          const char *bad = rm->rm_col[c].rc_data;
 312  311  
 313  312          if (good_data == NULL) {
 314  313                  zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 315  314                  return;
 316  315          }
 317  316  
 318  317          if (c < rm->rm_firstdatacol) {
 319  318                  /*
 320  319                   * The first time through, calculate the parity blocks for
 321  320                   * the good data (this relies on the fact that the good
 322  321                   * data never changes for a given logical ZIO)
 323  322                   */
 324  323                  if (rm->rm_col[0].rc_gdata == NULL) {
 325  324                          char *bad_parity[VDEV_RAIDZ_MAXPARITY];
 326  325                          char *buf;
 327  326  
 328  327                          /*
 329  328                           * Set up the rm_col[]s to generate the parity for
 330  329                           * good_data, first saving the parity bufs and
 331  330                           * replacing them with buffers to hold the result.
 332  331                           */
 333  332                          for (x = 0; x < rm->rm_firstdatacol; x++) {
 334  333                                  bad_parity[x] = rm->rm_col[x].rc_data;
 335  334                                  rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
 336  335                                      zio_buf_alloc(rm->rm_col[x].rc_size);
 337  336                          }
 338  337  
 339  338                          /* fill in the data columns from good_data */
 340  339                          buf = (char *)good_data;
 341  340                          for (; x < rm->rm_cols; x++) {
 342  341                                  rm->rm_col[x].rc_data = buf;
 343  342                                  buf += rm->rm_col[x].rc_size;
 344  343                          }
 345  344  
 346  345                          /*
 347  346                           * Construct the parity from the good data.
 348  347                           */
 349  348                          vdev_raidz_generate_parity(rm);
 350  349  
 351  350                          /* restore everything back to its original state */
 352  351                          for (x = 0; x < rm->rm_firstdatacol; x++)
 353  352                                  rm->rm_col[x].rc_data = bad_parity[x];
 354  353  
 355  354                          buf = rm->rm_datacopy;
 356  355                          for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 357  356                                  rm->rm_col[x].rc_data = buf;
 358  357                                  buf += rm->rm_col[x].rc_size;
 359  358                          }
 360  359                  }
 361  360  
 362  361                  ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 363  362                  good = rm->rm_col[c].rc_gdata;
 364  363          } else {
 365  364                  /* adjust good_data to point at the start of our column */
 366  365                  good = good_data;
 367  366  
 368  367                  for (x = rm->rm_firstdatacol; x < c; x++)
 369  368                          good += rm->rm_col[x].rc_size;
 370  369          }
 371  370  
 372  371          /* we drop the ereport if it ends up that the data was good */
 373  372          zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 374  373  }
 375  374  
 376  375  /*
 377  376   * Invoked indirectly by zfs_ereport_start_checksum(), called
 378  377   * below when our read operation fails completely.  The main point
 379  378   * is to keep a copy of everything we read from disk, so that at
 380  379   * vdev_raidz_cksum_finish() time we can compare it with the good data.
 381  380   */
 382  381  static void
 383  382  vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 384  383  {
 385  384          size_t c = (size_t)(uintptr_t)arg;
 386  385          caddr_t buf;
 387  386  
 388  387          raidz_map_t *rm = zio->io_vsd;
 389  388          size_t size;
 390  389  
 391  390          /* set up the report and bump the refcount  */
 392  391          zcr->zcr_cbdata = rm;
 393  392          zcr->zcr_cbinfo = c;
 394  393          zcr->zcr_finish = vdev_raidz_cksum_finish;
 395  394          zcr->zcr_free = vdev_raidz_cksum_free;
 396  395  
 397  396          rm->rm_reports++;
 398  397          ASSERT3U(rm->rm_reports, >, 0);
 399  398  
 400  399          if (rm->rm_datacopy != NULL)
 401  400                  return;
 402  401  
 403  402          /*
 404  403           * It's the first time we're called for this raidz_map_t, so we need
 405  404           * to copy the data aside; there's no guarantee that our zio's buffer
 406  405           * won't be re-used for something else.
 407  406           *
 408  407           * Our parity data is already in separate buffers, so there's no need
 409  408           * to copy them.
 410  409           */
 411  410  
 412  411          size = 0;
 413  412          for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 414  413                  size += rm->rm_col[c].rc_size;
 415  414  
 416  415          buf = rm->rm_datacopy = zio_buf_alloc(size);
 417  416  
 418  417          for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 419  418                  raidz_col_t *col = &rm->rm_col[c];
 420  419  
 421  420                  bcopy(col->rc_data, buf, col->rc_size);
 422  421                  col->rc_data = buf;
 423  422  
 424  423                  buf += col->rc_size;
 425  424          }
 426  425          ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 427  426  }
 428  427  
 429  428  static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 430  429          vdev_raidz_map_free_vsd,
 431  430          vdev_raidz_cksum_report
 432  431  };
 433  432  
 434  433  /*
 435  434   * Divides the IO evenly across all child vdevs; usually, dcols is
 436  435   * the number of children in the target vdev.
 437  436   */
 438  437  static raidz_map_t *
 439  438  vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 440  439      uint64_t nparity)
 441  440  {
 442  441          raidz_map_t *rm;
 443  442          /* The starting RAIDZ (parent) vdev sector of the block. */
 444  443          uint64_t b = zio->io_offset >> unit_shift;
 445  444          /* The zio's size in units of the vdev's minimum sector size */
 446  445          uint64_t s = zio->io_size >> unit_shift;
 447  446          /* The first column for this stripe. */
 448  447          uint64_t f = b % dcols;
 449  448          /* The starting byte offset on each child vdev. */
 450  449          uint64_t o = (b / dcols) << unit_shift;
 451  450          uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 452  451  
 453  452          /*
 454  453           * "Quotient": The number of data sectors for this stripe on all but
 455  454           * the "big column" child vdevs that also contain "remainder" data.
 456  455           */
 457  456          q = s / (dcols - nparity);
 458  457  
 459  458          /*
 460  459           * "Remainder": The number of partial stripe data sectors in this I/O.
 461  460           * This will add a sector to some, but not all, child vdevs.
 462  461           */
 463  462          r = s - q * (dcols - nparity);
 464  463  
 465  464          /* The number of "big columns" - those which contain remainder data. */
 466  465          bc = (r == 0 ? 0 : r + nparity);
 467  466  
 468  467          /*
 469  468           * The total number of data and parity sectors associated with
 470  469           * this I/O.
 471  470           */
 472  471          tot = s + nparity * (q + (r == 0 ? 0 : 1));
 473  472  
 474  473          /* acols: The columns that will be accessed. */
 475  474          /* scols: The columns that will be accessed or skipped. */
 476  475          if (q == 0) {
 477  476                  /* Our I/O request doesn't span all child vdevs. */
 478  477                  acols = bc;
 479  478                  scols = MIN(dcols, roundup(bc, nparity + 1));
 480  479          } else {
 481  480                  acols = dcols;
 482  481                  scols = dcols;
 483  482          }
 484  483  
 485  484          ASSERT3U(acols, <=, scols);
 486  485  
 487  486          rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 488  487  
 489  488          rm->rm_cols = acols;
 490  489          rm->rm_scols = scols;
 491  490          rm->rm_bigcols = bc;
 492  491          rm->rm_skipstart = bc;
 493  492          rm->rm_missingdata = 0;
 494  493          rm->rm_missingparity = 0;
 495  494          rm->rm_firstdatacol = nparity;
 496  495          rm->rm_datacopy = NULL;
 497  496          rm->rm_reports = 0;
 498  497          rm->rm_freed = 0;
 499  498          rm->rm_ecksuminjected = 0;
 500  499  
 501  500          asize = 0;
 502  501  
 503  502          for (c = 0; c < scols; c++) {
 504  503                  col = f + c;
 505  504                  coff = o;
 506  505                  if (col >= dcols) {
 507  506                          col -= dcols;
 508  507                          coff += 1ULL << unit_shift;
 509  508                  }
 510  509                  rm->rm_col[c].rc_devidx = col;
 511  510                  rm->rm_col[c].rc_offset = coff;
 512  511                  rm->rm_col[c].rc_data = NULL;
 513  512                  rm->rm_col[c].rc_gdata = NULL;
 514  513                  rm->rm_col[c].rc_error = 0;
 515  514                  rm->rm_col[c].rc_tried = 0;
 516  515                  rm->rm_col[c].rc_skipped = 0;
 517  516  
 518  517                  if (c >= acols)
 519  518                          rm->rm_col[c].rc_size = 0;
 520  519                  else if (c < bc)
 521  520                          rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 522  521                  else
 523  522                          rm->rm_col[c].rc_size = q << unit_shift;
 524  523  
 525  524                  asize += rm->rm_col[c].rc_size;
 526  525          }
 527  526  
 528  527          ASSERT3U(asize, ==, tot << unit_shift);
 529  528          rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 530  529          rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 531  530          ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 532  531          ASSERT3U(rm->rm_nskip, <=, nparity);
 533  532  
 534  533          for (c = 0; c < rm->rm_firstdatacol; c++)
 535  534                  rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
 536  535  
 537  536          rm->rm_col[c].rc_data = zio->io_data;
 538  537  
 539  538          for (c = c + 1; c < acols; c++)
 540  539                  rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
 541  540                      rm->rm_col[c - 1].rc_size;
 542  541  
 543  542          /*
 544  543           * If all data stored spans all columns, there's a danger that parity
 545  544           * will always be on the same device and, since parity isn't read
 546  545           * during normal operation, that that device's I/O bandwidth won't be
 547  546           * used effectively. We therefore switch the parity every 1MB.
 548  547           *
 549  548           * ... at least that was, ostensibly, the theory. As a practical
 550  549           * matter unless we juggle the parity between all devices evenly, we
 551  550           * won't see any benefit. Further, occasional writes that aren't a
 552  551           * multiple of the LCM of the number of children and the minimum
 553  552           * stripe width are sufficient to avoid pessimal behavior.
 554  553           * Unfortunately, this decision created an implicit on-disk format
 555  554           * requirement that we need to support for all eternity, but only
 556  555           * for single-parity RAID-Z.
 557  556           *
 558  557           * If we intend to skip a sector in the zeroth column for padding
 559  558           * we must make sure to note this swap. We will never intend to
 560  559           * skip the first column since at least one data and one parity
 561  560           * column must appear in each row.
 562  561           */
 563  562          ASSERT(rm->rm_cols >= 2);
 564  563          ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 565  564  
 566  565          if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 567  566                  devidx = rm->rm_col[0].rc_devidx;
 568  567                  o = rm->rm_col[0].rc_offset;
 569  568                  rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 570  569                  rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 571  570                  rm->rm_col[1].rc_devidx = devidx;
 572  571                  rm->rm_col[1].rc_offset = o;
 573  572  
 574  573                  if (rm->rm_skipstart == 0)
 575  574                          rm->rm_skipstart = 1;
 576  575          }
 577  576  
 578  577          zio->io_vsd = rm;
 579  578          zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 580  579          return (rm);
 581  580  }
 582  581  
 583  582  static void
 584  583  vdev_raidz_generate_parity_p(raidz_map_t *rm)
 585  584  {
 586  585          uint64_t *p, *src, pcount, ccount, i;
 587  586          int c;
 588  587  
 589  588          pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 590  589  
 591  590          for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 592  591                  src = rm->rm_col[c].rc_data;
 593  592                  p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 594  593                  ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 595  594  
 596  595                  if (c == rm->rm_firstdatacol) {
 597  596                          ASSERT(ccount == pcount);
 598  597                          for (i = 0; i < ccount; i++, src++, p++) {
 599  598                                  *p = *src;
 600  599                          }
 601  600                  } else {
 602  601                          ASSERT(ccount <= pcount);
 603  602                          for (i = 0; i < ccount; i++, src++, p++) {
 604  603                                  *p ^= *src;
 605  604                          }
 606  605                  }
 607  606          }
 608  607  }
 609  608  
 610  609  static void
 611  610  vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 612  611  {
 613  612          uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 614  613          int c;
 615  614  
 616  615          pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 617  616          ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 618  617              rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 619  618  
 620  619          for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 621  620                  src = rm->rm_col[c].rc_data;
 622  621                  p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 623  622                  q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 624  623  
 625  624                  ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 626  625  
 627  626                  if (c == rm->rm_firstdatacol) {
 628  627                          ASSERT(ccnt == pcnt || ccnt == 0);
 629  628                          for (i = 0; i < ccnt; i++, src++, p++, q++) {
 630  629                                  *p = *src;
 631  630                                  *q = *src;
 632  631                          }
 633  632                          for (; i < pcnt; i++, src++, p++, q++) {
 634  633                                  *p = 0;
 635  634                                  *q = 0;
 636  635                          }
 637  636                  } else {
 638  637                          ASSERT(ccnt <= pcnt);
 639  638  
 640  639                          /*
 641  640                           * Apply the algorithm described above by multiplying
 642  641                           * the previous result and adding in the new value.
 643  642                           */
 644  643                          for (i = 0; i < ccnt; i++, src++, p++, q++) {
 645  644                                  *p ^= *src;
 646  645  
 647  646                                  VDEV_RAIDZ_64MUL_2(*q, mask);
 648  647                                  *q ^= *src;
 649  648                          }
 650  649  
 651  650                          /*
 652  651                           * Treat short columns as though they are full of 0s.
 653  652                           * Note that there's therefore nothing needed for P.
 654  653                           */
 655  654                          for (; i < pcnt; i++, q++) {
 656  655                                  VDEV_RAIDZ_64MUL_2(*q, mask);
 657  656                          }
 658  657                  }
 659  658          }
 660  659  }
 661  660  
 662  661  static void
 663  662  vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 664  663  {
 665  664          uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 666  665          int c;
 667  666  
 668  667          pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 669  668          ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 670  669              rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 671  670          ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 672  671              rm->rm_col[VDEV_RAIDZ_R].rc_size);
 673  672  
 674  673          for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 675  674                  src = rm->rm_col[c].rc_data;
 676  675                  p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 677  676                  q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 678  677                  r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 679  678  
 680  679                  ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 681  680  
 682  681                  if (c == rm->rm_firstdatacol) {
 683  682                          ASSERT(ccnt == pcnt || ccnt == 0);
 684  683                          for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 685  684                                  *p = *src;
 686  685                                  *q = *src;
 687  686                                  *r = *src;
 688  687                          }
 689  688                          for (; i < pcnt; i++, src++, p++, q++, r++) {
 690  689                                  *p = 0;
 691  690                                  *q = 0;
 692  691                                  *r = 0;
 693  692                          }
 694  693                  } else {
 695  694                          ASSERT(ccnt <= pcnt);
 696  695  
 697  696                          /*
 698  697                           * Apply the algorithm described above by multiplying
 699  698                           * the previous result and adding in the new value.
 700  699                           */
 701  700                          for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 702  701                                  *p ^= *src;
 703  702  
 704  703                                  VDEV_RAIDZ_64MUL_2(*q, mask);
 705  704                                  *q ^= *src;
 706  705  
 707  706                                  VDEV_RAIDZ_64MUL_4(*r, mask);
 708  707                                  *r ^= *src;
 709  708                          }
 710  709  
 711  710                          /*
 712  711                           * Treat short columns as though they are full of 0s.
 713  712                           * Note that there's therefore nothing needed for P.
 714  713                           */
 715  714                          for (; i < pcnt; i++, q++, r++) {
 716  715                                  VDEV_RAIDZ_64MUL_2(*q, mask);
 717  716                                  VDEV_RAIDZ_64MUL_4(*r, mask);
 718  717                          }
 719  718                  }
 720  719          }
 721  720  }
 722  721  
 723  722  /*
 724  723   * Generate RAID parity in the first virtual columns according to the number of
 725  724   * parity columns available.
 726  725   */
 727  726  static void
 728  727  vdev_raidz_generate_parity(raidz_map_t *rm)
 729  728  {
 730  729          switch (rm->rm_firstdatacol) {
 731  730          case 1:
 732  731                  vdev_raidz_generate_parity_p(rm);
 733  732                  break;
 734  733          case 2:
 735  734                  vdev_raidz_generate_parity_pq(rm);
 736  735                  break;
 737  736          case 3:
 738  737                  vdev_raidz_generate_parity_pqr(rm);
 739  738                  break;
 740  739          default:
 741  740                  cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 742  741          }
 743  742  }
 744  743  
 745  744  static int
 746  745  vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 747  746  {
 748  747          uint64_t *dst, *src, xcount, ccount, count, i;
 749  748          int x = tgts[0];
 750  749          int c;
 751  750  
 752  751          ASSERT(ntgts == 1);
 753  752          ASSERT(x >= rm->rm_firstdatacol);
 754  753          ASSERT(x < rm->rm_cols);
 755  754  
 756  755          xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 757  756          ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 758  757          ASSERT(xcount > 0);
 759  758  
 760  759          src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 761  760          dst = rm->rm_col[x].rc_data;
 762  761          for (i = 0; i < xcount; i++, dst++, src++) {
 763  762                  *dst = *src;
 764  763          }
 765  764  
 766  765          for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 767  766                  src = rm->rm_col[c].rc_data;
 768  767                  dst = rm->rm_col[x].rc_data;
 769  768  
 770  769                  if (c == x)
 771  770                          continue;
 772  771  
 773  772                  ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 774  773                  count = MIN(ccount, xcount);
 775  774  
 776  775                  for (i = 0; i < count; i++, dst++, src++) {
 777  776                          *dst ^= *src;
 778  777                  }
 779  778          }
 780  779  
 781  780          return (1 << VDEV_RAIDZ_P);
 782  781  }
 783  782  
 784  783  static int
 785  784  vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 786  785  {
 787  786          uint64_t *dst, *src, xcount, ccount, count, mask, i;
 788  787          uint8_t *b;
 789  788          int x = tgts[0];
 790  789          int c, j, exp;
 791  790  
 792  791          ASSERT(ntgts == 1);
 793  792  
 794  793          xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 795  794          ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 796  795  
 797  796          for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 798  797                  src = rm->rm_col[c].rc_data;
 799  798                  dst = rm->rm_col[x].rc_data;
 800  799  
 801  800                  if (c == x)
 802  801                          ccount = 0;
 803  802                  else
 804  803                          ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 805  804  
 806  805                  count = MIN(ccount, xcount);
 807  806  
 808  807                  if (c == rm->rm_firstdatacol) {
 809  808                          for (i = 0; i < count; i++, dst++, src++) {
 810  809                                  *dst = *src;
 811  810                          }
 812  811                          for (; i < xcount; i++, dst++) {
 813  812                                  *dst = 0;
 814  813                          }
 815  814  
 816  815                  } else {
 817  816                          for (i = 0; i < count; i++, dst++, src++) {
 818  817                                  VDEV_RAIDZ_64MUL_2(*dst, mask);
 819  818                                  *dst ^= *src;
 820  819                          }
 821  820  
 822  821                          for (; i < xcount; i++, dst++) {
 823  822                                  VDEV_RAIDZ_64MUL_2(*dst, mask);
 824  823                          }
 825  824                  }
 826  825          }
 827  826  
 828  827          src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 829  828          dst = rm->rm_col[x].rc_data;
 830  829          exp = 255 - (rm->rm_cols - 1 - x);
 831  830  
 832  831          for (i = 0; i < xcount; i++, dst++, src++) {
 833  832                  *dst ^= *src;
 834  833                  for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 835  834                          *b = vdev_raidz_exp2(*b, exp);
 836  835                  }
 837  836          }
 838  837  
 839  838          return (1 << VDEV_RAIDZ_Q);
 840  839  }
 841  840  
 842  841  static int
 843  842  vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 844  843  {
 845  844          uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 846  845          void *pdata, *qdata;
 847  846          uint64_t xsize, ysize, i;
 848  847          int x = tgts[0];
 849  848          int y = tgts[1];
 850  849  
 851  850          ASSERT(ntgts == 2);
 852  851          ASSERT(x < y);
 853  852          ASSERT(x >= rm->rm_firstdatacol);
 854  853          ASSERT(y < rm->rm_cols);
 855  854  
 856  855          ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 857  856  
 858  857          /*
 859  858           * Move the parity data aside -- we're going to compute parity as
 860  859           * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 861  860           * reuse the parity generation mechanism without trashing the actual
 862  861           * parity so we make those columns appear to be full of zeros by
 863  862           * setting their lengths to zero.
 864  863           */
 865  864          pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 866  865          qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 867  866          xsize = rm->rm_col[x].rc_size;
 868  867          ysize = rm->rm_col[y].rc_size;
 869  868  
 870  869          rm->rm_col[VDEV_RAIDZ_P].rc_data =
 871  870              zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
 872  871          rm->rm_col[VDEV_RAIDZ_Q].rc_data =
 873  872              zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 874  873          rm->rm_col[x].rc_size = 0;
 875  874          rm->rm_col[y].rc_size = 0;
 876  875  
 877  876          vdev_raidz_generate_parity_pq(rm);
 878  877  
 879  878          rm->rm_col[x].rc_size = xsize;
 880  879          rm->rm_col[y].rc_size = ysize;
 881  880  
 882  881          p = pdata;
 883  882          q = qdata;
 884  883          pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 885  884          qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 886  885          xd = rm->rm_col[x].rc_data;
 887  886          yd = rm->rm_col[y].rc_data;
 888  887  
 889  888          /*
 890  889           * We now have:
 891  890           *      Pxy = P + D_x + D_y
 892  891           *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 893  892           *
 894  893           * We can then solve for D_x:
 895  894           *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 896  895           * where
 897  896           *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 898  897           *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 899  898           *
 900  899           * With D_x in hand, we can easily solve for D_y:
 901  900           *      D_y = P + Pxy + D_x
 902  901           */
 903  902  
 904  903          a = vdev_raidz_pow2[255 + x - y];
 905  904          b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 906  905          tmp = 255 - vdev_raidz_log2[a ^ 1];
 907  906  
 908  907          aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 909  908          bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 910  909  
 911  910          for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
 912  911                  *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
 913  912                      vdev_raidz_exp2(*q ^ *qxy, bexp);
 914  913  
 915  914                  if (i < ysize)
 916  915                          *yd = *p ^ *pxy ^ *xd;
 917  916          }
 918  917  
 919  918          zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
 920  919              rm->rm_col[VDEV_RAIDZ_P].rc_size);
 921  920          zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
 922  921              rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 923  922  
 924  923          /*
 925  924           * Restore the saved parity data.
 926  925           */
 927  926          rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 928  927          rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
 929  928  
 930  929          return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 931  930  }
 932  931  
 933  932  /* BEGIN CSTYLED */
 934  933  /*
 935  934   * In the general case of reconstruction, we must solve the system of linear
 936  935   * equations defined by the coeffecients used to generate parity as well as
 937  936   * the contents of the data and parity disks. This can be expressed with
 938  937   * vectors for the original data (D) and the actual data (d) and parity (p)
 939  938   * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 940  939   *
 941  940   *            __   __                     __     __
 942  941   *            |     |         __     __   |  p_0  |
 943  942   *            |  V  |         |  D_0  |   | p_m-1 |
 944  943   *            |     |    x    |   :   | = |  d_0  |
 945  944   *            |  I  |         | D_n-1 |   |   :   |
 946  945   *            |     |         ~~     ~~   | d_n-1 |
 947  946   *            ~~   ~~                     ~~     ~~
 948  947   *
 949  948   * I is simply a square identity matrix of size n, and V is a vandermonde
 950  949   * matrix defined by the coeffecients we chose for the various parity columns
 951  950   * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 952  951   * computation as well as linear separability.
 953  952   *
 954  953   *      __               __               __     __
 955  954   *      |   1   ..  1 1 1 |               |  p_0  |
 956  955   *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 957  956   *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 958  957   *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 959  958   *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 960  959   *      |   :       : : : |   |   :   |   |  d_2  |
 961  960   *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 962  961   *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 963  962   *      |   0   ..  0 0 1 |               | d_n-1 |
 964  963   *      ~~               ~~               ~~     ~~
 965  964   *
 966  965   * Note that I, V, d, and p are known. To compute D, we must invert the
 967  966   * matrix and use the known data and parity values to reconstruct the unknown
 968  967   * data values. We begin by removing the rows in V|I and d|p that correspond
 969  968   * to failed or missing columns; we then make V|I square (n x n) and d|p
 970  969   * sized n by removing rows corresponding to unused parity from the bottom up
 971  970   * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 972  971   * using Gauss-Jordan elimination. In the example below we use m=3 parity
 973  972   * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 974  973   *           __                               __
 975  974   *           |  1   1   1   1   1   1   1   1  |
 976  975   *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 977  976   *           |  19 205 116  29  64  16  4   1  |      / /
 978  977   *           |  1   0   0   0   0   0   0   0  |     / /
 979  978   *           |  0   1   0   0   0   0   0   0  | <--' /
 980  979   *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 981  980   *           |  0   0   0   1   0   0   0   0  |
 982  981   *           |  0   0   0   0   1   0   0   0  |
 983  982   *           |  0   0   0   0   0   1   0   0  |
 984  983   *           |  0   0   0   0   0   0   1   0  |
 985  984   *           |  0   0   0   0   0   0   0   1  |
 986  985   *           ~~                               ~~
 987  986   *           __                               __
 988  987   *           |  1   1   1   1   1   1   1   1  |
 989  988   *           | 128  64  32  16  8   4   2   1  |
 990  989   *           |  19 205 116  29  64  16  4   1  |
 991  990   *           |  1   0   0   0   0   0   0   0  |
 992  991   *           |  0   1   0   0   0   0   0   0  |
 993  992   *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 994  993   *           |  0   0   0   1   0   0   0   0  |
 995  994   *           |  0   0   0   0   1   0   0   0  |
 996  995   *           |  0   0   0   0   0   1   0   0  |
 997  996   *           |  0   0   0   0   0   0   1   0  |
 998  997   *           |  0   0   0   0   0   0   0   1  |
 999  998   *           ~~                               ~~
1000  999   *
1001 1000   * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1002 1001   * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1003 1002   * matrix is not singular.
1004 1003   * __                                                                 __
1005 1004   * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1006 1005   * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1007 1006   * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1008 1007   * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1009 1008   * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1010 1009   * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1011 1010   * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1012 1011   * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1013 1012   * ~~                                                                 ~~
1014 1013   * __                                                                 __
1015 1014   * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1016 1015   * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1017 1016   * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1018 1017   * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1019 1018   * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1020 1019   * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1021 1020   * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1022 1021   * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1023 1022   * ~~                                                                 ~~
1024 1023   * __                                                                 __
1025 1024   * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1026 1025   * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1027 1026   * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1028 1027   * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1029 1028   * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1030 1029   * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1031 1030   * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1032 1031   * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1033 1032   * ~~                                                                 ~~
1034 1033   * __                                                                 __
1035 1034   * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1036 1035   * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1037 1036   * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1038 1037   * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1039 1038   * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1040 1039   * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1041 1040   * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1042 1041   * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1043 1042   * ~~                                                                 ~~
1044 1043   * __                                                                 __
1045 1044   * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1046 1045   * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1047 1046   * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1048 1047   * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1049 1048   * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1050 1049   * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1051 1050   * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1052 1051   * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1053 1052   * ~~                                                                 ~~
1054 1053   * __                                                                 __
1055 1054   * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1056 1055   * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1057 1056   * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1058 1057   * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1059 1058   * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1060 1059   * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1061 1060   * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1062 1061   * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1063 1062   * ~~                                                                 ~~
1064 1063   *                   __                               __
1065 1064   *                   |  0   0   1   0   0   0   0   0  |
1066 1065   *                   | 167 100  5   41 159 169 217 208 |
1067 1066   *                   | 166 100  4   40 158 168 216 209 |
1068 1067   *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1069 1068   *                   |  0   0   0   0   1   0   0   0  |
1070 1069   *                   |  0   0   0   0   0   1   0   0  |
1071 1070   *                   |  0   0   0   0   0   0   1   0  |
1072 1071   *                   |  0   0   0   0   0   0   0   1  |
1073 1072   *                   ~~                               ~~
1074 1073   *
1075 1074   * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1076 1075   * of the missing data.
1077 1076   *
1078 1077   * As is apparent from the example above, the only non-trivial rows in the
1079 1078   * inverse matrix correspond to the data disks that we're trying to
1080 1079   * reconstruct. Indeed, those are the only rows we need as the others would
1081 1080   * only be useful for reconstructing data known or assumed to be valid. For
1082 1081   * that reason, we only build the coefficients in the rows that correspond to
1083 1082   * targeted columns.
1084 1083   */
1085 1084  /* END CSTYLED */
1086 1085  
1087 1086  static void
1088 1087  vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1089 1088      uint8_t **rows)
1090 1089  {
1091 1090          int i, j;
1092 1091          int pow;
1093 1092  
1094 1093          ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1095 1094  
1096 1095          /*
1097 1096           * Fill in the missing rows of interest.
1098 1097           */
1099 1098          for (i = 0; i < nmap; i++) {
1100 1099                  ASSERT3S(0, <=, map[i]);
1101 1100                  ASSERT3S(map[i], <=, 2);
1102 1101  
1103 1102                  pow = map[i] * n;
1104 1103                  if (pow > 255)
1105 1104                          pow -= 255;
1106 1105                  ASSERT(pow <= 255);
1107 1106  
1108 1107                  for (j = 0; j < n; j++) {
1109 1108                          pow -= map[i];
1110 1109                          if (pow < 0)
1111 1110                                  pow += 255;
1112 1111                          rows[i][j] = vdev_raidz_pow2[pow];
1113 1112                  }
1114 1113          }
1115 1114  }
1116 1115  
1117 1116  static void
1118 1117  vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1119 1118      uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1120 1119  {
1121 1120          int i, j, ii, jj;
1122 1121          uint8_t log;
1123 1122  
1124 1123          /*
1125 1124           * Assert that the first nmissing entries from the array of used
1126 1125           * columns correspond to parity columns and that subsequent entries
1127 1126           * correspond to data columns.
1128 1127           */
1129 1128          for (i = 0; i < nmissing; i++) {
1130 1129                  ASSERT3S(used[i], <, rm->rm_firstdatacol);
1131 1130          }
1132 1131          for (; i < n; i++) {
1133 1132                  ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1134 1133          }
1135 1134  
1136 1135          /*
1137 1136           * First initialize the storage where we'll compute the inverse rows.
1138 1137           */
1139 1138          for (i = 0; i < nmissing; i++) {
1140 1139                  for (j = 0; j < n; j++) {
1141 1140                          invrows[i][j] = (i == j) ? 1 : 0;
1142 1141                  }
1143 1142          }
1144 1143  
1145 1144          /*
1146 1145           * Subtract all trivial rows from the rows of consequence.
1147 1146           */
1148 1147          for (i = 0; i < nmissing; i++) {
1149 1148                  for (j = nmissing; j < n; j++) {
1150 1149                          ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1151 1150                          jj = used[j] - rm->rm_firstdatacol;
1152 1151                          ASSERT3S(jj, <, n);
1153 1152                          invrows[i][j] = rows[i][jj];
1154 1153                          rows[i][jj] = 0;
1155 1154                  }
1156 1155          }
1157 1156  
1158 1157          /*
1159 1158           * For each of the rows of interest, we must normalize it and subtract
1160 1159           * a multiple of it from the other rows.
1161 1160           */
1162 1161          for (i = 0; i < nmissing; i++) {
1163 1162                  for (j = 0; j < missing[i]; j++) {
1164 1163                          ASSERT0(rows[i][j]);
1165 1164                  }
1166 1165                  ASSERT3U(rows[i][missing[i]], !=, 0);
1167 1166  
1168 1167                  /*
1169 1168                   * Compute the inverse of the first element and multiply each
1170 1169                   * element in the row by that value.
1171 1170                   */
1172 1171                  log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1173 1172  
1174 1173                  for (j = 0; j < n; j++) {
1175 1174                          rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1176 1175                          invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1177 1176                  }
1178 1177  
1179 1178                  for (ii = 0; ii < nmissing; ii++) {
1180 1179                          if (i == ii)
1181 1180                                  continue;
1182 1181  
1183 1182                          ASSERT3U(rows[ii][missing[i]], !=, 0);
1184 1183  
1185 1184                          log = vdev_raidz_log2[rows[ii][missing[i]]];
1186 1185  
1187 1186                          for (j = 0; j < n; j++) {
1188 1187                                  rows[ii][j] ^=
1189 1188                                      vdev_raidz_exp2(rows[i][j], log);
1190 1189                                  invrows[ii][j] ^=
1191 1190                                      vdev_raidz_exp2(invrows[i][j], log);
1192 1191                          }
1193 1192                  }
1194 1193          }
1195 1194  
1196 1195          /*
1197 1196           * Verify that the data that is left in the rows are properly part of
1198 1197           * an identity matrix.
1199 1198           */
1200 1199          for (i = 0; i < nmissing; i++) {
1201 1200                  for (j = 0; j < n; j++) {
1202 1201                          if (j == missing[i]) {
1203 1202                                  ASSERT3U(rows[i][j], ==, 1);
1204 1203                          } else {
1205 1204                                  ASSERT0(rows[i][j]);
1206 1205                          }
1207 1206                  }
1208 1207          }
1209 1208  }
1210 1209  
1211 1210  static void
1212 1211  vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1213 1212      int *missing, uint8_t **invrows, const uint8_t *used)
1214 1213  {
1215 1214          int i, j, x, cc, c;
1216 1215          uint8_t *src;
1217 1216          uint64_t ccount;
1218 1217          uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1219 1218          uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1220 1219          uint8_t log = 0;
1221 1220          uint8_t val;
1222 1221          int ll;
1223 1222          uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1224 1223          uint8_t *p, *pp;
1225 1224          size_t psize;
1226 1225  
1227 1226          psize = sizeof (invlog[0][0]) * n * nmissing;
1228 1227          p = kmem_alloc(psize, KM_SLEEP);
1229 1228  
1230 1229          for (pp = p, i = 0; i < nmissing; i++) {
1231 1230                  invlog[i] = pp;
1232 1231                  pp += n;
1233 1232          }
1234 1233  
1235 1234          for (i = 0; i < nmissing; i++) {
1236 1235                  for (j = 0; j < n; j++) {
1237 1236                          ASSERT3U(invrows[i][j], !=, 0);
1238 1237                          invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1239 1238                  }
1240 1239          }
1241 1240  
1242 1241          for (i = 0; i < n; i++) {
1243 1242                  c = used[i];
1244 1243                  ASSERT3U(c, <, rm->rm_cols);
1245 1244  
1246 1245                  src = rm->rm_col[c].rc_data;
1247 1246                  ccount = rm->rm_col[c].rc_size;
1248 1247                  for (j = 0; j < nmissing; j++) {
1249 1248                          cc = missing[j] + rm->rm_firstdatacol;
1250 1249                          ASSERT3U(cc, >=, rm->rm_firstdatacol);
1251 1250                          ASSERT3U(cc, <, rm->rm_cols);
1252 1251                          ASSERT3U(cc, !=, c);
1253 1252  
1254 1253                          dst[j] = rm->rm_col[cc].rc_data;
1255 1254                          dcount[j] = rm->rm_col[cc].rc_size;
1256 1255                  }
1257 1256  
1258 1257                  ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1259 1258  
1260 1259                  for (x = 0; x < ccount; x++, src++) {
1261 1260                          if (*src != 0)
1262 1261                                  log = vdev_raidz_log2[*src];
1263 1262  
1264 1263                          for (cc = 0; cc < nmissing; cc++) {
1265 1264                                  if (x >= dcount[cc])
1266 1265                                          continue;
1267 1266  
1268 1267                                  if (*src == 0) {
1269 1268                                          val = 0;
1270 1269                                  } else {
1271 1270                                          if ((ll = log + invlog[cc][i]) >= 255)
1272 1271                                                  ll -= 255;
1273 1272                                          val = vdev_raidz_pow2[ll];
1274 1273                                  }
1275 1274  
1276 1275                                  if (i == 0)
1277 1276                                          dst[cc][x] = val;
1278 1277                                  else
1279 1278                                          dst[cc][x] ^= val;
1280 1279                          }
1281 1280                  }
1282 1281          }
1283 1282  
1284 1283          kmem_free(p, psize);
1285 1284  }
1286 1285  
1287 1286  static int
1288 1287  vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1289 1288  {
1290 1289          int n, i, c, t, tt;
1291 1290          int nmissing_rows;
1292 1291          int missing_rows[VDEV_RAIDZ_MAXPARITY];
1293 1292          int parity_map[VDEV_RAIDZ_MAXPARITY];
1294 1293  
1295 1294          uint8_t *p, *pp;
1296 1295          size_t psize;
1297 1296  
1298 1297          uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1299 1298          uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1300 1299          uint8_t *used;
1301 1300  
1302 1301          int code = 0;
1303 1302  
1304 1303  
1305 1304          n = rm->rm_cols - rm->rm_firstdatacol;
1306 1305  
1307 1306          /*
1308 1307           * Figure out which data columns are missing.
1309 1308           */
1310 1309          nmissing_rows = 0;
1311 1310          for (t = 0; t < ntgts; t++) {
1312 1311                  if (tgts[t] >= rm->rm_firstdatacol) {
1313 1312                          missing_rows[nmissing_rows++] =
1314 1313                              tgts[t] - rm->rm_firstdatacol;
1315 1314                  }
1316 1315          }
1317 1316  
1318 1317          /*
1319 1318           * Figure out which parity columns to use to help generate the missing
1320 1319           * data columns.
1321 1320           */
1322 1321          for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1323 1322                  ASSERT(tt < ntgts);
1324 1323                  ASSERT(c < rm->rm_firstdatacol);
1325 1324  
1326 1325                  /*
1327 1326                   * Skip any targeted parity columns.
1328 1327                   */
1329 1328                  if (c == tgts[tt]) {
1330 1329                          tt++;
1331 1330                          continue;
1332 1331                  }
1333 1332  
1334 1333                  code |= 1 << c;
1335 1334  
1336 1335                  parity_map[i] = c;
1337 1336                  i++;
1338 1337          }
1339 1338  
1340 1339          ASSERT(code != 0);
1341 1340          ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1342 1341  
1343 1342          psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1344 1343              nmissing_rows * n + sizeof (used[0]) * n;
1345 1344          p = kmem_alloc(psize, KM_SLEEP);
1346 1345  
1347 1346          for (pp = p, i = 0; i < nmissing_rows; i++) {
1348 1347                  rows[i] = pp;
1349 1348                  pp += n;
1350 1349                  invrows[i] = pp;
1351 1350                  pp += n;
1352 1351          }
1353 1352          used = pp;
1354 1353  
1355 1354          for (i = 0; i < nmissing_rows; i++) {
1356 1355                  used[i] = parity_map[i];
1357 1356          }
1358 1357  
1359 1358          for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1360 1359                  if (tt < nmissing_rows &&
1361 1360                      c == missing_rows[tt] + rm->rm_firstdatacol) {
1362 1361                          tt++;
1363 1362                          continue;
1364 1363                  }
1365 1364  
1366 1365                  ASSERT3S(i, <, n);
1367 1366                  used[i] = c;
1368 1367                  i++;
1369 1368          }
1370 1369  
1371 1370          /*
1372 1371           * Initialize the interesting rows of the matrix.
1373 1372           */
1374 1373          vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1375 1374  
1376 1375          /*
1377 1376           * Invert the matrix.
1378 1377           */
1379 1378          vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1380 1379              invrows, used);
1381 1380  
1382 1381          /*
1383 1382           * Reconstruct the missing data using the generated matrix.
1384 1383           */
1385 1384          vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1386 1385              invrows, used);
1387 1386  
1388 1387          kmem_free(p, psize);
1389 1388  
1390 1389          return (code);
1391 1390  }
1392 1391  
1393 1392  static int
1394 1393  vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1395 1394  {
1396 1395          int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1397 1396          int ntgts;
1398 1397          int i, c;
1399 1398          int code;
1400 1399          int nbadparity, nbaddata;
1401 1400          int parity_valid[VDEV_RAIDZ_MAXPARITY];
1402 1401  
1403 1402          /*
1404 1403           * The tgts list must already be sorted.
1405 1404           */
1406 1405          for (i = 1; i < nt; i++) {
1407 1406                  ASSERT(t[i] > t[i - 1]);
1408 1407          }
1409 1408  
1410 1409          nbadparity = rm->rm_firstdatacol;
1411 1410          nbaddata = rm->rm_cols - nbadparity;
1412 1411          ntgts = 0;
1413 1412          for (i = 0, c = 0; c < rm->rm_cols; c++) {
1414 1413                  if (c < rm->rm_firstdatacol)
1415 1414                          parity_valid[c] = B_FALSE;
1416 1415  
1417 1416                  if (i < nt && c == t[i]) {
1418 1417                          tgts[ntgts++] = c;
1419 1418                          i++;
1420 1419                  } else if (rm->rm_col[c].rc_error != 0) {
1421 1420                          tgts[ntgts++] = c;
1422 1421                  } else if (c >= rm->rm_firstdatacol) {
1423 1422                          nbaddata--;
1424 1423                  } else {
1425 1424                          parity_valid[c] = B_TRUE;
1426 1425                          nbadparity--;
1427 1426                  }
1428 1427          }
1429 1428  
1430 1429          ASSERT(ntgts >= nt);
1431 1430          ASSERT(nbaddata >= 0);
1432 1431          ASSERT(nbaddata + nbadparity == ntgts);
1433 1432  
1434 1433          dt = &tgts[nbadparity];
1435 1434  
1436 1435          /*
1437 1436           * See if we can use any of our optimized reconstruction routines.
1438 1437           */
1439 1438          if (!vdev_raidz_default_to_general) {
1440 1439                  switch (nbaddata) {
1441 1440                  case 1:
1442 1441                          if (parity_valid[VDEV_RAIDZ_P])
1443 1442                                  return (vdev_raidz_reconstruct_p(rm, dt, 1));
1444 1443  
1445 1444                          ASSERT(rm->rm_firstdatacol > 1);
1446 1445  
1447 1446                          if (parity_valid[VDEV_RAIDZ_Q])
1448 1447                                  return (vdev_raidz_reconstruct_q(rm, dt, 1));
1449 1448  
1450 1449                          ASSERT(rm->rm_firstdatacol > 2);
1451 1450                          break;
1452 1451  
1453 1452                  case 2:
1454 1453                          ASSERT(rm->rm_firstdatacol > 1);
1455 1454  
1456 1455                          if (parity_valid[VDEV_RAIDZ_P] &&
1457 1456                              parity_valid[VDEV_RAIDZ_Q])
1458 1457                                  return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1459 1458  
1460 1459                          ASSERT(rm->rm_firstdatacol > 2);
1461 1460  
1462 1461                          break;
1463 1462                  }
1464 1463          }
1465 1464  
1466 1465          code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1467 1466          ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1468 1467          ASSERT(code > 0);
1469 1468          return (code);
1470 1469  }
1471 1470  
1472 1471  static int
1473 1472  vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1474 1473      uint64_t *ashift)
1475 1474  {
1476 1475          vdev_t *cvd;
1477 1476          uint64_t nparity = vd->vdev_nparity;
1478 1477          int c;
1479 1478          int lasterror = 0;
1480 1479          int numerrors = 0;
1481 1480  
1482 1481          ASSERT(nparity > 0);
1483 1482  
1484 1483          if (nparity > VDEV_RAIDZ_MAXPARITY ||
1485 1484              vd->vdev_children < nparity + 1) {
1486 1485                  vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1487 1486                  return (SET_ERROR(EINVAL));
1488 1487          }
1489 1488  
1490 1489          vdev_open_children(vd);
1491 1490  
1492 1491          for (c = 0; c < vd->vdev_children; c++) {
1493 1492                  cvd = vd->vdev_child[c];
1494 1493  
1495 1494                  if (cvd->vdev_open_error != 0) {
1496 1495                          lasterror = cvd->vdev_open_error;
1497 1496                          numerrors++;
1498 1497                          continue;
1499 1498                  }
1500 1499  
1501 1500                  *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1502 1501                  *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1503 1502                  *ashift = MAX(*ashift, cvd->vdev_ashift);
1504 1503          }
1505 1504  
1506 1505          *asize *= vd->vdev_children;
1507 1506          *max_asize *= vd->vdev_children;
1508 1507  
1509 1508          if (numerrors > nparity) {
1510 1509                  vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1511 1510                  return (lasterror);
1512 1511          }
1513 1512  
1514 1513          return (0);
1515 1514  }
1516 1515  
1517 1516  static void
1518 1517  vdev_raidz_close(vdev_t *vd)
1519 1518  {
1520 1519          int c;
1521 1520  
1522 1521          for (c = 0; c < vd->vdev_children; c++)
1523 1522                  vdev_close(vd->vdev_child[c]);
1524 1523  }
1525 1524  
1526 1525  static uint64_t
1527 1526  vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1528 1527  {
1529 1528          uint64_t asize;
1530 1529          uint64_t ashift = vd->vdev_top->vdev_ashift;
1531 1530          uint64_t cols = vd->vdev_children;
1532 1531          uint64_t nparity = vd->vdev_nparity;
1533 1532  
1534 1533          asize = ((psize - 1) >> ashift) + 1;
1535 1534          asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1536 1535          asize = roundup(asize, nparity + 1) << ashift;
1537 1536  
1538 1537          return (asize);
1539 1538  }
1540 1539  
1541 1540  static void
1542 1541  vdev_raidz_child_done(zio_t *zio)
1543 1542  {
1544 1543          raidz_col_t *rc = zio->io_private;
1545 1544  
1546 1545          rc->rc_error = zio->io_error;
1547 1546          rc->rc_tried = 1;
1548 1547          rc->rc_skipped = 0;
1549 1548  }
1550 1549  
1551 1550  /*
1552 1551   * Start an IO operation on a RAIDZ VDev
1553 1552   *
1554 1553   * Outline:
1555 1554   * - For write operations:
1556 1555   *   1. Generate the parity data
1557 1556   *   2. Create child zio write operations to each column's vdev, for both
1558 1557   *      data and parity.
1559 1558   *   3. If the column skips any sectors for padding, create optional dummy
1560 1559   *      write zio children for those areas to improve aggregation continuity.
1561 1560   * - For read operations:
1562 1561   *   1. Create child zio read operations to each data column's vdev to read
1563 1562   *      the range of data required for zio.
1564 1563   *   2. If this is a scrub or resilver operation, or if any of the data
1565 1564   *      vdevs have had errors, then create zio read operations to the parity
1566 1565   *      columns' VDevs as well.
1567 1566   */
1568 1567  static int
1569 1568  vdev_raidz_io_start(zio_t *zio)
1570 1569  {
1571 1570          vdev_t *vd = zio->io_vd;
1572 1571          vdev_t *tvd = vd->vdev_top;
1573 1572          vdev_t *cvd;
1574 1573          raidz_map_t *rm;
1575 1574          raidz_col_t *rc;
1576 1575          int c, i;
1577 1576  
1578 1577          rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1579 1578              vd->vdev_nparity);
1580 1579  
1581 1580          ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1582 1581  
1583 1582          if (zio->io_type == ZIO_TYPE_WRITE) {
1584 1583                  vdev_raidz_generate_parity(rm);
1585 1584  
1586 1585                  for (c = 0; c < rm->rm_cols; c++) {
1587 1586                          rc = &rm->rm_col[c];
1588 1587                          cvd = vd->vdev_child[rc->rc_devidx];
1589 1588                          zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1590 1589                              rc->rc_offset, rc->rc_data, rc->rc_size,
1591 1590                              zio->io_type, zio->io_priority, 0,
1592 1591                              vdev_raidz_child_done, rc));
1593 1592                  }
1594 1593  
1595 1594                  /*
1596 1595                   * Generate optional I/Os for any skipped sectors to improve
1597 1596                   * aggregation contiguity.
1598 1597                   */
1599 1598                  for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1600 1599                          ASSERT(c <= rm->rm_scols);
1601 1600                          if (c == rm->rm_scols)
1602 1601                                  c = 0;
1603 1602                          rc = &rm->rm_col[c];
1604 1603                          cvd = vd->vdev_child[rc->rc_devidx];
1605 1604                          zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1606 1605                              rc->rc_offset + rc->rc_size, NULL,
1607 1606                              1 << tvd->vdev_ashift,
1608 1607                              zio->io_type, zio->io_priority,
1609 1608                              ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1610 1609                  }
1611 1610  
1612 1611                  return (ZIO_PIPELINE_CONTINUE);
1613 1612          }
1614 1613  
1615 1614          ASSERT(zio->io_type == ZIO_TYPE_READ);
1616 1615  
1617 1616          /*
1618 1617           * Iterate over the columns in reverse order so that we hit the parity
1619 1618           * last -- any errors along the way will force us to read the parity.
1620 1619           */
1621 1620          for (c = rm->rm_cols - 1; c >= 0; c--) {
1622 1621                  rc = &rm->rm_col[c];
1623 1622                  cvd = vd->vdev_child[rc->rc_devidx];
1624 1623                  if (!vdev_readable(cvd)) {
1625 1624                          if (c >= rm->rm_firstdatacol)
1626 1625                                  rm->rm_missingdata++;
1627 1626                          else
1628 1627                                  rm->rm_missingparity++;
1629 1628                          rc->rc_error = SET_ERROR(ENXIO);
1630 1629                          rc->rc_tried = 1;       /* don't even try */
1631 1630                          rc->rc_skipped = 1;
1632 1631                          continue;
1633 1632                  }
1634 1633                  if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1635 1634                          if (c >= rm->rm_firstdatacol)
1636 1635                                  rm->rm_missingdata++;
1637 1636                          else
1638 1637                                  rm->rm_missingparity++;
1639 1638                          rc->rc_error = SET_ERROR(ESTALE);
1640 1639                          rc->rc_skipped = 1;
1641 1640                          continue;
1642 1641                  }
1643 1642                  if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1644 1643                      (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1645 1644                          zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1646 1645                              rc->rc_offset, rc->rc_data, rc->rc_size,
1647 1646                              zio->io_type, zio->io_priority, 0,
1648 1647                              vdev_raidz_child_done, rc));
1649 1648                  }
1650 1649          }
1651 1650  
1652 1651          return (ZIO_PIPELINE_CONTINUE);
1653 1652  }
1654 1653  
1655 1654  
1656 1655  /*
1657 1656   * Report a checksum error for a child of a RAID-Z device.
1658 1657   */
1659 1658  static void
1660 1659  raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1661 1660  {
1662 1661          vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1663 1662  
1664 1663          if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1665 1664                  zio_bad_cksum_t zbc;
1666 1665                  raidz_map_t *rm = zio->io_vsd;
1667 1666  
1668 1667                  mutex_enter(&vd->vdev_stat_lock);
1669 1668                  vd->vdev_stat.vs_checksum_errors++;
1670 1669                  mutex_exit(&vd->vdev_stat_lock);
1671 1670  
1672 1671                  zbc.zbc_has_cksum = 0;
1673 1672                  zbc.zbc_injected = rm->rm_ecksuminjected;
1674 1673  
1675 1674                  zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1676 1675                      rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1677 1676                      &zbc);
1678 1677          }
1679 1678  }
1680 1679  
1681 1680  /*
1682 1681   * We keep track of whether or not there were any injected errors, so that
1683 1682   * any ereports we generate can note it.
1684 1683   */
1685 1684  static int
1686 1685  raidz_checksum_verify(zio_t *zio)
1687 1686  {
1688 1687          zio_bad_cksum_t zbc;
1689 1688          raidz_map_t *rm = zio->io_vsd;
1690 1689  
1691 1690          int ret = zio_checksum_error(zio, &zbc);
1692 1691          if (ret != 0 && zbc.zbc_injected != 0)
1693 1692                  rm->rm_ecksuminjected = 1;
1694 1693  
1695 1694          return (ret);
1696 1695  }
1697 1696  
1698 1697  /*
1699 1698   * Generate the parity from the data columns. If we tried and were able to
1700 1699   * read the parity without error, verify that the generated parity matches the
1701 1700   * data we read. If it doesn't, we fire off a checksum error. Return the
1702 1701   * number such failures.
1703 1702   */
1704 1703  static int
1705 1704  raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1706 1705  {
1707 1706          void *orig[VDEV_RAIDZ_MAXPARITY];
1708 1707          int c, ret = 0;
1709 1708          raidz_col_t *rc;
1710 1709  
1711 1710          for (c = 0; c < rm->rm_firstdatacol; c++) {
1712 1711                  rc = &rm->rm_col[c];
1713 1712                  if (!rc->rc_tried || rc->rc_error != 0)
1714 1713                          continue;
1715 1714                  orig[c] = zio_buf_alloc(rc->rc_size);
1716 1715                  bcopy(rc->rc_data, orig[c], rc->rc_size);
1717 1716          }
1718 1717  
1719 1718          vdev_raidz_generate_parity(rm);
1720 1719  
1721 1720          for (c = 0; c < rm->rm_firstdatacol; c++) {
1722 1721                  rc = &rm->rm_col[c];
1723 1722                  if (!rc->rc_tried || rc->rc_error != 0)
1724 1723                          continue;
1725 1724                  if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1726 1725                          raidz_checksum_error(zio, rc, orig[c]);
1727 1726                          rc->rc_error = SET_ERROR(ECKSUM);
1728 1727                          ret++;
1729 1728                  }
1730 1729                  zio_buf_free(orig[c], rc->rc_size);
1731 1730          }
1732 1731  
1733 1732          return (ret);
1734 1733  }
1735 1734  
1736 1735  /*
1737 1736   * Keep statistics on all the ways that we used parity to correct data.
1738 1737   */
1739 1738  static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1740 1739  
1741 1740  static int
1742 1741  vdev_raidz_worst_error(raidz_map_t *rm)
1743 1742  {
1744 1743          int error = 0;
1745 1744  
1746 1745          for (int c = 0; c < rm->rm_cols; c++)
1747 1746                  error = zio_worst_error(error, rm->rm_col[c].rc_error);
1748 1747  
1749 1748          return (error);
1750 1749  }
1751 1750  
1752 1751  /*
1753 1752   * Iterate over all combinations of bad data and attempt a reconstruction.
1754 1753   * Note that the algorithm below is non-optimal because it doesn't take into
1755 1754   * account how reconstruction is actually performed. For example, with
1756 1755   * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1757 1756   * is targeted as invalid as if columns 1 and 4 are targeted since in both
1758 1757   * cases we'd only use parity information in column 0.
1759 1758   */
1760 1759  static int
1761 1760  vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1762 1761  {
1763 1762          raidz_map_t *rm = zio->io_vsd;
1764 1763          raidz_col_t *rc;
1765 1764          void *orig[VDEV_RAIDZ_MAXPARITY];
1766 1765          int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1767 1766          int *tgts = &tstore[1];
1768 1767          int current, next, i, c, n;
1769 1768          int code, ret = 0;
1770 1769  
1771 1770          ASSERT(total_errors < rm->rm_firstdatacol);
1772 1771  
1773 1772          /*
1774 1773           * This simplifies one edge condition.
1775 1774           */
1776 1775          tgts[-1] = -1;
1777 1776  
1778 1777          for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1779 1778                  /*
1780 1779                   * Initialize the targets array by finding the first n columns
1781 1780                   * that contain no error.
1782 1781                   *
1783 1782                   * If there were no data errors, we need to ensure that we're
1784 1783                   * always explicitly attempting to reconstruct at least one
1785 1784                   * data column. To do this, we simply push the highest target
1786 1785                   * up into the data columns.
1787 1786                   */
1788 1787                  for (c = 0, i = 0; i < n; i++) {
1789 1788                          if (i == n - 1 && data_errors == 0 &&
1790 1789                              c < rm->rm_firstdatacol) {
1791 1790                                  c = rm->rm_firstdatacol;
1792 1791                          }
1793 1792  
1794 1793                          while (rm->rm_col[c].rc_error != 0) {
1795 1794                                  c++;
1796 1795                                  ASSERT3S(c, <, rm->rm_cols);
1797 1796                          }
1798 1797  
1799 1798                          tgts[i] = c++;
1800 1799                  }
1801 1800  
1802 1801                  /*
1803 1802                   * Setting tgts[n] simplifies the other edge condition.
1804 1803                   */
1805 1804                  tgts[n] = rm->rm_cols;
1806 1805  
1807 1806                  /*
1808 1807                   * These buffers were allocated in previous iterations.
1809 1808                   */
1810 1809                  for (i = 0; i < n - 1; i++) {
1811 1810                          ASSERT(orig[i] != NULL);
1812 1811                  }
1813 1812  
1814 1813                  orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1815 1814  
1816 1815                  current = 0;
1817 1816                  next = tgts[current];
1818 1817  
1819 1818                  while (current != n) {
1820 1819                          tgts[current] = next;
1821 1820                          current = 0;
1822 1821  
1823 1822                          /*
1824 1823                           * Save off the original data that we're going to
1825 1824                           * attempt to reconstruct.
1826 1825                           */
1827 1826                          for (i = 0; i < n; i++) {
1828 1827                                  ASSERT(orig[i] != NULL);
1829 1828                                  c = tgts[i];
1830 1829                                  ASSERT3S(c, >=, 0);
1831 1830                                  ASSERT3S(c, <, rm->rm_cols);
1832 1831                                  rc = &rm->rm_col[c];
1833 1832                                  bcopy(rc->rc_data, orig[i], rc->rc_size);
1834 1833                          }
1835 1834  
1836 1835                          /*
1837 1836                           * Attempt a reconstruction and exit the outer loop on
1838 1837                           * success.
1839 1838                           */
1840 1839                          code = vdev_raidz_reconstruct(rm, tgts, n);
1841 1840                          if (raidz_checksum_verify(zio) == 0) {
1842 1841                                  atomic_inc_64(&raidz_corrected[code]);
1843 1842  
1844 1843                                  for (i = 0; i < n; i++) {
1845 1844                                          c = tgts[i];
1846 1845                                          rc = &rm->rm_col[c];
1847 1846                                          ASSERT(rc->rc_error == 0);
1848 1847                                          if (rc->rc_tried)
1849 1848                                                  raidz_checksum_error(zio, rc,
1850 1849                                                      orig[i]);
1851 1850                                          rc->rc_error = SET_ERROR(ECKSUM);
1852 1851                                  }
1853 1852  
1854 1853                                  ret = code;
1855 1854                                  goto done;
1856 1855                          }
1857 1856  
1858 1857                          /*
1859 1858                           * Restore the original data.
1860 1859                           */
1861 1860                          for (i = 0; i < n; i++) {
1862 1861                                  c = tgts[i];
1863 1862                                  rc = &rm->rm_col[c];
1864 1863                                  bcopy(orig[i], rc->rc_data, rc->rc_size);
1865 1864                          }
1866 1865  
1867 1866                          do {
1868 1867                                  /*
1869 1868                                   * Find the next valid column after the current
1870 1869                                   * position..
1871 1870                                   */
1872 1871                                  for (next = tgts[current] + 1;
1873 1872                                      next < rm->rm_cols &&
1874 1873                                      rm->rm_col[next].rc_error != 0; next++)
1875 1874                                          continue;
1876 1875  
1877 1876                                  ASSERT(next <= tgts[current + 1]);
1878 1877  
1879 1878                                  /*
1880 1879                                   * If that spot is available, we're done here.
1881 1880                                   */
1882 1881                                  if (next != tgts[current + 1])
1883 1882                                          break;
1884 1883  
1885 1884                                  /*
1886 1885                                   * Otherwise, find the next valid column after
1887 1886                                   * the previous position.
1888 1887                                   */
1889 1888                                  for (c = tgts[current - 1] + 1;
1890 1889                                      rm->rm_col[c].rc_error != 0; c++)
1891 1890                                          continue;
1892 1891  
1893 1892                                  tgts[current] = c;
1894 1893                                  current++;
1895 1894  
1896 1895                          } while (current != n);
1897 1896                  }
1898 1897          }
1899 1898          n--;
1900 1899  done:
1901 1900          for (i = 0; i < n; i++) {
1902 1901                  zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1903 1902          }
1904 1903  
1905 1904          return (ret);
1906 1905  }
1907 1906  
1908 1907  /*
1909 1908   * Complete an IO operation on a RAIDZ VDev
1910 1909   *
1911 1910   * Outline:
1912 1911   * - For write operations:
1913 1912   *   1. Check for errors on the child IOs.
1914 1913   *   2. Return, setting an error code if too few child VDevs were written
1915 1914   *      to reconstruct the data later.  Note that partial writes are
1916 1915   *      considered successful if they can be reconstructed at all.
1917 1916   * - For read operations:
1918 1917   *   1. Check for errors on the child IOs.
1919 1918   *   2. If data errors occurred:
1920 1919   *      a. Try to reassemble the data from the parity available.
1921 1920   *      b. If we haven't yet read the parity drives, read them now.
1922 1921   *      c. If all parity drives have been read but the data still doesn't
1923 1922   *         reassemble with a correct checksum, then try combinatorial
1924 1923   *         reconstruction.
1925 1924   *      d. If that doesn't work, return an error.
1926 1925   *   3. If there were unexpected errors or this is a resilver operation,
1927 1926   *      rewrite the vdevs that had errors.
1928 1927   */
1929 1928  static void
1930 1929  vdev_raidz_io_done(zio_t *zio)
1931 1930  {
1932 1931          vdev_t *vd = zio->io_vd;
1933 1932          vdev_t *cvd;
1934 1933          raidz_map_t *rm = zio->io_vsd;
1935 1934          raidz_col_t *rc;
1936 1935          int unexpected_errors = 0;
1937 1936          int parity_errors = 0;
1938 1937          int parity_untried = 0;
1939 1938          int data_errors = 0;
1940 1939          int total_errors = 0;
1941 1940          int n, c;
1942 1941          int tgts[VDEV_RAIDZ_MAXPARITY];
1943 1942          int code;
1944 1943  
1945 1944          ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1946 1945  
1947 1946          ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1948 1947          ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1949 1948  
1950 1949          for (c = 0; c < rm->rm_cols; c++) {
1951 1950                  rc = &rm->rm_col[c];
1952 1951  
1953 1952                  if (rc->rc_error) {
1954 1953                          ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1955 1954  
1956 1955                          if (c < rm->rm_firstdatacol)
1957 1956                                  parity_errors++;
1958 1957                          else
1959 1958                                  data_errors++;
1960 1959  
1961 1960                          if (!rc->rc_skipped)
1962 1961                                  unexpected_errors++;
1963 1962  
1964 1963                          total_errors++;
1965 1964                  } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1966 1965                          parity_untried++;
1967 1966                  }
1968 1967          }
1969 1968  
1970 1969          if (zio->io_type == ZIO_TYPE_WRITE) {
1971 1970                  /*
1972 1971                   * XXX -- for now, treat partial writes as a success.
1973 1972                   * (If we couldn't write enough columns to reconstruct
1974 1973                   * the data, the I/O failed.  Otherwise, good enough.)
1975 1974                   *
1976 1975                   * Now that we support write reallocation, it would be better
1977 1976                   * to treat partial failure as real failure unless there are
1978 1977                   * no non-degraded top-level vdevs left, and not update DTLs
1979 1978                   * if we intend to reallocate.
1980 1979                   */
1981 1980                  /* XXPOLICY */
1982 1981                  if (total_errors > rm->rm_firstdatacol)
1983 1982                          zio->io_error = vdev_raidz_worst_error(rm);
1984 1983  
1985 1984                  return;
1986 1985          }
1987 1986  
1988 1987          ASSERT(zio->io_type == ZIO_TYPE_READ);
1989 1988          /*
1990 1989           * There are three potential phases for a read:
1991 1990           *      1. produce valid data from the columns read
1992 1991           *      2. read all disks and try again
1993 1992           *      3. perform combinatorial reconstruction
1994 1993           *
1995 1994           * Each phase is progressively both more expensive and less likely to
1996 1995           * occur. If we encounter more errors than we can repair or all phases
1997 1996           * fail, we have no choice but to return an error.
1998 1997           */
1999 1998  
2000 1999          /*
2001 2000           * If the number of errors we saw was correctable -- less than or equal
2002 2001           * to the number of parity disks read -- attempt to produce data that
2003 2002           * has a valid checksum. Naturally, this case applies in the absence of
2004 2003           * any errors.
2005 2004           */
2006 2005          if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2007 2006                  if (data_errors == 0) {
2008 2007                          if (raidz_checksum_verify(zio) == 0) {
2009 2008                                  /*
2010 2009                                   * If we read parity information (unnecessarily
2011 2010                                   * as it happens since no reconstruction was
2012 2011                                   * needed) regenerate and verify the parity.
2013 2012                                   * We also regenerate parity when resilvering
2014 2013                                   * so we can write it out to the failed device
2015 2014                                   * later.
2016 2015                                   */
2017 2016                                  if (parity_errors + parity_untried <
2018 2017                                      rm->rm_firstdatacol ||
2019 2018                                      (zio->io_flags & ZIO_FLAG_RESILVER)) {
2020 2019                                          n = raidz_parity_verify(zio, rm);
2021 2020                                          unexpected_errors += n;
2022 2021                                          ASSERT(parity_errors + n <=
2023 2022                                              rm->rm_firstdatacol);
2024 2023                                  }
2025 2024                                  goto done;
2026 2025                          }
2027 2026                  } else {
2028 2027                          /*
2029 2028                           * We either attempt to read all the parity columns or
2030 2029                           * none of them. If we didn't try to read parity, we
2031 2030                           * wouldn't be here in the correctable case. There must
2032 2031                           * also have been fewer parity errors than parity
2033 2032                           * columns or, again, we wouldn't be in this code path.
2034 2033                           */
2035 2034                          ASSERT(parity_untried == 0);
2036 2035                          ASSERT(parity_errors < rm->rm_firstdatacol);
2037 2036  
2038 2037                          /*
2039 2038                           * Identify the data columns that reported an error.
2040 2039                           */
2041 2040                          n = 0;
2042 2041                          for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2043 2042                                  rc = &rm->rm_col[c];
2044 2043                                  if (rc->rc_error != 0) {
2045 2044                                          ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2046 2045                                          tgts[n++] = c;
2047 2046                                  }
2048 2047                          }
2049 2048  
2050 2049                          ASSERT(rm->rm_firstdatacol >= n);
2051 2050  
2052 2051                          code = vdev_raidz_reconstruct(rm, tgts, n);
2053 2052  
2054 2053                          if (raidz_checksum_verify(zio) == 0) {
2055 2054                                  atomic_inc_64(&raidz_corrected[code]);
2056 2055  
2057 2056                                  /*
2058 2057                                   * If we read more parity disks than were used
2059 2058                                   * for reconstruction, confirm that the other
2060 2059                                   * parity disks produced correct data. This
2061 2060                                   * routine is suboptimal in that it regenerates
2062 2061                                   * the parity that we already used in addition
2063 2062                                   * to the parity that we're attempting to
2064 2063                                   * verify, but this should be a relatively
2065 2064                                   * uncommon case, and can be optimized if it
2066 2065                                   * becomes a problem. Note that we regenerate
2067 2066                                   * parity when resilvering so we can write it
2068 2067                                   * out to failed devices later.
2069 2068                                   */
2070 2069                                  if (parity_errors < rm->rm_firstdatacol - n ||
2071 2070                                      (zio->io_flags & ZIO_FLAG_RESILVER)) {
2072 2071                                          n = raidz_parity_verify(zio, rm);
2073 2072                                          unexpected_errors += n;
2074 2073                                          ASSERT(parity_errors + n <=
2075 2074                                              rm->rm_firstdatacol);
2076 2075                                  }
2077 2076  
2078 2077                                  goto done;
2079 2078                          }
2080 2079                  }
2081 2080          }
2082 2081  
2083 2082          /*
2084 2083           * This isn't a typical situation -- either we got a read error or
2085 2084           * a child silently returned bad data. Read every block so we can
2086 2085           * try again with as much data and parity as we can track down. If
2087 2086           * we've already been through once before, all children will be marked
2088 2087           * as tried so we'll proceed to combinatorial reconstruction.
2089 2088           */
2090 2089          unexpected_errors = 1;
2091 2090          rm->rm_missingdata = 0;
2092 2091          rm->rm_missingparity = 0;
2093 2092  
2094 2093          for (c = 0; c < rm->rm_cols; c++) {
2095 2094                  if (rm->rm_col[c].rc_tried)
2096 2095                          continue;
2097 2096  
2098 2097                  zio_vdev_io_redone(zio);
2099 2098                  do {
2100 2099                          rc = &rm->rm_col[c];
2101 2100                          if (rc->rc_tried)
2102 2101                                  continue;
2103 2102                          zio_nowait(zio_vdev_child_io(zio, NULL,
2104 2103                              vd->vdev_child[rc->rc_devidx],
2105 2104                              rc->rc_offset, rc->rc_data, rc->rc_size,
2106 2105                              zio->io_type, zio->io_priority, 0,
2107 2106                              vdev_raidz_child_done, rc));
2108 2107                  } while (++c < rm->rm_cols);
2109 2108  
2110 2109                  return;
2111 2110          }
2112 2111  
2113 2112          /*
2114 2113           * At this point we've attempted to reconstruct the data given the
2115 2114           * errors we detected, and we've attempted to read all columns. There
2116 2115           * must, therefore, be one or more additional problems -- silent errors
2117 2116           * resulting in invalid data rather than explicit I/O errors resulting
2118 2117           * in absent data. We check if there is enough additional data to
2119 2118           * possibly reconstruct the data and then perform combinatorial
2120 2119           * reconstruction over all possible combinations. If that fails,
2121 2120           * we're cooked.
2122 2121           */
2123 2122          if (total_errors > rm->rm_firstdatacol) {
2124 2123                  zio->io_error = vdev_raidz_worst_error(rm);
2125 2124  
2126 2125          } else if (total_errors < rm->rm_firstdatacol &&
2127 2126              (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2128 2127                  /*
2129 2128                   * If we didn't use all the available parity for the
2130 2129                   * combinatorial reconstruction, verify that the remaining
2131 2130                   * parity is correct.
2132 2131                   */
2133 2132                  if (code != (1 << rm->rm_firstdatacol) - 1)
2134 2133                          (void) raidz_parity_verify(zio, rm);
2135 2134          } else {
2136 2135                  /*
2137 2136                   * We're here because either:
2138 2137                   *
2139 2138                   *      total_errors == rm_first_datacol, or
2140 2139                   *      vdev_raidz_combrec() failed
2141 2140                   *
2142 2141                   * In either case, there is enough bad data to prevent
2143 2142                   * reconstruction.
2144 2143                   *
2145 2144                   * Start checksum ereports for all children which haven't
2146 2145                   * failed, and the IO wasn't speculative.
2147 2146                   */
2148 2147                  zio->io_error = SET_ERROR(ECKSUM);
2149 2148  
2150 2149                  if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2151 2150                          for (c = 0; c < rm->rm_cols; c++) {
2152 2151                                  rc = &rm->rm_col[c];
2153 2152                                  if (rc->rc_error == 0) {
2154 2153                                          zio_bad_cksum_t zbc;
2155 2154                                          zbc.zbc_has_cksum = 0;
2156 2155                                          zbc.zbc_injected =
2157 2156                                              rm->rm_ecksuminjected;
2158 2157  
2159 2158                                          zfs_ereport_start_checksum(
2160 2159                                              zio->io_spa,
2161 2160                                              vd->vdev_child[rc->rc_devidx],
2162 2161                                              zio, rc->rc_offset, rc->rc_size,
2163 2162                                              (void *)(uintptr_t)c, &zbc);
2164 2163                                  }
2165 2164                          }
2166 2165                  }
2167 2166          }
2168 2167  
2169 2168  done:
2170 2169          zio_checksum_verified(zio);
2171 2170  
2172 2171          if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2173 2172              (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2174 2173                  /*
2175 2174                   * Use the good data we have in hand to repair damaged children.
2176 2175                   */
2177 2176                  for (c = 0; c < rm->rm_cols; c++) {
2178 2177                          rc = &rm->rm_col[c];
2179 2178                          cvd = vd->vdev_child[rc->rc_devidx];
2180 2179  
2181 2180                          if (rc->rc_error == 0)
2182 2181                                  continue;
2183 2182  
2184 2183                          zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2185 2184                              rc->rc_offset, rc->rc_data, rc->rc_size,
2186 2185                              ZIO_TYPE_WRITE, zio->io_priority,
2187 2186                              ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2188 2187                              ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2189 2188                  }
2190 2189          }
2191 2190  }
2192 2191  
2193 2192  static void
2194 2193  vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2195 2194  {
2196 2195          if (faulted > vd->vdev_nparity)
2197 2196                  vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2198 2197                      VDEV_AUX_NO_REPLICAS);
2199 2198          else if (degraded + faulted != 0)
2200 2199                  vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2201 2200          else
2202 2201                  vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2203 2202  }
2204 2203  
2205 2204  vdev_ops_t vdev_raidz_ops = {
2206 2205          vdev_raidz_open,
2207 2206          vdev_raidz_close,
2208 2207          vdev_raidz_asize,
2209 2208          vdev_raidz_io_start,
2210 2209          vdev_raidz_io_done,
2211 2210          vdev_raidz_state_change,
2212 2211          NULL,
2213 2212          NULL,
2214 2213          VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2215 2214          B_FALSE                 /* not a leaf vdev */
2216 2215  };

↓ open down ↓

2008 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX