1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/spa.h>
  29 #include <sys/vdev_impl.h>
  30 #include <sys/zio.h>
  31 #include <sys/zio_checksum.h>
  32 #include <sys/fs/zfs.h>
  33 #include <sys/fm/fs/zfs.h>
  34 
  35 /*
  36  * Virtual device vector for RAID-Z.
  37  *
  38  * This vdev supports single, double, and triple parity. For single parity,
  39  * we use a simple XOR of all the data columns. For double or triple parity,
  40  * we use a special case of Reed-Solomon coding. This extends the
  41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  44  * former is also based. The latter is designed to provide higher performance
  45  * for writes.
  46  *
  47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  48  * amended six years later identifying a critical flaw that invalidates its
  49  * claims. Nevertheless, the technique can be adapted to work for up to
  50  * triple parity. For additional parity, the amendment "Note: Correction to
  51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  52  * is viable, but the additional complexity means that write performance will
  53  * suffer.
  54  *
  55  * All of the methods above operate on a Galois field, defined over the
  56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  57  * can be expressed with a single byte. Briefly, the operations on the
  58  * field are defined as follows:
  59  *
  60  *   o addition (+) is represented by a bitwise XOR
  61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  62  *   o multiplication of A by 2 is defined by the following bitwise expression:
  63  *      (A * 2)_7 = A_6
  64  *      (A * 2)_6 = A_5
  65  *      (A * 2)_5 = A_4
  66  *      (A * 2)_4 = A_3 + A_7
  67  *      (A * 2)_3 = A_2 + A_7
  68  *      (A * 2)_2 = A_1 + A_7
  69  *      (A * 2)_1 = A_0
  70  *      (A * 2)_0 = A_7
  71  *
  72  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  73  * As an aside, this multiplication is derived from the error correcting
  74  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  75  *
  76  * Observe that any number in the field (except for 0) can be expressed as a
  77  * power of 2 -- a generator for the field. We store a table of the powers of
  78  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  79  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  80  * than field addition). The inverse of a field element A (A^-1) is therefore
  81  * A ^ (255 - 1) = A^254.
  82  *
  83  * The up-to-three parity columns, P, Q, R over several data columns,
  84  * D_0, ... D_n-1, can be expressed by field operations:
  85  *
  86  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  87  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  88  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  89  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  90  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  91  *
  92  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
  93  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  94  * independent coefficients. (There are no additional coefficients that have
  95  * this property which is why the uncorrected Plank method breaks down.)
  96  *
  97  * See the reconstruction code below for how P, Q and R can used individually
  98  * or in concert to recover missing data columns.
  99  */
 100 
 101 typedef struct raidz_col {
 102         uint64_t rc_devidx;             /* child device index for I/O */
 103         uint64_t rc_offset;             /* device offset */
 104         uint64_t rc_size;               /* I/O size */
 105         void *rc_data;                  /* I/O data */
 106         void *rc_gdata;                 /* used to store the "good" version */
 107         int rc_error;                   /* I/O error for this device */
 108         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 109         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 110 } raidz_col_t;
 111 
 112 typedef struct raidz_map {
 113         uint64_t rm_cols;               /* Regular column count */
 114         uint64_t rm_scols;              /* Count including skipped columns */
 115         uint64_t rm_bigcols;            /* Number of oversized columns */
 116         uint64_t rm_asize;              /* Actual total I/O size */
 117         uint64_t rm_missingdata;        /* Count of missing data devices */
 118         uint64_t rm_missingparity;      /* Count of missing parity devices */
 119         uint64_t rm_firstdatacol;       /* First data column/parity count */
 120         uint64_t rm_nskip;              /* Skipped sectors for padding */
 121         uint64_t rm_skipstart;  /* Column index of padding start */
 122         void *rm_datacopy;              /* rm_asize-buffer of copied data */
 123         uintptr_t rm_reports;           /* # of referencing checksum reports */
 124         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 125         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 126         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 127 } raidz_map_t;
 128 
 129 #define VDEV_RAIDZ_P            0
 130 #define VDEV_RAIDZ_Q            1
 131 #define VDEV_RAIDZ_R            2
 132 
 133 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 134 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 135 
 136 /*
 137  * We provide a mechanism to perform the field multiplication operation on a
 138  * 64-bit value all at once rather than a byte at a time. This works by
 139  * creating a mask from the top bit in each byte and using that to
 140  * conditionally apply the XOR of 0x1d.
 141  */
 142 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 143 { \
 144         (mask) = (x) & 0x8080808080808080ULL; \
 145         (mask) = ((mask) << 1) - ((mask) >> 7); \
 146         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 147             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 148 }
 149 
 150 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 151 { \
 152         VDEV_RAIDZ_64MUL_2((x), mask); \
 153         VDEV_RAIDZ_64MUL_2((x), mask); \
 154 }
 155 
 156 /*
 157  * Force reconstruction to use the general purpose method.
 158  */
 159 int vdev_raidz_default_to_general;
 160 
 161 /*
 162  * These two tables represent powers and logs of 2 in the Galois field defined
 163  * above. These values were computed by repeatedly multiplying by 2 as above.
 164  */
 165 static const uint8_t vdev_raidz_pow2[256] = {
 166         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 167         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 168         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 169         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 170         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 171         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 172         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 173         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 174         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 175         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 176         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 177         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 178         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 179         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 180         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 181         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 182         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 183         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 184         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 185         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 186         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 187         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 188         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 189         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 190         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 191         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 192         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 193         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 194         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 195         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 196         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 197         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 198 };
 199 static const uint8_t vdev_raidz_log2[256] = {
 200         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 201         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 202         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 203         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 204         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 205         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 206         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 207         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 208         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 209         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 210         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 211         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 212         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 213         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 214         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 215         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 216         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 217         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 218         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 219         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 220         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 221         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 222         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 223         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 224         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 225         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 226         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 227         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 228         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 229         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 230         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 231         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 232 };
 233 
 234 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 235 
 236 /*
 237  * Multiply a given number by 2 raised to the given power.
 238  */
 239 static uint8_t
 240 vdev_raidz_exp2(uint_t a, int exp)
 241 {
 242         if (a == 0)
 243                 return (0);
 244 
 245         ASSERT(exp >= 0);
 246         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 247 
 248         exp += vdev_raidz_log2[a];
 249         if (exp > 255)
 250                 exp -= 255;
 251 
 252         return (vdev_raidz_pow2[exp]);
 253 }
 254 
 255 static void
 256 vdev_raidz_map_free(raidz_map_t *rm)
 257 {
 258         int c;
 259         size_t size;
 260 
 261         for (c = 0; c < rm->rm_firstdatacol; c++) {
 262                 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 263 
 264                 if (rm->rm_col[c].rc_gdata != NULL)
 265                         zio_buf_free(rm->rm_col[c].rc_gdata,
 266                             rm->rm_col[c].rc_size);
 267         }
 268 
 269         size = 0;
 270         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 271                 size += rm->rm_col[c].rc_size;
 272 
 273         if (rm->rm_datacopy != NULL)
 274                 zio_buf_free(rm->rm_datacopy, size);
 275 
 276         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 277 }
 278 
 279 static void
 280 vdev_raidz_map_free_vsd(zio_t *zio)
 281 {
 282         raidz_map_t *rm = zio->io_vsd;
 283 
 284         ASSERT0(rm->rm_freed);
 285         rm->rm_freed = 1;
 286 
 287         if (rm->rm_reports == 0)
 288                 vdev_raidz_map_free(rm);
 289 }
 290 
 291 /*ARGSUSED*/
 292 static void
 293 vdev_raidz_cksum_free(void *arg, size_t ignored)
 294 {
 295         raidz_map_t *rm = arg;
 296 
 297         ASSERT3U(rm->rm_reports, >, 0);
 298 
 299         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 300                 vdev_raidz_map_free(rm);
 301 }
 302 
 303 static void
 304 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 305 {
 306         raidz_map_t *rm = zcr->zcr_cbdata;
 307         size_t c = zcr->zcr_cbinfo;
 308         size_t x;
 309 
 310         const char *good = NULL;
 311         const char *bad = rm->rm_col[c].rc_data;
 312 
 313         if (good_data == NULL) {
 314                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 315                 return;
 316         }
 317 
 318         if (c < rm->rm_firstdatacol) {
 319                 /*
 320                  * The first time through, calculate the parity blocks for
 321                  * the good data (this relies on the fact that the good
 322                  * data never changes for a given logical ZIO)
 323                  */
 324                 if (rm->rm_col[0].rc_gdata == NULL) {
 325                         char *bad_parity[VDEV_RAIDZ_MAXPARITY];
 326                         char *buf;
 327 
 328                         /*
 329                          * Set up the rm_col[]s to generate the parity for
 330                          * good_data, first saving the parity bufs and
 331                          * replacing them with buffers to hold the result.
 332                          */
 333                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 334                                 bad_parity[x] = rm->rm_col[x].rc_data;
 335                                 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
 336                                     zio_buf_alloc(rm->rm_col[x].rc_size);
 337                         }
 338 
 339                         /* fill in the data columns from good_data */
 340                         buf = (char *)good_data;
 341                         for (; x < rm->rm_cols; x++) {
 342                                 rm->rm_col[x].rc_data = buf;
 343                                 buf += rm->rm_col[x].rc_size;
 344                         }
 345 
 346                         /*
 347                          * Construct the parity from the good data.
 348                          */
 349                         vdev_raidz_generate_parity(rm);
 350 
 351                         /* restore everything back to its original state */
 352                         for (x = 0; x < rm->rm_firstdatacol; x++)
 353                                 rm->rm_col[x].rc_data = bad_parity[x];
 354 
 355                         buf = rm->rm_datacopy;
 356                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 357                                 rm->rm_col[x].rc_data = buf;
 358                                 buf += rm->rm_col[x].rc_size;
 359                         }
 360                 }
 361 
 362                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 363                 good = rm->rm_col[c].rc_gdata;
 364         } else {
 365                 /* adjust good_data to point at the start of our column */
 366                 good = good_data;
 367 
 368                 for (x = rm->rm_firstdatacol; x < c; x++)
 369                         good += rm->rm_col[x].rc_size;
 370         }
 371 
 372         /* we drop the ereport if it ends up that the data was good */
 373         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 374 }
 375 
 376 /*
 377  * Invoked indirectly by zfs_ereport_start_checksum(), called
 378  * below when our read operation fails completely.  The main point
 379  * is to keep a copy of everything we read from disk, so that at
 380  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 381  */
 382 static void
 383 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 384 {
 385         size_t c = (size_t)(uintptr_t)arg;
 386         caddr_t buf;
 387 
 388         raidz_map_t *rm = zio->io_vsd;
 389         size_t size;
 390 
 391         /* set up the report and bump the refcount  */
 392         zcr->zcr_cbdata = rm;
 393         zcr->zcr_cbinfo = c;
 394         zcr->zcr_finish = vdev_raidz_cksum_finish;
 395         zcr->zcr_free = vdev_raidz_cksum_free;
 396 
 397         rm->rm_reports++;
 398         ASSERT3U(rm->rm_reports, >, 0);
 399 
 400         if (rm->rm_datacopy != NULL)
 401                 return;
 402 
 403         /*
 404          * It's the first time we're called for this raidz_map_t, so we need
 405          * to copy the data aside; there's no guarantee that our zio's buffer
 406          * won't be re-used for something else.
 407          *
 408          * Our parity data is already in separate buffers, so there's no need
 409          * to copy them.
 410          */
 411 
 412         size = 0;
 413         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 414                 size += rm->rm_col[c].rc_size;
 415 
 416         buf = rm->rm_datacopy = zio_buf_alloc(size);
 417 
 418         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 419                 raidz_col_t *col = &rm->rm_col[c];
 420 
 421                 bcopy(col->rc_data, buf, col->rc_size);
 422                 col->rc_data = buf;
 423 
 424                 buf += col->rc_size;
 425         }
 426         ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 427 }
 428 
 429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 430         vdev_raidz_map_free_vsd,
 431         vdev_raidz_cksum_report
 432 };
 433 
 434 /*
 435  * Divides the IO evenly across all child vdevs; usually, dcols is
 436  * the number of children in the target vdev.
 437  */
 438 static raidz_map_t *
 439 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 440     uint64_t nparity)
 441 {
 442         raidz_map_t *rm;
 443         /* The starting RAIDZ (parent) vdev sector of the block. */
 444         uint64_t b = zio->io_offset >> unit_shift;
 445         /* The zio's size in units of the vdev's minimum sector size */
 446         uint64_t s = zio->io_size >> unit_shift;
 447         /* The first column for this stripe. */
 448         uint64_t f = b % dcols;
 449         /* The starting byte offset on each child vdev. */
 450         uint64_t o = (b / dcols) << unit_shift;
 451         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 452 
 453         /*
 454          * "Quotient": The number of data sectors for this stripe on all but
 455          * the "big column" child vdevs that also contain "remainder" data.
 456          */
 457         q = s / (dcols - nparity);
 458 
 459         /*
 460          * "Remainder": The number of partial stripe data sectors in this I/O.
 461          * This will add a sector to some, but not all, child vdevs.
 462          */
 463         r = s - q * (dcols - nparity);
 464 
 465         /* The number of "big columns" - those which contain remainder data. */
 466         bc = (r == 0 ? 0 : r + nparity);
 467 
 468         /*
 469          * The total number of data and parity sectors associated with
 470          * this I/O.
 471          */
 472         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 473 
 474         /* acols: The columns that will be accessed. */
 475         /* scols: The columns that will be accessed or skipped. */
 476         if (q == 0) {
 477                 /* Our I/O request doesn't span all child vdevs. */
 478                 acols = bc;
 479                 scols = MIN(dcols, roundup(bc, nparity + 1));
 480         } else {
 481                 acols = dcols;
 482                 scols = dcols;
 483         }
 484 
 485         ASSERT3U(acols, <=, scols);
 486 
 487         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 488 
 489         rm->rm_cols = acols;
 490         rm->rm_scols = scols;
 491         rm->rm_bigcols = bc;
 492         rm->rm_skipstart = bc;
 493         rm->rm_missingdata = 0;
 494         rm->rm_missingparity = 0;
 495         rm->rm_firstdatacol = nparity;
 496         rm->rm_datacopy = NULL;
 497         rm->rm_reports = 0;
 498         rm->rm_freed = 0;
 499         rm->rm_ecksuminjected = 0;
 500 
 501         asize = 0;
 502 
 503         for (c = 0; c < scols; c++) {
 504                 col = f + c;
 505                 coff = o;
 506                 if (col >= dcols) {
 507                         col -= dcols;
 508                         coff += 1ULL << unit_shift;
 509                 }
 510                 rm->rm_col[c].rc_devidx = col;
 511                 rm->rm_col[c].rc_offset = coff;
 512                 rm->rm_col[c].rc_data = NULL;
 513                 rm->rm_col[c].rc_gdata = NULL;
 514                 rm->rm_col[c].rc_error = 0;
 515                 rm->rm_col[c].rc_tried = 0;
 516                 rm->rm_col[c].rc_skipped = 0;
 517 
 518                 if (c >= acols)
 519                         rm->rm_col[c].rc_size = 0;
 520                 else if (c < bc)
 521                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 522                 else
 523                         rm->rm_col[c].rc_size = q << unit_shift;
 524 
 525                 asize += rm->rm_col[c].rc_size;
 526         }
 527 
 528         ASSERT3U(asize, ==, tot << unit_shift);
 529         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 530         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 531         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 532         ASSERT3U(rm->rm_nskip, <=, nparity);
 533 
 534         for (c = 0; c < rm->rm_firstdatacol; c++)
 535                 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
 536 
 537         rm->rm_col[c].rc_data = zio->io_data;
 538 
 539         for (c = c + 1; c < acols; c++)
 540                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
 541                     rm->rm_col[c - 1].rc_size;
 542 
 543         /*
 544          * If all data stored spans all columns, there's a danger that parity
 545          * will always be on the same device and, since parity isn't read
 546          * during normal operation, that that device's I/O bandwidth won't be
 547          * used effectively. We therefore switch the parity every 1MB.
 548          *
 549          * ... at least that was, ostensibly, the theory. As a practical
 550          * matter unless we juggle the parity between all devices evenly, we
 551          * won't see any benefit. Further, occasional writes that aren't a
 552          * multiple of the LCM of the number of children and the minimum
 553          * stripe width are sufficient to avoid pessimal behavior.
 554          * Unfortunately, this decision created an implicit on-disk format
 555          * requirement that we need to support for all eternity, but only
 556          * for single-parity RAID-Z.
 557          *
 558          * If we intend to skip a sector in the zeroth column for padding
 559          * we must make sure to note this swap. We will never intend to
 560          * skip the first column since at least one data and one parity
 561          * column must appear in each row.
 562          */
 563         ASSERT(rm->rm_cols >= 2);
 564         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 565 
 566         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 567                 devidx = rm->rm_col[0].rc_devidx;
 568                 o = rm->rm_col[0].rc_offset;
 569                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 570                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 571                 rm->rm_col[1].rc_devidx = devidx;
 572                 rm->rm_col[1].rc_offset = o;
 573 
 574                 if (rm->rm_skipstart == 0)
 575                         rm->rm_skipstart = 1;
 576         }
 577 
 578         zio->io_vsd = rm;
 579         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 580         return (rm);
 581 }
 582 
 583 static void
 584 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 585 {
 586         uint64_t *p, *src, pcount, ccount, i;
 587         int c;
 588 
 589         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 590 
 591         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 592                 src = rm->rm_col[c].rc_data;
 593                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 594                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 595 
 596                 if (c == rm->rm_firstdatacol) {
 597                         ASSERT(ccount == pcount);
 598                         for (i = 0; i < ccount; i++, src++, p++) {
 599                                 *p = *src;
 600                         }
 601                 } else {
 602                         ASSERT(ccount <= pcount);
 603                         for (i = 0; i < ccount; i++, src++, p++) {
 604                                 *p ^= *src;
 605                         }
 606                 }
 607         }
 608 }
 609 
 610 static void
 611 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 612 {
 613         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 614         int c;
 615 
 616         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 617         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 618             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 619 
 620         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 621                 src = rm->rm_col[c].rc_data;
 622                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 623                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 624 
 625                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 626 
 627                 if (c == rm->rm_firstdatacol) {
 628                         ASSERT(ccnt == pcnt || ccnt == 0);
 629                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 630                                 *p = *src;
 631                                 *q = *src;
 632                         }
 633                         for (; i < pcnt; i++, src++, p++, q++) {
 634                                 *p = 0;
 635                                 *q = 0;
 636                         }
 637                 } else {
 638                         ASSERT(ccnt <= pcnt);
 639 
 640                         /*
 641                          * Apply the algorithm described above by multiplying
 642                          * the previous result and adding in the new value.
 643                          */
 644                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 645                                 *p ^= *src;
 646 
 647                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 648                                 *q ^= *src;
 649                         }
 650 
 651                         /*
 652                          * Treat short columns as though they are full of 0s.
 653                          * Note that there's therefore nothing needed for P.
 654                          */
 655                         for (; i < pcnt; i++, q++) {
 656                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 657                         }
 658                 }
 659         }
 660 }
 661 
 662 static void
 663 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 664 {
 665         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 666         int c;
 667 
 668         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 669         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 670             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 671         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 672             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 673 
 674         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 675                 src = rm->rm_col[c].rc_data;
 676                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 677                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 678                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 679 
 680                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 681 
 682                 if (c == rm->rm_firstdatacol) {
 683                         ASSERT(ccnt == pcnt || ccnt == 0);
 684                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 685                                 *p = *src;
 686                                 *q = *src;
 687                                 *r = *src;
 688                         }
 689                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 690                                 *p = 0;
 691                                 *q = 0;
 692                                 *r = 0;
 693                         }
 694                 } else {
 695                         ASSERT(ccnt <= pcnt);
 696 
 697                         /*
 698                          * Apply the algorithm described above by multiplying
 699                          * the previous result and adding in the new value.
 700                          */
 701                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 702                                 *p ^= *src;
 703 
 704                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 705                                 *q ^= *src;
 706 
 707                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 708                                 *r ^= *src;
 709                         }
 710 
 711                         /*
 712                          * Treat short columns as though they are full of 0s.
 713                          * Note that there's therefore nothing needed for P.
 714                          */
 715                         for (; i < pcnt; i++, q++, r++) {
 716                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 717                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 718                         }
 719                 }
 720         }
 721 }
 722 
 723 /*
 724  * Generate RAID parity in the first virtual columns according to the number of
 725  * parity columns available.
 726  */
 727 static void
 728 vdev_raidz_generate_parity(raidz_map_t *rm)
 729 {
 730         switch (rm->rm_firstdatacol) {
 731         case 1:
 732                 vdev_raidz_generate_parity_p(rm);
 733                 break;
 734         case 2:
 735                 vdev_raidz_generate_parity_pq(rm);
 736                 break;
 737         case 3:
 738                 vdev_raidz_generate_parity_pqr(rm);
 739                 break;
 740         default:
 741                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 742         }
 743 }
 744 
 745 static int
 746 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 747 {
 748         uint64_t *dst, *src, xcount, ccount, count, i;
 749         int x = tgts[0];
 750         int c;
 751 
 752         ASSERT(ntgts == 1);
 753         ASSERT(x >= rm->rm_firstdatacol);
 754         ASSERT(x < rm->rm_cols);
 755 
 756         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 757         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 758         ASSERT(xcount > 0);
 759 
 760         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 761         dst = rm->rm_col[x].rc_data;
 762         for (i = 0; i < xcount; i++, dst++, src++) {
 763                 *dst = *src;
 764         }
 765 
 766         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 767                 src = rm->rm_col[c].rc_data;
 768                 dst = rm->rm_col[x].rc_data;
 769 
 770                 if (c == x)
 771                         continue;
 772 
 773                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 774                 count = MIN(ccount, xcount);
 775 
 776                 for (i = 0; i < count; i++, dst++, src++) {
 777                         *dst ^= *src;
 778                 }
 779         }
 780 
 781         return (1 << VDEV_RAIDZ_P);
 782 }
 783 
 784 static int
 785 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 786 {
 787         uint64_t *dst, *src, xcount, ccount, count, mask, i;
 788         uint8_t *b;
 789         int x = tgts[0];
 790         int c, j, exp;
 791 
 792         ASSERT(ntgts == 1);
 793 
 794         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 795         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 796 
 797         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 798                 src = rm->rm_col[c].rc_data;
 799                 dst = rm->rm_col[x].rc_data;
 800 
 801                 if (c == x)
 802                         ccount = 0;
 803                 else
 804                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 805 
 806                 count = MIN(ccount, xcount);
 807 
 808                 if (c == rm->rm_firstdatacol) {
 809                         for (i = 0; i < count; i++, dst++, src++) {
 810                                 *dst = *src;
 811                         }
 812                         for (; i < xcount; i++, dst++) {
 813                                 *dst = 0;
 814                         }
 815 
 816                 } else {
 817                         for (i = 0; i < count; i++, dst++, src++) {
 818                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 819                                 *dst ^= *src;
 820                         }
 821 
 822                         for (; i < xcount; i++, dst++) {
 823                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 824                         }
 825                 }
 826         }
 827 
 828         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 829         dst = rm->rm_col[x].rc_data;
 830         exp = 255 - (rm->rm_cols - 1 - x);
 831 
 832         for (i = 0; i < xcount; i++, dst++, src++) {
 833                 *dst ^= *src;
 834                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 835                         *b = vdev_raidz_exp2(*b, exp);
 836                 }
 837         }
 838 
 839         return (1 << VDEV_RAIDZ_Q);
 840 }
 841 
 842 static int
 843 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 844 {
 845         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 846         void *pdata, *qdata;
 847         uint64_t xsize, ysize, i;
 848         int x = tgts[0];
 849         int y = tgts[1];
 850 
 851         ASSERT(ntgts == 2);
 852         ASSERT(x < y);
 853         ASSERT(x >= rm->rm_firstdatacol);
 854         ASSERT(y < rm->rm_cols);
 855 
 856         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 857 
 858         /*
 859          * Move the parity data aside -- we're going to compute parity as
 860          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 861          * reuse the parity generation mechanism without trashing the actual
 862          * parity so we make those columns appear to be full of zeros by
 863          * setting their lengths to zero.
 864          */
 865         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 866         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 867         xsize = rm->rm_col[x].rc_size;
 868         ysize = rm->rm_col[y].rc_size;
 869 
 870         rm->rm_col[VDEV_RAIDZ_P].rc_data =
 871             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
 872         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
 873             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 874         rm->rm_col[x].rc_size = 0;
 875         rm->rm_col[y].rc_size = 0;
 876 
 877         vdev_raidz_generate_parity_pq(rm);
 878 
 879         rm->rm_col[x].rc_size = xsize;
 880         rm->rm_col[y].rc_size = ysize;
 881 
 882         p = pdata;
 883         q = qdata;
 884         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 885         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 886         xd = rm->rm_col[x].rc_data;
 887         yd = rm->rm_col[y].rc_data;
 888 
 889         /*
 890          * We now have:
 891          *      Pxy = P + D_x + D_y
 892          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 893          *
 894          * We can then solve for D_x:
 895          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 896          * where
 897          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 898          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 899          *
 900          * With D_x in hand, we can easily solve for D_y:
 901          *      D_y = P + Pxy + D_x
 902          */
 903 
 904         a = vdev_raidz_pow2[255 + x - y];
 905         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 906         tmp = 255 - vdev_raidz_log2[a ^ 1];
 907 
 908         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 909         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 910 
 911         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
 912                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
 913                     vdev_raidz_exp2(*q ^ *qxy, bexp);
 914 
 915                 if (i < ysize)
 916                         *yd = *p ^ *pxy ^ *xd;
 917         }
 918 
 919         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
 920             rm->rm_col[VDEV_RAIDZ_P].rc_size);
 921         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
 922             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 923 
 924         /*
 925          * Restore the saved parity data.
 926          */
 927         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 928         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
 929 
 930         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 931 }
 932 
 933 /* BEGIN CSTYLED */
 934 /*
 935  * In the general case of reconstruction, we must solve the system of linear
 936  * equations defined by the coeffecients used to generate parity as well as
 937  * the contents of the data and parity disks. This can be expressed with
 938  * vectors for the original data (D) and the actual data (d) and parity (p)
 939  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 940  *
 941  *            __   __                     __     __
 942  *            |     |         __     __   |  p_0  |
 943  *            |  V  |         |  D_0  |   | p_m-1 |
 944  *            |     |    x    |   :   | = |  d_0  |
 945  *            |  I  |         | D_n-1 |   |   :   |
 946  *            |     |         ~~     ~~   | d_n-1 |
 947  *            ~~   ~~                     ~~     ~~
 948  *
 949  * I is simply a square identity matrix of size n, and V is a vandermonde
 950  * matrix defined by the coeffecients we chose for the various parity columns
 951  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 952  * computation as well as linear separability.
 953  *
 954  *      __               __               __     __
 955  *      |   1   ..  1 1 1 |               |  p_0  |
 956  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 957  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 958  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 959  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 960  *      |   :       : : : |   |   :   |   |  d_2  |
 961  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 962  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 963  *      |   0   ..  0 0 1 |               | d_n-1 |
 964  *      ~~               ~~               ~~     ~~
 965  *
 966  * Note that I, V, d, and p are known. To compute D, we must invert the
 967  * matrix and use the known data and parity values to reconstruct the unknown
 968  * data values. We begin by removing the rows in V|I and d|p that correspond
 969  * to failed or missing columns; we then make V|I square (n x n) and d|p
 970  * sized n by removing rows corresponding to unused parity from the bottom up
 971  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 972  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 973  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 974  *           __                               __
 975  *           |  1   1   1   1   1   1   1   1  |
 976  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 977  *           |  19 205 116  29  64  16  4   1  |      / /
 978  *           |  1   0   0   0   0   0   0   0  |     / /
 979  *           |  0   1   0   0   0   0   0   0  | <--' /
 980  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 981  *           |  0   0   0   1   0   0   0   0  |
 982  *           |  0   0   0   0   1   0   0   0  |
 983  *           |  0   0   0   0   0   1   0   0  |
 984  *           |  0   0   0   0   0   0   1   0  |
 985  *           |  0   0   0   0   0   0   0   1  |
 986  *           ~~                               ~~
 987  *           __                               __
 988  *           |  1   1   1   1   1   1   1   1  |
 989  *           | 128  64  32  16  8   4   2   1  |
 990  *           |  19 205 116  29  64  16  4   1  |
 991  *           |  1   0   0   0   0   0   0   0  |
 992  *           |  0   1   0   0   0   0   0   0  |
 993  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 994  *           |  0   0   0   1   0   0   0   0  |
 995  *           |  0   0   0   0   1   0   0   0  |
 996  *           |  0   0   0   0   0   1   0   0  |
 997  *           |  0   0   0   0   0   0   1   0  |
 998  *           |  0   0   0   0   0   0   0   1  |
 999  *           ~~                               ~~
1000  *
1001  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1002  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1003  * matrix is not singular.
1004  * __                                                                 __
1005  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1006  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1007  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1008  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1009  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1010  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1011  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1012  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1013  * ~~                                                                 ~~
1014  * __                                                                 __
1015  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1016  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1017  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1018  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1019  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1020  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1021  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1022  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1023  * ~~                                                                 ~~
1024  * __                                                                 __
1025  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1026  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1027  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1028  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1029  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1030  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1031  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1032  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1033  * ~~                                                                 ~~
1034  * __                                                                 __
1035  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1036  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1037  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1038  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1039  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1040  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1041  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1042  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1043  * ~~                                                                 ~~
1044  * __                                                                 __
1045  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1046  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1047  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1048  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1049  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1050  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1051  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1052  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1053  * ~~                                                                 ~~
1054  * __                                                                 __
1055  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1056  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1057  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1058  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1059  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1060  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1061  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1062  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1063  * ~~                                                                 ~~
1064  *                   __                               __
1065  *                   |  0   0   1   0   0   0   0   0  |
1066  *                   | 167 100  5   41 159 169 217 208 |
1067  *                   | 166 100  4   40 158 168 216 209 |
1068  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1069  *                   |  0   0   0   0   1   0   0   0  |
1070  *                   |  0   0   0   0   0   1   0   0  |
1071  *                   |  0   0   0   0   0   0   1   0  |
1072  *                   |  0   0   0   0   0   0   0   1  |
1073  *                   ~~                               ~~
1074  *
1075  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1076  * of the missing data.
1077  *
1078  * As is apparent from the example above, the only non-trivial rows in the
1079  * inverse matrix correspond to the data disks that we're trying to
1080  * reconstruct. Indeed, those are the only rows we need as the others would
1081  * only be useful for reconstructing data known or assumed to be valid. For
1082  * that reason, we only build the coefficients in the rows that correspond to
1083  * targeted columns.
1084  */
1085 /* END CSTYLED */
1086 
1087 static void
1088 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1089     uint8_t **rows)
1090 {
1091         int i, j;
1092         int pow;
1093 
1094         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1095 
1096         /*
1097          * Fill in the missing rows of interest.
1098          */
1099         for (i = 0; i < nmap; i++) {
1100                 ASSERT3S(0, <=, map[i]);
1101                 ASSERT3S(map[i], <=, 2);
1102 
1103                 pow = map[i] * n;
1104                 if (pow > 255)
1105                         pow -= 255;
1106                 ASSERT(pow <= 255);
1107 
1108                 for (j = 0; j < n; j++) {
1109                         pow -= map[i];
1110                         if (pow < 0)
1111                                 pow += 255;
1112                         rows[i][j] = vdev_raidz_pow2[pow];
1113                 }
1114         }
1115 }
1116 
1117 static void
1118 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1119     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1120 {
1121         int i, j, ii, jj;
1122         uint8_t log;
1123 
1124         /*
1125          * Assert that the first nmissing entries from the array of used
1126          * columns correspond to parity columns and that subsequent entries
1127          * correspond to data columns.
1128          */
1129         for (i = 0; i < nmissing; i++) {
1130                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1131         }
1132         for (; i < n; i++) {
1133                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1134         }
1135 
1136         /*
1137          * First initialize the storage where we'll compute the inverse rows.
1138          */
1139         for (i = 0; i < nmissing; i++) {
1140                 for (j = 0; j < n; j++) {
1141                         invrows[i][j] = (i == j) ? 1 : 0;
1142                 }
1143         }
1144 
1145         /*
1146          * Subtract all trivial rows from the rows of consequence.
1147          */
1148         for (i = 0; i < nmissing; i++) {
1149                 for (j = nmissing; j < n; j++) {
1150                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1151                         jj = used[j] - rm->rm_firstdatacol;
1152                         ASSERT3S(jj, <, n);
1153                         invrows[i][j] = rows[i][jj];
1154                         rows[i][jj] = 0;
1155                 }
1156         }
1157 
1158         /*
1159          * For each of the rows of interest, we must normalize it and subtract
1160          * a multiple of it from the other rows.
1161          */
1162         for (i = 0; i < nmissing; i++) {
1163                 for (j = 0; j < missing[i]; j++) {
1164                         ASSERT0(rows[i][j]);
1165                 }
1166                 ASSERT3U(rows[i][missing[i]], !=, 0);
1167 
1168                 /*
1169                  * Compute the inverse of the first element and multiply each
1170                  * element in the row by that value.
1171                  */
1172                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1173 
1174                 for (j = 0; j < n; j++) {
1175                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1176                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1177                 }
1178 
1179                 for (ii = 0; ii < nmissing; ii++) {
1180                         if (i == ii)
1181                                 continue;
1182 
1183                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1184 
1185                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1186 
1187                         for (j = 0; j < n; j++) {
1188                                 rows[ii][j] ^=
1189                                     vdev_raidz_exp2(rows[i][j], log);
1190                                 invrows[ii][j] ^=
1191                                     vdev_raidz_exp2(invrows[i][j], log);
1192                         }
1193                 }
1194         }
1195 
1196         /*
1197          * Verify that the data that is left in the rows are properly part of
1198          * an identity matrix.
1199          */
1200         for (i = 0; i < nmissing; i++) {
1201                 for (j = 0; j < n; j++) {
1202                         if (j == missing[i]) {
1203                                 ASSERT3U(rows[i][j], ==, 1);
1204                         } else {
1205                                 ASSERT0(rows[i][j]);
1206                         }
1207                 }
1208         }
1209 }
1210 
1211 static void
1212 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1213     int *missing, uint8_t **invrows, const uint8_t *used)
1214 {
1215         int i, j, x, cc, c;
1216         uint8_t *src;
1217         uint64_t ccount;
1218         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1219         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1220         uint8_t log = 0;
1221         uint8_t val;
1222         int ll;
1223         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1224         uint8_t *p, *pp;
1225         size_t psize;
1226 
1227         psize = sizeof (invlog[0][0]) * n * nmissing;
1228         p = kmem_alloc(psize, KM_SLEEP);
1229 
1230         for (pp = p, i = 0; i < nmissing; i++) {
1231                 invlog[i] = pp;
1232                 pp += n;
1233         }
1234 
1235         for (i = 0; i < nmissing; i++) {
1236                 for (j = 0; j < n; j++) {
1237                         ASSERT3U(invrows[i][j], !=, 0);
1238                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1239                 }
1240         }
1241 
1242         for (i = 0; i < n; i++) {
1243                 c = used[i];
1244                 ASSERT3U(c, <, rm->rm_cols);
1245 
1246                 src = rm->rm_col[c].rc_data;
1247                 ccount = rm->rm_col[c].rc_size;
1248                 for (j = 0; j < nmissing; j++) {
1249                         cc = missing[j] + rm->rm_firstdatacol;
1250                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1251                         ASSERT3U(cc, <, rm->rm_cols);
1252                         ASSERT3U(cc, !=, c);
1253 
1254                         dst[j] = rm->rm_col[cc].rc_data;
1255                         dcount[j] = rm->rm_col[cc].rc_size;
1256                 }
1257 
1258                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1259 
1260                 for (x = 0; x < ccount; x++, src++) {
1261                         if (*src != 0)
1262                                 log = vdev_raidz_log2[*src];
1263 
1264                         for (cc = 0; cc < nmissing; cc++) {
1265                                 if (x >= dcount[cc])
1266                                         continue;
1267 
1268                                 if (*src == 0) {
1269                                         val = 0;
1270                                 } else {
1271                                         if ((ll = log + invlog[cc][i]) >= 255)
1272                                                 ll -= 255;
1273                                         val = vdev_raidz_pow2[ll];
1274                                 }
1275 
1276                                 if (i == 0)
1277                                         dst[cc][x] = val;
1278                                 else
1279                                         dst[cc][x] ^= val;
1280                         }
1281                 }
1282         }
1283 
1284         kmem_free(p, psize);
1285 }
1286 
1287 static int
1288 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1289 {
1290         int n, i, c, t, tt;
1291         int nmissing_rows;
1292         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1293         int parity_map[VDEV_RAIDZ_MAXPARITY];
1294 
1295         uint8_t *p, *pp;
1296         size_t psize;
1297 
1298         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1299         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1300         uint8_t *used;
1301 
1302         int code = 0;
1303 
1304 
1305         n = rm->rm_cols - rm->rm_firstdatacol;
1306 
1307         /*
1308          * Figure out which data columns are missing.
1309          */
1310         nmissing_rows = 0;
1311         for (t = 0; t < ntgts; t++) {
1312                 if (tgts[t] >= rm->rm_firstdatacol) {
1313                         missing_rows[nmissing_rows++] =
1314                             tgts[t] - rm->rm_firstdatacol;
1315                 }
1316         }
1317 
1318         /*
1319          * Figure out which parity columns to use to help generate the missing
1320          * data columns.
1321          */
1322         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1323                 ASSERT(tt < ntgts);
1324                 ASSERT(c < rm->rm_firstdatacol);
1325 
1326                 /*
1327                  * Skip any targeted parity columns.
1328                  */
1329                 if (c == tgts[tt]) {
1330                         tt++;
1331                         continue;
1332                 }
1333 
1334                 code |= 1 << c;
1335 
1336                 parity_map[i] = c;
1337                 i++;
1338         }
1339 
1340         ASSERT(code != 0);
1341         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1342 
1343         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1344             nmissing_rows * n + sizeof (used[0]) * n;
1345         p = kmem_alloc(psize, KM_SLEEP);
1346 
1347         for (pp = p, i = 0; i < nmissing_rows; i++) {
1348                 rows[i] = pp;
1349                 pp += n;
1350                 invrows[i] = pp;
1351                 pp += n;
1352         }
1353         used = pp;
1354 
1355         for (i = 0; i < nmissing_rows; i++) {
1356                 used[i] = parity_map[i];
1357         }
1358 
1359         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1360                 if (tt < nmissing_rows &&
1361                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1362                         tt++;
1363                         continue;
1364                 }
1365 
1366                 ASSERT3S(i, <, n);
1367                 used[i] = c;
1368                 i++;
1369         }
1370 
1371         /*
1372          * Initialize the interesting rows of the matrix.
1373          */
1374         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1375 
1376         /*
1377          * Invert the matrix.
1378          */
1379         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1380             invrows, used);
1381 
1382         /*
1383          * Reconstruct the missing data using the generated matrix.
1384          */
1385         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1386             invrows, used);
1387 
1388         kmem_free(p, psize);
1389 
1390         return (code);
1391 }
1392 
1393 static int
1394 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1395 {
1396         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1397         int ntgts;
1398         int i, c;
1399         int code;
1400         int nbadparity, nbaddata;
1401         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1402 
1403         /*
1404          * The tgts list must already be sorted.
1405          */
1406         for (i = 1; i < nt; i++) {
1407                 ASSERT(t[i] > t[i - 1]);
1408         }
1409 
1410         nbadparity = rm->rm_firstdatacol;
1411         nbaddata = rm->rm_cols - nbadparity;
1412         ntgts = 0;
1413         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1414                 if (c < rm->rm_firstdatacol)
1415                         parity_valid[c] = B_FALSE;
1416 
1417                 if (i < nt && c == t[i]) {
1418                         tgts[ntgts++] = c;
1419                         i++;
1420                 } else if (rm->rm_col[c].rc_error != 0) {
1421                         tgts[ntgts++] = c;
1422                 } else if (c >= rm->rm_firstdatacol) {
1423                         nbaddata--;
1424                 } else {
1425                         parity_valid[c] = B_TRUE;
1426                         nbadparity--;
1427                 }
1428         }
1429 
1430         ASSERT(ntgts >= nt);
1431         ASSERT(nbaddata >= 0);
1432         ASSERT(nbaddata + nbadparity == ntgts);
1433 
1434         dt = &tgts[nbadparity];
1435 
1436         /*
1437          * See if we can use any of our optimized reconstruction routines.
1438          */
1439         if (!vdev_raidz_default_to_general) {
1440                 switch (nbaddata) {
1441                 case 1:
1442                         if (parity_valid[VDEV_RAIDZ_P])
1443                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1444 
1445                         ASSERT(rm->rm_firstdatacol > 1);
1446 
1447                         if (parity_valid[VDEV_RAIDZ_Q])
1448                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1449 
1450                         ASSERT(rm->rm_firstdatacol > 2);
1451                         break;
1452 
1453                 case 2:
1454                         ASSERT(rm->rm_firstdatacol > 1);
1455 
1456                         if (parity_valid[VDEV_RAIDZ_P] &&
1457                             parity_valid[VDEV_RAIDZ_Q])
1458                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1459 
1460                         ASSERT(rm->rm_firstdatacol > 2);
1461 
1462                         break;
1463                 }
1464         }
1465 
1466         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1467         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1468         ASSERT(code > 0);
1469         return (code);
1470 }
1471 
1472 static int
1473 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1474     uint64_t *ashift)
1475 {
1476         vdev_t *cvd;
1477         uint64_t nparity = vd->vdev_nparity;
1478         int c;
1479         int lasterror = 0;
1480         int numerrors = 0;
1481 
1482         ASSERT(nparity > 0);
1483 
1484         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1485             vd->vdev_children < nparity + 1) {
1486                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1487                 return (SET_ERROR(EINVAL));
1488         }
1489 
1490         vdev_open_children(vd);
1491 
1492         for (c = 0; c < vd->vdev_children; c++) {
1493                 cvd = vd->vdev_child[c];
1494 
1495                 if (cvd->vdev_open_error != 0) {
1496                         lasterror = cvd->vdev_open_error;
1497                         numerrors++;
1498                         continue;
1499                 }
1500 
1501                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1502                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1503                 *ashift = MAX(*ashift, cvd->vdev_ashift);
1504         }
1505 
1506         *asize *= vd->vdev_children;
1507         *max_asize *= vd->vdev_children;
1508 
1509         if (numerrors > nparity) {
1510                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1511                 return (lasterror);
1512         }
1513 
1514         return (0);
1515 }
1516 
1517 static void
1518 vdev_raidz_close(vdev_t *vd)
1519 {
1520         int c;
1521 
1522         for (c = 0; c < vd->vdev_children; c++)
1523                 vdev_close(vd->vdev_child[c]);
1524 }
1525 
1526 static uint64_t
1527 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1528 {
1529         uint64_t asize;
1530         uint64_t ashift = vd->vdev_top->vdev_ashift;
1531         uint64_t cols = vd->vdev_children;
1532         uint64_t nparity = vd->vdev_nparity;
1533 
1534         asize = ((psize - 1) >> ashift) + 1;
1535         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1536         asize = roundup(asize, nparity + 1) << ashift;
1537 
1538         return (asize);
1539 }
1540 
1541 static void
1542 vdev_raidz_child_done(zio_t *zio)
1543 {
1544         raidz_col_t *rc = zio->io_private;
1545 
1546         rc->rc_error = zio->io_error;
1547         rc->rc_tried = 1;
1548         rc->rc_skipped = 0;
1549 }
1550 
1551 /*
1552  * Start an IO operation on a RAIDZ VDev
1553  *
1554  * Outline:
1555  * - For write operations:
1556  *   1. Generate the parity data
1557  *   2. Create child zio write operations to each column's vdev, for both
1558  *      data and parity.
1559  *   3. If the column skips any sectors for padding, create optional dummy
1560  *      write zio children for those areas to improve aggregation continuity.
1561  * - For read operations:
1562  *   1. Create child zio read operations to each data column's vdev to read
1563  *      the range of data required for zio.
1564  *   2. If this is a scrub or resilver operation, or if any of the data
1565  *      vdevs have had errors, then create zio read operations to the parity
1566  *      columns' VDevs as well.
1567  */
1568 static int
1569 vdev_raidz_io_start(zio_t *zio)
1570 {
1571         vdev_t *vd = zio->io_vd;
1572         vdev_t *tvd = vd->vdev_top;
1573         vdev_t *cvd;
1574         raidz_map_t *rm;
1575         raidz_col_t *rc;
1576         int c, i;
1577 
1578         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1579             vd->vdev_nparity);
1580 
1581         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1582 
1583         if (zio->io_type == ZIO_TYPE_WRITE) {
1584                 vdev_raidz_generate_parity(rm);
1585 
1586                 for (c = 0; c < rm->rm_cols; c++) {
1587                         rc = &rm->rm_col[c];
1588                         cvd = vd->vdev_child[rc->rc_devidx];
1589                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1590                             rc->rc_offset, rc->rc_data, rc->rc_size,
1591                             zio->io_type, zio->io_priority, 0,
1592                             vdev_raidz_child_done, rc));
1593                 }
1594 
1595                 /*
1596                  * Generate optional I/Os for any skipped sectors to improve
1597                  * aggregation contiguity.
1598                  */
1599                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1600                         ASSERT(c <= rm->rm_scols);
1601                         if (c == rm->rm_scols)
1602                                 c = 0;
1603                         rc = &rm->rm_col[c];
1604                         cvd = vd->vdev_child[rc->rc_devidx];
1605                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1606                             rc->rc_offset + rc->rc_size, NULL,
1607                             1 << tvd->vdev_ashift,
1608                             zio->io_type, zio->io_priority,
1609                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1610                 }
1611 
1612                 return (ZIO_PIPELINE_CONTINUE);
1613         }
1614 
1615         ASSERT(zio->io_type == ZIO_TYPE_READ);
1616 
1617         /*
1618          * Iterate over the columns in reverse order so that we hit the parity
1619          * last -- any errors along the way will force us to read the parity.
1620          */
1621         for (c = rm->rm_cols - 1; c >= 0; c--) {
1622                 rc = &rm->rm_col[c];
1623                 cvd = vd->vdev_child[rc->rc_devidx];
1624                 if (!vdev_readable(cvd)) {
1625                         if (c >= rm->rm_firstdatacol)
1626                                 rm->rm_missingdata++;
1627                         else
1628                                 rm->rm_missingparity++;
1629                         rc->rc_error = SET_ERROR(ENXIO);
1630                         rc->rc_tried = 1;    /* don't even try */
1631                         rc->rc_skipped = 1;
1632                         continue;
1633                 }
1634                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1635                         if (c >= rm->rm_firstdatacol)
1636                                 rm->rm_missingdata++;
1637                         else
1638                                 rm->rm_missingparity++;
1639                         rc->rc_error = SET_ERROR(ESTALE);
1640                         rc->rc_skipped = 1;
1641                         continue;
1642                 }
1643                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1644                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1645                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1646                             rc->rc_offset, rc->rc_data, rc->rc_size,
1647                             zio->io_type, zio->io_priority, 0,
1648                             vdev_raidz_child_done, rc));
1649                 }
1650         }
1651 
1652         return (ZIO_PIPELINE_CONTINUE);
1653 }
1654 
1655 
1656 /*
1657  * Report a checksum error for a child of a RAID-Z device.
1658  */
1659 static void
1660 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1661 {
1662         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1663 
1664         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1665                 zio_bad_cksum_t zbc;
1666                 raidz_map_t *rm = zio->io_vsd;
1667 
1668                 mutex_enter(&vd->vdev_stat_lock);
1669                 vd->vdev_stat.vs_checksum_errors++;
1670                 mutex_exit(&vd->vdev_stat_lock);
1671 
1672                 zbc.zbc_has_cksum = 0;
1673                 zbc.zbc_injected = rm->rm_ecksuminjected;
1674 
1675                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1676                     rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1677                     &zbc);
1678         }
1679 }
1680 
1681 /*
1682  * We keep track of whether or not there were any injected errors, so that
1683  * any ereports we generate can note it.
1684  */
1685 static int
1686 raidz_checksum_verify(zio_t *zio)
1687 {
1688         zio_bad_cksum_t zbc;
1689         raidz_map_t *rm = zio->io_vsd;
1690 
1691         int ret = zio_checksum_error(zio, &zbc);
1692         if (ret != 0 && zbc.zbc_injected != 0)
1693                 rm->rm_ecksuminjected = 1;
1694 
1695         return (ret);
1696 }
1697 
1698 /*
1699  * Generate the parity from the data columns. If we tried and were able to
1700  * read the parity without error, verify that the generated parity matches the
1701  * data we read. If it doesn't, we fire off a checksum error. Return the
1702  * number such failures.
1703  */
1704 static int
1705 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1706 {
1707         void *orig[VDEV_RAIDZ_MAXPARITY];
1708         int c, ret = 0;
1709         raidz_col_t *rc;
1710 
1711         for (c = 0; c < rm->rm_firstdatacol; c++) {
1712                 rc = &rm->rm_col[c];
1713                 if (!rc->rc_tried || rc->rc_error != 0)
1714                         continue;
1715                 orig[c] = zio_buf_alloc(rc->rc_size);
1716                 bcopy(rc->rc_data, orig[c], rc->rc_size);
1717         }
1718 
1719         vdev_raidz_generate_parity(rm);
1720 
1721         for (c = 0; c < rm->rm_firstdatacol; c++) {
1722                 rc = &rm->rm_col[c];
1723                 if (!rc->rc_tried || rc->rc_error != 0)
1724                         continue;
1725                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1726                         raidz_checksum_error(zio, rc, orig[c]);
1727                         rc->rc_error = SET_ERROR(ECKSUM);
1728                         ret++;
1729                 }
1730                 zio_buf_free(orig[c], rc->rc_size);
1731         }
1732 
1733         return (ret);
1734 }
1735 
1736 /*
1737  * Keep statistics on all the ways that we used parity to correct data.
1738  */
1739 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1740 
1741 static int
1742 vdev_raidz_worst_error(raidz_map_t *rm)
1743 {
1744         int error = 0;
1745 
1746         for (int c = 0; c < rm->rm_cols; c++)
1747                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1748 
1749         return (error);
1750 }
1751 
1752 /*
1753  * Iterate over all combinations of bad data and attempt a reconstruction.
1754  * Note that the algorithm below is non-optimal because it doesn't take into
1755  * account how reconstruction is actually performed. For example, with
1756  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1757  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1758  * cases we'd only use parity information in column 0.
1759  */
1760 static int
1761 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1762 {
1763         raidz_map_t *rm = zio->io_vsd;
1764         raidz_col_t *rc;
1765         void *orig[VDEV_RAIDZ_MAXPARITY];
1766         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1767         int *tgts = &tstore[1];
1768         int current, next, i, c, n;
1769         int code, ret = 0;
1770 
1771         ASSERT(total_errors < rm->rm_firstdatacol);
1772 
1773         /*
1774          * This simplifies one edge condition.
1775          */
1776         tgts[-1] = -1;
1777 
1778         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1779                 /*
1780                  * Initialize the targets array by finding the first n columns
1781                  * that contain no error.
1782                  *
1783                  * If there were no data errors, we need to ensure that we're
1784                  * always explicitly attempting to reconstruct at least one
1785                  * data column. To do this, we simply push the highest target
1786                  * up into the data columns.
1787                  */
1788                 for (c = 0, i = 0; i < n; i++) {
1789                         if (i == n - 1 && data_errors == 0 &&
1790                             c < rm->rm_firstdatacol) {
1791                                 c = rm->rm_firstdatacol;
1792                         }
1793 
1794                         while (rm->rm_col[c].rc_error != 0) {
1795                                 c++;
1796                                 ASSERT3S(c, <, rm->rm_cols);
1797                         }
1798 
1799                         tgts[i] = c++;
1800                 }
1801 
1802                 /*
1803                  * Setting tgts[n] simplifies the other edge condition.
1804                  */
1805                 tgts[n] = rm->rm_cols;
1806 
1807                 /*
1808                  * These buffers were allocated in previous iterations.
1809                  */
1810                 for (i = 0; i < n - 1; i++) {
1811                         ASSERT(orig[i] != NULL);
1812                 }
1813 
1814                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1815 
1816                 current = 0;
1817                 next = tgts[current];
1818 
1819                 while (current != n) {
1820                         tgts[current] = next;
1821                         current = 0;
1822 
1823                         /*
1824                          * Save off the original data that we're going to
1825                          * attempt to reconstruct.
1826                          */
1827                         for (i = 0; i < n; i++) {
1828                                 ASSERT(orig[i] != NULL);
1829                                 c = tgts[i];
1830                                 ASSERT3S(c, >=, 0);
1831                                 ASSERT3S(c, <, rm->rm_cols);
1832                                 rc = &rm->rm_col[c];
1833                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
1834                         }
1835 
1836                         /*
1837                          * Attempt a reconstruction and exit the outer loop on
1838                          * success.
1839                          */
1840                         code = vdev_raidz_reconstruct(rm, tgts, n);
1841                         if (raidz_checksum_verify(zio) == 0) {
1842                                 atomic_inc_64(&raidz_corrected[code]);
1843 
1844                                 for (i = 0; i < n; i++) {
1845                                         c = tgts[i];
1846                                         rc = &rm->rm_col[c];
1847                                         ASSERT(rc->rc_error == 0);
1848                                         if (rc->rc_tried)
1849                                                 raidz_checksum_error(zio, rc,
1850                                                     orig[i]);
1851                                         rc->rc_error = SET_ERROR(ECKSUM);
1852                                 }
1853 
1854                                 ret = code;
1855                                 goto done;
1856                         }
1857 
1858                         /*
1859                          * Restore the original data.
1860                          */
1861                         for (i = 0; i < n; i++) {
1862                                 c = tgts[i];
1863                                 rc = &rm->rm_col[c];
1864                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
1865                         }
1866 
1867                         do {
1868                                 /*
1869                                  * Find the next valid column after the current
1870                                  * position..
1871                                  */
1872                                 for (next = tgts[current] + 1;
1873                                     next < rm->rm_cols &&
1874                                     rm->rm_col[next].rc_error != 0; next++)
1875                                         continue;
1876 
1877                                 ASSERT(next <= tgts[current + 1]);
1878 
1879                                 /*
1880                                  * If that spot is available, we're done here.
1881                                  */
1882                                 if (next != tgts[current + 1])
1883                                         break;
1884 
1885                                 /*
1886                                  * Otherwise, find the next valid column after
1887                                  * the previous position.
1888                                  */
1889                                 for (c = tgts[current - 1] + 1;
1890                                     rm->rm_col[c].rc_error != 0; c++)
1891                                         continue;
1892 
1893                                 tgts[current] = c;
1894                                 current++;
1895 
1896                         } while (current != n);
1897                 }
1898         }
1899         n--;
1900 done:
1901         for (i = 0; i < n; i++) {
1902                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1903         }
1904 
1905         return (ret);
1906 }
1907 
1908 /*
1909  * Complete an IO operation on a RAIDZ VDev
1910  *
1911  * Outline:
1912  * - For write operations:
1913  *   1. Check for errors on the child IOs.
1914  *   2. Return, setting an error code if too few child VDevs were written
1915  *      to reconstruct the data later.  Note that partial writes are
1916  *      considered successful if they can be reconstructed at all.
1917  * - For read operations:
1918  *   1. Check for errors on the child IOs.
1919  *   2. If data errors occurred:
1920  *      a. Try to reassemble the data from the parity available.
1921  *      b. If we haven't yet read the parity drives, read them now.
1922  *      c. If all parity drives have been read but the data still doesn't
1923  *         reassemble with a correct checksum, then try combinatorial
1924  *         reconstruction.
1925  *      d. If that doesn't work, return an error.
1926  *   3. If there were unexpected errors or this is a resilver operation,
1927  *      rewrite the vdevs that had errors.
1928  */
1929 static void
1930 vdev_raidz_io_done(zio_t *zio)
1931 {
1932         vdev_t *vd = zio->io_vd;
1933         vdev_t *cvd;
1934         raidz_map_t *rm = zio->io_vsd;
1935         raidz_col_t *rc;
1936         int unexpected_errors = 0;
1937         int parity_errors = 0;
1938         int parity_untried = 0;
1939         int data_errors = 0;
1940         int total_errors = 0;
1941         int n, c;
1942         int tgts[VDEV_RAIDZ_MAXPARITY];
1943         int code;
1944 
1945         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1946 
1947         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1948         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1949 
1950         for (c = 0; c < rm->rm_cols; c++) {
1951                 rc = &rm->rm_col[c];
1952 
1953                 if (rc->rc_error) {
1954                         ASSERT(rc->rc_error != ECKSUM);      /* child has no bp */
1955 
1956                         if (c < rm->rm_firstdatacol)
1957                                 parity_errors++;
1958                         else
1959                                 data_errors++;
1960 
1961                         if (!rc->rc_skipped)
1962                                 unexpected_errors++;
1963 
1964                         total_errors++;
1965                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1966                         parity_untried++;
1967                 }
1968         }
1969 
1970         if (zio->io_type == ZIO_TYPE_WRITE) {
1971                 /*
1972                  * XXX -- for now, treat partial writes as a success.
1973                  * (If we couldn't write enough columns to reconstruct
1974                  * the data, the I/O failed.  Otherwise, good enough.)
1975                  *
1976                  * Now that we support write reallocation, it would be better
1977                  * to treat partial failure as real failure unless there are
1978                  * no non-degraded top-level vdevs left, and not update DTLs
1979                  * if we intend to reallocate.
1980                  */
1981                 /* XXPOLICY */
1982                 if (total_errors > rm->rm_firstdatacol)
1983                         zio->io_error = vdev_raidz_worst_error(rm);
1984 
1985                 return;
1986         }
1987 
1988         ASSERT(zio->io_type == ZIO_TYPE_READ);
1989         /*
1990          * There are three potential phases for a read:
1991          *      1. produce valid data from the columns read
1992          *      2. read all disks and try again
1993          *      3. perform combinatorial reconstruction
1994          *
1995          * Each phase is progressively both more expensive and less likely to
1996          * occur. If we encounter more errors than we can repair or all phases
1997          * fail, we have no choice but to return an error.
1998          */
1999 
2000         /*
2001          * If the number of errors we saw was correctable -- less than or equal
2002          * to the number of parity disks read -- attempt to produce data that
2003          * has a valid checksum. Naturally, this case applies in the absence of
2004          * any errors.
2005          */
2006         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2007                 if (data_errors == 0) {
2008                         if (raidz_checksum_verify(zio) == 0) {
2009                                 /*
2010                                  * If we read parity information (unnecessarily
2011                                  * as it happens since no reconstruction was
2012                                  * needed) regenerate and verify the parity.
2013                                  * We also regenerate parity when resilvering
2014                                  * so we can write it out to the failed device
2015                                  * later.
2016                                  */
2017                                 if (parity_errors + parity_untried <
2018                                     rm->rm_firstdatacol ||
2019                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2020                                         n = raidz_parity_verify(zio, rm);
2021                                         unexpected_errors += n;
2022                                         ASSERT(parity_errors + n <=
2023                                             rm->rm_firstdatacol);
2024                                 }
2025                                 goto done;
2026                         }
2027                 } else {
2028                         /*
2029                          * We either attempt to read all the parity columns or
2030                          * none of them. If we didn't try to read parity, we
2031                          * wouldn't be here in the correctable case. There must
2032                          * also have been fewer parity errors than parity
2033                          * columns or, again, we wouldn't be in this code path.
2034                          */
2035                         ASSERT(parity_untried == 0);
2036                         ASSERT(parity_errors < rm->rm_firstdatacol);
2037 
2038                         /*
2039                          * Identify the data columns that reported an error.
2040                          */
2041                         n = 0;
2042                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2043                                 rc = &rm->rm_col[c];
2044                                 if (rc->rc_error != 0) {
2045                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2046                                         tgts[n++] = c;
2047                                 }
2048                         }
2049 
2050                         ASSERT(rm->rm_firstdatacol >= n);
2051 
2052                         code = vdev_raidz_reconstruct(rm, tgts, n);
2053 
2054                         if (raidz_checksum_verify(zio) == 0) {
2055                                 atomic_inc_64(&raidz_corrected[code]);
2056 
2057                                 /*
2058                                  * If we read more parity disks than were used
2059                                  * for reconstruction, confirm that the other
2060                                  * parity disks produced correct data. This
2061                                  * routine is suboptimal in that it regenerates
2062                                  * the parity that we already used in addition
2063                                  * to the parity that we're attempting to
2064                                  * verify, but this should be a relatively
2065                                  * uncommon case, and can be optimized if it
2066                                  * becomes a problem. Note that we regenerate
2067                                  * parity when resilvering so we can write it
2068                                  * out to failed devices later.
2069                                  */
2070                                 if (parity_errors < rm->rm_firstdatacol - n ||
2071                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2072                                         n = raidz_parity_verify(zio, rm);
2073                                         unexpected_errors += n;
2074                                         ASSERT(parity_errors + n <=
2075                                             rm->rm_firstdatacol);
2076                                 }
2077 
2078                                 goto done;
2079                         }
2080                 }
2081         }
2082 
2083         /*
2084          * This isn't a typical situation -- either we got a read error or
2085          * a child silently returned bad data. Read every block so we can
2086          * try again with as much data and parity as we can track down. If
2087          * we've already been through once before, all children will be marked
2088          * as tried so we'll proceed to combinatorial reconstruction.
2089          */
2090         unexpected_errors = 1;
2091         rm->rm_missingdata = 0;
2092         rm->rm_missingparity = 0;
2093 
2094         for (c = 0; c < rm->rm_cols; c++) {
2095                 if (rm->rm_col[c].rc_tried)
2096                         continue;
2097 
2098                 zio_vdev_io_redone(zio);
2099                 do {
2100                         rc = &rm->rm_col[c];
2101                         if (rc->rc_tried)
2102                                 continue;
2103                         zio_nowait(zio_vdev_child_io(zio, NULL,
2104                             vd->vdev_child[rc->rc_devidx],
2105                             rc->rc_offset, rc->rc_data, rc->rc_size,
2106                             zio->io_type, zio->io_priority, 0,
2107                             vdev_raidz_child_done, rc));
2108                 } while (++c < rm->rm_cols);
2109 
2110                 return;
2111         }
2112 
2113         /*
2114          * At this point we've attempted to reconstruct the data given the
2115          * errors we detected, and we've attempted to read all columns. There
2116          * must, therefore, be one or more additional problems -- silent errors
2117          * resulting in invalid data rather than explicit I/O errors resulting
2118          * in absent data. We check if there is enough additional data to
2119          * possibly reconstruct the data and then perform combinatorial
2120          * reconstruction over all possible combinations. If that fails,
2121          * we're cooked.
2122          */
2123         if (total_errors > rm->rm_firstdatacol) {
2124                 zio->io_error = vdev_raidz_worst_error(rm);
2125 
2126         } else if (total_errors < rm->rm_firstdatacol &&
2127             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2128                 /*
2129                  * If we didn't use all the available parity for the
2130                  * combinatorial reconstruction, verify that the remaining
2131                  * parity is correct.
2132                  */
2133                 if (code != (1 << rm->rm_firstdatacol) - 1)
2134                         (void) raidz_parity_verify(zio, rm);
2135         } else {
2136                 /*
2137                  * We're here because either:
2138                  *
2139                  *      total_errors == rm_first_datacol, or
2140                  *      vdev_raidz_combrec() failed
2141                  *
2142                  * In either case, there is enough bad data to prevent
2143                  * reconstruction.
2144                  *
2145                  * Start checksum ereports for all children which haven't
2146                  * failed, and the IO wasn't speculative.
2147                  */
2148                 zio->io_error = SET_ERROR(ECKSUM);
2149 
2150                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2151                         for (c = 0; c < rm->rm_cols; c++) {
2152                                 rc = &rm->rm_col[c];
2153                                 if (rc->rc_error == 0) {
2154                                         zio_bad_cksum_t zbc;
2155                                         zbc.zbc_has_cksum = 0;
2156                                         zbc.zbc_injected =
2157                                             rm->rm_ecksuminjected;
2158 
2159                                         zfs_ereport_start_checksum(
2160                                             zio->io_spa,
2161                                             vd->vdev_child[rc->rc_devidx],
2162                                             zio, rc->rc_offset, rc->rc_size,
2163                                             (void *)(uintptr_t)c, &zbc);
2164                                 }
2165                         }
2166                 }
2167         }
2168 
2169 done:
2170         zio_checksum_verified(zio);
2171 
2172         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2173             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2174                 /*
2175                  * Use the good data we have in hand to repair damaged children.
2176                  */
2177                 for (c = 0; c < rm->rm_cols; c++) {
2178                         rc = &rm->rm_col[c];
2179                         cvd = vd->vdev_child[rc->rc_devidx];
2180 
2181                         if (rc->rc_error == 0)
2182                                 continue;
2183 
2184                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2185                             rc->rc_offset, rc->rc_data, rc->rc_size,
2186                             ZIO_TYPE_WRITE, zio->io_priority,
2187                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2188                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2189                 }
2190         }
2191 }
2192 
2193 static void
2194 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2195 {
2196         if (faulted > vd->vdev_nparity)
2197                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2198                     VDEV_AUX_NO_REPLICAS);
2199         else if (degraded + faulted != 0)
2200                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2201         else
2202                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2203 }
2204 
2205 vdev_ops_t vdev_raidz_ops = {
2206         vdev_raidz_open,
2207         vdev_raidz_close,
2208         vdev_raidz_asize,
2209         vdev_raidz_io_start,
2210         vdev_raidz_io_done,
2211         vdev_raidz_state_change,
2212         NULL,
2213         NULL,
2214         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2215         B_FALSE                 /* not a leaf vdev */
2216 };