1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/spa.h>
  29 #include <sys/vdev_impl.h>
  30 #include <sys/zio.h>
  31 #include <sys/zio_checksum.h>
  32 #include <sys/fs/zfs.h>
  33 #include <sys/fm/fs/zfs.h>
  34 
  35 /*
  36  * Virtual device vector for RAID-Z.
  37  *
  38  * This vdev supports single, double, and triple parity. For single parity,
  39  * we use a simple XOR of all the data columns. For double or triple parity,
  40  * we use a special case of Reed-Solomon coding. This extends the
  41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  44  * former is also based. The latter is designed to provide higher performance
  45  * for writes.
  46  *
  47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  48  * amended six years later identifying a critical flaw that invalidates its
  49  * claims. Nevertheless, the technique can be adapted to work for up to
  50  * triple parity. For additional parity, the amendment "Note: Correction to
  51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  52  * is viable, but the additional complexity means that write performance will
  53  * suffer.
  54  *
  55  * All of the methods above operate on a Galois field, defined over the
  56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  57  * can be expressed with a single byte. Briefly, the operations on the
  58  * field are defined as follows:
  59  *
  60  *   o addition (+) is represented by a bitwise XOR
  61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  62  *   o multiplication of A by 2 is defined by the following bitwise expression:
  63  *      (A * 2)_7 = A_6
  64  *      (A * 2)_6 = A_5
  65  *      (A * 2)_5 = A_4
  66  *      (A * 2)_4 = A_3 + A_7
  67  *      (A * 2)_3 = A_2 + A_7
  68  *      (A * 2)_2 = A_1 + A_7
  69  *      (A * 2)_1 = A_0
  70  *      (A * 2)_0 = A_7
  71  *
  72  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  73  * As an aside, this multiplication is derived from the error correcting
  74  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  75  *
  76  * Observe that any number in the field (except for 0) can be expressed as a
  77  * power of 2 -- a generator for the field. We store a table of the powers of
  78  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  79  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  80  * than field addition). The inverse of a field element A (A^-1) is therefore
  81  * A ^ (255 - 1) = A^254.
  82  *
  83  * The up-to-three parity columns, P, Q, R over several data columns,
  84  * D_0, ... D_n-1, can be expressed by field operations:
  85  *
  86  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  87  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  88  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  89  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  90  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  91  *
  92  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
  93  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  94  * independent coefficients. (There are no additional coefficients that have
  95  * this property which is why the uncorrected Plank method breaks down.)
  96  *
  97  * See the reconstruction code below for how P, Q and R can used individually
  98  * or in concert to recover missing data columns.
  99  */
 100 
 101 typedef struct raidz_col {
 102         uint64_t rc_devidx;             /* child device index for I/O */
 103         uint64_t rc_offset;             /* device offset */
 104         uint64_t rc_size;               /* I/O size */
 105         void *rc_data;                  /* I/O data */
 106         void *rc_gdata;                 /* used to store the "good" version */
 107         int rc_error;                   /* I/O error for this device */
 108         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 109         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 110 } raidz_col_t;
 111 
 112 typedef struct raidz_map {
 113         uint64_t rm_cols;               /* Regular column count */
 114         uint64_t rm_scols;              /* Count including skipped columns */
 115         uint64_t rm_bigcols;            /* Number of oversized columns */
 116         uint64_t rm_asize;              /* Actual total I/O size */
 117         uint64_t rm_missingdata;        /* Count of missing data devices */
 118         uint64_t rm_missingparity;      /* Count of missing parity devices */
 119         uint64_t rm_firstdatacol;       /* First data column/parity count */
 120         uint64_t rm_nskip;              /* Skipped sectors for padding */
 121         uint64_t rm_skipstart;  /* Column index of padding start */
 122         void *rm_datacopy;              /* rm_asize-buffer of copied data */
 123         uintptr_t rm_reports;           /* # of referencing checksum reports */
 124         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 125         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 126         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 127 } raidz_map_t;
 128 
 129 #define VDEV_RAIDZ_P            0
 130 #define VDEV_RAIDZ_Q            1
 131 #define VDEV_RAIDZ_R            2
 132 
 133 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 134 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 135 
 136 /*
 137  * We provide a mechanism to perform the field multiplication operation on a
 138  * 64-bit value all at once rather than a byte at a time. This works by
 139  * creating a mask from the top bit in each byte and using that to
 140  * conditionally apply the XOR of 0x1d.
 141  */
 142 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 143 { \
 144         (mask) = (x) & 0x8080808080808080ULL; \
 145         (mask) = ((mask) << 1) - ((mask) >> 7); \
 146         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 147             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 148 }
 149 
 150 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 151 { \
 152         VDEV_RAIDZ_64MUL_2((x), mask); \
 153         VDEV_RAIDZ_64MUL_2((x), mask); \
 154 }
 155 
 156 /*
 157  * Force reconstruction to use the general purpose method.
 158  */
 159 int vdev_raidz_default_to_general;
 160 
 161 /*
 162  * These two tables represent powers and logs of 2 in the Galois field defined
 163  * above. These values were computed by repeatedly multiplying by 2 as above.
 164  */
 165 static const uint8_t vdev_raidz_pow2[256] = {
 166         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 167         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 168         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 169         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 170         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 171         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 172         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 173         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 174         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 175         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 176         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 177         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 178         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 179         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 180         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 181         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 182         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 183         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 184         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 185         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 186         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 187         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 188         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 189         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 190         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 191         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 192         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 193         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 194         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 195         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 196         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 197         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 198 };
 199 static const uint8_t vdev_raidz_log2[256] = {
 200         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 201         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 202         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 203         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 204         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 205         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 206         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 207         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 208         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 209         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 210         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 211         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 212         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 213         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 214         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 215         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 216         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 217         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 218         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 219         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 220         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 221         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 222         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 223         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 224         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 225         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 226         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 227         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 228         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 229         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 230         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 231         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 232 };
 233 
 234 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 235 
 236 /*
 237  * Multiply a given number by 2 raised to the given power.
 238  */
 239 static uint8_t
 240 vdev_raidz_exp2(uint_t a, int exp)
 241 {
 242         if (a == 0)
 243                 return (0);
 244 
 245         ASSERT(exp >= 0);
 246         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 247 
 248         exp += vdev_raidz_log2[a];
 249         if (exp > 255)
 250                 exp -= 255;
 251 
 252         return (vdev_raidz_pow2[exp]);
 253 }
 254 
 255 static void
 256 vdev_raidz_map_free(raidz_map_t *rm)
 257 {
 258         int c;
 259         size_t size;
 260 
 261         for (c = 0; c < rm->rm_firstdatacol; c++) {
 262                 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 263 
 264                 if (rm->rm_col[c].rc_gdata != NULL)
 265                         zio_buf_free(rm->rm_col[c].rc_gdata,
 266                             rm->rm_col[c].rc_size);
 267         }
 268 
 269         size = 0;
 270         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 271                 size += rm->rm_col[c].rc_size;
 272 
 273         if (rm->rm_datacopy != NULL)
 274                 zio_buf_free(rm->rm_datacopy, size);
 275 
 276         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 277 }
 278 
 279 static void
 280 vdev_raidz_map_free_vsd(zio_t *zio)
 281 {
 282         raidz_map_t *rm = zio->io_vsd;
 283 
 284         ASSERT0(rm->rm_freed);
 285         rm->rm_freed = 1;
 286 
 287         if (rm->rm_reports == 0)
 288                 vdev_raidz_map_free(rm);
 289 }
 290 
 291 /*ARGSUSED*/
 292 static void
 293 vdev_raidz_cksum_free(void *arg, size_t ignored)
 294 {
 295         raidz_map_t *rm = arg;
 296 
 297         ASSERT3U(rm->rm_reports, >, 0);
 298 
 299         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 300                 vdev_raidz_map_free(rm);
 301 }
 302 
 303 static void
 304 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 305 {
 306         raidz_map_t *rm = zcr->zcr_cbdata;
 307         size_t c = zcr->zcr_cbinfo;
 308         size_t x;
 309 
 310         const char *good = NULL;
 311         const char *bad = rm->rm_col[c].rc_data;
 312 
 313         if (good_data == NULL) {
 314                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 315                 return;
 316         }
 317 
 318         if (c < rm->rm_firstdatacol) {
 319                 /*
 320                  * The first time through, calculate the parity blocks for
 321                  * the good data (this relies on the fact that the good
 322                  * data never changes for a given logical ZIO)
 323                  */
 324                 if (rm->rm_col[0].rc_gdata == NULL) {
 325                         char *bad_parity[VDEV_RAIDZ_MAXPARITY];
 326                         char *buf;
 327 
 328                         /*
 329                          * Set up the rm_col[]s to generate the parity for
 330                          * good_data, first saving the parity bufs and
 331                          * replacing them with buffers to hold the result.
 332                          */
 333                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 334                                 bad_parity[x] = rm->rm_col[x].rc_data;
 335                                 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
 336                                     zio_buf_alloc(rm->rm_col[x].rc_size);
 337                         }
 338 
 339                         /* fill in the data columns from good_data */
 340                         buf = (char *)good_data;
 341                         for (; x < rm->rm_cols; x++) {
 342                                 rm->rm_col[x].rc_data = buf;
 343                                 buf += rm->rm_col[x].rc_size;
 344                         }
 345 
 346                         /*
 347                          * Construct the parity from the good data.
 348                          */
 349                         vdev_raidz_generate_parity(rm);
 350 
 351                         /* restore everything back to its original state */
 352                         for (x = 0; x < rm->rm_firstdatacol; x++)
 353                                 rm->rm_col[x].rc_data = bad_parity[x];
 354 
 355                         buf = rm->rm_datacopy;
 356                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 357                                 rm->rm_col[x].rc_data = buf;
 358                                 buf += rm->rm_col[x].rc_size;
 359                         }
 360                 }
 361 
 362                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 363                 good = rm->rm_col[c].rc_gdata;
 364         } else {
 365                 /* adjust good_data to point at the start of our column */
 366                 good = good_data;
 367 
 368                 for (x = rm->rm_firstdatacol; x < c; x++)
 369                         good += rm->rm_col[x].rc_size;
 370         }
 371 
 372         /* we drop the ereport if it ends up that the data was good */
 373         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 374 }
 375 
 376 /*
 377  * Invoked indirectly by zfs_ereport_start_checksum(), called
 378  * below when our read operation fails completely.  The main point
 379  * is to keep a copy of everything we read from disk, so that at
 380  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 381  */
 382 static void
 383 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 384 {
 385         size_t c = (size_t)(uintptr_t)arg;
 386         caddr_t buf;
 387 
 388         raidz_map_t *rm = zio->io_vsd;
 389         size_t size;
 390 
 391         /* set up the report and bump the refcount  */
 392         zcr->zcr_cbdata = rm;
 393         zcr->zcr_cbinfo = c;
 394         zcr->zcr_finish = vdev_raidz_cksum_finish;
 395         zcr->zcr_free = vdev_raidz_cksum_free;
 396 
 397         rm->rm_reports++;
 398         ASSERT3U(rm->rm_reports, >, 0);
 399 
 400         if (rm->rm_datacopy != NULL)
 401                 return;
 402 
 403         /*
 404          * It's the first time we're called for this raidz_map_t, so we need
 405          * to copy the data aside; there's no guarantee that our zio's buffer
 406          * won't be re-used for something else.
 407          *
 408          * Our parity data is already in separate buffers, so there's no need
 409          * to copy them.
 410          */
 411 
 412         size = 0;
 413         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 414                 size += rm->rm_col[c].rc_size;
 415 
 416         buf = rm->rm_datacopy = zio_buf_alloc(size);
 417 
 418         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 419                 raidz_col_t *col = &rm->rm_col[c];
 420 
 421                 bcopy(col->rc_data, buf, col->rc_size);
 422                 col->rc_data = buf;
 423 
 424                 buf += col->rc_size;
 425         }
 426         ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 427 }
 428 
 429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 430         vdev_raidz_map_free_vsd,
 431         vdev_raidz_cksum_report
 432 };
 433 
 434 static raidz_map_t *
 435 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 436     uint64_t nparity)
 437 {
 438         raidz_map_t *rm;
 439         uint64_t b = zio->io_offset >> unit_shift;
 440         uint64_t s = zio->io_size >> unit_shift;
 441         uint64_t f = b % dcols;
 442         uint64_t o = (b / dcols) << unit_shift;
 443         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 444 
 445         q = s / (dcols - nparity);
 446         r = s - q * (dcols - nparity);
 447         bc = (r == 0 ? 0 : r + nparity);
 448         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 449 
 450         if (q == 0) {
 451                 acols = bc;
 452                 scols = MIN(dcols, roundup(bc, nparity + 1));
 453         } else {
 454                 acols = dcols;
 455                 scols = dcols;
 456         }
 457 
 458         ASSERT3U(acols, <=, scols);
 459 
 460         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 461 
 462         rm->rm_cols = acols;
 463         rm->rm_scols = scols;
 464         rm->rm_bigcols = bc;
 465         rm->rm_skipstart = bc;
 466         rm->rm_missingdata = 0;
 467         rm->rm_missingparity = 0;
 468         rm->rm_firstdatacol = nparity;
 469         rm->rm_datacopy = NULL;
 470         rm->rm_reports = 0;
 471         rm->rm_freed = 0;
 472         rm->rm_ecksuminjected = 0;
 473 
 474         asize = 0;
 475 
 476         for (c = 0; c < scols; c++) {
 477                 col = f + c;
 478                 coff = o;
 479                 if (col >= dcols) {
 480                         col -= dcols;
 481                         coff += 1ULL << unit_shift;
 482                 }
 483                 rm->rm_col[c].rc_devidx = col;
 484                 rm->rm_col[c].rc_offset = coff;
 485                 rm->rm_col[c].rc_data = NULL;
 486                 rm->rm_col[c].rc_gdata = NULL;
 487                 rm->rm_col[c].rc_error = 0;
 488                 rm->rm_col[c].rc_tried = 0;
 489                 rm->rm_col[c].rc_skipped = 0;
 490 
 491                 if (c >= acols)
 492                         rm->rm_col[c].rc_size = 0;
 493                 else if (c < bc)
 494                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 495                 else
 496                         rm->rm_col[c].rc_size = q << unit_shift;
 497 
 498                 asize += rm->rm_col[c].rc_size;
 499         }
 500 
 501         ASSERT3U(asize, ==, tot << unit_shift);
 502         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 503         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 504         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 505         ASSERT3U(rm->rm_nskip, <=, nparity);
 506 
 507         for (c = 0; c < rm->rm_firstdatacol; c++)
 508                 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
 509 
 510         rm->rm_col[c].rc_data = zio->io_data;
 511 
 512         for (c = c + 1; c < acols; c++)
 513                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
 514                     rm->rm_col[c - 1].rc_size;
 515 
 516         /*
 517          * If all data stored spans all columns, there's a danger that parity
 518          * will always be on the same device and, since parity isn't read
 519          * during normal operation, that that device's I/O bandwidth won't be
 520          * used effectively. We therefore switch the parity every 1MB.
 521          *
 522          * ... at least that was, ostensibly, the theory. As a practical
 523          * matter unless we juggle the parity between all devices evenly, we
 524          * won't see any benefit. Further, occasional writes that aren't a
 525          * multiple of the LCM of the number of children and the minimum
 526          * stripe width are sufficient to avoid pessimal behavior.
 527          * Unfortunately, this decision created an implicit on-disk format
 528          * requirement that we need to support for all eternity, but only
 529          * for single-parity RAID-Z.
 530          *
 531          * If we intend to skip a sector in the zeroth column for padding
 532          * we must make sure to note this swap. We will never intend to
 533          * skip the first column since at least one data and one parity
 534          * column must appear in each row.
 535          */
 536         ASSERT(rm->rm_cols >= 2);
 537         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 538 
 539         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 540                 devidx = rm->rm_col[0].rc_devidx;
 541                 o = rm->rm_col[0].rc_offset;
 542                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 543                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 544                 rm->rm_col[1].rc_devidx = devidx;
 545                 rm->rm_col[1].rc_offset = o;
 546 
 547                 if (rm->rm_skipstart == 0)
 548                         rm->rm_skipstart = 1;
 549         }
 550 
 551         zio->io_vsd = rm;
 552         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 553         return (rm);
 554 }
 555 
 556 static void
 557 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 558 {
 559         uint64_t *p, *src, pcount, ccount, i;
 560         int c;
 561 
 562         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 563 
 564         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 565                 src = rm->rm_col[c].rc_data;
 566                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 567                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 568 
 569                 if (c == rm->rm_firstdatacol) {
 570                         ASSERT(ccount == pcount);
 571                         for (i = 0; i < ccount; i++, src++, p++) {
 572                                 *p = *src;
 573                         }
 574                 } else {
 575                         ASSERT(ccount <= pcount);
 576                         for (i = 0; i < ccount; i++, src++, p++) {
 577                                 *p ^= *src;
 578                         }
 579                 }
 580         }
 581 }
 582 
 583 static void
 584 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 585 {
 586         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 587         int c;
 588 
 589         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 590         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 591             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 592 
 593         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 594                 src = rm->rm_col[c].rc_data;
 595                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 596                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 597 
 598                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 599 
 600                 if (c == rm->rm_firstdatacol) {
 601                         ASSERT(ccnt == pcnt || ccnt == 0);
 602                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 603                                 *p = *src;
 604                                 *q = *src;
 605                         }
 606                         for (; i < pcnt; i++, src++, p++, q++) {
 607                                 *p = 0;
 608                                 *q = 0;
 609                         }
 610                 } else {
 611                         ASSERT(ccnt <= pcnt);
 612 
 613                         /*
 614                          * Apply the algorithm described above by multiplying
 615                          * the previous result and adding in the new value.
 616                          */
 617                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 618                                 *p ^= *src;
 619 
 620                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 621                                 *q ^= *src;
 622                         }
 623 
 624                         /*
 625                          * Treat short columns as though they are full of 0s.
 626                          * Note that there's therefore nothing needed for P.
 627                          */
 628                         for (; i < pcnt; i++, q++) {
 629                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 630                         }
 631                 }
 632         }
 633 }
 634 
 635 static void
 636 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 637 {
 638         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 639         int c;
 640 
 641         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 642         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 643             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 644         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 645             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 646 
 647         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 648                 src = rm->rm_col[c].rc_data;
 649                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 650                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 651                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 652 
 653                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 654 
 655                 if (c == rm->rm_firstdatacol) {
 656                         ASSERT(ccnt == pcnt || ccnt == 0);
 657                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 658                                 *p = *src;
 659                                 *q = *src;
 660                                 *r = *src;
 661                         }
 662                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 663                                 *p = 0;
 664                                 *q = 0;
 665                                 *r = 0;
 666                         }
 667                 } else {
 668                         ASSERT(ccnt <= pcnt);
 669 
 670                         /*
 671                          * Apply the algorithm described above by multiplying
 672                          * the previous result and adding in the new value.
 673                          */
 674                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 675                                 *p ^= *src;
 676 
 677                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 678                                 *q ^= *src;
 679 
 680                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 681                                 *r ^= *src;
 682                         }
 683 
 684                         /*
 685                          * Treat short columns as though they are full of 0s.
 686                          * Note that there's therefore nothing needed for P.
 687                          */
 688                         for (; i < pcnt; i++, q++, r++) {
 689                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 690                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 691                         }
 692                 }
 693         }
 694 }
 695 
 696 /*
 697  * Generate RAID parity in the first virtual columns according to the number of
 698  * parity columns available.
 699  */
 700 static void
 701 vdev_raidz_generate_parity(raidz_map_t *rm)
 702 {
 703         switch (rm->rm_firstdatacol) {
 704         case 1:
 705                 vdev_raidz_generate_parity_p(rm);
 706                 break;
 707         case 2:
 708                 vdev_raidz_generate_parity_pq(rm);
 709                 break;
 710         case 3:
 711                 vdev_raidz_generate_parity_pqr(rm);
 712                 break;
 713         default:
 714                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 715         }
 716 }
 717 
 718 static int
 719 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 720 {
 721         uint64_t *dst, *src, xcount, ccount, count, i;
 722         int x = tgts[0];
 723         int c;
 724 
 725         ASSERT(ntgts == 1);
 726         ASSERT(x >= rm->rm_firstdatacol);
 727         ASSERT(x < rm->rm_cols);
 728 
 729         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 730         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 731         ASSERT(xcount > 0);
 732 
 733         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 734         dst = rm->rm_col[x].rc_data;
 735         for (i = 0; i < xcount; i++, dst++, src++) {
 736                 *dst = *src;
 737         }
 738 
 739         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 740                 src = rm->rm_col[c].rc_data;
 741                 dst = rm->rm_col[x].rc_data;
 742 
 743                 if (c == x)
 744                         continue;
 745 
 746                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 747                 count = MIN(ccount, xcount);
 748 
 749                 for (i = 0; i < count; i++, dst++, src++) {
 750                         *dst ^= *src;
 751                 }
 752         }
 753 
 754         return (1 << VDEV_RAIDZ_P);
 755 }
 756 
 757 static int
 758 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 759 {
 760         uint64_t *dst, *src, xcount, ccount, count, mask, i;
 761         uint8_t *b;
 762         int x = tgts[0];
 763         int c, j, exp;
 764 
 765         ASSERT(ntgts == 1);
 766 
 767         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 768         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 769 
 770         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 771                 src = rm->rm_col[c].rc_data;
 772                 dst = rm->rm_col[x].rc_data;
 773 
 774                 if (c == x)
 775                         ccount = 0;
 776                 else
 777                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 778 
 779                 count = MIN(ccount, xcount);
 780 
 781                 if (c == rm->rm_firstdatacol) {
 782                         for (i = 0; i < count; i++, dst++, src++) {
 783                                 *dst = *src;
 784                         }
 785                         for (; i < xcount; i++, dst++) {
 786                                 *dst = 0;
 787                         }
 788 
 789                 } else {
 790                         for (i = 0; i < count; i++, dst++, src++) {
 791                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 792                                 *dst ^= *src;
 793                         }
 794 
 795                         for (; i < xcount; i++, dst++) {
 796                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 797                         }
 798                 }
 799         }
 800 
 801         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 802         dst = rm->rm_col[x].rc_data;
 803         exp = 255 - (rm->rm_cols - 1 - x);
 804 
 805         for (i = 0; i < xcount; i++, dst++, src++) {
 806                 *dst ^= *src;
 807                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 808                         *b = vdev_raidz_exp2(*b, exp);
 809                 }
 810         }
 811 
 812         return (1 << VDEV_RAIDZ_Q);
 813 }
 814 
 815 static int
 816 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 817 {
 818         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 819         void *pdata, *qdata;
 820         uint64_t xsize, ysize, i;
 821         int x = tgts[0];
 822         int y = tgts[1];
 823 
 824         ASSERT(ntgts == 2);
 825         ASSERT(x < y);
 826         ASSERT(x >= rm->rm_firstdatacol);
 827         ASSERT(y < rm->rm_cols);
 828 
 829         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 830 
 831         /*
 832          * Move the parity data aside -- we're going to compute parity as
 833          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 834          * reuse the parity generation mechanism without trashing the actual
 835          * parity so we make those columns appear to be full of zeros by
 836          * setting their lengths to zero.
 837          */
 838         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 839         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 840         xsize = rm->rm_col[x].rc_size;
 841         ysize = rm->rm_col[y].rc_size;
 842 
 843         rm->rm_col[VDEV_RAIDZ_P].rc_data =
 844             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
 845         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
 846             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 847         rm->rm_col[x].rc_size = 0;
 848         rm->rm_col[y].rc_size = 0;
 849 
 850         vdev_raidz_generate_parity_pq(rm);
 851 
 852         rm->rm_col[x].rc_size = xsize;
 853         rm->rm_col[y].rc_size = ysize;
 854 
 855         p = pdata;
 856         q = qdata;
 857         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 858         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 859         xd = rm->rm_col[x].rc_data;
 860         yd = rm->rm_col[y].rc_data;
 861 
 862         /*
 863          * We now have:
 864          *      Pxy = P + D_x + D_y
 865          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 866          *
 867          * We can then solve for D_x:
 868          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 869          * where
 870          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 871          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 872          *
 873          * With D_x in hand, we can easily solve for D_y:
 874          *      D_y = P + Pxy + D_x
 875          */
 876 
 877         a = vdev_raidz_pow2[255 + x - y];
 878         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 879         tmp = 255 - vdev_raidz_log2[a ^ 1];
 880 
 881         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 882         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 883 
 884         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
 885                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
 886                     vdev_raidz_exp2(*q ^ *qxy, bexp);
 887 
 888                 if (i < ysize)
 889                         *yd = *p ^ *pxy ^ *xd;
 890         }
 891 
 892         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
 893             rm->rm_col[VDEV_RAIDZ_P].rc_size);
 894         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
 895             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 896 
 897         /*
 898          * Restore the saved parity data.
 899          */
 900         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 901         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
 902 
 903         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 904 }
 905 
 906 /* BEGIN CSTYLED */
 907 /*
 908  * In the general case of reconstruction, we must solve the system of linear
 909  * equations defined by the coeffecients used to generate parity as well as
 910  * the contents of the data and parity disks. This can be expressed with
 911  * vectors for the original data (D) and the actual data (d) and parity (p)
 912  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 913  *
 914  *            __   __                     __     __
 915  *            |     |         __     __   |  p_0  |
 916  *            |  V  |         |  D_0  |   | p_m-1 |
 917  *            |     |    x    |   :   | = |  d_0  |
 918  *            |  I  |         | D_n-1 |   |   :   |
 919  *            |     |         ~~     ~~   | d_n-1 |
 920  *            ~~   ~~                     ~~     ~~
 921  *
 922  * I is simply a square identity matrix of size n, and V is a vandermonde
 923  * matrix defined by the coeffecients we chose for the various parity columns
 924  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 925  * computation as well as linear separability.
 926  *
 927  *      __               __               __     __
 928  *      |   1   ..  1 1 1 |               |  p_0  |
 929  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 930  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 931  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 932  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 933  *      |   :       : : : |   |   :   |   |  d_2  |
 934  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 935  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 936  *      |   0   ..  0 0 1 |               | d_n-1 |
 937  *      ~~               ~~               ~~     ~~
 938  *
 939  * Note that I, V, d, and p are known. To compute D, we must invert the
 940  * matrix and use the known data and parity values to reconstruct the unknown
 941  * data values. We begin by removing the rows in V|I and d|p that correspond
 942  * to failed or missing columns; we then make V|I square (n x n) and d|p
 943  * sized n by removing rows corresponding to unused parity from the bottom up
 944  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 945  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 946  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 947  *           __                               __
 948  *           |  1   1   1   1   1   1   1   1  |
 949  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 950  *           |  19 205 116  29  64  16  4   1  |      / /
 951  *           |  1   0   0   0   0   0   0   0  |     / /
 952  *           |  0   1   0   0   0   0   0   0  | <--' /
 953  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 954  *           |  0   0   0   1   0   0   0   0  |
 955  *           |  0   0   0   0   1   0   0   0  |
 956  *           |  0   0   0   0   0   1   0   0  |
 957  *           |  0   0   0   0   0   0   1   0  |
 958  *           |  0   0   0   0   0   0   0   1  |
 959  *           ~~                               ~~
 960  *           __                               __
 961  *           |  1   1   1   1   1   1   1   1  |
 962  *           | 128  64  32  16  8   4   2   1  |
 963  *           |  19 205 116  29  64  16  4   1  |
 964  *           |  1   0   0   0   0   0   0   0  |
 965  *           |  0   1   0   0   0   0   0   0  |
 966  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 967  *           |  0   0   0   1   0   0   0   0  |
 968  *           |  0   0   0   0   1   0   0   0  |
 969  *           |  0   0   0   0   0   1   0   0  |
 970  *           |  0   0   0   0   0   0   1   0  |
 971  *           |  0   0   0   0   0   0   0   1  |
 972  *           ~~                               ~~
 973  *
 974  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
 975  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
 976  * matrix is not singular.
 977  * __                                                                 __
 978  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 979  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 980  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 981  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 982  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 983  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 984  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 985  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 986  * ~~                                                                 ~~
 987  * __                                                                 __
 988  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 989  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 990  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 991  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 992  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 993  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 994  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 995  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 996  * ~~                                                                 ~~
 997  * __                                                                 __
 998  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 999  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1000  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1001  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1002  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1003  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1004  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1005  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1006  * ~~                                                                 ~~
1007  * __                                                                 __
1008  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1009  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1010  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1011  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1012  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1013  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1014  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1015  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1016  * ~~                                                                 ~~
1017  * __                                                                 __
1018  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1019  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1020  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1021  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1022  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1023  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1024  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1025  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1026  * ~~                                                                 ~~
1027  * __                                                                 __
1028  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1029  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1030  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1031  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1032  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1033  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1034  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1035  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1036  * ~~                                                                 ~~
1037  *                   __                               __
1038  *                   |  0   0   1   0   0   0   0   0  |
1039  *                   | 167 100  5   41 159 169 217 208 |
1040  *                   | 166 100  4   40 158 168 216 209 |
1041  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1042  *                   |  0   0   0   0   1   0   0   0  |
1043  *                   |  0   0   0   0   0   1   0   0  |
1044  *                   |  0   0   0   0   0   0   1   0  |
1045  *                   |  0   0   0   0   0   0   0   1  |
1046  *                   ~~                               ~~
1047  *
1048  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1049  * of the missing data.
1050  *
1051  * As is apparent from the example above, the only non-trivial rows in the
1052  * inverse matrix correspond to the data disks that we're trying to
1053  * reconstruct. Indeed, those are the only rows we need as the others would
1054  * only be useful for reconstructing data known or assumed to be valid. For
1055  * that reason, we only build the coefficients in the rows that correspond to
1056  * targeted columns.
1057  */
1058 /* END CSTYLED */
1059 
1060 static void
1061 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1062     uint8_t **rows)
1063 {
1064         int i, j;
1065         int pow;
1066 
1067         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1068 
1069         /*
1070          * Fill in the missing rows of interest.
1071          */
1072         for (i = 0; i < nmap; i++) {
1073                 ASSERT3S(0, <=, map[i]);
1074                 ASSERT3S(map[i], <=, 2);
1075 
1076                 pow = map[i] * n;
1077                 if (pow > 255)
1078                         pow -= 255;
1079                 ASSERT(pow <= 255);
1080 
1081                 for (j = 0; j < n; j++) {
1082                         pow -= map[i];
1083                         if (pow < 0)
1084                                 pow += 255;
1085                         rows[i][j] = vdev_raidz_pow2[pow];
1086                 }
1087         }
1088 }
1089 
1090 static void
1091 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1092     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1093 {
1094         int i, j, ii, jj;
1095         uint8_t log;
1096 
1097         /*
1098          * Assert that the first nmissing entries from the array of used
1099          * columns correspond to parity columns and that subsequent entries
1100          * correspond to data columns.
1101          */
1102         for (i = 0; i < nmissing; i++) {
1103                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1104         }
1105         for (; i < n; i++) {
1106                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1107         }
1108 
1109         /*
1110          * First initialize the storage where we'll compute the inverse rows.
1111          */
1112         for (i = 0; i < nmissing; i++) {
1113                 for (j = 0; j < n; j++) {
1114                         invrows[i][j] = (i == j) ? 1 : 0;
1115                 }
1116         }
1117 
1118         /*
1119          * Subtract all trivial rows from the rows of consequence.
1120          */
1121         for (i = 0; i < nmissing; i++) {
1122                 for (j = nmissing; j < n; j++) {
1123                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1124                         jj = used[j] - rm->rm_firstdatacol;
1125                         ASSERT3S(jj, <, n);
1126                         invrows[i][j] = rows[i][jj];
1127                         rows[i][jj] = 0;
1128                 }
1129         }
1130 
1131         /*
1132          * For each of the rows of interest, we must normalize it and subtract
1133          * a multiple of it from the other rows.
1134          */
1135         for (i = 0; i < nmissing; i++) {
1136                 for (j = 0; j < missing[i]; j++) {
1137                         ASSERT0(rows[i][j]);
1138                 }
1139                 ASSERT3U(rows[i][missing[i]], !=, 0);
1140 
1141                 /*
1142                  * Compute the inverse of the first element and multiply each
1143                  * element in the row by that value.
1144                  */
1145                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1146 
1147                 for (j = 0; j < n; j++) {
1148                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1149                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1150                 }
1151 
1152                 for (ii = 0; ii < nmissing; ii++) {
1153                         if (i == ii)
1154                                 continue;
1155 
1156                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1157 
1158                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1159 
1160                         for (j = 0; j < n; j++) {
1161                                 rows[ii][j] ^=
1162                                     vdev_raidz_exp2(rows[i][j], log);
1163                                 invrows[ii][j] ^=
1164                                     vdev_raidz_exp2(invrows[i][j], log);
1165                         }
1166                 }
1167         }
1168 
1169         /*
1170          * Verify that the data that is left in the rows are properly part of
1171          * an identity matrix.
1172          */
1173         for (i = 0; i < nmissing; i++) {
1174                 for (j = 0; j < n; j++) {
1175                         if (j == missing[i]) {
1176                                 ASSERT3U(rows[i][j], ==, 1);
1177                         } else {
1178                                 ASSERT0(rows[i][j]);
1179                         }
1180                 }
1181         }
1182 }
1183 
1184 static void
1185 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1186     int *missing, uint8_t **invrows, const uint8_t *used)
1187 {
1188         int i, j, x, cc, c;
1189         uint8_t *src;
1190         uint64_t ccount;
1191         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1192         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1193         uint8_t log = 0;
1194         uint8_t val;
1195         int ll;
1196         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1197         uint8_t *p, *pp;
1198         size_t psize;
1199 
1200         psize = sizeof (invlog[0][0]) * n * nmissing;
1201         p = kmem_alloc(psize, KM_SLEEP);
1202 
1203         for (pp = p, i = 0; i < nmissing; i++) {
1204                 invlog[i] = pp;
1205                 pp += n;
1206         }
1207 
1208         for (i = 0; i < nmissing; i++) {
1209                 for (j = 0; j < n; j++) {
1210                         ASSERT3U(invrows[i][j], !=, 0);
1211                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1212                 }
1213         }
1214 
1215         for (i = 0; i < n; i++) {
1216                 c = used[i];
1217                 ASSERT3U(c, <, rm->rm_cols);
1218 
1219                 src = rm->rm_col[c].rc_data;
1220                 ccount = rm->rm_col[c].rc_size;
1221                 for (j = 0; j < nmissing; j++) {
1222                         cc = missing[j] + rm->rm_firstdatacol;
1223                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1224                         ASSERT3U(cc, <, rm->rm_cols);
1225                         ASSERT3U(cc, !=, c);
1226 
1227                         dst[j] = rm->rm_col[cc].rc_data;
1228                         dcount[j] = rm->rm_col[cc].rc_size;
1229                 }
1230 
1231                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1232 
1233                 for (x = 0; x < ccount; x++, src++) {
1234                         if (*src != 0)
1235                                 log = vdev_raidz_log2[*src];
1236 
1237                         for (cc = 0; cc < nmissing; cc++) {
1238                                 if (x >= dcount[cc])
1239                                         continue;
1240 
1241                                 if (*src == 0) {
1242                                         val = 0;
1243                                 } else {
1244                                         if ((ll = log + invlog[cc][i]) >= 255)
1245                                                 ll -= 255;
1246                                         val = vdev_raidz_pow2[ll];
1247                                 }
1248 
1249                                 if (i == 0)
1250                                         dst[cc][x] = val;
1251                                 else
1252                                         dst[cc][x] ^= val;
1253                         }
1254                 }
1255         }
1256 
1257         kmem_free(p, psize);
1258 }
1259 
1260 static int
1261 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1262 {
1263         int n, i, c, t, tt;
1264         int nmissing_rows;
1265         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1266         int parity_map[VDEV_RAIDZ_MAXPARITY];
1267 
1268         uint8_t *p, *pp;
1269         size_t psize;
1270 
1271         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1272         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1273         uint8_t *used;
1274 
1275         int code = 0;
1276 
1277 
1278         n = rm->rm_cols - rm->rm_firstdatacol;
1279 
1280         /*
1281          * Figure out which data columns are missing.
1282          */
1283         nmissing_rows = 0;
1284         for (t = 0; t < ntgts; t++) {
1285                 if (tgts[t] >= rm->rm_firstdatacol) {
1286                         missing_rows[nmissing_rows++] =
1287                             tgts[t] - rm->rm_firstdatacol;
1288                 }
1289         }
1290 
1291         /*
1292          * Figure out which parity columns to use to help generate the missing
1293          * data columns.
1294          */
1295         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1296                 ASSERT(tt < ntgts);
1297                 ASSERT(c < rm->rm_firstdatacol);
1298 
1299                 /*
1300                  * Skip any targeted parity columns.
1301                  */
1302                 if (c == tgts[tt]) {
1303                         tt++;
1304                         continue;
1305                 }
1306 
1307                 code |= 1 << c;
1308 
1309                 parity_map[i] = c;
1310                 i++;
1311         }
1312 
1313         ASSERT(code != 0);
1314         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1315 
1316         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1317             nmissing_rows * n + sizeof (used[0]) * n;
1318         p = kmem_alloc(psize, KM_SLEEP);
1319 
1320         for (pp = p, i = 0; i < nmissing_rows; i++) {
1321                 rows[i] = pp;
1322                 pp += n;
1323                 invrows[i] = pp;
1324                 pp += n;
1325         }
1326         used = pp;
1327 
1328         for (i = 0; i < nmissing_rows; i++) {
1329                 used[i] = parity_map[i];
1330         }
1331 
1332         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1333                 if (tt < nmissing_rows &&
1334                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1335                         tt++;
1336                         continue;
1337                 }
1338 
1339                 ASSERT3S(i, <, n);
1340                 used[i] = c;
1341                 i++;
1342         }
1343 
1344         /*
1345          * Initialize the interesting rows of the matrix.
1346          */
1347         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1348 
1349         /*
1350          * Invert the matrix.
1351          */
1352         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1353             invrows, used);
1354 
1355         /*
1356          * Reconstruct the missing data using the generated matrix.
1357          */
1358         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1359             invrows, used);
1360 
1361         kmem_free(p, psize);
1362 
1363         return (code);
1364 }
1365 
1366 static int
1367 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1368 {
1369         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1370         int ntgts;
1371         int i, c;
1372         int code;
1373         int nbadparity, nbaddata;
1374         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1375 
1376         /*
1377          * The tgts list must already be sorted.
1378          */
1379         for (i = 1; i < nt; i++) {
1380                 ASSERT(t[i] > t[i - 1]);
1381         }
1382 
1383         nbadparity = rm->rm_firstdatacol;
1384         nbaddata = rm->rm_cols - nbadparity;
1385         ntgts = 0;
1386         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1387                 if (c < rm->rm_firstdatacol)
1388                         parity_valid[c] = B_FALSE;
1389 
1390                 if (i < nt && c == t[i]) {
1391                         tgts[ntgts++] = c;
1392                         i++;
1393                 } else if (rm->rm_col[c].rc_error != 0) {
1394                         tgts[ntgts++] = c;
1395                 } else if (c >= rm->rm_firstdatacol) {
1396                         nbaddata--;
1397                 } else {
1398                         parity_valid[c] = B_TRUE;
1399                         nbadparity--;
1400                 }
1401         }
1402 
1403         ASSERT(ntgts >= nt);
1404         ASSERT(nbaddata >= 0);
1405         ASSERT(nbaddata + nbadparity == ntgts);
1406 
1407         dt = &tgts[nbadparity];
1408 
1409         /*
1410          * See if we can use any of our optimized reconstruction routines.
1411          */
1412         if (!vdev_raidz_default_to_general) {
1413                 switch (nbaddata) {
1414                 case 1:
1415                         if (parity_valid[VDEV_RAIDZ_P])
1416                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1417 
1418                         ASSERT(rm->rm_firstdatacol > 1);
1419 
1420                         if (parity_valid[VDEV_RAIDZ_Q])
1421                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1422 
1423                         ASSERT(rm->rm_firstdatacol > 2);
1424                         break;
1425 
1426                 case 2:
1427                         ASSERT(rm->rm_firstdatacol > 1);
1428 
1429                         if (parity_valid[VDEV_RAIDZ_P] &&
1430                             parity_valid[VDEV_RAIDZ_Q])
1431                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1432 
1433                         ASSERT(rm->rm_firstdatacol > 2);
1434 
1435                         break;
1436                 }
1437         }
1438 
1439         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1440         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1441         ASSERT(code > 0);
1442         return (code);
1443 }
1444 
1445 static int
1446 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1447     uint64_t *ashift)
1448 {
1449         vdev_t *cvd;
1450         uint64_t nparity = vd->vdev_nparity;
1451         int c;
1452         int lasterror = 0;
1453         int numerrors = 0;
1454 
1455         ASSERT(nparity > 0);
1456 
1457         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1458             vd->vdev_children < nparity + 1) {
1459                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1460                 return (SET_ERROR(EINVAL));
1461         }
1462 
1463         vdev_open_children(vd);
1464 
1465         for (c = 0; c < vd->vdev_children; c++) {
1466                 cvd = vd->vdev_child[c];
1467 
1468                 if (cvd->vdev_open_error != 0) {
1469                         lasterror = cvd->vdev_open_error;
1470                         numerrors++;
1471                         continue;
1472                 }
1473 
1474                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1475                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1476                 *ashift = MAX(*ashift, cvd->vdev_ashift);
1477         }
1478 
1479         *asize *= vd->vdev_children;
1480         *max_asize *= vd->vdev_children;
1481 
1482         if (numerrors > nparity) {
1483                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1484                 return (lasterror);
1485         }
1486 
1487         return (0);
1488 }
1489 
1490 static void
1491 vdev_raidz_close(vdev_t *vd)
1492 {
1493         int c;
1494 
1495         for (c = 0; c < vd->vdev_children; c++)
1496                 vdev_close(vd->vdev_child[c]);
1497 }
1498 
1499 static uint64_t
1500 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1501 {
1502         uint64_t asize;
1503         uint64_t ashift = vd->vdev_top->vdev_ashift;
1504         uint64_t cols = vd->vdev_children;
1505         uint64_t nparity = vd->vdev_nparity;
1506 
1507         asize = ((psize - 1) >> ashift) + 1;
1508         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1509         asize = roundup(asize, nparity + 1) << ashift;
1510 
1511         return (asize);
1512 }
1513 
1514 static void
1515 vdev_raidz_child_done(zio_t *zio)
1516 {
1517         raidz_col_t *rc = zio->io_private;
1518 
1519         rc->rc_error = zio->io_error;
1520         rc->rc_tried = 1;
1521         rc->rc_skipped = 0;
1522 }
1523 
1524 static int
1525 vdev_raidz_io_start(zio_t *zio)
1526 {
1527         vdev_t *vd = zio->io_vd;
1528         vdev_t *tvd = vd->vdev_top;
1529         vdev_t *cvd;
1530         raidz_map_t *rm;
1531         raidz_col_t *rc;
1532         int c, i;
1533 
1534         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1535             vd->vdev_nparity);
1536 
1537         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1538 
1539         if (zio->io_type == ZIO_TYPE_WRITE) {
1540                 vdev_raidz_generate_parity(rm);
1541 
1542                 for (c = 0; c < rm->rm_cols; c++) {
1543                         rc = &rm->rm_col[c];
1544                         cvd = vd->vdev_child[rc->rc_devidx];
1545                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1546                             rc->rc_offset, rc->rc_data, rc->rc_size,
1547                             zio->io_type, zio->io_priority, 0,
1548                             vdev_raidz_child_done, rc));
1549                 }
1550 
1551                 /*
1552                  * Generate optional I/Os for any skipped sectors to improve
1553                  * aggregation contiguity.
1554                  */
1555                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1556                         ASSERT(c <= rm->rm_scols);
1557                         if (c == rm->rm_scols)
1558                                 c = 0;
1559                         rc = &rm->rm_col[c];
1560                         cvd = vd->vdev_child[rc->rc_devidx];
1561                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1562                             rc->rc_offset + rc->rc_size, NULL,
1563                             1 << tvd->vdev_ashift,
1564                             zio->io_type, zio->io_priority,
1565                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1566                 }
1567 
1568                 return (ZIO_PIPELINE_CONTINUE);
1569         }
1570 
1571         ASSERT(zio->io_type == ZIO_TYPE_READ);
1572 
1573         /*
1574          * Iterate over the columns in reverse order so that we hit the parity
1575          * last -- any errors along the way will force us to read the parity.
1576          */
1577         for (c = rm->rm_cols - 1; c >= 0; c--) {
1578                 rc = &rm->rm_col[c];
1579                 cvd = vd->vdev_child[rc->rc_devidx];
1580                 if (!vdev_readable(cvd)) {
1581                         if (c >= rm->rm_firstdatacol)
1582                                 rm->rm_missingdata++;
1583                         else
1584                                 rm->rm_missingparity++;
1585                         rc->rc_error = SET_ERROR(ENXIO);
1586                         rc->rc_tried = 1;    /* don't even try */
1587                         rc->rc_skipped = 1;
1588                         continue;
1589                 }
1590                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1591                         if (c >= rm->rm_firstdatacol)
1592                                 rm->rm_missingdata++;
1593                         else
1594                                 rm->rm_missingparity++;
1595                         rc->rc_error = SET_ERROR(ESTALE);
1596                         rc->rc_skipped = 1;
1597                         continue;
1598                 }
1599                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1600                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1601                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1602                             rc->rc_offset, rc->rc_data, rc->rc_size,
1603                             zio->io_type, zio->io_priority, 0,
1604                             vdev_raidz_child_done, rc));
1605                 }
1606         }
1607 
1608         return (ZIO_PIPELINE_CONTINUE);
1609 }
1610 
1611 
1612 /*
1613  * Report a checksum error for a child of a RAID-Z device.
1614  */
1615 static void
1616 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1617 {
1618         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1619 
1620         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1621                 zio_bad_cksum_t zbc;
1622                 raidz_map_t *rm = zio->io_vsd;
1623 
1624                 mutex_enter(&vd->vdev_stat_lock);
1625                 vd->vdev_stat.vs_checksum_errors++;
1626                 mutex_exit(&vd->vdev_stat_lock);
1627 
1628                 zbc.zbc_has_cksum = 0;
1629                 zbc.zbc_injected = rm->rm_ecksuminjected;
1630 
1631                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1632                     rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1633                     &zbc);
1634         }
1635 }
1636 
1637 /*
1638  * We keep track of whether or not there were any injected errors, so that
1639  * any ereports we generate can note it.
1640  */
1641 static int
1642 raidz_checksum_verify(zio_t *zio)
1643 {
1644         zio_bad_cksum_t zbc;
1645         raidz_map_t *rm = zio->io_vsd;
1646 
1647         int ret = zio_checksum_error(zio, &zbc);
1648         if (ret != 0 && zbc.zbc_injected != 0)
1649                 rm->rm_ecksuminjected = 1;
1650 
1651         return (ret);
1652 }
1653 
1654 /*
1655  * Generate the parity from the data columns. If we tried and were able to
1656  * read the parity without error, verify that the generated parity matches the
1657  * data we read. If it doesn't, we fire off a checksum error. Return the
1658  * number such failures.
1659  */
1660 static int
1661 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1662 {
1663         void *orig[VDEV_RAIDZ_MAXPARITY];
1664         int c, ret = 0;
1665         raidz_col_t *rc;
1666 
1667         for (c = 0; c < rm->rm_firstdatacol; c++) {
1668                 rc = &rm->rm_col[c];
1669                 if (!rc->rc_tried || rc->rc_error != 0)
1670                         continue;
1671                 orig[c] = zio_buf_alloc(rc->rc_size);
1672                 bcopy(rc->rc_data, orig[c], rc->rc_size);
1673         }
1674 
1675         vdev_raidz_generate_parity(rm);
1676 
1677         for (c = 0; c < rm->rm_firstdatacol; c++) {
1678                 rc = &rm->rm_col[c];
1679                 if (!rc->rc_tried || rc->rc_error != 0)
1680                         continue;
1681                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1682                         raidz_checksum_error(zio, rc, orig[c]);
1683                         rc->rc_error = SET_ERROR(ECKSUM);
1684                         ret++;
1685                 }
1686                 zio_buf_free(orig[c], rc->rc_size);
1687         }
1688 
1689         return (ret);
1690 }
1691 
1692 /*
1693  * Keep statistics on all the ways that we used parity to correct data.
1694  */
1695 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1696 
1697 static int
1698 vdev_raidz_worst_error(raidz_map_t *rm)
1699 {
1700         int error = 0;
1701 
1702         for (int c = 0; c < rm->rm_cols; c++)
1703                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1704 
1705         return (error);
1706 }
1707 
1708 /*
1709  * Iterate over all combinations of bad data and attempt a reconstruction.
1710  * Note that the algorithm below is non-optimal because it doesn't take into
1711  * account how reconstruction is actually performed. For example, with
1712  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1713  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1714  * cases we'd only use parity information in column 0.
1715  */
1716 static int
1717 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1718 {
1719         raidz_map_t *rm = zio->io_vsd;
1720         raidz_col_t *rc;
1721         void *orig[VDEV_RAIDZ_MAXPARITY];
1722         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1723         int *tgts = &tstore[1];
1724         int current, next, i, c, n;
1725         int code, ret = 0;
1726 
1727         ASSERT(total_errors < rm->rm_firstdatacol);
1728 
1729         /*
1730          * This simplifies one edge condition.
1731          */
1732         tgts[-1] = -1;
1733 
1734         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1735                 /*
1736                  * Initialize the targets array by finding the first n columns
1737                  * that contain no error.
1738                  *
1739                  * If there were no data errors, we need to ensure that we're
1740                  * always explicitly attempting to reconstruct at least one
1741                  * data column. To do this, we simply push the highest target
1742                  * up into the data columns.
1743                  */
1744                 for (c = 0, i = 0; i < n; i++) {
1745                         if (i == n - 1 && data_errors == 0 &&
1746                             c < rm->rm_firstdatacol) {
1747                                 c = rm->rm_firstdatacol;
1748                         }
1749 
1750                         while (rm->rm_col[c].rc_error != 0) {
1751                                 c++;
1752                                 ASSERT3S(c, <, rm->rm_cols);
1753                         }
1754 
1755                         tgts[i] = c++;
1756                 }
1757 
1758                 /*
1759                  * Setting tgts[n] simplifies the other edge condition.
1760                  */
1761                 tgts[n] = rm->rm_cols;
1762 
1763                 /*
1764                  * These buffers were allocated in previous iterations.
1765                  */
1766                 for (i = 0; i < n - 1; i++) {
1767                         ASSERT(orig[i] != NULL);
1768                 }
1769 
1770                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1771 
1772                 current = 0;
1773                 next = tgts[current];
1774 
1775                 while (current != n) {
1776                         tgts[current] = next;
1777                         current = 0;
1778 
1779                         /*
1780                          * Save off the original data that we're going to
1781                          * attempt to reconstruct.
1782                          */
1783                         for (i = 0; i < n; i++) {
1784                                 ASSERT(orig[i] != NULL);
1785                                 c = tgts[i];
1786                                 ASSERT3S(c, >=, 0);
1787                                 ASSERT3S(c, <, rm->rm_cols);
1788                                 rc = &rm->rm_col[c];
1789                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
1790                         }
1791 
1792                         /*
1793                          * Attempt a reconstruction and exit the outer loop on
1794                          * success.
1795                          */
1796                         code = vdev_raidz_reconstruct(rm, tgts, n);
1797                         if (raidz_checksum_verify(zio) == 0) {
1798                                 atomic_inc_64(&raidz_corrected[code]);
1799 
1800                                 for (i = 0; i < n; i++) {
1801                                         c = tgts[i];
1802                                         rc = &rm->rm_col[c];
1803                                         ASSERT(rc->rc_error == 0);
1804                                         if (rc->rc_tried)
1805                                                 raidz_checksum_error(zio, rc,
1806                                                     orig[i]);
1807                                         rc->rc_error = SET_ERROR(ECKSUM);
1808                                 }
1809 
1810                                 ret = code;
1811                                 goto done;
1812                         }
1813 
1814                         /*
1815                          * Restore the original data.
1816                          */
1817                         for (i = 0; i < n; i++) {
1818                                 c = tgts[i];
1819                                 rc = &rm->rm_col[c];
1820                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
1821                         }
1822 
1823                         do {
1824                                 /*
1825                                  * Find the next valid column after the current
1826                                  * position..
1827                                  */
1828                                 for (next = tgts[current] + 1;
1829                                     next < rm->rm_cols &&
1830                                     rm->rm_col[next].rc_error != 0; next++)
1831                                         continue;
1832 
1833                                 ASSERT(next <= tgts[current + 1]);
1834 
1835                                 /*
1836                                  * If that spot is available, we're done here.
1837                                  */
1838                                 if (next != tgts[current + 1])
1839                                         break;
1840 
1841                                 /*
1842                                  * Otherwise, find the next valid column after
1843                                  * the previous position.
1844                                  */
1845                                 for (c = tgts[current - 1] + 1;
1846                                     rm->rm_col[c].rc_error != 0; c++)
1847                                         continue;
1848 
1849                                 tgts[current] = c;
1850                                 current++;
1851 
1852                         } while (current != n);
1853                 }
1854         }
1855         n--;
1856 done:
1857         for (i = 0; i < n; i++) {
1858                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1859         }
1860 
1861         return (ret);
1862 }
1863 
1864 static void
1865 vdev_raidz_io_done(zio_t *zio)
1866 {
1867         vdev_t *vd = zio->io_vd;
1868         vdev_t *cvd;
1869         raidz_map_t *rm = zio->io_vsd;
1870         raidz_col_t *rc;
1871         int unexpected_errors = 0;
1872         int parity_errors = 0;
1873         int parity_untried = 0;
1874         int data_errors = 0;
1875         int total_errors = 0;
1876         int n, c;
1877         int tgts[VDEV_RAIDZ_MAXPARITY];
1878         int code;
1879 
1880         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1881 
1882         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1883         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1884 
1885         for (c = 0; c < rm->rm_cols; c++) {
1886                 rc = &rm->rm_col[c];
1887 
1888                 if (rc->rc_error) {
1889                         ASSERT(rc->rc_error != ECKSUM);      /* child has no bp */
1890 
1891                         if (c < rm->rm_firstdatacol)
1892                                 parity_errors++;
1893                         else
1894                                 data_errors++;
1895 
1896                         if (!rc->rc_skipped)
1897                                 unexpected_errors++;
1898 
1899                         total_errors++;
1900                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1901                         parity_untried++;
1902                 }
1903         }
1904 
1905         if (zio->io_type == ZIO_TYPE_WRITE) {
1906                 /*
1907                  * XXX -- for now, treat partial writes as a success.
1908                  * (If we couldn't write enough columns to reconstruct
1909                  * the data, the I/O failed.  Otherwise, good enough.)
1910                  *
1911                  * Now that we support write reallocation, it would be better
1912                  * to treat partial failure as real failure unless there are
1913                  * no non-degraded top-level vdevs left, and not update DTLs
1914                  * if we intend to reallocate.
1915                  */
1916                 /* XXPOLICY */
1917                 if (total_errors > rm->rm_firstdatacol)
1918                         zio->io_error = vdev_raidz_worst_error(rm);
1919 
1920                 return;
1921         }
1922 
1923         ASSERT(zio->io_type == ZIO_TYPE_READ);
1924         /*
1925          * There are three potential phases for a read:
1926          *      1. produce valid data from the columns read
1927          *      2. read all disks and try again
1928          *      3. perform combinatorial reconstruction
1929          *
1930          * Each phase is progressively both more expensive and less likely to
1931          * occur. If we encounter more errors than we can repair or all phases
1932          * fail, we have no choice but to return an error.
1933          */
1934 
1935         /*
1936          * If the number of errors we saw was correctable -- less than or equal
1937          * to the number of parity disks read -- attempt to produce data that
1938          * has a valid checksum. Naturally, this case applies in the absence of
1939          * any errors.
1940          */
1941         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1942                 if (data_errors == 0) {
1943                         if (raidz_checksum_verify(zio) == 0) {
1944                                 /*
1945                                  * If we read parity information (unnecessarily
1946                                  * as it happens since no reconstruction was
1947                                  * needed) regenerate and verify the parity.
1948                                  * We also regenerate parity when resilvering
1949                                  * so we can write it out to the failed device
1950                                  * later.
1951                                  */
1952                                 if (parity_errors + parity_untried <
1953                                     rm->rm_firstdatacol ||
1954                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
1955                                         n = raidz_parity_verify(zio, rm);
1956                                         unexpected_errors += n;
1957                                         ASSERT(parity_errors + n <=
1958                                             rm->rm_firstdatacol);
1959                                 }
1960                                 goto done;
1961                         }
1962                 } else {
1963                         /*
1964                          * We either attempt to read all the parity columns or
1965                          * none of them. If we didn't try to read parity, we
1966                          * wouldn't be here in the correctable case. There must
1967                          * also have been fewer parity errors than parity
1968                          * columns or, again, we wouldn't be in this code path.
1969                          */
1970                         ASSERT(parity_untried == 0);
1971                         ASSERT(parity_errors < rm->rm_firstdatacol);
1972 
1973                         /*
1974                          * Identify the data columns that reported an error.
1975                          */
1976                         n = 0;
1977                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1978                                 rc = &rm->rm_col[c];
1979                                 if (rc->rc_error != 0) {
1980                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1981                                         tgts[n++] = c;
1982                                 }
1983                         }
1984 
1985                         ASSERT(rm->rm_firstdatacol >= n);
1986 
1987                         code = vdev_raidz_reconstruct(rm, tgts, n);
1988 
1989                         if (raidz_checksum_verify(zio) == 0) {
1990                                 atomic_inc_64(&raidz_corrected[code]);
1991 
1992                                 /*
1993                                  * If we read more parity disks than were used
1994                                  * for reconstruction, confirm that the other
1995                                  * parity disks produced correct data. This
1996                                  * routine is suboptimal in that it regenerates
1997                                  * the parity that we already used in addition
1998                                  * to the parity that we're attempting to
1999                                  * verify, but this should be a relatively
2000                                  * uncommon case, and can be optimized if it
2001                                  * becomes a problem. Note that we regenerate
2002                                  * parity when resilvering so we can write it
2003                                  * out to failed devices later.
2004                                  */
2005                                 if (parity_errors < rm->rm_firstdatacol - n ||
2006                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2007                                         n = raidz_parity_verify(zio, rm);
2008                                         unexpected_errors += n;
2009                                         ASSERT(parity_errors + n <=
2010                                             rm->rm_firstdatacol);
2011                                 }
2012 
2013                                 goto done;
2014                         }
2015                 }
2016         }
2017 
2018         /*
2019          * This isn't a typical situation -- either we got a read error or
2020          * a child silently returned bad data. Read every block so we can
2021          * try again with as much data and parity as we can track down. If
2022          * we've already been through once before, all children will be marked
2023          * as tried so we'll proceed to combinatorial reconstruction.
2024          */
2025         unexpected_errors = 1;
2026         rm->rm_missingdata = 0;
2027         rm->rm_missingparity = 0;
2028 
2029         for (c = 0; c < rm->rm_cols; c++) {
2030                 if (rm->rm_col[c].rc_tried)
2031                         continue;
2032 
2033                 zio_vdev_io_redone(zio);
2034                 do {
2035                         rc = &rm->rm_col[c];
2036                         if (rc->rc_tried)
2037                                 continue;
2038                         zio_nowait(zio_vdev_child_io(zio, NULL,
2039                             vd->vdev_child[rc->rc_devidx],
2040                             rc->rc_offset, rc->rc_data, rc->rc_size,
2041                             zio->io_type, zio->io_priority, 0,
2042                             vdev_raidz_child_done, rc));
2043                 } while (++c < rm->rm_cols);
2044 
2045                 return;
2046         }
2047 
2048         /*
2049          * At this point we've attempted to reconstruct the data given the
2050          * errors we detected, and we've attempted to read all columns. There
2051          * must, therefore, be one or more additional problems -- silent errors
2052          * resulting in invalid data rather than explicit I/O errors resulting
2053          * in absent data. We check if there is enough additional data to
2054          * possibly reconstruct the data and then perform combinatorial
2055          * reconstruction over all possible combinations. If that fails,
2056          * we're cooked.
2057          */
2058         if (total_errors > rm->rm_firstdatacol) {
2059                 zio->io_error = vdev_raidz_worst_error(rm);
2060 
2061         } else if (total_errors < rm->rm_firstdatacol &&
2062             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2063                 /*
2064                  * If we didn't use all the available parity for the
2065                  * combinatorial reconstruction, verify that the remaining
2066                  * parity is correct.
2067                  */
2068                 if (code != (1 << rm->rm_firstdatacol) - 1)
2069                         (void) raidz_parity_verify(zio, rm);
2070         } else {
2071                 /*
2072                  * We're here because either:
2073                  *
2074                  *      total_errors == rm_first_datacol, or
2075                  *      vdev_raidz_combrec() failed
2076                  *
2077                  * In either case, there is enough bad data to prevent
2078                  * reconstruction.
2079                  *
2080                  * Start checksum ereports for all children which haven't
2081                  * failed, and the IO wasn't speculative.
2082                  */
2083                 zio->io_error = SET_ERROR(ECKSUM);
2084 
2085                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2086                         for (c = 0; c < rm->rm_cols; c++) {
2087                                 rc = &rm->rm_col[c];
2088                                 if (rc->rc_error == 0) {
2089                                         zio_bad_cksum_t zbc;
2090                                         zbc.zbc_has_cksum = 0;
2091                                         zbc.zbc_injected =
2092                                             rm->rm_ecksuminjected;
2093 
2094                                         zfs_ereport_start_checksum(
2095                                             zio->io_spa,
2096                                             vd->vdev_child[rc->rc_devidx],
2097                                             zio, rc->rc_offset, rc->rc_size,
2098                                             (void *)(uintptr_t)c, &zbc);
2099                                 }
2100                         }
2101                 }
2102         }
2103 
2104 done:
2105         zio_checksum_verified(zio);
2106 
2107         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2108             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2109                 /*
2110                  * Use the good data we have in hand to repair damaged children.
2111                  */
2112                 for (c = 0; c < rm->rm_cols; c++) {
2113                         rc = &rm->rm_col[c];
2114                         cvd = vd->vdev_child[rc->rc_devidx];
2115 
2116                         if (rc->rc_error == 0)
2117                                 continue;
2118 
2119                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2120                             rc->rc_offset, rc->rc_data, rc->rc_size,
2121                             ZIO_TYPE_WRITE, zio->io_priority,
2122                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2123                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2124                 }
2125         }
2126 }
2127 
2128 static void
2129 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2130 {
2131         if (faulted > vd->vdev_nparity)
2132                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2133                     VDEV_AUX_NO_REPLICAS);
2134         else if (degraded + faulted != 0)
2135                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2136         else
2137                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2138 }
2139 
2140 vdev_ops_t vdev_raidz_ops = {
2141         vdev_raidz_open,
2142         vdev_raidz_close,
2143         vdev_raidz_asize,
2144         vdev_raidz_io_start,
2145         vdev_raidz_io_done,
2146         vdev_raidz_state_change,
2147         NULL,
2148         NULL,
2149         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2150         B_FALSE                 /* not a leaf vdev */
2151 };