1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/spa.h>
  29 #include <sys/vdev_impl.h>
  30 #include <sys/zio.h>
  31 #include <sys/zio_checksum.h>
  32 #include <sys/fs/zfs.h>
  33 #include <sys/fm/fs/zfs.h>
  34 
  35 /*
  36  * Virtual device vector for RAID-Z.
  37  *
  38  * This vdev supports single, double, and triple parity. For single parity,
  39  * we use a simple XOR of all the data columns. For double or triple parity,
  40  * we use a special case of Reed-Solomon coding. This extends the
  41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  44  * former is also based. The latter is designed to provide higher performance
  45  * for writes.
  46  *
  47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  48  * amended six years later identifying a critical flaw that invalidates its
  49  * claims. Nevertheless, the technique can be adapted to work for up to
  50  * triple parity. For additional parity, the amendment "Note: Correction to
  51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  52  * is viable, but the additional complexity means that write performance will
  53  * suffer.
  54  *
  55  * All of the methods above operate on a Galois field, defined over the
  56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  57  * can be expressed with a single byte. Briefly, the operations on the
  58  * field are defined as follows:
  59  *
  60  *   o addition (+) is represented by a bitwise XOR
  61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  62  *   o multiplication of A by 2 is defined by the following bitwise expression:
  63  *
  64  *      (A * 2)_7 = A_6
  65  *      (A * 2)_6 = A_5
  66  *      (A * 2)_5 = A_4
  67  *      (A * 2)_4 = A_3 + A_7
  68  *      (A * 2)_3 = A_2 + A_7
  69  *      (A * 2)_2 = A_1 + A_7
  70  *      (A * 2)_1 = A_0
  71  *      (A * 2)_0 = A_7
  72  *
  73  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  74  * As an aside, this multiplication is derived from the error correcting
  75  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  76  *
  77  * Observe that any number in the field (except for 0) can be expressed as a
  78  * power of 2 -- a generator for the field. We store a table of the powers of
  79  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  80  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  81  * than field addition). The inverse of a field element A (A^-1) is therefore
  82  * A ^ (255 - 1) = A^254.
  83  *
  84  * The up-to-three parity columns, P, Q, R over several data columns,
  85  * D_0, ... D_n-1, can be expressed by field operations:
  86  *
  87  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  88  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  89  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  90  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  91  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  92  *
  93  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
  94  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  95  * independent coefficients. (There are no additional coefficients that have
  96  * this property which is why the uncorrected Plank method breaks down.)
  97  *
  98  * See the reconstruction code below for how P, Q and R can used individually
  99  * or in concert to recover missing data columns.
 100  */
 101 
 102 typedef struct raidz_col {
 103         uint64_t rc_devidx;             /* child device index for I/O */
 104         uint64_t rc_offset;             /* device offset */
 105         uint64_t rc_size;               /* I/O size */
 106         void *rc_data;                  /* I/O data */
 107         void *rc_gdata;                 /* used to store the "good" version */
 108         int rc_error;                   /* I/O error for this device */
 109         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 110         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 111 } raidz_col_t;
 112 
 113 typedef struct raidz_map {
 114         uint64_t rm_cols;               /* Regular column count */
 115         uint64_t rm_scols;              /* Count including skipped columns */
 116         uint64_t rm_bigcols;            /* Number of oversized columns */
 117         uint64_t rm_asize;              /* Actual total I/O size */
 118         uint64_t rm_missingdata;        /* Count of missing data devices */
 119         uint64_t rm_missingparity;      /* Count of missing parity devices */
 120         uint64_t rm_firstdatacol;       /* First data column/parity count */
 121         uint64_t rm_nskip;              /* Skipped sectors for padding */
 122         uint64_t rm_skipstart;          /* Column index of padding start */
 123         void *rm_datacopy;              /* rm_asize-buffer of copied data */
 124         uintptr_t rm_reports;           /* # of referencing checksum reports */
 125         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 126         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 127         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 128 } raidz_map_t;
 129 
 130 #define VDEV_RAIDZ_P            0
 131 #define VDEV_RAIDZ_Q            1
 132 #define VDEV_RAIDZ_R            2
 133 
 134 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 135 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 136 
 137 /*
 138  * We provide a mechanism to perform the field multiplication operation on a
 139  * 64-bit value all at once rather than a byte at a time. This works by
 140  * creating a mask from the top bit in each byte and using that to
 141  * conditionally apply the XOR of 0x1d.
 142  */
 143 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 144 { \
 145         (mask) = (x) & 0x8080808080808080ULL; \
 146         (mask) = ((mask) << 1) - ((mask) >> 7); \
 147         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 148             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 149 }
 150 
 151 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 152 { \
 153         VDEV_RAIDZ_64MUL_2((x), mask); \
 154         VDEV_RAIDZ_64MUL_2((x), mask); \
 155 }
 156 
 157 /*
 158  * Force reconstruction to use the general purpose method.
 159  */
 160 int vdev_raidz_default_to_general;
 161 
 162 /* Powers of 2 in the Galois field defined above. */
 163 static const uint8_t vdev_raidz_pow2[256] = {
 164         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 165         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 166         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 167         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 168         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 169         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 170         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 171         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 172         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 173         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 174         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 175         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 176         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 177         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 178         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 179         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 180         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 181         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 182         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 183         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 184         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 185         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 186         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 187         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 188         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 189         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 190         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 191         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 192         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 193         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 194         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 195         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 196 };
 197 /* Logs of 2 in the Galois field defined above. */
 198 static const uint8_t vdev_raidz_log2[256] = {
 199         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 200         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 201         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 202         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 203         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 204         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 205         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 206         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 207         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 208         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 209         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 210         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 211         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 212         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 213         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 214         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 215         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 216         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 217         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 218         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 219         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 220         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 221         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 222         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 223         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 224         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 225         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 226         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 227         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 228         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 229         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 230         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 231 };
 232 
 233 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 234 
 235 /*
 236  * Multiply a given number by 2 raised to the given power.
 237  */
 238 static uint8_t
 239 vdev_raidz_exp2(uint_t a, int exp)
 240 {
 241         if (a == 0)
 242                 return (0);
 243 
 244         ASSERT(exp >= 0);
 245         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 246 
 247         exp += vdev_raidz_log2[a];
 248         if (exp > 255)
 249                 exp -= 255;
 250 
 251         return (vdev_raidz_pow2[exp]);
 252 }
 253 
 254 static void
 255 vdev_raidz_map_free(raidz_map_t *rm)
 256 {
 257         int c;
 258         size_t size;
 259 
 260         for (c = 0; c < rm->rm_firstdatacol; c++) {
 261                 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 262 
 263                 if (rm->rm_col[c].rc_gdata != NULL)
 264                         zio_buf_free(rm->rm_col[c].rc_gdata,
 265                             rm->rm_col[c].rc_size);
 266         }
 267 
 268         size = 0;
 269         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 270                 size += rm->rm_col[c].rc_size;
 271 
 272         if (rm->rm_datacopy != NULL)
 273                 zio_buf_free(rm->rm_datacopy, size);
 274 
 275         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 276 }
 277 
 278 static void
 279 vdev_raidz_map_free_vsd(zio_t *zio)
 280 {
 281         raidz_map_t *rm = zio->io_vsd;
 282 
 283         ASSERT0(rm->rm_freed);
 284         rm->rm_freed = 1;
 285 
 286         if (rm->rm_reports == 0)
 287                 vdev_raidz_map_free(rm);
 288 }
 289 
 290 /*ARGSUSED*/
 291 static void
 292 vdev_raidz_cksum_free(void *arg, size_t ignored)
 293 {
 294         raidz_map_t *rm = arg;
 295 
 296         ASSERT3U(rm->rm_reports, >, 0);
 297 
 298         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 299                 vdev_raidz_map_free(rm);
 300 }
 301 
 302 static void
 303 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 304 {
 305         raidz_map_t *rm = zcr->zcr_cbdata;
 306         size_t c = zcr->zcr_cbinfo;
 307         size_t x;
 308 
 309         const char *good = NULL;
 310         const char *bad = rm->rm_col[c].rc_data;
 311 
 312         if (good_data == NULL) {
 313                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 314                 return;
 315         }
 316 
 317         if (c < rm->rm_firstdatacol) {
 318                 /*
 319                  * The first time through, calculate the parity blocks for
 320                  * the good data (this relies on the fact that the good
 321                  * data never changes for a given logical ZIO)
 322                  */
 323                 if (rm->rm_col[0].rc_gdata == NULL) {
 324                         char *bad_parity[VDEV_RAIDZ_MAXPARITY];
 325                         char *buf;
 326 
 327                         /*
 328                          * Set up the rm_col[]s to generate the parity for
 329                          * good_data, first saving the parity bufs and
 330                          * replacing them with buffers to hold the result.
 331                          */
 332                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 333                                 bad_parity[x] = rm->rm_col[x].rc_data;
 334                                 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
 335                                     zio_buf_alloc(rm->rm_col[x].rc_size);
 336                         }
 337 
 338                         /* fill in the data columns from good_data */
 339                         buf = (char *)good_data;
 340                         for (; x < rm->rm_cols; x++) {
 341                                 rm->rm_col[x].rc_data = buf;
 342                                 buf += rm->rm_col[x].rc_size;
 343                         }
 344 
 345                         /*
 346                          * Construct the parity from the good data.
 347                          */
 348                         vdev_raidz_generate_parity(rm);
 349 
 350                         /* restore everything back to its original state */
 351                         for (x = 0; x < rm->rm_firstdatacol; x++)
 352                                 rm->rm_col[x].rc_data = bad_parity[x];
 353 
 354                         buf = rm->rm_datacopy;
 355                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 356                                 rm->rm_col[x].rc_data = buf;
 357                                 buf += rm->rm_col[x].rc_size;
 358                         }
 359                 }
 360 
 361                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 362                 good = rm->rm_col[c].rc_gdata;
 363         } else {
 364                 /* adjust good_data to point at the start of our column */
 365                 good = good_data;
 366 
 367                 for (x = rm->rm_firstdatacol; x < c; x++)
 368                         good += rm->rm_col[x].rc_size;
 369         }
 370 
 371         /* we drop the ereport if it ends up that the data was good */
 372         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 373 }
 374 
 375 /*
 376  * Invoked indirectly by zfs_ereport_start_checksum(), called
 377  * below when our read operation fails completely.  The main point
 378  * is to keep a copy of everything we read from disk, so that at
 379  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 380  */
 381 static void
 382 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 383 {
 384         size_t c = (size_t)(uintptr_t)arg;
 385         caddr_t buf;
 386 
 387         raidz_map_t *rm = zio->io_vsd;
 388         size_t size;
 389 
 390         /* set up the report and bump the refcount  */
 391         zcr->zcr_cbdata = rm;
 392         zcr->zcr_cbinfo = c;
 393         zcr->zcr_finish = vdev_raidz_cksum_finish;
 394         zcr->zcr_free = vdev_raidz_cksum_free;
 395 
 396         rm->rm_reports++;
 397         ASSERT3U(rm->rm_reports, >, 0);
 398 
 399         if (rm->rm_datacopy != NULL)
 400                 return;
 401 
 402         /*
 403          * It's the first time we're called for this raidz_map_t, so we need
 404          * to copy the data aside; there's no guarantee that our zio's buffer
 405          * won't be re-used for something else.
 406          *
 407          * Our parity data is already in separate buffers, so there's no need
 408          * to copy them.
 409          */
 410 
 411         size = 0;
 412         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 413                 size += rm->rm_col[c].rc_size;
 414 
 415         buf = rm->rm_datacopy = zio_buf_alloc(size);
 416 
 417         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 418                 raidz_col_t *col = &rm->rm_col[c];
 419 
 420                 bcopy(col->rc_data, buf, col->rc_size);
 421                 col->rc_data = buf;
 422 
 423                 buf += col->rc_size;
 424         }
 425         ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 426 }
 427 
 428 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 429         vdev_raidz_map_free_vsd,
 430         vdev_raidz_cksum_report
 431 };
 432 
 433 /*
 434  * Divides the IO evenly across all child vdevs; usually, dcols is
 435  * the number of children in the target vdev.
 436  */
 437 static raidz_map_t *
 438 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 439     uint64_t nparity)
 440 {
 441         raidz_map_t *rm;
 442         /* The starting RAIDZ (parent) vdev sector of the block. */
 443         uint64_t b = zio->io_offset >> unit_shift;
 444         /* The zio's size in units of the vdev's minimum sector size. */
 445         uint64_t s = zio->io_size >> unit_shift;
 446         /* The first column for this stripe. */
 447         uint64_t f = b % dcols;
 448         /* The starting byte offset on each child vdev. */
 449         uint64_t o = (b / dcols) << unit_shift;
 450         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 451 
 452         /*
 453          * "Quotient": The number of data sectors for this stripe on all but
 454          * the "big column" child vdevs that also contain "remainder" data.
 455          */
 456         q = s / (dcols - nparity);
 457 
 458         /*
 459          * "Remainder": The number of partial stripe data sectors in this I/O.
 460          * This will add a sector to some, but not all, child vdevs.
 461          */
 462         r = s - q * (dcols - nparity);
 463 
 464         /* The number of "big columns" - those which contain remainder data. */
 465         bc = (r == 0 ? 0 : r + nparity);
 466 
 467         /*
 468          * The total number of data and parity sectors associated with
 469          * this I/O.
 470          */
 471         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 472 
 473         /* acols: The columns that will be accessed. */
 474         /* scols: The columns that will be accessed or skipped. */
 475         if (q == 0) {
 476                 /* Our I/O request doesn't span all child vdevs. */
 477                 acols = bc;
 478                 scols = MIN(dcols, roundup(bc, nparity + 1));
 479         } else {
 480                 acols = dcols;
 481                 scols = dcols;
 482         }
 483 
 484         ASSERT3U(acols, <=, scols);
 485 
 486         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 487 
 488         rm->rm_cols = acols;
 489         rm->rm_scols = scols;
 490         rm->rm_bigcols = bc;
 491         rm->rm_skipstart = bc;
 492         rm->rm_missingdata = 0;
 493         rm->rm_missingparity = 0;
 494         rm->rm_firstdatacol = nparity;
 495         rm->rm_datacopy = NULL;
 496         rm->rm_reports = 0;
 497         rm->rm_freed = 0;
 498         rm->rm_ecksuminjected = 0;
 499 
 500         asize = 0;
 501 
 502         for (c = 0; c < scols; c++) {
 503                 col = f + c;
 504                 coff = o;
 505                 if (col >= dcols) {
 506                         col -= dcols;
 507                         coff += 1ULL << unit_shift;
 508                 }
 509                 rm->rm_col[c].rc_devidx = col;
 510                 rm->rm_col[c].rc_offset = coff;
 511                 rm->rm_col[c].rc_data = NULL;
 512                 rm->rm_col[c].rc_gdata = NULL;
 513                 rm->rm_col[c].rc_error = 0;
 514                 rm->rm_col[c].rc_tried = 0;
 515                 rm->rm_col[c].rc_skipped = 0;
 516 
 517                 if (c >= acols)
 518                         rm->rm_col[c].rc_size = 0;
 519                 else if (c < bc)
 520                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 521                 else
 522                         rm->rm_col[c].rc_size = q << unit_shift;
 523 
 524                 asize += rm->rm_col[c].rc_size;
 525         }
 526 
 527         ASSERT3U(asize, ==, tot << unit_shift);
 528         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 529         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 530         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 531         ASSERT3U(rm->rm_nskip, <=, nparity);
 532 
 533         for (c = 0; c < rm->rm_firstdatacol; c++)
 534                 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
 535 
 536         rm->rm_col[c].rc_data = zio->io_data;
 537 
 538         for (c = c + 1; c < acols; c++)
 539                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
 540                     rm->rm_col[c - 1].rc_size;
 541 
 542         /*
 543          * If all data stored spans all columns, there's a danger that parity
 544          * will always be on the same device and, since parity isn't read
 545          * during normal operation, that that device's I/O bandwidth won't be
 546          * used effectively. We therefore switch the parity every 1MB.
 547          *
 548          * ... at least that was, ostensibly, the theory. As a practical
 549          * matter unless we juggle the parity between all devices evenly, we
 550          * won't see any benefit. Further, occasional writes that aren't a
 551          * multiple of the LCM of the number of children and the minimum
 552          * stripe width are sufficient to avoid pessimal behavior.
 553          * Unfortunately, this decision created an implicit on-disk format
 554          * requirement that we need to support for all eternity, but only
 555          * for single-parity RAID-Z.
 556          *
 557          * If we intend to skip a sector in the zeroth column for padding
 558          * we must make sure to note this swap. We will never intend to
 559          * skip the first column since at least one data and one parity
 560          * column must appear in each row.
 561          */
 562         ASSERT(rm->rm_cols >= 2);
 563         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 564 
 565         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 566                 devidx = rm->rm_col[0].rc_devidx;
 567                 o = rm->rm_col[0].rc_offset;
 568                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 569                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 570                 rm->rm_col[1].rc_devidx = devidx;
 571                 rm->rm_col[1].rc_offset = o;
 572 
 573                 if (rm->rm_skipstart == 0)
 574                         rm->rm_skipstart = 1;
 575         }
 576 
 577         zio->io_vsd = rm;
 578         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 579         return (rm);
 580 }
 581 
 582 static void
 583 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 584 {
 585         uint64_t *p, *src, pcount, ccount, i;
 586         int c;
 587 
 588         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 589 
 590         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 591                 src = rm->rm_col[c].rc_data;
 592                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 593                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 594 
 595                 if (c == rm->rm_firstdatacol) {
 596                         ASSERT(ccount == pcount);
 597                         for (i = 0; i < ccount; i++, src++, p++) {
 598                                 *p = *src;
 599                         }
 600                 } else {
 601                         ASSERT(ccount <= pcount);
 602                         for (i = 0; i < ccount; i++, src++, p++) {
 603                                 *p ^= *src;
 604                         }
 605                 }
 606         }
 607 }
 608 
 609 static void
 610 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 611 {
 612         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 613         int c;
 614 
 615         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 616         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 617             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 618 
 619         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 620                 src = rm->rm_col[c].rc_data;
 621                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 622                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 623 
 624                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 625 
 626                 if (c == rm->rm_firstdatacol) {
 627                         ASSERT(ccnt == pcnt || ccnt == 0);
 628                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 629                                 *p = *src;
 630                                 *q = *src;
 631                         }
 632                         for (; i < pcnt; i++, src++, p++, q++) {
 633                                 *p = 0;
 634                                 *q = 0;
 635                         }
 636                 } else {
 637                         ASSERT(ccnt <= pcnt);
 638 
 639                         /*
 640                          * Apply the algorithm described above by multiplying
 641                          * the previous result and adding in the new value.
 642                          */
 643                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 644                                 *p ^= *src;
 645 
 646                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 647                                 *q ^= *src;
 648                         }
 649 
 650                         /*
 651                          * Treat short columns as though they are full of 0s.
 652                          * Note that there's therefore nothing needed for P.
 653                          */
 654                         for (; i < pcnt; i++, q++) {
 655                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 656                         }
 657                 }
 658         }
 659 }
 660 
 661 static void
 662 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 663 {
 664         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 665         int c;
 666 
 667         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 668         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 669             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 670         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 671             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 672 
 673         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 674                 src = rm->rm_col[c].rc_data;
 675                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 676                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 677                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 678 
 679                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 680 
 681                 if (c == rm->rm_firstdatacol) {
 682                         ASSERT(ccnt == pcnt || ccnt == 0);
 683                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 684                                 *p = *src;
 685                                 *q = *src;
 686                                 *r = *src;
 687                         }
 688                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 689                                 *p = 0;
 690                                 *q = 0;
 691                                 *r = 0;
 692                         }
 693                 } else {
 694                         ASSERT(ccnt <= pcnt);
 695 
 696                         /*
 697                          * Apply the algorithm described above by multiplying
 698                          * the previous result and adding in the new value.
 699                          */
 700                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 701                                 *p ^= *src;
 702 
 703                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 704                                 *q ^= *src;
 705 
 706                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 707                                 *r ^= *src;
 708                         }
 709 
 710                         /*
 711                          * Treat short columns as though they are full of 0s.
 712                          * Note that there's therefore nothing needed for P.
 713                          */
 714                         for (; i < pcnt; i++, q++, r++) {
 715                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 716                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 717                         }
 718                 }
 719         }
 720 }
 721 
 722 /*
 723  * Generate RAID parity in the first virtual columns according to the number of
 724  * parity columns available.
 725  */
 726 static void
 727 vdev_raidz_generate_parity(raidz_map_t *rm)
 728 {
 729         switch (rm->rm_firstdatacol) {
 730         case 1:
 731                 vdev_raidz_generate_parity_p(rm);
 732                 break;
 733         case 2:
 734                 vdev_raidz_generate_parity_pq(rm);
 735                 break;
 736         case 3:
 737                 vdev_raidz_generate_parity_pqr(rm);
 738                 break;
 739         default:
 740                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 741         }
 742 }
 743 
 744 static int
 745 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 746 {
 747         uint64_t *dst, *src, xcount, ccount, count, i;
 748         int x = tgts[0];
 749         int c;
 750 
 751         ASSERT(ntgts == 1);
 752         ASSERT(x >= rm->rm_firstdatacol);
 753         ASSERT(x < rm->rm_cols);
 754 
 755         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 756         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 757         ASSERT(xcount > 0);
 758 
 759         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 760         dst = rm->rm_col[x].rc_data;
 761         for (i = 0; i < xcount; i++, dst++, src++) {
 762                 *dst = *src;
 763         }
 764 
 765         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 766                 src = rm->rm_col[c].rc_data;
 767                 dst = rm->rm_col[x].rc_data;
 768 
 769                 if (c == x)
 770                         continue;
 771 
 772                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 773                 count = MIN(ccount, xcount);
 774 
 775                 for (i = 0; i < count; i++, dst++, src++) {
 776                         *dst ^= *src;
 777                 }
 778         }
 779 
 780         return (1 << VDEV_RAIDZ_P);
 781 }
 782 
 783 static int
 784 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 785 {
 786         uint64_t *dst, *src, xcount, ccount, count, mask, i;
 787         uint8_t *b;
 788         int x = tgts[0];
 789         int c, j, exp;
 790 
 791         ASSERT(ntgts == 1);
 792 
 793         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 794         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 795 
 796         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 797                 src = rm->rm_col[c].rc_data;
 798                 dst = rm->rm_col[x].rc_data;
 799 
 800                 if (c == x)
 801                         ccount = 0;
 802                 else
 803                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 804 
 805                 count = MIN(ccount, xcount);
 806 
 807                 if (c == rm->rm_firstdatacol) {
 808                         for (i = 0; i < count; i++, dst++, src++) {
 809                                 *dst = *src;
 810                         }
 811                         for (; i < xcount; i++, dst++) {
 812                                 *dst = 0;
 813                         }
 814 
 815                 } else {
 816                         for (i = 0; i < count; i++, dst++, src++) {
 817                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 818                                 *dst ^= *src;
 819                         }
 820 
 821                         for (; i < xcount; i++, dst++) {
 822                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 823                         }
 824                 }
 825         }
 826 
 827         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 828         dst = rm->rm_col[x].rc_data;
 829         exp = 255 - (rm->rm_cols - 1 - x);
 830 
 831         for (i = 0; i < xcount; i++, dst++, src++) {
 832                 *dst ^= *src;
 833                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 834                         *b = vdev_raidz_exp2(*b, exp);
 835                 }
 836         }
 837 
 838         return (1 << VDEV_RAIDZ_Q);
 839 }
 840 
 841 static int
 842 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 843 {
 844         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 845         void *pdata, *qdata;
 846         uint64_t xsize, ysize, i;
 847         int x = tgts[0];
 848         int y = tgts[1];
 849 
 850         ASSERT(ntgts == 2);
 851         ASSERT(x < y);
 852         ASSERT(x >= rm->rm_firstdatacol);
 853         ASSERT(y < rm->rm_cols);
 854 
 855         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 856 
 857         /*
 858          * Move the parity data aside -- we're going to compute parity as
 859          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 860          * reuse the parity generation mechanism without trashing the actual
 861          * parity so we make those columns appear to be full of zeros by
 862          * setting their lengths to zero.
 863          */
 864         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 865         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 866         xsize = rm->rm_col[x].rc_size;
 867         ysize = rm->rm_col[y].rc_size;
 868 
 869         rm->rm_col[VDEV_RAIDZ_P].rc_data =
 870             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
 871         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
 872             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 873         rm->rm_col[x].rc_size = 0;
 874         rm->rm_col[y].rc_size = 0;
 875 
 876         vdev_raidz_generate_parity_pq(rm);
 877 
 878         rm->rm_col[x].rc_size = xsize;
 879         rm->rm_col[y].rc_size = ysize;
 880 
 881         p = pdata;
 882         q = qdata;
 883         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 884         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 885         xd = rm->rm_col[x].rc_data;
 886         yd = rm->rm_col[y].rc_data;
 887 
 888         /*
 889          * We now have:
 890          *      Pxy = P + D_x + D_y
 891          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 892          *
 893          * We can then solve for D_x:
 894          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 895          * where
 896          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 897          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 898          *
 899          * With D_x in hand, we can easily solve for D_y:
 900          *      D_y = P + Pxy + D_x
 901          */
 902 
 903         a = vdev_raidz_pow2[255 + x - y];
 904         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 905         tmp = 255 - vdev_raidz_log2[a ^ 1];
 906 
 907         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 908         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 909 
 910         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
 911                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
 912                     vdev_raidz_exp2(*q ^ *qxy, bexp);
 913 
 914                 if (i < ysize)
 915                         *yd = *p ^ *pxy ^ *xd;
 916         }
 917 
 918         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
 919             rm->rm_col[VDEV_RAIDZ_P].rc_size);
 920         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
 921             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 922 
 923         /*
 924          * Restore the saved parity data.
 925          */
 926         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 927         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
 928 
 929         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 930 }
 931 
 932 /* BEGIN CSTYLED */
 933 /*
 934  * In the general case of reconstruction, we must solve the system of linear
 935  * equations defined by the coeffecients used to generate parity as well as
 936  * the contents of the data and parity disks. This can be expressed with
 937  * vectors for the original data (D) and the actual data (d) and parity (p)
 938  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 939  *
 940  *            __   __                     __     __
 941  *            |     |         __     __   |  p_0  |
 942  *            |  V  |         |  D_0  |   | p_m-1 |
 943  *            |     |    x    |   :   | = |  d_0  |
 944  *            |  I  |         | D_n-1 |   |   :   |
 945  *            |     |         ~~     ~~   | d_n-1 |
 946  *            ~~   ~~                     ~~     ~~
 947  *
 948  * I is simply a square identity matrix of size n, and V is a vandermonde
 949  * matrix defined by the coeffecients we chose for the various parity columns
 950  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 951  * computation as well as linear separability.
 952  *
 953  *      __               __               __     __
 954  *      |   1   ..  1 1 1 |               |  p_0  |
 955  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 956  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 957  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 958  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 959  *      |   :       : : : |   |   :   |   |  d_2  |
 960  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 961  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 962  *      |   0   ..  0 0 1 |               | d_n-1 |
 963  *      ~~               ~~               ~~     ~~
 964  *
 965  * Note that I, V, d, and p are known. To compute D, we must invert the
 966  * matrix and use the known data and parity values to reconstruct the unknown
 967  * data values. We begin by removing the rows in V|I and d|p that correspond
 968  * to failed or missing columns; we then make V|I square (n x n) and d|p
 969  * sized n by removing rows corresponding to unused parity from the bottom up
 970  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 971  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 972  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 973  *           __                               __
 974  *           |  1   1   1   1   1   1   1   1  |
 975  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 976  *           |  19 205 116  29  64  16  4   1  |      / /
 977  *           |  1   0   0   0   0   0   0   0  |     / /
 978  *           |  0   1   0   0   0   0   0   0  | <--' /
 979  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 980  *           |  0   0   0   1   0   0   0   0  |
 981  *           |  0   0   0   0   1   0   0   0  |
 982  *           |  0   0   0   0   0   1   0   0  |
 983  *           |  0   0   0   0   0   0   1   0  |
 984  *           |  0   0   0   0   0   0   0   1  |
 985  *           ~~                               ~~
 986  *           __                               __
 987  *           |  1   1   1   1   1   1   1   1  |
 988  *           | 128  64  32  16  8   4   2   1  |
 989  *           |  19 205 116  29  64  16  4   1  |
 990  *           |  1   0   0   0   0   0   0   0  |
 991  *           |  0   1   0   0   0   0   0   0  |
 992  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 993  *           |  0   0   0   1   0   0   0   0  |
 994  *           |  0   0   0   0   1   0   0   0  |
 995  *           |  0   0   0   0   0   1   0   0  |
 996  *           |  0   0   0   0   0   0   1   0  |
 997  *           |  0   0   0   0   0   0   0   1  |
 998  *           ~~                               ~~
 999  *
1000  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1001  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1002  * matrix is not singular.
1003  * __                                                                 __
1004  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1005  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1006  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1007  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1008  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1009  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1010  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1011  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1012  * ~~                                                                 ~~
1013  * __                                                                 __
1014  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1015  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1016  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1017  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1018  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1019  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1020  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1021  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1022  * ~~                                                                 ~~
1023  * __                                                                 __
1024  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1025  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1026  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1027  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1028  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1029  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1030  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1031  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1032  * ~~                                                                 ~~
1033  * __                                                                 __
1034  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1035  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1036  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1037  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1038  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1039  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1040  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1041  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1042  * ~~                                                                 ~~
1043  * __                                                                 __
1044  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1045  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1046  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1047  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1048  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1049  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1050  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1051  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1052  * ~~                                                                 ~~
1053  * __                                                                 __
1054  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1055  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1056  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1057  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1058  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1059  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1060  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1061  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1062  * ~~                                                                 ~~
1063  *                   __                               __
1064  *                   |  0   0   1   0   0   0   0   0  |
1065  *                   | 167 100  5   41 159 169 217 208 |
1066  *                   | 166 100  4   40 158 168 216 209 |
1067  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1068  *                   |  0   0   0   0   1   0   0   0  |
1069  *                   |  0   0   0   0   0   1   0   0  |
1070  *                   |  0   0   0   0   0   0   1   0  |
1071  *                   |  0   0   0   0   0   0   0   1  |
1072  *                   ~~                               ~~
1073  *
1074  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1075  * of the missing data.
1076  *
1077  * As is apparent from the example above, the only non-trivial rows in the
1078  * inverse matrix correspond to the data disks that we're trying to
1079  * reconstruct. Indeed, those are the only rows we need as the others would
1080  * only be useful for reconstructing data known or assumed to be valid. For
1081  * that reason, we only build the coefficients in the rows that correspond to
1082  * targeted columns.
1083  */
1084 /* END CSTYLED */
1085 
1086 static void
1087 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1088     uint8_t **rows)
1089 {
1090         int i, j;
1091         int pow;
1092 
1093         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1094 
1095         /*
1096          * Fill in the missing rows of interest.
1097          */
1098         for (i = 0; i < nmap; i++) {
1099                 ASSERT3S(0, <=, map[i]);
1100                 ASSERT3S(map[i], <=, 2);
1101 
1102                 pow = map[i] * n;
1103                 if (pow > 255)
1104                         pow -= 255;
1105                 ASSERT(pow <= 255);
1106 
1107                 for (j = 0; j < n; j++) {
1108                         pow -= map[i];
1109                         if (pow < 0)
1110                                 pow += 255;
1111                         rows[i][j] = vdev_raidz_pow2[pow];
1112                 }
1113         }
1114 }
1115 
1116 static void
1117 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1118     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1119 {
1120         int i, j, ii, jj;
1121         uint8_t log;
1122 
1123         /*
1124          * Assert that the first nmissing entries from the array of used
1125          * columns correspond to parity columns and that subsequent entries
1126          * correspond to data columns.
1127          */
1128         for (i = 0; i < nmissing; i++) {
1129                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1130         }
1131         for (; i < n; i++) {
1132                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1133         }
1134 
1135         /*
1136          * First initialize the storage where we'll compute the inverse rows.
1137          */
1138         for (i = 0; i < nmissing; i++) {
1139                 for (j = 0; j < n; j++) {
1140                         invrows[i][j] = (i == j) ? 1 : 0;
1141                 }
1142         }
1143 
1144         /*
1145          * Subtract all trivial rows from the rows of consequence.
1146          */
1147         for (i = 0; i < nmissing; i++) {
1148                 for (j = nmissing; j < n; j++) {
1149                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1150                         jj = used[j] - rm->rm_firstdatacol;
1151                         ASSERT3S(jj, <, n);
1152                         invrows[i][j] = rows[i][jj];
1153                         rows[i][jj] = 0;
1154                 }
1155         }
1156 
1157         /*
1158          * For each of the rows of interest, we must normalize it and subtract
1159          * a multiple of it from the other rows.
1160          */
1161         for (i = 0; i < nmissing; i++) {
1162                 for (j = 0; j < missing[i]; j++) {
1163                         ASSERT0(rows[i][j]);
1164                 }
1165                 ASSERT3U(rows[i][missing[i]], !=, 0);
1166 
1167                 /*
1168                  * Compute the inverse of the first element and multiply each
1169                  * element in the row by that value.
1170                  */
1171                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1172 
1173                 for (j = 0; j < n; j++) {
1174                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1175                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1176                 }
1177 
1178                 for (ii = 0; ii < nmissing; ii++) {
1179                         if (i == ii)
1180                                 continue;
1181 
1182                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1183 
1184                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1185 
1186                         for (j = 0; j < n; j++) {
1187                                 rows[ii][j] ^=
1188                                     vdev_raidz_exp2(rows[i][j], log);
1189                                 invrows[ii][j] ^=
1190                                     vdev_raidz_exp2(invrows[i][j], log);
1191                         }
1192                 }
1193         }
1194 
1195         /*
1196          * Verify that the data that is left in the rows are properly part of
1197          * an identity matrix.
1198          */
1199         for (i = 0; i < nmissing; i++) {
1200                 for (j = 0; j < n; j++) {
1201                         if (j == missing[i]) {
1202                                 ASSERT3U(rows[i][j], ==, 1);
1203                         } else {
1204                                 ASSERT0(rows[i][j]);
1205                         }
1206                 }
1207         }
1208 }
1209 
1210 static void
1211 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1212     int *missing, uint8_t **invrows, const uint8_t *used)
1213 {
1214         int i, j, x, cc, c;
1215         uint8_t *src;
1216         uint64_t ccount;
1217         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1218         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1219         uint8_t log = 0;
1220         uint8_t val;
1221         int ll;
1222         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1223         uint8_t *p, *pp;
1224         size_t psize;
1225 
1226         psize = sizeof (invlog[0][0]) * n * nmissing;
1227         p = kmem_alloc(psize, KM_SLEEP);
1228 
1229         for (pp = p, i = 0; i < nmissing; i++) {
1230                 invlog[i] = pp;
1231                 pp += n;
1232         }
1233 
1234         for (i = 0; i < nmissing; i++) {
1235                 for (j = 0; j < n; j++) {
1236                         ASSERT3U(invrows[i][j], !=, 0);
1237                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1238                 }
1239         }
1240 
1241         for (i = 0; i < n; i++) {
1242                 c = used[i];
1243                 ASSERT3U(c, <, rm->rm_cols);
1244 
1245                 src = rm->rm_col[c].rc_data;
1246                 ccount = rm->rm_col[c].rc_size;
1247                 for (j = 0; j < nmissing; j++) {
1248                         cc = missing[j] + rm->rm_firstdatacol;
1249                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1250                         ASSERT3U(cc, <, rm->rm_cols);
1251                         ASSERT3U(cc, !=, c);
1252 
1253                         dst[j] = rm->rm_col[cc].rc_data;
1254                         dcount[j] = rm->rm_col[cc].rc_size;
1255                 }
1256 
1257                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1258 
1259                 for (x = 0; x < ccount; x++, src++) {
1260                         if (*src != 0)
1261                                 log = vdev_raidz_log2[*src];
1262 
1263                         for (cc = 0; cc < nmissing; cc++) {
1264                                 if (x >= dcount[cc])
1265                                         continue;
1266 
1267                                 if (*src == 0) {
1268                                         val = 0;
1269                                 } else {
1270                                         if ((ll = log + invlog[cc][i]) >= 255)
1271                                                 ll -= 255;
1272                                         val = vdev_raidz_pow2[ll];
1273                                 }
1274 
1275                                 if (i == 0)
1276                                         dst[cc][x] = val;
1277                                 else
1278                                         dst[cc][x] ^= val;
1279                         }
1280                 }
1281         }
1282 
1283         kmem_free(p, psize);
1284 }
1285 
1286 static int
1287 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1288 {
1289         int n, i, c, t, tt;
1290         int nmissing_rows;
1291         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1292         int parity_map[VDEV_RAIDZ_MAXPARITY];
1293 
1294         uint8_t *p, *pp;
1295         size_t psize;
1296 
1297         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1298         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1299         uint8_t *used;
1300 
1301         int code = 0;
1302 
1303 
1304         n = rm->rm_cols - rm->rm_firstdatacol;
1305 
1306         /*
1307          * Figure out which data columns are missing.
1308          */
1309         nmissing_rows = 0;
1310         for (t = 0; t < ntgts; t++) {
1311                 if (tgts[t] >= rm->rm_firstdatacol) {
1312                         missing_rows[nmissing_rows++] =
1313                             tgts[t] - rm->rm_firstdatacol;
1314                 }
1315         }
1316 
1317         /*
1318          * Figure out which parity columns to use to help generate the missing
1319          * data columns.
1320          */
1321         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1322                 ASSERT(tt < ntgts);
1323                 ASSERT(c < rm->rm_firstdatacol);
1324 
1325                 /*
1326                  * Skip any targeted parity columns.
1327                  */
1328                 if (c == tgts[tt]) {
1329                         tt++;
1330                         continue;
1331                 }
1332 
1333                 code |= 1 << c;
1334 
1335                 parity_map[i] = c;
1336                 i++;
1337         }
1338 
1339         ASSERT(code != 0);
1340         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1341 
1342         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1343             nmissing_rows * n + sizeof (used[0]) * n;
1344         p = kmem_alloc(psize, KM_SLEEP);
1345 
1346         for (pp = p, i = 0; i < nmissing_rows; i++) {
1347                 rows[i] = pp;
1348                 pp += n;
1349                 invrows[i] = pp;
1350                 pp += n;
1351         }
1352         used = pp;
1353 
1354         for (i = 0; i < nmissing_rows; i++) {
1355                 used[i] = parity_map[i];
1356         }
1357 
1358         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1359                 if (tt < nmissing_rows &&
1360                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1361                         tt++;
1362                         continue;
1363                 }
1364 
1365                 ASSERT3S(i, <, n);
1366                 used[i] = c;
1367                 i++;
1368         }
1369 
1370         /*
1371          * Initialize the interesting rows of the matrix.
1372          */
1373         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1374 
1375         /*
1376          * Invert the matrix.
1377          */
1378         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1379             invrows, used);
1380 
1381         /*
1382          * Reconstruct the missing data using the generated matrix.
1383          */
1384         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1385             invrows, used);
1386 
1387         kmem_free(p, psize);
1388 
1389         return (code);
1390 }
1391 
1392 static int
1393 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1394 {
1395         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1396         int ntgts;
1397         int i, c;
1398         int code;
1399         int nbadparity, nbaddata;
1400         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1401 
1402         /*
1403          * The tgts list must already be sorted.
1404          */
1405         for (i = 1; i < nt; i++) {
1406                 ASSERT(t[i] > t[i - 1]);
1407         }
1408 
1409         nbadparity = rm->rm_firstdatacol;
1410         nbaddata = rm->rm_cols - nbadparity;
1411         ntgts = 0;
1412         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1413                 if (c < rm->rm_firstdatacol)
1414                         parity_valid[c] = B_FALSE;
1415 
1416                 if (i < nt && c == t[i]) {
1417                         tgts[ntgts++] = c;
1418                         i++;
1419                 } else if (rm->rm_col[c].rc_error != 0) {
1420                         tgts[ntgts++] = c;
1421                 } else if (c >= rm->rm_firstdatacol) {
1422                         nbaddata--;
1423                 } else {
1424                         parity_valid[c] = B_TRUE;
1425                         nbadparity--;
1426                 }
1427         }
1428 
1429         ASSERT(ntgts >= nt);
1430         ASSERT(nbaddata >= 0);
1431         ASSERT(nbaddata + nbadparity == ntgts);
1432 
1433         dt = &tgts[nbadparity];
1434 
1435         /*
1436          * See if we can use any of our optimized reconstruction routines.
1437          */
1438         if (!vdev_raidz_default_to_general) {
1439                 switch (nbaddata) {
1440                 case 1:
1441                         if (parity_valid[VDEV_RAIDZ_P])
1442                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1443 
1444                         ASSERT(rm->rm_firstdatacol > 1);
1445 
1446                         if (parity_valid[VDEV_RAIDZ_Q])
1447                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1448 
1449                         ASSERT(rm->rm_firstdatacol > 2);
1450                         break;
1451 
1452                 case 2:
1453                         ASSERT(rm->rm_firstdatacol > 1);
1454 
1455                         if (parity_valid[VDEV_RAIDZ_P] &&
1456                             parity_valid[VDEV_RAIDZ_Q])
1457                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1458 
1459                         ASSERT(rm->rm_firstdatacol > 2);
1460 
1461                         break;
1462                 }
1463         }
1464 
1465         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1466         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1467         ASSERT(code > 0);
1468         return (code);
1469 }
1470 
1471 static int
1472 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1473     uint64_t *ashift)
1474 {
1475         vdev_t *cvd;
1476         uint64_t nparity = vd->vdev_nparity;
1477         int c;
1478         int lasterror = 0;
1479         int numerrors = 0;
1480 
1481         ASSERT(nparity > 0);
1482 
1483         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1484             vd->vdev_children < nparity + 1) {
1485                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1486                 return (SET_ERROR(EINVAL));
1487         }
1488 
1489         vdev_open_children(vd);
1490 
1491         for (c = 0; c < vd->vdev_children; c++) {
1492                 cvd = vd->vdev_child[c];
1493 
1494                 if (cvd->vdev_open_error != 0) {
1495                         lasterror = cvd->vdev_open_error;
1496                         numerrors++;
1497                         continue;
1498                 }
1499 
1500                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1501                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1502                 *ashift = MAX(*ashift, cvd->vdev_ashift);
1503         }
1504 
1505         *asize *= vd->vdev_children;
1506         *max_asize *= vd->vdev_children;
1507 
1508         if (numerrors > nparity) {
1509                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1510                 return (lasterror);
1511         }
1512 
1513         return (0);
1514 }
1515 
1516 static void
1517 vdev_raidz_close(vdev_t *vd)
1518 {
1519         int c;
1520 
1521         for (c = 0; c < vd->vdev_children; c++)
1522                 vdev_close(vd->vdev_child[c]);
1523 }
1524 
1525 static uint64_t
1526 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1527 {
1528         uint64_t asize;
1529         uint64_t ashift = vd->vdev_top->vdev_ashift;
1530         uint64_t cols = vd->vdev_children;
1531         uint64_t nparity = vd->vdev_nparity;
1532 
1533         asize = ((psize - 1) >> ashift) + 1;
1534         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1535         asize = roundup(asize, nparity + 1) << ashift;
1536 
1537         return (asize);
1538 }
1539 
1540 static void
1541 vdev_raidz_child_done(zio_t *zio)
1542 {
1543         raidz_col_t *rc = zio->io_private;
1544 
1545         rc->rc_error = zio->io_error;
1546         rc->rc_tried = 1;
1547         rc->rc_skipped = 0;
1548 }
1549 
1550 /*
1551  * Start an IO operation on a RAIDZ VDev
1552  *
1553  * Outline:
1554  * - For write operations:
1555  *   1. Generate the parity data
1556  *   2. Create child zio write operations to each column's vdev, for both
1557  *      data and parity.
1558  *   3. If the column skips any sectors for padding, create optional dummy
1559  *      write zio children for those areas to improve aggregation continuity.
1560  * - For read operations:
1561  *   1. Create child zio read operations to each data column's vdev to read
1562  *      the range of data required for zio.
1563  *   2. If this is a scrub or resilver operation, or if any of the data
1564  *      vdevs have had errors, then create zio read operations to the parity
1565  *      columns' VDevs as well.
1566  */
1567 static int
1568 vdev_raidz_io_start(zio_t *zio)
1569 {
1570         vdev_t *vd = zio->io_vd;
1571         vdev_t *tvd = vd->vdev_top;
1572         vdev_t *cvd;
1573         raidz_map_t *rm;
1574         raidz_col_t *rc;
1575         int c, i;
1576 
1577         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1578             vd->vdev_nparity);
1579 
1580         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1581 
1582         if (zio->io_type == ZIO_TYPE_WRITE) {
1583                 vdev_raidz_generate_parity(rm);
1584 
1585                 for (c = 0; c < rm->rm_cols; c++) {
1586                         rc = &rm->rm_col[c];
1587                         cvd = vd->vdev_child[rc->rc_devidx];
1588                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1589                             rc->rc_offset, rc->rc_data, rc->rc_size,
1590                             zio->io_type, zio->io_priority, 0,
1591                             vdev_raidz_child_done, rc));
1592                 }
1593 
1594                 /*
1595                  * Generate optional I/Os for any skipped sectors to improve
1596                  * aggregation contiguity.
1597                  */
1598                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1599                         ASSERT(c <= rm->rm_scols);
1600                         if (c == rm->rm_scols)
1601                                 c = 0;
1602                         rc = &rm->rm_col[c];
1603                         cvd = vd->vdev_child[rc->rc_devidx];
1604                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1605                             rc->rc_offset + rc->rc_size, NULL,
1606                             1 << tvd->vdev_ashift,
1607                             zio->io_type, zio->io_priority,
1608                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1609                 }
1610 
1611                 return (ZIO_PIPELINE_CONTINUE);
1612         }
1613 
1614         ASSERT(zio->io_type == ZIO_TYPE_READ);
1615 
1616         /*
1617          * Iterate over the columns in reverse order so that we hit the parity
1618          * last -- any errors along the way will force us to read the parity.
1619          */
1620         for (c = rm->rm_cols - 1; c >= 0; c--) {
1621                 rc = &rm->rm_col[c];
1622                 cvd = vd->vdev_child[rc->rc_devidx];
1623                 if (!vdev_readable(cvd)) {
1624                         if (c >= rm->rm_firstdatacol)
1625                                 rm->rm_missingdata++;
1626                         else
1627                                 rm->rm_missingparity++;
1628                         rc->rc_error = SET_ERROR(ENXIO);
1629                         rc->rc_tried = 1;    /* don't even try */
1630                         rc->rc_skipped = 1;
1631                         continue;
1632                 }
1633                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1634                         if (c >= rm->rm_firstdatacol)
1635                                 rm->rm_missingdata++;
1636                         else
1637                                 rm->rm_missingparity++;
1638                         rc->rc_error = SET_ERROR(ESTALE);
1639                         rc->rc_skipped = 1;
1640                         continue;
1641                 }
1642                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1643                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1644                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1645                             rc->rc_offset, rc->rc_data, rc->rc_size,
1646                             zio->io_type, zio->io_priority, 0,
1647                             vdev_raidz_child_done, rc));
1648                 }
1649         }
1650 
1651         return (ZIO_PIPELINE_CONTINUE);
1652 }
1653 
1654 
1655 /*
1656  * Report a checksum error for a child of a RAID-Z device.
1657  */
1658 static void
1659 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1660 {
1661         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1662 
1663         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1664                 zio_bad_cksum_t zbc;
1665                 raidz_map_t *rm = zio->io_vsd;
1666 
1667                 mutex_enter(&vd->vdev_stat_lock);
1668                 vd->vdev_stat.vs_checksum_errors++;
1669                 mutex_exit(&vd->vdev_stat_lock);
1670 
1671                 zbc.zbc_has_cksum = 0;
1672                 zbc.zbc_injected = rm->rm_ecksuminjected;
1673 
1674                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1675                     rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1676                     &zbc);
1677         }
1678 }
1679 
1680 /*
1681  * We keep track of whether or not there were any injected errors, so that
1682  * any ereports we generate can note it.
1683  */
1684 static int
1685 raidz_checksum_verify(zio_t *zio)
1686 {
1687         zio_bad_cksum_t zbc;
1688         raidz_map_t *rm = zio->io_vsd;
1689 
1690         int ret = zio_checksum_error(zio, &zbc);
1691         if (ret != 0 && zbc.zbc_injected != 0)
1692                 rm->rm_ecksuminjected = 1;
1693 
1694         return (ret);
1695 }
1696 
1697 /*
1698  * Generate the parity from the data columns. If we tried and were able to
1699  * read the parity without error, verify that the generated parity matches the
1700  * data we read. If it doesn't, we fire off a checksum error. Return the
1701  * number such failures.
1702  */
1703 static int
1704 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1705 {
1706         void *orig[VDEV_RAIDZ_MAXPARITY];
1707         int c, ret = 0;
1708         raidz_col_t *rc;
1709 
1710         for (c = 0; c < rm->rm_firstdatacol; c++) {
1711                 rc = &rm->rm_col[c];
1712                 if (!rc->rc_tried || rc->rc_error != 0)
1713                         continue;
1714                 orig[c] = zio_buf_alloc(rc->rc_size);
1715                 bcopy(rc->rc_data, orig[c], rc->rc_size);
1716         }
1717 
1718         vdev_raidz_generate_parity(rm);
1719 
1720         for (c = 0; c < rm->rm_firstdatacol; c++) {
1721                 rc = &rm->rm_col[c];
1722                 if (!rc->rc_tried || rc->rc_error != 0)
1723                         continue;
1724                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1725                         raidz_checksum_error(zio, rc, orig[c]);
1726                         rc->rc_error = SET_ERROR(ECKSUM);
1727                         ret++;
1728                 }
1729                 zio_buf_free(orig[c], rc->rc_size);
1730         }
1731 
1732         return (ret);
1733 }
1734 
1735 /*
1736  * Keep statistics on all the ways that we used parity to correct data.
1737  */
1738 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1739 
1740 static int
1741 vdev_raidz_worst_error(raidz_map_t *rm)
1742 {
1743         int error = 0;
1744 
1745         for (int c = 0; c < rm->rm_cols; c++)
1746                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1747 
1748         return (error);
1749 }
1750 
1751 /*
1752  * Iterate over all combinations of bad data and attempt a reconstruction.
1753  * Note that the algorithm below is non-optimal because it doesn't take into
1754  * account how reconstruction is actually performed. For example, with
1755  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1756  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1757  * cases we'd only use parity information in column 0.
1758  */
1759 static int
1760 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1761 {
1762         raidz_map_t *rm = zio->io_vsd;
1763         raidz_col_t *rc;
1764         void *orig[VDEV_RAIDZ_MAXPARITY];
1765         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1766         int *tgts = &tstore[1];
1767         int current, next, i, c, n;
1768         int code, ret = 0;
1769 
1770         ASSERT(total_errors < rm->rm_firstdatacol);
1771 
1772         /*
1773          * This simplifies one edge condition.
1774          */
1775         tgts[-1] = -1;
1776 
1777         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1778                 /*
1779                  * Initialize the targets array by finding the first n columns
1780                  * that contain no error.
1781                  *
1782                  * If there were no data errors, we need to ensure that we're
1783                  * always explicitly attempting to reconstruct at least one
1784                  * data column. To do this, we simply push the highest target
1785                  * up into the data columns.
1786                  */
1787                 for (c = 0, i = 0; i < n; i++) {
1788                         if (i == n - 1 && data_errors == 0 &&
1789                             c < rm->rm_firstdatacol) {
1790                                 c = rm->rm_firstdatacol;
1791                         }
1792 
1793                         while (rm->rm_col[c].rc_error != 0) {
1794                                 c++;
1795                                 ASSERT3S(c, <, rm->rm_cols);
1796                         }
1797 
1798                         tgts[i] = c++;
1799                 }
1800 
1801                 /*
1802                  * Setting tgts[n] simplifies the other edge condition.
1803                  */
1804                 tgts[n] = rm->rm_cols;
1805 
1806                 /*
1807                  * These buffers were allocated in previous iterations.
1808                  */
1809                 for (i = 0; i < n - 1; i++) {
1810                         ASSERT(orig[i] != NULL);
1811                 }
1812 
1813                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1814 
1815                 current = 0;
1816                 next = tgts[current];
1817 
1818                 while (current != n) {
1819                         tgts[current] = next;
1820                         current = 0;
1821 
1822                         /*
1823                          * Save off the original data that we're going to
1824                          * attempt to reconstruct.
1825                          */
1826                         for (i = 0; i < n; i++) {
1827                                 ASSERT(orig[i] != NULL);
1828                                 c = tgts[i];
1829                                 ASSERT3S(c, >=, 0);
1830                                 ASSERT3S(c, <, rm->rm_cols);
1831                                 rc = &rm->rm_col[c];
1832                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
1833                         }
1834 
1835                         /*
1836                          * Attempt a reconstruction and exit the outer loop on
1837                          * success.
1838                          */
1839                         code = vdev_raidz_reconstruct(rm, tgts, n);
1840                         if (raidz_checksum_verify(zio) == 0) {
1841                                 atomic_inc_64(&raidz_corrected[code]);
1842 
1843                                 for (i = 0; i < n; i++) {
1844                                         c = tgts[i];
1845                                         rc = &rm->rm_col[c];
1846                                         ASSERT(rc->rc_error == 0);
1847                                         if (rc->rc_tried)
1848                                                 raidz_checksum_error(zio, rc,
1849                                                     orig[i]);
1850                                         rc->rc_error = SET_ERROR(ECKSUM);
1851                                 }
1852 
1853                                 ret = code;
1854                                 goto done;
1855                         }
1856 
1857                         /*
1858                          * Restore the original data.
1859                          */
1860                         for (i = 0; i < n; i++) {
1861                                 c = tgts[i];
1862                                 rc = &rm->rm_col[c];
1863                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
1864                         }
1865 
1866                         do {
1867                                 /*
1868                                  * Find the next valid column after the current
1869                                  * position..
1870                                  */
1871                                 for (next = tgts[current] + 1;
1872                                     next < rm->rm_cols &&
1873                                     rm->rm_col[next].rc_error != 0; next++)
1874                                         continue;
1875 
1876                                 ASSERT(next <= tgts[current + 1]);
1877 
1878                                 /*
1879                                  * If that spot is available, we're done here.
1880                                  */
1881                                 if (next != tgts[current + 1])
1882                                         break;
1883 
1884                                 /*
1885                                  * Otherwise, find the next valid column after
1886                                  * the previous position.
1887                                  */
1888                                 for (c = tgts[current - 1] + 1;
1889                                     rm->rm_col[c].rc_error != 0; c++)
1890                                         continue;
1891 
1892                                 tgts[current] = c;
1893                                 current++;
1894 
1895                         } while (current != n);
1896                 }
1897         }
1898         n--;
1899 done:
1900         for (i = 0; i < n; i++) {
1901                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1902         }
1903 
1904         return (ret);
1905 }
1906 
1907 /*
1908  * Complete an IO operation on a RAIDZ VDev
1909  *
1910  * Outline:
1911  * - For write operations:
1912  *   1. Check for errors on the child IOs.
1913  *   2. Return, setting an error code if too few child VDevs were written
1914  *      to reconstruct the data later.  Note that partial writes are
1915  *      considered successful if they can be reconstructed at all.
1916  * - For read operations:
1917  *   1. Check for errors on the child IOs.
1918  *   2. If data errors occurred:
1919  *      a. Try to reassemble the data from the parity available.
1920  *      b. If we haven't yet read the parity drives, read them now.
1921  *      c. If all parity drives have been read but the data still doesn't
1922  *         reassemble with a correct checksum, then try combinatorial
1923  *         reconstruction.
1924  *      d. If that doesn't work, return an error.
1925  *   3. If there were unexpected errors or this is a resilver operation,
1926  *      rewrite the vdevs that had errors.
1927  */
1928 static void
1929 vdev_raidz_io_done(zio_t *zio)
1930 {
1931         vdev_t *vd = zio->io_vd;
1932         vdev_t *cvd;
1933         raidz_map_t *rm = zio->io_vsd;
1934         raidz_col_t *rc;
1935         int unexpected_errors = 0;
1936         int parity_errors = 0;
1937         int parity_untried = 0;
1938         int data_errors = 0;
1939         int total_errors = 0;
1940         int n, c;
1941         int tgts[VDEV_RAIDZ_MAXPARITY];
1942         int code;
1943 
1944         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1945 
1946         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1947         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1948 
1949         for (c = 0; c < rm->rm_cols; c++) {
1950                 rc = &rm->rm_col[c];
1951 
1952                 if (rc->rc_error) {
1953                         ASSERT(rc->rc_error != ECKSUM);      /* child has no bp */
1954 
1955                         if (c < rm->rm_firstdatacol)
1956                                 parity_errors++;
1957                         else
1958                                 data_errors++;
1959 
1960                         if (!rc->rc_skipped)
1961                                 unexpected_errors++;
1962 
1963                         total_errors++;
1964                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1965                         parity_untried++;
1966                 }
1967         }
1968 
1969         if (zio->io_type == ZIO_TYPE_WRITE) {
1970                 /*
1971                  * XXX -- for now, treat partial writes as a success.
1972                  * (If we couldn't write enough columns to reconstruct
1973                  * the data, the I/O failed.  Otherwise, good enough.)
1974                  *
1975                  * Now that we support write reallocation, it would be better
1976                  * to treat partial failure as real failure unless there are
1977                  * no non-degraded top-level vdevs left, and not update DTLs
1978                  * if we intend to reallocate.
1979                  */
1980                 /* XXPOLICY */
1981                 if (total_errors > rm->rm_firstdatacol)
1982                         zio->io_error = vdev_raidz_worst_error(rm);
1983 
1984                 return;
1985         }
1986 
1987         ASSERT(zio->io_type == ZIO_TYPE_READ);
1988         /*
1989          * There are three potential phases for a read:
1990          *      1. produce valid data from the columns read
1991          *      2. read all disks and try again
1992          *      3. perform combinatorial reconstruction
1993          *
1994          * Each phase is progressively both more expensive and less likely to
1995          * occur. If we encounter more errors than we can repair or all phases
1996          * fail, we have no choice but to return an error.
1997          */
1998 
1999         /*
2000          * If the number of errors we saw was correctable -- less than or equal
2001          * to the number of parity disks read -- attempt to produce data that
2002          * has a valid checksum. Naturally, this case applies in the absence of
2003          * any errors.
2004          */
2005         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2006                 if (data_errors == 0) {
2007                         if (raidz_checksum_verify(zio) == 0) {
2008                                 /*
2009                                  * If we read parity information (unnecessarily
2010                                  * as it happens since no reconstruction was
2011                                  * needed) regenerate and verify the parity.
2012                                  * We also regenerate parity when resilvering
2013                                  * so we can write it out to the failed device
2014                                  * later.
2015                                  */
2016                                 if (parity_errors + parity_untried <
2017                                     rm->rm_firstdatacol ||
2018                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2019                                         n = raidz_parity_verify(zio, rm);
2020                                         unexpected_errors += n;
2021                                         ASSERT(parity_errors + n <=
2022                                             rm->rm_firstdatacol);
2023                                 }
2024                                 goto done;
2025                         }
2026                 } else {
2027                         /*
2028                          * We either attempt to read all the parity columns or
2029                          * none of them. If we didn't try to read parity, we
2030                          * wouldn't be here in the correctable case. There must
2031                          * also have been fewer parity errors than parity
2032                          * columns or, again, we wouldn't be in this code path.
2033                          */
2034                         ASSERT(parity_untried == 0);
2035                         ASSERT(parity_errors < rm->rm_firstdatacol);
2036 
2037                         /*
2038                          * Identify the data columns that reported an error.
2039                          */
2040                         n = 0;
2041                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2042                                 rc = &rm->rm_col[c];
2043                                 if (rc->rc_error != 0) {
2044                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2045                                         tgts[n++] = c;
2046                                 }
2047                         }
2048 
2049                         ASSERT(rm->rm_firstdatacol >= n);
2050 
2051                         code = vdev_raidz_reconstruct(rm, tgts, n);
2052 
2053                         if (raidz_checksum_verify(zio) == 0) {
2054                                 atomic_inc_64(&raidz_corrected[code]);
2055 
2056                                 /*
2057                                  * If we read more parity disks than were used
2058                                  * for reconstruction, confirm that the other
2059                                  * parity disks produced correct data. This
2060                                  * routine is suboptimal in that it regenerates
2061                                  * the parity that we already used in addition
2062                                  * to the parity that we're attempting to
2063                                  * verify, but this should be a relatively
2064                                  * uncommon case, and can be optimized if it
2065                                  * becomes a problem. Note that we regenerate
2066                                  * parity when resilvering so we can write it
2067                                  * out to failed devices later.
2068                                  */
2069                                 if (parity_errors < rm->rm_firstdatacol - n ||
2070                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2071                                         n = raidz_parity_verify(zio, rm);
2072                                         unexpected_errors += n;
2073                                         ASSERT(parity_errors + n <=
2074                                             rm->rm_firstdatacol);
2075                                 }
2076 
2077                                 goto done;
2078                         }
2079                 }
2080         }
2081 
2082         /*
2083          * This isn't a typical situation -- either we got a read error or
2084          * a child silently returned bad data. Read every block so we can
2085          * try again with as much data and parity as we can track down. If
2086          * we've already been through once before, all children will be marked
2087          * as tried so we'll proceed to combinatorial reconstruction.
2088          */
2089         unexpected_errors = 1;
2090         rm->rm_missingdata = 0;
2091         rm->rm_missingparity = 0;
2092 
2093         for (c = 0; c < rm->rm_cols; c++) {
2094                 if (rm->rm_col[c].rc_tried)
2095                         continue;
2096 
2097                 zio_vdev_io_redone(zio);
2098                 do {
2099                         rc = &rm->rm_col[c];
2100                         if (rc->rc_tried)
2101                                 continue;
2102                         zio_nowait(zio_vdev_child_io(zio, NULL,
2103                             vd->vdev_child[rc->rc_devidx],
2104                             rc->rc_offset, rc->rc_data, rc->rc_size,
2105                             zio->io_type, zio->io_priority, 0,
2106                             vdev_raidz_child_done, rc));
2107                 } while (++c < rm->rm_cols);
2108 
2109                 return;
2110         }
2111 
2112         /*
2113          * At this point we've attempted to reconstruct the data given the
2114          * errors we detected, and we've attempted to read all columns. There
2115          * must, therefore, be one or more additional problems -- silent errors
2116          * resulting in invalid data rather than explicit I/O errors resulting
2117          * in absent data. We check if there is enough additional data to
2118          * possibly reconstruct the data and then perform combinatorial
2119          * reconstruction over all possible combinations. If that fails,
2120          * we're cooked.
2121          */
2122         if (total_errors > rm->rm_firstdatacol) {
2123                 zio->io_error = vdev_raidz_worst_error(rm);
2124 
2125         } else if (total_errors < rm->rm_firstdatacol &&
2126             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2127                 /*
2128                  * If we didn't use all the available parity for the
2129                  * combinatorial reconstruction, verify that the remaining
2130                  * parity is correct.
2131                  */
2132                 if (code != (1 << rm->rm_firstdatacol) - 1)
2133                         (void) raidz_parity_verify(zio, rm);
2134         } else {
2135                 /*
2136                  * We're here because either:
2137                  *
2138                  *      total_errors == rm_first_datacol, or
2139                  *      vdev_raidz_combrec() failed
2140                  *
2141                  * In either case, there is enough bad data to prevent
2142                  * reconstruction.
2143                  *
2144                  * Start checksum ereports for all children which haven't
2145                  * failed, and the IO wasn't speculative.
2146                  */
2147                 zio->io_error = SET_ERROR(ECKSUM);
2148 
2149                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2150                         for (c = 0; c < rm->rm_cols; c++) {
2151                                 rc = &rm->rm_col[c];
2152                                 if (rc->rc_error == 0) {
2153                                         zio_bad_cksum_t zbc;
2154                                         zbc.zbc_has_cksum = 0;
2155                                         zbc.zbc_injected =
2156                                             rm->rm_ecksuminjected;
2157 
2158                                         zfs_ereport_start_checksum(
2159                                             zio->io_spa,
2160                                             vd->vdev_child[rc->rc_devidx],
2161                                             zio, rc->rc_offset, rc->rc_size,
2162                                             (void *)(uintptr_t)c, &zbc);
2163                                 }
2164                         }
2165                 }
2166         }
2167 
2168 done:
2169         zio_checksum_verified(zio);
2170 
2171         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2172             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2173                 /*
2174                  * Use the good data we have in hand to repair damaged children.
2175                  */
2176                 for (c = 0; c < rm->rm_cols; c++) {
2177                         rc = &rm->rm_col[c];
2178                         cvd = vd->vdev_child[rc->rc_devidx];
2179 
2180                         if (rc->rc_error == 0)
2181                                 continue;
2182 
2183                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2184                             rc->rc_offset, rc->rc_data, rc->rc_size,
2185                             ZIO_TYPE_WRITE, zio->io_priority,
2186                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2187                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2188                 }
2189         }
2190 }
2191 
2192 static void
2193 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2194 {
2195         if (faulted > vd->vdev_nparity)
2196                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2197                     VDEV_AUX_NO_REPLICAS);
2198         else if (degraded + faulted != 0)
2199                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2200         else
2201                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2202 }
2203 
2204 vdev_ops_t vdev_raidz_ops = {
2205         vdev_raidz_open,
2206         vdev_raidz_close,
2207         vdev_raidz_asize,
2208         vdev_raidz_io_start,
2209         vdev_raidz_io_done,
2210         vdev_raidz_state_change,
2211         NULL,
2212         NULL,
2213         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2214         B_FALSE                 /* not a leaf vdev */
2215 };