1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/vdev_impl.h> 30 #include <sys/zio.h> 31 #include <sys/zio_checksum.h> 32 #include <sys/fs/zfs.h> 33 #include <sys/fm/fs/zfs.h> 34 35 /* 36 * Virtual device vector for RAID-Z. 37 * 38 * This vdev supports single, double, and triple parity. For single parity, 39 * we use a simple XOR of all the data columns. For double or triple parity, 40 * we use a special case of Reed-Solomon coding. This extends the 41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 44 * former is also based. The latter is designed to provide higher performance 45 * for writes. 46 * 47 * Note that the Plank paper claimed to support arbitrary N+M, but was then 48 * amended six years later identifying a critical flaw that invalidates its 49 * claims. Nevertheless, the technique can be adapted to work for up to 50 * triple parity. For additional parity, the amendment "Note: Correction to 51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 52 * is viable, but the additional complexity means that write performance will 53 * suffer. 54 * 55 * All of the methods above operate on a Galois field, defined over the 56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 57 * can be expressed with a single byte. Briefly, the operations on the 58 * field are defined as follows: 59 * 60 * o addition (+) is represented by a bitwise XOR 61 * o subtraction (-) is therefore identical to addition: A + B = A - B 62 * o multiplication of A by 2 is defined by the following bitwise expression: 63 * (A * 2)_7 = A_6 64 * (A * 2)_6 = A_5 65 * (A * 2)_5 = A_4 66 * (A * 2)_4 = A_3 + A_7 67 * (A * 2)_3 = A_2 + A_7 68 * (A * 2)_2 = A_1 + A_7 69 * (A * 2)_1 = A_0 70 * (A * 2)_0 = A_7 71 * 72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 73 * As an aside, this multiplication is derived from the error correcting 74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 75 * 76 * Observe that any number in the field (except for 0) can be expressed as a 77 * power of 2 -- a generator for the field. We store a table of the powers of 78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 80 * than field addition). The inverse of a field element A (A^-1) is therefore 81 * A ^ (255 - 1) = A^254. 82 * 83 * The up-to-three parity columns, P, Q, R over several data columns, 84 * D_0, ... D_n-1, can be expressed by field operations: 85 * 86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 91 * 92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 94 * independent coefficients. (There are no additional coefficients that have 95 * this property which is why the uncorrected Plank method breaks down.) 96 * 97 * See the reconstruction code below for how P, Q and R can used individually 98 * or in concert to recover missing data columns. 99 */ 100 101 typedef struct raidz_col { 102 uint64_t rc_devidx; /* child device index for I/O */ 103 uint64_t rc_offset; /* device offset */ 104 uint64_t rc_size; /* I/O size */ 105 void *rc_data; /* I/O data */ 106 void *rc_gdata; /* used to store the "good" version */ 107 int rc_error; /* I/O error for this device */ 108 uint8_t rc_tried; /* Did we attempt this I/O column? */ 109 uint8_t rc_skipped; /* Did we skip this I/O column? */ 110 } raidz_col_t; 111 112 typedef struct raidz_map { 113 uint64_t rm_cols; /* Regular column count */ 114 uint64_t rm_scols; /* Count including skipped columns */ 115 uint64_t rm_bigcols; /* Number of oversized columns */ 116 uint64_t rm_asize; /* Actual total I/O size */ 117 uint64_t rm_missingdata; /* Count of missing data devices */ 118 uint64_t rm_missingparity; /* Count of missing parity devices */ 119 uint64_t rm_firstdatacol; /* First data column/parity count */ 120 uint64_t rm_nskip; /* Skipped sectors for padding */ 121 uint64_t rm_skipstart; /* Column index of padding start */ 122 void *rm_datacopy; /* rm_asize-buffer of copied data */ 123 uintptr_t rm_reports; /* # of referencing checksum reports */ 124 uint8_t rm_freed; /* map no longer has referencing ZIO */ 125 uint8_t rm_ecksuminjected; /* checksum error was injected */ 126 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 127 } raidz_map_t; 128 129 #define VDEV_RAIDZ_P 0 130 #define VDEV_RAIDZ_Q 1 131 #define VDEV_RAIDZ_R 2 132 133 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 134 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 135 136 /* 137 * We provide a mechanism to perform the field multiplication operation on a 138 * 64-bit value all at once rather than a byte at a time. This works by 139 * creating a mask from the top bit in each byte and using that to 140 * conditionally apply the XOR of 0x1d. 141 */ 142 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 143 { \ 144 (mask) = (x) & 0x8080808080808080ULL; \ 145 (mask) = ((mask) << 1) - ((mask) >> 7); \ 146 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 147 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 148 } 149 150 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 151 { \ 152 VDEV_RAIDZ_64MUL_2((x), mask); \ 153 VDEV_RAIDZ_64MUL_2((x), mask); \ 154 } 155 156 /* 157 * Force reconstruction to use the general purpose method. 158 */ 159 int vdev_raidz_default_to_general; 160 161 /* 162 * These two tables represent powers and logs of 2 in the Galois field defined 163 * above. These values were computed by repeatedly multiplying by 2 as above. 164 */ 165 static const uint8_t vdev_raidz_pow2[256] = { 166 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 167 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 168 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 169 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 170 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 171 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 172 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 173 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 174 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 175 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 176 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 177 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 178 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 179 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 180 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 181 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 182 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 183 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 184 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 185 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 186 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 187 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 188 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 189 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 190 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 191 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 192 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 193 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 194 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 195 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 196 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 197 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 198 }; 199 static const uint8_t vdev_raidz_log2[256] = { 200 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 201 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 202 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 203 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 204 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 205 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 206 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 207 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 208 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 209 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 210 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 211 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 212 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 213 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 214 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 215 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 216 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 217 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 218 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 219 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 220 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 221 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 222 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 223 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 224 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 225 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 226 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 227 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 228 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 229 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 230 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 231 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 232 }; 233 234 static void vdev_raidz_generate_parity(raidz_map_t *rm); 235 236 /* 237 * Multiply a given number by 2 raised to the given power. 238 */ 239 static uint8_t 240 vdev_raidz_exp2(uint_t a, int exp) 241 { 242 if (a == 0) 243 return (0); 244 245 ASSERT(exp >= 0); 246 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 247 248 exp += vdev_raidz_log2[a]; 249 if (exp > 255) 250 exp -= 255; 251 252 return (vdev_raidz_pow2[exp]); 253 } 254 255 static void 256 vdev_raidz_map_free(raidz_map_t *rm) 257 { 258 int c; 259 size_t size; 260 261 for (c = 0; c < rm->rm_firstdatacol; c++) { 262 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 263 264 if (rm->rm_col[c].rc_gdata != NULL) 265 zio_buf_free(rm->rm_col[c].rc_gdata, 266 rm->rm_col[c].rc_size); 267 } 268 269 size = 0; 270 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 271 size += rm->rm_col[c].rc_size; 272 273 if (rm->rm_datacopy != NULL) 274 zio_buf_free(rm->rm_datacopy, size); 275 276 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 277 } 278 279 static void 280 vdev_raidz_map_free_vsd(zio_t *zio) 281 { 282 raidz_map_t *rm = zio->io_vsd; 283 284 ASSERT0(rm->rm_freed); 285 rm->rm_freed = 1; 286 287 if (rm->rm_reports == 0) 288 vdev_raidz_map_free(rm); 289 } 290 291 /*ARGSUSED*/ 292 static void 293 vdev_raidz_cksum_free(void *arg, size_t ignored) 294 { 295 raidz_map_t *rm = arg; 296 297 ASSERT3U(rm->rm_reports, >, 0); 298 299 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 300 vdev_raidz_map_free(rm); 301 } 302 303 static void 304 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 305 { 306 raidz_map_t *rm = zcr->zcr_cbdata; 307 size_t c = zcr->zcr_cbinfo; 308 size_t x; 309 310 const char *good = NULL; 311 const char *bad = rm->rm_col[c].rc_data; 312 313 if (good_data == NULL) { 314 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 315 return; 316 } 317 318 if (c < rm->rm_firstdatacol) { 319 /* 320 * The first time through, calculate the parity blocks for 321 * the good data (this relies on the fact that the good 322 * data never changes for a given logical ZIO) 323 */ 324 if (rm->rm_col[0].rc_gdata == NULL) { 325 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 326 char *buf; 327 328 /* 329 * Set up the rm_col[]s to generate the parity for 330 * good_data, first saving the parity bufs and 331 * replacing them with buffers to hold the result. 332 */ 333 for (x = 0; x < rm->rm_firstdatacol; x++) { 334 bad_parity[x] = rm->rm_col[x].rc_data; 335 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 336 zio_buf_alloc(rm->rm_col[x].rc_size); 337 } 338 339 /* fill in the data columns from good_data */ 340 buf = (char *)good_data; 341 for (; x < rm->rm_cols; x++) { 342 rm->rm_col[x].rc_data = buf; 343 buf += rm->rm_col[x].rc_size; 344 } 345 346 /* 347 * Construct the parity from the good data. 348 */ 349 vdev_raidz_generate_parity(rm); 350 351 /* restore everything back to its original state */ 352 for (x = 0; x < rm->rm_firstdatacol; x++) 353 rm->rm_col[x].rc_data = bad_parity[x]; 354 355 buf = rm->rm_datacopy; 356 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 357 rm->rm_col[x].rc_data = buf; 358 buf += rm->rm_col[x].rc_size; 359 } 360 } 361 362 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 363 good = rm->rm_col[c].rc_gdata; 364 } else { 365 /* adjust good_data to point at the start of our column */ 366 good = good_data; 367 368 for (x = rm->rm_firstdatacol; x < c; x++) 369 good += rm->rm_col[x].rc_size; 370 } 371 372 /* we drop the ereport if it ends up that the data was good */ 373 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 374 } 375 376 /* 377 * Invoked indirectly by zfs_ereport_start_checksum(), called 378 * below when our read operation fails completely. The main point 379 * is to keep a copy of everything we read from disk, so that at 380 * vdev_raidz_cksum_finish() time we can compare it with the good data. 381 */ 382 static void 383 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 384 { 385 size_t c = (size_t)(uintptr_t)arg; 386 caddr_t buf; 387 388 raidz_map_t *rm = zio->io_vsd; 389 size_t size; 390 391 /* set up the report and bump the refcount */ 392 zcr->zcr_cbdata = rm; 393 zcr->zcr_cbinfo = c; 394 zcr->zcr_finish = vdev_raidz_cksum_finish; 395 zcr->zcr_free = vdev_raidz_cksum_free; 396 397 rm->rm_reports++; 398 ASSERT3U(rm->rm_reports, >, 0); 399 400 if (rm->rm_datacopy != NULL) 401 return; 402 403 /* 404 * It's the first time we're called for this raidz_map_t, so we need 405 * to copy the data aside; there's no guarantee that our zio's buffer 406 * won't be re-used for something else. 407 * 408 * Our parity data is already in separate buffers, so there's no need 409 * to copy them. 410 */ 411 412 size = 0; 413 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 414 size += rm->rm_col[c].rc_size; 415 416 buf = rm->rm_datacopy = zio_buf_alloc(size); 417 418 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 419 raidz_col_t *col = &rm->rm_col[c]; 420 421 bcopy(col->rc_data, buf, col->rc_size); 422 col->rc_data = buf; 423 424 buf += col->rc_size; 425 } 426 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 427 } 428 429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 430 vdev_raidz_map_free_vsd, 431 vdev_raidz_cksum_report 432 }; 433 434 /* 435 * Divides the IO evenly across all child vdevs; usually, dcols is 436 * the number of children in the target vdev. 437 */ 438 static raidz_map_t * 439 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 440 uint64_t nparity) 441 { 442 raidz_map_t *rm; 443 /* The starting RAIDZ (parent) vdev sector of the block. */ 444 uint64_t b = zio->io_offset >> unit_shift; 445 /* The zio's size in units of the vdev's minimum sector size */ 446 uint64_t s = zio->io_size >> unit_shift; 447 /* The first column for this stripe. */ 448 uint64_t f = b % dcols; 449 /* The starting byte offset on each child vdev. */ 450 uint64_t o = (b / dcols) << unit_shift; 451 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 452 453 /* 454 * "Quotient": The number of data sectors for this stripe on all but 455 * the "big column" child vdevs that also contain "remainder" data. 456 */ 457 q = s / (dcols - nparity); 458 459 /* 460 * "Remainder": The number of partial stripe data sectors in this I/O. 461 * This will add a sector to some, but not all, child vdevs. 462 */ 463 r = s - q * (dcols - nparity); 464 465 /* The number of "big columns" - those which contain remainder data. */ 466 bc = (r == 0 ? 0 : r + nparity); 467 468 /* 469 * The total number of data and parity sectors associated with 470 * this I/O. 471 */ 472 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 473 474 /* acols: The columns that will be accessed. */ 475 /* scols: The columns that will be accessed or skipped. */ 476 if (q == 0) { 477 /* Our I/O request doesn't span all child vdevs. */ 478 acols = bc; 479 scols = MIN(dcols, roundup(bc, nparity + 1)); 480 } else { 481 acols = dcols; 482 scols = dcols; 483 } 484 485 ASSERT3U(acols, <=, scols); 486 487 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 488 489 rm->rm_cols = acols; 490 rm->rm_scols = scols; 491 rm->rm_bigcols = bc; 492 rm->rm_skipstart = bc; 493 rm->rm_missingdata = 0; 494 rm->rm_missingparity = 0; 495 rm->rm_firstdatacol = nparity; 496 rm->rm_datacopy = NULL; 497 rm->rm_reports = 0; 498 rm->rm_freed = 0; 499 rm->rm_ecksuminjected = 0; 500 501 asize = 0; 502 503 for (c = 0; c < scols; c++) { 504 col = f + c; 505 coff = o; 506 if (col >= dcols) { 507 col -= dcols; 508 coff += 1ULL << unit_shift; 509 } 510 rm->rm_col[c].rc_devidx = col; 511 rm->rm_col[c].rc_offset = coff; 512 rm->rm_col[c].rc_data = NULL; 513 rm->rm_col[c].rc_gdata = NULL; 514 rm->rm_col[c].rc_error = 0; 515 rm->rm_col[c].rc_tried = 0; 516 rm->rm_col[c].rc_skipped = 0; 517 518 if (c >= acols) 519 rm->rm_col[c].rc_size = 0; 520 else if (c < bc) 521 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 522 else 523 rm->rm_col[c].rc_size = q << unit_shift; 524 525 asize += rm->rm_col[c].rc_size; 526 } 527 528 ASSERT3U(asize, ==, tot << unit_shift); 529 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 530 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 531 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 532 ASSERT3U(rm->rm_nskip, <=, nparity); 533 534 for (c = 0; c < rm->rm_firstdatacol; c++) 535 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 536 537 rm->rm_col[c].rc_data = zio->io_data; 538 539 for (c = c + 1; c < acols; c++) 540 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 541 rm->rm_col[c - 1].rc_size; 542 543 /* 544 * If all data stored spans all columns, there's a danger that parity 545 * will always be on the same device and, since parity isn't read 546 * during normal operation, that that device's I/O bandwidth won't be 547 * used effectively. We therefore switch the parity every 1MB. 548 * 549 * ... at least that was, ostensibly, the theory. As a practical 550 * matter unless we juggle the parity between all devices evenly, we 551 * won't see any benefit. Further, occasional writes that aren't a 552 * multiple of the LCM of the number of children and the minimum 553 * stripe width are sufficient to avoid pessimal behavior. 554 * Unfortunately, this decision created an implicit on-disk format 555 * requirement that we need to support for all eternity, but only 556 * for single-parity RAID-Z. 557 * 558 * If we intend to skip a sector in the zeroth column for padding 559 * we must make sure to note this swap. We will never intend to 560 * skip the first column since at least one data and one parity 561 * column must appear in each row. 562 */ 563 ASSERT(rm->rm_cols >= 2); 564 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 565 566 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 567 devidx = rm->rm_col[0].rc_devidx; 568 o = rm->rm_col[0].rc_offset; 569 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 570 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 571 rm->rm_col[1].rc_devidx = devidx; 572 rm->rm_col[1].rc_offset = o; 573 574 if (rm->rm_skipstart == 0) 575 rm->rm_skipstart = 1; 576 } 577 578 zio->io_vsd = rm; 579 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 580 return (rm); 581 } 582 583 static void 584 vdev_raidz_generate_parity_p(raidz_map_t *rm) 585 { 586 uint64_t *p, *src, pcount, ccount, i; 587 int c; 588 589 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 590 591 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 592 src = rm->rm_col[c].rc_data; 593 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 594 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 595 596 if (c == rm->rm_firstdatacol) { 597 ASSERT(ccount == pcount); 598 for (i = 0; i < ccount; i++, src++, p++) { 599 *p = *src; 600 } 601 } else { 602 ASSERT(ccount <= pcount); 603 for (i = 0; i < ccount; i++, src++, p++) { 604 *p ^= *src; 605 } 606 } 607 } 608 } 609 610 static void 611 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 612 { 613 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 614 int c; 615 616 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 617 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 618 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 619 620 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 621 src = rm->rm_col[c].rc_data; 622 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 623 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 624 625 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 626 627 if (c == rm->rm_firstdatacol) { 628 ASSERT(ccnt == pcnt || ccnt == 0); 629 for (i = 0; i < ccnt; i++, src++, p++, q++) { 630 *p = *src; 631 *q = *src; 632 } 633 for (; i < pcnt; i++, src++, p++, q++) { 634 *p = 0; 635 *q = 0; 636 } 637 } else { 638 ASSERT(ccnt <= pcnt); 639 640 /* 641 * Apply the algorithm described above by multiplying 642 * the previous result and adding in the new value. 643 */ 644 for (i = 0; i < ccnt; i++, src++, p++, q++) { 645 *p ^= *src; 646 647 VDEV_RAIDZ_64MUL_2(*q, mask); 648 *q ^= *src; 649 } 650 651 /* 652 * Treat short columns as though they are full of 0s. 653 * Note that there's therefore nothing needed for P. 654 */ 655 for (; i < pcnt; i++, q++) { 656 VDEV_RAIDZ_64MUL_2(*q, mask); 657 } 658 } 659 } 660 } 661 662 static void 663 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 664 { 665 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 666 int c; 667 668 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 669 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 670 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 671 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 672 rm->rm_col[VDEV_RAIDZ_R].rc_size); 673 674 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 675 src = rm->rm_col[c].rc_data; 676 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 677 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 678 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 679 680 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 681 682 if (c == rm->rm_firstdatacol) { 683 ASSERT(ccnt == pcnt || ccnt == 0); 684 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 685 *p = *src; 686 *q = *src; 687 *r = *src; 688 } 689 for (; i < pcnt; i++, src++, p++, q++, r++) { 690 *p = 0; 691 *q = 0; 692 *r = 0; 693 } 694 } else { 695 ASSERT(ccnt <= pcnt); 696 697 /* 698 * Apply the algorithm described above by multiplying 699 * the previous result and adding in the new value. 700 */ 701 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 702 *p ^= *src; 703 704 VDEV_RAIDZ_64MUL_2(*q, mask); 705 *q ^= *src; 706 707 VDEV_RAIDZ_64MUL_4(*r, mask); 708 *r ^= *src; 709 } 710 711 /* 712 * Treat short columns as though they are full of 0s. 713 * Note that there's therefore nothing needed for P. 714 */ 715 for (; i < pcnt; i++, q++, r++) { 716 VDEV_RAIDZ_64MUL_2(*q, mask); 717 VDEV_RAIDZ_64MUL_4(*r, mask); 718 } 719 } 720 } 721 } 722 723 /* 724 * Generate RAID parity in the first virtual columns according to the number of 725 * parity columns available. 726 */ 727 static void 728 vdev_raidz_generate_parity(raidz_map_t *rm) 729 { 730 switch (rm->rm_firstdatacol) { 731 case 1: 732 vdev_raidz_generate_parity_p(rm); 733 break; 734 case 2: 735 vdev_raidz_generate_parity_pq(rm); 736 break; 737 case 3: 738 vdev_raidz_generate_parity_pqr(rm); 739 break; 740 default: 741 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 742 } 743 } 744 745 static int 746 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 747 { 748 uint64_t *dst, *src, xcount, ccount, count, i; 749 int x = tgts[0]; 750 int c; 751 752 ASSERT(ntgts == 1); 753 ASSERT(x >= rm->rm_firstdatacol); 754 ASSERT(x < rm->rm_cols); 755 756 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 757 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 758 ASSERT(xcount > 0); 759 760 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 761 dst = rm->rm_col[x].rc_data; 762 for (i = 0; i < xcount; i++, dst++, src++) { 763 *dst = *src; 764 } 765 766 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 767 src = rm->rm_col[c].rc_data; 768 dst = rm->rm_col[x].rc_data; 769 770 if (c == x) 771 continue; 772 773 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 774 count = MIN(ccount, xcount); 775 776 for (i = 0; i < count; i++, dst++, src++) { 777 *dst ^= *src; 778 } 779 } 780 781 return (1 << VDEV_RAIDZ_P); 782 } 783 784 static int 785 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 786 { 787 uint64_t *dst, *src, xcount, ccount, count, mask, i; 788 uint8_t *b; 789 int x = tgts[0]; 790 int c, j, exp; 791 792 ASSERT(ntgts == 1); 793 794 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 795 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 796 797 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 798 src = rm->rm_col[c].rc_data; 799 dst = rm->rm_col[x].rc_data; 800 801 if (c == x) 802 ccount = 0; 803 else 804 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 805 806 count = MIN(ccount, xcount); 807 808 if (c == rm->rm_firstdatacol) { 809 for (i = 0; i < count; i++, dst++, src++) { 810 *dst = *src; 811 } 812 for (; i < xcount; i++, dst++) { 813 *dst = 0; 814 } 815 816 } else { 817 for (i = 0; i < count; i++, dst++, src++) { 818 VDEV_RAIDZ_64MUL_2(*dst, mask); 819 *dst ^= *src; 820 } 821 822 for (; i < xcount; i++, dst++) { 823 VDEV_RAIDZ_64MUL_2(*dst, mask); 824 } 825 } 826 } 827 828 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 829 dst = rm->rm_col[x].rc_data; 830 exp = 255 - (rm->rm_cols - 1 - x); 831 832 for (i = 0; i < xcount; i++, dst++, src++) { 833 *dst ^= *src; 834 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 835 *b = vdev_raidz_exp2(*b, exp); 836 } 837 } 838 839 return (1 << VDEV_RAIDZ_Q); 840 } 841 842 static int 843 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 844 { 845 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 846 void *pdata, *qdata; 847 uint64_t xsize, ysize, i; 848 int x = tgts[0]; 849 int y = tgts[1]; 850 851 ASSERT(ntgts == 2); 852 ASSERT(x < y); 853 ASSERT(x >= rm->rm_firstdatacol); 854 ASSERT(y < rm->rm_cols); 855 856 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 857 858 /* 859 * Move the parity data aside -- we're going to compute parity as 860 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 861 * reuse the parity generation mechanism without trashing the actual 862 * parity so we make those columns appear to be full of zeros by 863 * setting their lengths to zero. 864 */ 865 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 866 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 867 xsize = rm->rm_col[x].rc_size; 868 ysize = rm->rm_col[y].rc_size; 869 870 rm->rm_col[VDEV_RAIDZ_P].rc_data = 871 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 872 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 873 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 874 rm->rm_col[x].rc_size = 0; 875 rm->rm_col[y].rc_size = 0; 876 877 vdev_raidz_generate_parity_pq(rm); 878 879 rm->rm_col[x].rc_size = xsize; 880 rm->rm_col[y].rc_size = ysize; 881 882 p = pdata; 883 q = qdata; 884 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 885 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 886 xd = rm->rm_col[x].rc_data; 887 yd = rm->rm_col[y].rc_data; 888 889 /* 890 * We now have: 891 * Pxy = P + D_x + D_y 892 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 893 * 894 * We can then solve for D_x: 895 * D_x = A * (P + Pxy) + B * (Q + Qxy) 896 * where 897 * A = 2^(x - y) * (2^(x - y) + 1)^-1 898 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 899 * 900 * With D_x in hand, we can easily solve for D_y: 901 * D_y = P + Pxy + D_x 902 */ 903 904 a = vdev_raidz_pow2[255 + x - y]; 905 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 906 tmp = 255 - vdev_raidz_log2[a ^ 1]; 907 908 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 909 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 910 911 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 912 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 913 vdev_raidz_exp2(*q ^ *qxy, bexp); 914 915 if (i < ysize) 916 *yd = *p ^ *pxy ^ *xd; 917 } 918 919 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 920 rm->rm_col[VDEV_RAIDZ_P].rc_size); 921 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 922 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 923 924 /* 925 * Restore the saved parity data. 926 */ 927 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 928 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 929 930 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 931 } 932 933 /* BEGIN CSTYLED */ 934 /* 935 * In the general case of reconstruction, we must solve the system of linear 936 * equations defined by the coeffecients used to generate parity as well as 937 * the contents of the data and parity disks. This can be expressed with 938 * vectors for the original data (D) and the actual data (d) and parity (p) 939 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 940 * 941 * __ __ __ __ 942 * | | __ __ | p_0 | 943 * | V | | D_0 | | p_m-1 | 944 * | | x | : | = | d_0 | 945 * | I | | D_n-1 | | : | 946 * | | ~~ ~~ | d_n-1 | 947 * ~~ ~~ ~~ ~~ 948 * 949 * I is simply a square identity matrix of size n, and V is a vandermonde 950 * matrix defined by the coeffecients we chose for the various parity columns 951 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 952 * computation as well as linear separability. 953 * 954 * __ __ __ __ 955 * | 1 .. 1 1 1 | | p_0 | 956 * | 2^n-1 .. 4 2 1 | __ __ | : | 957 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 958 * | 1 .. 0 0 0 | | D_1 | | d_0 | 959 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 960 * | : : : : | | : | | d_2 | 961 * | 0 .. 1 0 0 | | D_n-1 | | : | 962 * | 0 .. 0 1 0 | ~~ ~~ | : | 963 * | 0 .. 0 0 1 | | d_n-1 | 964 * ~~ ~~ ~~ ~~ 965 * 966 * Note that I, V, d, and p are known. To compute D, we must invert the 967 * matrix and use the known data and parity values to reconstruct the unknown 968 * data values. We begin by removing the rows in V|I and d|p that correspond 969 * to failed or missing columns; we then make V|I square (n x n) and d|p 970 * sized n by removing rows corresponding to unused parity from the bottom up 971 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 972 * using Gauss-Jordan elimination. In the example below we use m=3 parity 973 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 974 * __ __ 975 * | 1 1 1 1 1 1 1 1 | 976 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 977 * | 19 205 116 29 64 16 4 1 | / / 978 * | 1 0 0 0 0 0 0 0 | / / 979 * | 0 1 0 0 0 0 0 0 | <--' / 980 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 981 * | 0 0 0 1 0 0 0 0 | 982 * | 0 0 0 0 1 0 0 0 | 983 * | 0 0 0 0 0 1 0 0 | 984 * | 0 0 0 0 0 0 1 0 | 985 * | 0 0 0 0 0 0 0 1 | 986 * ~~ ~~ 987 * __ __ 988 * | 1 1 1 1 1 1 1 1 | 989 * | 128 64 32 16 8 4 2 1 | 990 * | 19 205 116 29 64 16 4 1 | 991 * | 1 0 0 0 0 0 0 0 | 992 * | 0 1 0 0 0 0 0 0 | 993 * (V|I)' = | 0 0 1 0 0 0 0 0 | 994 * | 0 0 0 1 0 0 0 0 | 995 * | 0 0 0 0 1 0 0 0 | 996 * | 0 0 0 0 0 1 0 0 | 997 * | 0 0 0 0 0 0 1 0 | 998 * | 0 0 0 0 0 0 0 1 | 999 * ~~ ~~ 1000 * 1001 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1002 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1003 * matrix is not singular. 1004 * __ __ 1005 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1006 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1007 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1008 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1009 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1010 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1011 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1012 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1013 * ~~ ~~ 1014 * __ __ 1015 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1016 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1017 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1018 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1019 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1020 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1021 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1022 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1023 * ~~ ~~ 1024 * __ __ 1025 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1026 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1027 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1028 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1029 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1030 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1031 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1032 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1033 * ~~ ~~ 1034 * __ __ 1035 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1036 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1037 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1038 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1039 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1040 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1041 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1042 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1043 * ~~ ~~ 1044 * __ __ 1045 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1046 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1047 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1048 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1049 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1050 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1051 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1052 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1053 * ~~ ~~ 1054 * __ __ 1055 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1056 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1057 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1058 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1059 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1060 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1061 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1062 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1063 * ~~ ~~ 1064 * __ __ 1065 * | 0 0 1 0 0 0 0 0 | 1066 * | 167 100 5 41 159 169 217 208 | 1067 * | 166 100 4 40 158 168 216 209 | 1068 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1069 * | 0 0 0 0 1 0 0 0 | 1070 * | 0 0 0 0 0 1 0 0 | 1071 * | 0 0 0 0 0 0 1 0 | 1072 * | 0 0 0 0 0 0 0 1 | 1073 * ~~ ~~ 1074 * 1075 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1076 * of the missing data. 1077 * 1078 * As is apparent from the example above, the only non-trivial rows in the 1079 * inverse matrix correspond to the data disks that we're trying to 1080 * reconstruct. Indeed, those are the only rows we need as the others would 1081 * only be useful for reconstructing data known or assumed to be valid. For 1082 * that reason, we only build the coefficients in the rows that correspond to 1083 * targeted columns. 1084 */ 1085 /* END CSTYLED */ 1086 1087 static void 1088 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1089 uint8_t **rows) 1090 { 1091 int i, j; 1092 int pow; 1093 1094 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1095 1096 /* 1097 * Fill in the missing rows of interest. 1098 */ 1099 for (i = 0; i < nmap; i++) { 1100 ASSERT3S(0, <=, map[i]); 1101 ASSERT3S(map[i], <=, 2); 1102 1103 pow = map[i] * n; 1104 if (pow > 255) 1105 pow -= 255; 1106 ASSERT(pow <= 255); 1107 1108 for (j = 0; j < n; j++) { 1109 pow -= map[i]; 1110 if (pow < 0) 1111 pow += 255; 1112 rows[i][j] = vdev_raidz_pow2[pow]; 1113 } 1114 } 1115 } 1116 1117 static void 1118 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1119 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1120 { 1121 int i, j, ii, jj; 1122 uint8_t log; 1123 1124 /* 1125 * Assert that the first nmissing entries from the array of used 1126 * columns correspond to parity columns and that subsequent entries 1127 * correspond to data columns. 1128 */ 1129 for (i = 0; i < nmissing; i++) { 1130 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1131 } 1132 for (; i < n; i++) { 1133 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1134 } 1135 1136 /* 1137 * First initialize the storage where we'll compute the inverse rows. 1138 */ 1139 for (i = 0; i < nmissing; i++) { 1140 for (j = 0; j < n; j++) { 1141 invrows[i][j] = (i == j) ? 1 : 0; 1142 } 1143 } 1144 1145 /* 1146 * Subtract all trivial rows from the rows of consequence. 1147 */ 1148 for (i = 0; i < nmissing; i++) { 1149 for (j = nmissing; j < n; j++) { 1150 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1151 jj = used[j] - rm->rm_firstdatacol; 1152 ASSERT3S(jj, <, n); 1153 invrows[i][j] = rows[i][jj]; 1154 rows[i][jj] = 0; 1155 } 1156 } 1157 1158 /* 1159 * For each of the rows of interest, we must normalize it and subtract 1160 * a multiple of it from the other rows. 1161 */ 1162 for (i = 0; i < nmissing; i++) { 1163 for (j = 0; j < missing[i]; j++) { 1164 ASSERT0(rows[i][j]); 1165 } 1166 ASSERT3U(rows[i][missing[i]], !=, 0); 1167 1168 /* 1169 * Compute the inverse of the first element and multiply each 1170 * element in the row by that value. 1171 */ 1172 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1173 1174 for (j = 0; j < n; j++) { 1175 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1176 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1177 } 1178 1179 for (ii = 0; ii < nmissing; ii++) { 1180 if (i == ii) 1181 continue; 1182 1183 ASSERT3U(rows[ii][missing[i]], !=, 0); 1184 1185 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1186 1187 for (j = 0; j < n; j++) { 1188 rows[ii][j] ^= 1189 vdev_raidz_exp2(rows[i][j], log); 1190 invrows[ii][j] ^= 1191 vdev_raidz_exp2(invrows[i][j], log); 1192 } 1193 } 1194 } 1195 1196 /* 1197 * Verify that the data that is left in the rows are properly part of 1198 * an identity matrix. 1199 */ 1200 for (i = 0; i < nmissing; i++) { 1201 for (j = 0; j < n; j++) { 1202 if (j == missing[i]) { 1203 ASSERT3U(rows[i][j], ==, 1); 1204 } else { 1205 ASSERT0(rows[i][j]); 1206 } 1207 } 1208 } 1209 } 1210 1211 static void 1212 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1213 int *missing, uint8_t **invrows, const uint8_t *used) 1214 { 1215 int i, j, x, cc, c; 1216 uint8_t *src; 1217 uint64_t ccount; 1218 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1219 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1220 uint8_t log = 0; 1221 uint8_t val; 1222 int ll; 1223 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1224 uint8_t *p, *pp; 1225 size_t psize; 1226 1227 psize = sizeof (invlog[0][0]) * n * nmissing; 1228 p = kmem_alloc(psize, KM_SLEEP); 1229 1230 for (pp = p, i = 0; i < nmissing; i++) { 1231 invlog[i] = pp; 1232 pp += n; 1233 } 1234 1235 for (i = 0; i < nmissing; i++) { 1236 for (j = 0; j < n; j++) { 1237 ASSERT3U(invrows[i][j], !=, 0); 1238 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1239 } 1240 } 1241 1242 for (i = 0; i < n; i++) { 1243 c = used[i]; 1244 ASSERT3U(c, <, rm->rm_cols); 1245 1246 src = rm->rm_col[c].rc_data; 1247 ccount = rm->rm_col[c].rc_size; 1248 for (j = 0; j < nmissing; j++) { 1249 cc = missing[j] + rm->rm_firstdatacol; 1250 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1251 ASSERT3U(cc, <, rm->rm_cols); 1252 ASSERT3U(cc, !=, c); 1253 1254 dst[j] = rm->rm_col[cc].rc_data; 1255 dcount[j] = rm->rm_col[cc].rc_size; 1256 } 1257 1258 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1259 1260 for (x = 0; x < ccount; x++, src++) { 1261 if (*src != 0) 1262 log = vdev_raidz_log2[*src]; 1263 1264 for (cc = 0; cc < nmissing; cc++) { 1265 if (x >= dcount[cc]) 1266 continue; 1267 1268 if (*src == 0) { 1269 val = 0; 1270 } else { 1271 if ((ll = log + invlog[cc][i]) >= 255) 1272 ll -= 255; 1273 val = vdev_raidz_pow2[ll]; 1274 } 1275 1276 if (i == 0) 1277 dst[cc][x] = val; 1278 else 1279 dst[cc][x] ^= val; 1280 } 1281 } 1282 } 1283 1284 kmem_free(p, psize); 1285 } 1286 1287 static int 1288 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1289 { 1290 int n, i, c, t, tt; 1291 int nmissing_rows; 1292 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1293 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1294 1295 uint8_t *p, *pp; 1296 size_t psize; 1297 1298 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1299 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1300 uint8_t *used; 1301 1302 int code = 0; 1303 1304 1305 n = rm->rm_cols - rm->rm_firstdatacol; 1306 1307 /* 1308 * Figure out which data columns are missing. 1309 */ 1310 nmissing_rows = 0; 1311 for (t = 0; t < ntgts; t++) { 1312 if (tgts[t] >= rm->rm_firstdatacol) { 1313 missing_rows[nmissing_rows++] = 1314 tgts[t] - rm->rm_firstdatacol; 1315 } 1316 } 1317 1318 /* 1319 * Figure out which parity columns to use to help generate the missing 1320 * data columns. 1321 */ 1322 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1323 ASSERT(tt < ntgts); 1324 ASSERT(c < rm->rm_firstdatacol); 1325 1326 /* 1327 * Skip any targeted parity columns. 1328 */ 1329 if (c == tgts[tt]) { 1330 tt++; 1331 continue; 1332 } 1333 1334 code |= 1 << c; 1335 1336 parity_map[i] = c; 1337 i++; 1338 } 1339 1340 ASSERT(code != 0); 1341 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1342 1343 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1344 nmissing_rows * n + sizeof (used[0]) * n; 1345 p = kmem_alloc(psize, KM_SLEEP); 1346 1347 for (pp = p, i = 0; i < nmissing_rows; i++) { 1348 rows[i] = pp; 1349 pp += n; 1350 invrows[i] = pp; 1351 pp += n; 1352 } 1353 used = pp; 1354 1355 for (i = 0; i < nmissing_rows; i++) { 1356 used[i] = parity_map[i]; 1357 } 1358 1359 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1360 if (tt < nmissing_rows && 1361 c == missing_rows[tt] + rm->rm_firstdatacol) { 1362 tt++; 1363 continue; 1364 } 1365 1366 ASSERT3S(i, <, n); 1367 used[i] = c; 1368 i++; 1369 } 1370 1371 /* 1372 * Initialize the interesting rows of the matrix. 1373 */ 1374 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1375 1376 /* 1377 * Invert the matrix. 1378 */ 1379 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1380 invrows, used); 1381 1382 /* 1383 * Reconstruct the missing data using the generated matrix. 1384 */ 1385 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1386 invrows, used); 1387 1388 kmem_free(p, psize); 1389 1390 return (code); 1391 } 1392 1393 static int 1394 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1395 { 1396 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1397 int ntgts; 1398 int i, c; 1399 int code; 1400 int nbadparity, nbaddata; 1401 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1402 1403 /* 1404 * The tgts list must already be sorted. 1405 */ 1406 for (i = 1; i < nt; i++) { 1407 ASSERT(t[i] > t[i - 1]); 1408 } 1409 1410 nbadparity = rm->rm_firstdatacol; 1411 nbaddata = rm->rm_cols - nbadparity; 1412 ntgts = 0; 1413 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1414 if (c < rm->rm_firstdatacol) 1415 parity_valid[c] = B_FALSE; 1416 1417 if (i < nt && c == t[i]) { 1418 tgts[ntgts++] = c; 1419 i++; 1420 } else if (rm->rm_col[c].rc_error != 0) { 1421 tgts[ntgts++] = c; 1422 } else if (c >= rm->rm_firstdatacol) { 1423 nbaddata--; 1424 } else { 1425 parity_valid[c] = B_TRUE; 1426 nbadparity--; 1427 } 1428 } 1429 1430 ASSERT(ntgts >= nt); 1431 ASSERT(nbaddata >= 0); 1432 ASSERT(nbaddata + nbadparity == ntgts); 1433 1434 dt = &tgts[nbadparity]; 1435 1436 /* 1437 * See if we can use any of our optimized reconstruction routines. 1438 */ 1439 if (!vdev_raidz_default_to_general) { 1440 switch (nbaddata) { 1441 case 1: 1442 if (parity_valid[VDEV_RAIDZ_P]) 1443 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1444 1445 ASSERT(rm->rm_firstdatacol > 1); 1446 1447 if (parity_valid[VDEV_RAIDZ_Q]) 1448 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1449 1450 ASSERT(rm->rm_firstdatacol > 2); 1451 break; 1452 1453 case 2: 1454 ASSERT(rm->rm_firstdatacol > 1); 1455 1456 if (parity_valid[VDEV_RAIDZ_P] && 1457 parity_valid[VDEV_RAIDZ_Q]) 1458 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1459 1460 ASSERT(rm->rm_firstdatacol > 2); 1461 1462 break; 1463 } 1464 } 1465 1466 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1467 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1468 ASSERT(code > 0); 1469 return (code); 1470 } 1471 1472 static int 1473 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1474 uint64_t *ashift) 1475 { 1476 vdev_t *cvd; 1477 uint64_t nparity = vd->vdev_nparity; 1478 int c; 1479 int lasterror = 0; 1480 int numerrors = 0; 1481 1482 ASSERT(nparity > 0); 1483 1484 if (nparity > VDEV_RAIDZ_MAXPARITY || 1485 vd->vdev_children < nparity + 1) { 1486 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1487 return (SET_ERROR(EINVAL)); 1488 } 1489 1490 vdev_open_children(vd); 1491 1492 for (c = 0; c < vd->vdev_children; c++) { 1493 cvd = vd->vdev_child[c]; 1494 1495 if (cvd->vdev_open_error != 0) { 1496 lasterror = cvd->vdev_open_error; 1497 numerrors++; 1498 continue; 1499 } 1500 1501 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1502 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1503 *ashift = MAX(*ashift, cvd->vdev_ashift); 1504 } 1505 1506 *asize *= vd->vdev_children; 1507 *max_asize *= vd->vdev_children; 1508 1509 if (numerrors > nparity) { 1510 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1511 return (lasterror); 1512 } 1513 1514 return (0); 1515 } 1516 1517 static void 1518 vdev_raidz_close(vdev_t *vd) 1519 { 1520 int c; 1521 1522 for (c = 0; c < vd->vdev_children; c++) 1523 vdev_close(vd->vdev_child[c]); 1524 } 1525 1526 static uint64_t 1527 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1528 { 1529 uint64_t asize; 1530 uint64_t ashift = vd->vdev_top->vdev_ashift; 1531 uint64_t cols = vd->vdev_children; 1532 uint64_t nparity = vd->vdev_nparity; 1533 1534 asize = ((psize - 1) >> ashift) + 1; 1535 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1536 asize = roundup(asize, nparity + 1) << ashift; 1537 1538 return (asize); 1539 } 1540 1541 static void 1542 vdev_raidz_child_done(zio_t *zio) 1543 { 1544 raidz_col_t *rc = zio->io_private; 1545 1546 rc->rc_error = zio->io_error; 1547 rc->rc_tried = 1; 1548 rc->rc_skipped = 0; 1549 } 1550 1551 /* 1552 * Start an IO operation on a RAIDZ VDev 1553 * 1554 * Outline: 1555 * - For write operations: 1556 * 1. Generate the parity data 1557 * 2. Create child zio write operations to each column's vdev, for both 1558 * data and parity. 1559 * 3. If the column skips any sectors for padding, create optional dummy 1560 * write zio children for those areas to improve aggregation continuity. 1561 * - For read operations: 1562 * 1. Create child zio read operations to each data column's vdev to read 1563 * the range of data required for zio. 1564 * 2. If this is a scrub or resilver operation, or if any of the data 1565 * vdevs have had errors, then create zio read operations to the parity 1566 * columns' VDevs as well. 1567 */ 1568 static int 1569 vdev_raidz_io_start(zio_t *zio) 1570 { 1571 vdev_t *vd = zio->io_vd; 1572 vdev_t *tvd = vd->vdev_top; 1573 vdev_t *cvd; 1574 raidz_map_t *rm; 1575 raidz_col_t *rc; 1576 int c, i; 1577 1578 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1579 vd->vdev_nparity); 1580 1581 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1582 1583 if (zio->io_type == ZIO_TYPE_WRITE) { 1584 vdev_raidz_generate_parity(rm); 1585 1586 for (c = 0; c < rm->rm_cols; c++) { 1587 rc = &rm->rm_col[c]; 1588 cvd = vd->vdev_child[rc->rc_devidx]; 1589 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1590 rc->rc_offset, rc->rc_data, rc->rc_size, 1591 zio->io_type, zio->io_priority, 0, 1592 vdev_raidz_child_done, rc)); 1593 } 1594 1595 /* 1596 * Generate optional I/Os for any skipped sectors to improve 1597 * aggregation contiguity. 1598 */ 1599 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1600 ASSERT(c <= rm->rm_scols); 1601 if (c == rm->rm_scols) 1602 c = 0; 1603 rc = &rm->rm_col[c]; 1604 cvd = vd->vdev_child[rc->rc_devidx]; 1605 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1606 rc->rc_offset + rc->rc_size, NULL, 1607 1 << tvd->vdev_ashift, 1608 zio->io_type, zio->io_priority, 1609 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1610 } 1611 1612 return (ZIO_PIPELINE_CONTINUE); 1613 } 1614 1615 ASSERT(zio->io_type == ZIO_TYPE_READ); 1616 1617 /* 1618 * Iterate over the columns in reverse order so that we hit the parity 1619 * last -- any errors along the way will force us to read the parity. 1620 */ 1621 for (c = rm->rm_cols - 1; c >= 0; c--) { 1622 rc = &rm->rm_col[c]; 1623 cvd = vd->vdev_child[rc->rc_devidx]; 1624 if (!vdev_readable(cvd)) { 1625 if (c >= rm->rm_firstdatacol) 1626 rm->rm_missingdata++; 1627 else 1628 rm->rm_missingparity++; 1629 rc->rc_error = SET_ERROR(ENXIO); 1630 rc->rc_tried = 1; /* don't even try */ 1631 rc->rc_skipped = 1; 1632 continue; 1633 } 1634 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1635 if (c >= rm->rm_firstdatacol) 1636 rm->rm_missingdata++; 1637 else 1638 rm->rm_missingparity++; 1639 rc->rc_error = SET_ERROR(ESTALE); 1640 rc->rc_skipped = 1; 1641 continue; 1642 } 1643 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1644 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1645 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1646 rc->rc_offset, rc->rc_data, rc->rc_size, 1647 zio->io_type, zio->io_priority, 0, 1648 vdev_raidz_child_done, rc)); 1649 } 1650 } 1651 1652 return (ZIO_PIPELINE_CONTINUE); 1653 } 1654 1655 1656 /* 1657 * Report a checksum error for a child of a RAID-Z device. 1658 */ 1659 static void 1660 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1661 { 1662 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1663 1664 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1665 zio_bad_cksum_t zbc; 1666 raidz_map_t *rm = zio->io_vsd; 1667 1668 mutex_enter(&vd->vdev_stat_lock); 1669 vd->vdev_stat.vs_checksum_errors++; 1670 mutex_exit(&vd->vdev_stat_lock); 1671 1672 zbc.zbc_has_cksum = 0; 1673 zbc.zbc_injected = rm->rm_ecksuminjected; 1674 1675 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1676 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1677 &zbc); 1678 } 1679 } 1680 1681 /* 1682 * We keep track of whether or not there were any injected errors, so that 1683 * any ereports we generate can note it. 1684 */ 1685 static int 1686 raidz_checksum_verify(zio_t *zio) 1687 { 1688 zio_bad_cksum_t zbc; 1689 raidz_map_t *rm = zio->io_vsd; 1690 1691 int ret = zio_checksum_error(zio, &zbc); 1692 if (ret != 0 && zbc.zbc_injected != 0) 1693 rm->rm_ecksuminjected = 1; 1694 1695 return (ret); 1696 } 1697 1698 /* 1699 * Generate the parity from the data columns. If we tried and were able to 1700 * read the parity without error, verify that the generated parity matches the 1701 * data we read. If it doesn't, we fire off a checksum error. Return the 1702 * number such failures. 1703 */ 1704 static int 1705 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1706 { 1707 void *orig[VDEV_RAIDZ_MAXPARITY]; 1708 int c, ret = 0; 1709 raidz_col_t *rc; 1710 1711 for (c = 0; c < rm->rm_firstdatacol; c++) { 1712 rc = &rm->rm_col[c]; 1713 if (!rc->rc_tried || rc->rc_error != 0) 1714 continue; 1715 orig[c] = zio_buf_alloc(rc->rc_size); 1716 bcopy(rc->rc_data, orig[c], rc->rc_size); 1717 } 1718 1719 vdev_raidz_generate_parity(rm); 1720 1721 for (c = 0; c < rm->rm_firstdatacol; c++) { 1722 rc = &rm->rm_col[c]; 1723 if (!rc->rc_tried || rc->rc_error != 0) 1724 continue; 1725 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1726 raidz_checksum_error(zio, rc, orig[c]); 1727 rc->rc_error = SET_ERROR(ECKSUM); 1728 ret++; 1729 } 1730 zio_buf_free(orig[c], rc->rc_size); 1731 } 1732 1733 return (ret); 1734 } 1735 1736 /* 1737 * Keep statistics on all the ways that we used parity to correct data. 1738 */ 1739 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1740 1741 static int 1742 vdev_raidz_worst_error(raidz_map_t *rm) 1743 { 1744 int error = 0; 1745 1746 for (int c = 0; c < rm->rm_cols; c++) 1747 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1748 1749 return (error); 1750 } 1751 1752 /* 1753 * Iterate over all combinations of bad data and attempt a reconstruction. 1754 * Note that the algorithm below is non-optimal because it doesn't take into 1755 * account how reconstruction is actually performed. For example, with 1756 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1757 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1758 * cases we'd only use parity information in column 0. 1759 */ 1760 static int 1761 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1762 { 1763 raidz_map_t *rm = zio->io_vsd; 1764 raidz_col_t *rc; 1765 void *orig[VDEV_RAIDZ_MAXPARITY]; 1766 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1767 int *tgts = &tstore[1]; 1768 int current, next, i, c, n; 1769 int code, ret = 0; 1770 1771 ASSERT(total_errors < rm->rm_firstdatacol); 1772 1773 /* 1774 * This simplifies one edge condition. 1775 */ 1776 tgts[-1] = -1; 1777 1778 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1779 /* 1780 * Initialize the targets array by finding the first n columns 1781 * that contain no error. 1782 * 1783 * If there were no data errors, we need to ensure that we're 1784 * always explicitly attempting to reconstruct at least one 1785 * data column. To do this, we simply push the highest target 1786 * up into the data columns. 1787 */ 1788 for (c = 0, i = 0; i < n; i++) { 1789 if (i == n - 1 && data_errors == 0 && 1790 c < rm->rm_firstdatacol) { 1791 c = rm->rm_firstdatacol; 1792 } 1793 1794 while (rm->rm_col[c].rc_error != 0) { 1795 c++; 1796 ASSERT3S(c, <, rm->rm_cols); 1797 } 1798 1799 tgts[i] = c++; 1800 } 1801 1802 /* 1803 * Setting tgts[n] simplifies the other edge condition. 1804 */ 1805 tgts[n] = rm->rm_cols; 1806 1807 /* 1808 * These buffers were allocated in previous iterations. 1809 */ 1810 for (i = 0; i < n - 1; i++) { 1811 ASSERT(orig[i] != NULL); 1812 } 1813 1814 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1815 1816 current = 0; 1817 next = tgts[current]; 1818 1819 while (current != n) { 1820 tgts[current] = next; 1821 current = 0; 1822 1823 /* 1824 * Save off the original data that we're going to 1825 * attempt to reconstruct. 1826 */ 1827 for (i = 0; i < n; i++) { 1828 ASSERT(orig[i] != NULL); 1829 c = tgts[i]; 1830 ASSERT3S(c, >=, 0); 1831 ASSERT3S(c, <, rm->rm_cols); 1832 rc = &rm->rm_col[c]; 1833 bcopy(rc->rc_data, orig[i], rc->rc_size); 1834 } 1835 1836 /* 1837 * Attempt a reconstruction and exit the outer loop on 1838 * success. 1839 */ 1840 code = vdev_raidz_reconstruct(rm, tgts, n); 1841 if (raidz_checksum_verify(zio) == 0) { 1842 atomic_inc_64(&raidz_corrected[code]); 1843 1844 for (i = 0; i < n; i++) { 1845 c = tgts[i]; 1846 rc = &rm->rm_col[c]; 1847 ASSERT(rc->rc_error == 0); 1848 if (rc->rc_tried) 1849 raidz_checksum_error(zio, rc, 1850 orig[i]); 1851 rc->rc_error = SET_ERROR(ECKSUM); 1852 } 1853 1854 ret = code; 1855 goto done; 1856 } 1857 1858 /* 1859 * Restore the original data. 1860 */ 1861 for (i = 0; i < n; i++) { 1862 c = tgts[i]; 1863 rc = &rm->rm_col[c]; 1864 bcopy(orig[i], rc->rc_data, rc->rc_size); 1865 } 1866 1867 do { 1868 /* 1869 * Find the next valid column after the current 1870 * position.. 1871 */ 1872 for (next = tgts[current] + 1; 1873 next < rm->rm_cols && 1874 rm->rm_col[next].rc_error != 0; next++) 1875 continue; 1876 1877 ASSERT(next <= tgts[current + 1]); 1878 1879 /* 1880 * If that spot is available, we're done here. 1881 */ 1882 if (next != tgts[current + 1]) 1883 break; 1884 1885 /* 1886 * Otherwise, find the next valid column after 1887 * the previous position. 1888 */ 1889 for (c = tgts[current - 1] + 1; 1890 rm->rm_col[c].rc_error != 0; c++) 1891 continue; 1892 1893 tgts[current] = c; 1894 current++; 1895 1896 } while (current != n); 1897 } 1898 } 1899 n--; 1900 done: 1901 for (i = 0; i < n; i++) { 1902 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 1903 } 1904 1905 return (ret); 1906 } 1907 1908 /* 1909 * Complete an IO operation on a RAIDZ VDev 1910 * 1911 * Outline: 1912 * - For write operations: 1913 * 1. Check for errors on the child IOs. 1914 * 2. Return, setting an error code if too few child VDevs were written 1915 * to reconstruct the data later. Note that partial writes are 1916 * considered successful if they can be reconstructed at all. 1917 * - For read operations: 1918 * 1. Check for errors on the child IOs. 1919 * 2. If data errors occurred: 1920 * a. Try to reassemble the data from the parity available. 1921 * b. If we haven't yet read the parity drives, read them now. 1922 * c. If all parity drives have been read but the data still doesn't 1923 * reassemble with a correct checksum, then try combinatorial 1924 * reconstruction. 1925 * d. If that doesn't work, return an error. 1926 * 3. If there were unexpected errors or this is a resilver operation, 1927 * rewrite the vdevs that had errors. 1928 */ 1929 static void 1930 vdev_raidz_io_done(zio_t *zio) 1931 { 1932 vdev_t *vd = zio->io_vd; 1933 vdev_t *cvd; 1934 raidz_map_t *rm = zio->io_vsd; 1935 raidz_col_t *rc; 1936 int unexpected_errors = 0; 1937 int parity_errors = 0; 1938 int parity_untried = 0; 1939 int data_errors = 0; 1940 int total_errors = 0; 1941 int n, c; 1942 int tgts[VDEV_RAIDZ_MAXPARITY]; 1943 int code; 1944 1945 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1946 1947 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1948 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1949 1950 for (c = 0; c < rm->rm_cols; c++) { 1951 rc = &rm->rm_col[c]; 1952 1953 if (rc->rc_error) { 1954 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1955 1956 if (c < rm->rm_firstdatacol) 1957 parity_errors++; 1958 else 1959 data_errors++; 1960 1961 if (!rc->rc_skipped) 1962 unexpected_errors++; 1963 1964 total_errors++; 1965 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1966 parity_untried++; 1967 } 1968 } 1969 1970 if (zio->io_type == ZIO_TYPE_WRITE) { 1971 /* 1972 * XXX -- for now, treat partial writes as a success. 1973 * (If we couldn't write enough columns to reconstruct 1974 * the data, the I/O failed. Otherwise, good enough.) 1975 * 1976 * Now that we support write reallocation, it would be better 1977 * to treat partial failure as real failure unless there are 1978 * no non-degraded top-level vdevs left, and not update DTLs 1979 * if we intend to reallocate. 1980 */ 1981 /* XXPOLICY */ 1982 if (total_errors > rm->rm_firstdatacol) 1983 zio->io_error = vdev_raidz_worst_error(rm); 1984 1985 return; 1986 } 1987 1988 ASSERT(zio->io_type == ZIO_TYPE_READ); 1989 /* 1990 * There are three potential phases for a read: 1991 * 1. produce valid data from the columns read 1992 * 2. read all disks and try again 1993 * 3. perform combinatorial reconstruction 1994 * 1995 * Each phase is progressively both more expensive and less likely to 1996 * occur. If we encounter more errors than we can repair or all phases 1997 * fail, we have no choice but to return an error. 1998 */ 1999 2000 /* 2001 * If the number of errors we saw was correctable -- less than or equal 2002 * to the number of parity disks read -- attempt to produce data that 2003 * has a valid checksum. Naturally, this case applies in the absence of 2004 * any errors. 2005 */ 2006 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2007 if (data_errors == 0) { 2008 if (raidz_checksum_verify(zio) == 0) { 2009 /* 2010 * If we read parity information (unnecessarily 2011 * as it happens since no reconstruction was 2012 * needed) regenerate and verify the parity. 2013 * We also regenerate parity when resilvering 2014 * so we can write it out to the failed device 2015 * later. 2016 */ 2017 if (parity_errors + parity_untried < 2018 rm->rm_firstdatacol || 2019 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2020 n = raidz_parity_verify(zio, rm); 2021 unexpected_errors += n; 2022 ASSERT(parity_errors + n <= 2023 rm->rm_firstdatacol); 2024 } 2025 goto done; 2026 } 2027 } else { 2028 /* 2029 * We either attempt to read all the parity columns or 2030 * none of them. If we didn't try to read parity, we 2031 * wouldn't be here in the correctable case. There must 2032 * also have been fewer parity errors than parity 2033 * columns or, again, we wouldn't be in this code path. 2034 */ 2035 ASSERT(parity_untried == 0); 2036 ASSERT(parity_errors < rm->rm_firstdatacol); 2037 2038 /* 2039 * Identify the data columns that reported an error. 2040 */ 2041 n = 0; 2042 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2043 rc = &rm->rm_col[c]; 2044 if (rc->rc_error != 0) { 2045 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2046 tgts[n++] = c; 2047 } 2048 } 2049 2050 ASSERT(rm->rm_firstdatacol >= n); 2051 2052 code = vdev_raidz_reconstruct(rm, tgts, n); 2053 2054 if (raidz_checksum_verify(zio) == 0) { 2055 atomic_inc_64(&raidz_corrected[code]); 2056 2057 /* 2058 * If we read more parity disks than were used 2059 * for reconstruction, confirm that the other 2060 * parity disks produced correct data. This 2061 * routine is suboptimal in that it regenerates 2062 * the parity that we already used in addition 2063 * to the parity that we're attempting to 2064 * verify, but this should be a relatively 2065 * uncommon case, and can be optimized if it 2066 * becomes a problem. Note that we regenerate 2067 * parity when resilvering so we can write it 2068 * out to failed devices later. 2069 */ 2070 if (parity_errors < rm->rm_firstdatacol - n || 2071 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2072 n = raidz_parity_verify(zio, rm); 2073 unexpected_errors += n; 2074 ASSERT(parity_errors + n <= 2075 rm->rm_firstdatacol); 2076 } 2077 2078 goto done; 2079 } 2080 } 2081 } 2082 2083 /* 2084 * This isn't a typical situation -- either we got a read error or 2085 * a child silently returned bad data. Read every block so we can 2086 * try again with as much data and parity as we can track down. If 2087 * we've already been through once before, all children will be marked 2088 * as tried so we'll proceed to combinatorial reconstruction. 2089 */ 2090 unexpected_errors = 1; 2091 rm->rm_missingdata = 0; 2092 rm->rm_missingparity = 0; 2093 2094 for (c = 0; c < rm->rm_cols; c++) { 2095 if (rm->rm_col[c].rc_tried) 2096 continue; 2097 2098 zio_vdev_io_redone(zio); 2099 do { 2100 rc = &rm->rm_col[c]; 2101 if (rc->rc_tried) 2102 continue; 2103 zio_nowait(zio_vdev_child_io(zio, NULL, 2104 vd->vdev_child[rc->rc_devidx], 2105 rc->rc_offset, rc->rc_data, rc->rc_size, 2106 zio->io_type, zio->io_priority, 0, 2107 vdev_raidz_child_done, rc)); 2108 } while (++c < rm->rm_cols); 2109 2110 return; 2111 } 2112 2113 /* 2114 * At this point we've attempted to reconstruct the data given the 2115 * errors we detected, and we've attempted to read all columns. There 2116 * must, therefore, be one or more additional problems -- silent errors 2117 * resulting in invalid data rather than explicit I/O errors resulting 2118 * in absent data. We check if there is enough additional data to 2119 * possibly reconstruct the data and then perform combinatorial 2120 * reconstruction over all possible combinations. If that fails, 2121 * we're cooked. 2122 */ 2123 if (total_errors > rm->rm_firstdatacol) { 2124 zio->io_error = vdev_raidz_worst_error(rm); 2125 2126 } else if (total_errors < rm->rm_firstdatacol && 2127 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2128 /* 2129 * If we didn't use all the available parity for the 2130 * combinatorial reconstruction, verify that the remaining 2131 * parity is correct. 2132 */ 2133 if (code != (1 << rm->rm_firstdatacol) - 1) 2134 (void) raidz_parity_verify(zio, rm); 2135 } else { 2136 /* 2137 * We're here because either: 2138 * 2139 * total_errors == rm_first_datacol, or 2140 * vdev_raidz_combrec() failed 2141 * 2142 * In either case, there is enough bad data to prevent 2143 * reconstruction. 2144 * 2145 * Start checksum ereports for all children which haven't 2146 * failed, and the IO wasn't speculative. 2147 */ 2148 zio->io_error = SET_ERROR(ECKSUM); 2149 2150 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2151 for (c = 0; c < rm->rm_cols; c++) { 2152 rc = &rm->rm_col[c]; 2153 if (rc->rc_error == 0) { 2154 zio_bad_cksum_t zbc; 2155 zbc.zbc_has_cksum = 0; 2156 zbc.zbc_injected = 2157 rm->rm_ecksuminjected; 2158 2159 zfs_ereport_start_checksum( 2160 zio->io_spa, 2161 vd->vdev_child[rc->rc_devidx], 2162 zio, rc->rc_offset, rc->rc_size, 2163 (void *)(uintptr_t)c, &zbc); 2164 } 2165 } 2166 } 2167 } 2168 2169 done: 2170 zio_checksum_verified(zio); 2171 2172 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2173 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2174 /* 2175 * Use the good data we have in hand to repair damaged children. 2176 */ 2177 for (c = 0; c < rm->rm_cols; c++) { 2178 rc = &rm->rm_col[c]; 2179 cvd = vd->vdev_child[rc->rc_devidx]; 2180 2181 if (rc->rc_error == 0) 2182 continue; 2183 2184 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2185 rc->rc_offset, rc->rc_data, rc->rc_size, 2186 ZIO_TYPE_WRITE, zio->io_priority, 2187 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2188 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2189 } 2190 } 2191 } 2192 2193 static void 2194 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2195 { 2196 if (faulted > vd->vdev_nparity) 2197 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2198 VDEV_AUX_NO_REPLICAS); 2199 else if (degraded + faulted != 0) 2200 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2201 else 2202 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2203 } 2204 2205 vdev_ops_t vdev_raidz_ops = { 2206 vdev_raidz_open, 2207 vdev_raidz_close, 2208 vdev_raidz_asize, 2209 vdev_raidz_io_start, 2210 vdev_raidz_io_done, 2211 vdev_raidz_state_change, 2212 NULL, 2213 NULL, 2214 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2215 B_FALSE /* not a leaf vdev */ 2216 };