1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/vdev_impl.h> 30 #include <sys/zio.h> 31 #include <sys/zio_checksum.h> 32 #include <sys/fs/zfs.h> 33 #include <sys/fm/fs/zfs.h> 34 35 /* 36 * Virtual device vector for RAID-Z. 37 * 38 * This vdev supports single, double, and triple parity. For single parity, 39 * we use a simple XOR of all the data columns. For double or triple parity, 40 * we use a special case of Reed-Solomon coding. This extends the 41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 44 * former is also based. The latter is designed to provide higher performance 45 * for writes. 46 * 47 * Note that the Plank paper claimed to support arbitrary N+M, but was then 48 * amended six years later identifying a critical flaw that invalidates its 49 * claims. Nevertheless, the technique can be adapted to work for up to 50 * triple parity. For additional parity, the amendment "Note: Correction to 51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 52 * is viable, but the additional complexity means that write performance will 53 * suffer. 54 * 55 * All of the methods above operate on a Galois field, defined over the 56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 57 * can be expressed with a single byte. Briefly, the operations on the 58 * field are defined as follows: 59 * 60 * o addition (+) is represented by a bitwise XOR 61 * o subtraction (-) is therefore identical to addition: A + B = A - B 62 * o multiplication of A by 2 is defined by the following bitwise expression: 63 * 64 * (A * 2)_7 = A_6 65 * (A * 2)_6 = A_5 66 * (A * 2)_5 = A_4 67 * (A * 2)_4 = A_3 + A_7 68 * (A * 2)_3 = A_2 + A_7 69 * (A * 2)_2 = A_1 + A_7 70 * (A * 2)_1 = A_0 71 * (A * 2)_0 = A_7 72 * 73 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 74 * As an aside, this multiplication is derived from the error correcting 75 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 76 * 77 * Observe that any number in the field (except for 0) can be expressed as a 78 * power of 2 -- a generator for the field. We store a table of the powers of 79 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 80 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 81 * than field addition). The inverse of a field element A (A^-1) is therefore 82 * A ^ (255 - 1) = A^254. 83 * 84 * The up-to-three parity columns, P, Q, R over several data columns, 85 * D_0, ... D_n-1, can be expressed by field operations: 86 * 87 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 88 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 89 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 90 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 91 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 92 * 93 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 94 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 95 * independent coefficients. (There are no additional coefficients that have 96 * this property which is why the uncorrected Plank method breaks down.) 97 * 98 * See the reconstruction code below for how P, Q and R can used individually 99 * or in concert to recover missing data columns. 100 */ 101 102 typedef struct raidz_col { 103 uint64_t rc_devidx; /* child device index for I/O */ 104 uint64_t rc_offset; /* device offset */ 105 uint64_t rc_size; /* I/O size */ 106 void *rc_data; /* I/O data */ 107 void *rc_gdata; /* used to store the "good" version */ 108 int rc_error; /* I/O error for this device */ 109 uint8_t rc_tried; /* Did we attempt this I/O column? */ 110 uint8_t rc_skipped; /* Did we skip this I/O column? */ 111 } raidz_col_t; 112 113 typedef struct raidz_map { 114 uint64_t rm_cols; /* Regular column count */ 115 uint64_t rm_scols; /* Count including skipped columns */ 116 uint64_t rm_bigcols; /* Number of oversized columns */ 117 uint64_t rm_asize; /* Actual total I/O size */ 118 uint64_t rm_missingdata; /* Count of missing data devices */ 119 uint64_t rm_missingparity; /* Count of missing parity devices */ 120 uint64_t rm_firstdatacol; /* First data column/parity count */ 121 uint64_t rm_nskip; /* Skipped sectors for padding */ 122 uint64_t rm_skipstart; /* Column index of padding start */ 123 void *rm_datacopy; /* rm_asize-buffer of copied data */ 124 uintptr_t rm_reports; /* # of referencing checksum reports */ 125 uint8_t rm_freed; /* map no longer has referencing ZIO */ 126 uint8_t rm_ecksuminjected; /* checksum error was injected */ 127 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 128 } raidz_map_t; 129 130 #define VDEV_RAIDZ_P 0 131 #define VDEV_RAIDZ_Q 1 132 #define VDEV_RAIDZ_R 2 133 134 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 135 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 136 137 /* 138 * We provide a mechanism to perform the field multiplication operation on a 139 * 64-bit value all at once rather than a byte at a time. This works by 140 * creating a mask from the top bit in each byte and using that to 141 * conditionally apply the XOR of 0x1d. 142 */ 143 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 144 { \ 145 (mask) = (x) & 0x8080808080808080ULL; \ 146 (mask) = ((mask) << 1) - ((mask) >> 7); \ 147 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 148 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 149 } 150 151 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 152 { \ 153 VDEV_RAIDZ_64MUL_2((x), mask); \ 154 VDEV_RAIDZ_64MUL_2((x), mask); \ 155 } 156 157 /* 158 * Force reconstruction to use the general purpose method. 159 */ 160 int vdev_raidz_default_to_general; 161 162 /* Powers of 2 in the Galois field defined above. */ 163 static const uint8_t vdev_raidz_pow2[256] = { 164 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 165 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 166 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 167 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 168 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 169 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 170 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 171 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 172 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 173 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 174 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 175 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 176 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 177 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 178 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 179 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 180 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 181 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 182 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 183 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 184 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 185 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 186 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 187 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 188 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 189 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 190 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 191 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 192 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 193 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 194 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 195 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 196 }; 197 /* Logs of 2 in the Galois field defined above. */ 198 static const uint8_t vdev_raidz_log2[256] = { 199 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 200 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 201 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 202 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 203 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 204 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 205 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 206 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 207 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 208 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 209 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 210 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 211 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 212 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 213 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 214 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 215 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 216 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 217 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 218 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 219 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 220 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 221 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 222 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 223 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 224 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 225 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 226 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 227 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 228 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 229 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 230 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 231 }; 232 233 static void vdev_raidz_generate_parity(raidz_map_t *rm); 234 235 /* 236 * Multiply a given number by 2 raised to the given power. 237 */ 238 static uint8_t 239 vdev_raidz_exp2(uint_t a, int exp) 240 { 241 if (a == 0) 242 return (0); 243 244 ASSERT(exp >= 0); 245 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 246 247 exp += vdev_raidz_log2[a]; 248 if (exp > 255) 249 exp -= 255; 250 251 return (vdev_raidz_pow2[exp]); 252 } 253 254 static void 255 vdev_raidz_map_free(raidz_map_t *rm) 256 { 257 int c; 258 size_t size; 259 260 for (c = 0; c < rm->rm_firstdatacol; c++) { 261 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 262 263 if (rm->rm_col[c].rc_gdata != NULL) 264 zio_buf_free(rm->rm_col[c].rc_gdata, 265 rm->rm_col[c].rc_size); 266 } 267 268 size = 0; 269 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 270 size += rm->rm_col[c].rc_size; 271 272 if (rm->rm_datacopy != NULL) 273 zio_buf_free(rm->rm_datacopy, size); 274 275 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 276 } 277 278 static void 279 vdev_raidz_map_free_vsd(zio_t *zio) 280 { 281 raidz_map_t *rm = zio->io_vsd; 282 283 ASSERT0(rm->rm_freed); 284 rm->rm_freed = 1; 285 286 if (rm->rm_reports == 0) 287 vdev_raidz_map_free(rm); 288 } 289 290 /*ARGSUSED*/ 291 static void 292 vdev_raidz_cksum_free(void *arg, size_t ignored) 293 { 294 raidz_map_t *rm = arg; 295 296 ASSERT3U(rm->rm_reports, >, 0); 297 298 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 299 vdev_raidz_map_free(rm); 300 } 301 302 static void 303 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 304 { 305 raidz_map_t *rm = zcr->zcr_cbdata; 306 size_t c = zcr->zcr_cbinfo; 307 size_t x; 308 309 const char *good = NULL; 310 const char *bad = rm->rm_col[c].rc_data; 311 312 if (good_data == NULL) { 313 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 314 return; 315 } 316 317 if (c < rm->rm_firstdatacol) { 318 /* 319 * The first time through, calculate the parity blocks for 320 * the good data (this relies on the fact that the good 321 * data never changes for a given logical ZIO) 322 */ 323 if (rm->rm_col[0].rc_gdata == NULL) { 324 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 325 char *buf; 326 327 /* 328 * Set up the rm_col[]s to generate the parity for 329 * good_data, first saving the parity bufs and 330 * replacing them with buffers to hold the result. 331 */ 332 for (x = 0; x < rm->rm_firstdatacol; x++) { 333 bad_parity[x] = rm->rm_col[x].rc_data; 334 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 335 zio_buf_alloc(rm->rm_col[x].rc_size); 336 } 337 338 /* fill in the data columns from good_data */ 339 buf = (char *)good_data; 340 for (; x < rm->rm_cols; x++) { 341 rm->rm_col[x].rc_data = buf; 342 buf += rm->rm_col[x].rc_size; 343 } 344 345 /* 346 * Construct the parity from the good data. 347 */ 348 vdev_raidz_generate_parity(rm); 349 350 /* restore everything back to its original state */ 351 for (x = 0; x < rm->rm_firstdatacol; x++) 352 rm->rm_col[x].rc_data = bad_parity[x]; 353 354 buf = rm->rm_datacopy; 355 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 356 rm->rm_col[x].rc_data = buf; 357 buf += rm->rm_col[x].rc_size; 358 } 359 } 360 361 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 362 good = rm->rm_col[c].rc_gdata; 363 } else { 364 /* adjust good_data to point at the start of our column */ 365 good = good_data; 366 367 for (x = rm->rm_firstdatacol; x < c; x++) 368 good += rm->rm_col[x].rc_size; 369 } 370 371 /* we drop the ereport if it ends up that the data was good */ 372 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 373 } 374 375 /* 376 * Invoked indirectly by zfs_ereport_start_checksum(), called 377 * below when our read operation fails completely. The main point 378 * is to keep a copy of everything we read from disk, so that at 379 * vdev_raidz_cksum_finish() time we can compare it with the good data. 380 */ 381 static void 382 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 383 { 384 size_t c = (size_t)(uintptr_t)arg; 385 caddr_t buf; 386 387 raidz_map_t *rm = zio->io_vsd; 388 size_t size; 389 390 /* set up the report and bump the refcount */ 391 zcr->zcr_cbdata = rm; 392 zcr->zcr_cbinfo = c; 393 zcr->zcr_finish = vdev_raidz_cksum_finish; 394 zcr->zcr_free = vdev_raidz_cksum_free; 395 396 rm->rm_reports++; 397 ASSERT3U(rm->rm_reports, >, 0); 398 399 if (rm->rm_datacopy != NULL) 400 return; 401 402 /* 403 * It's the first time we're called for this raidz_map_t, so we need 404 * to copy the data aside; there's no guarantee that our zio's buffer 405 * won't be re-used for something else. 406 * 407 * Our parity data is already in separate buffers, so there's no need 408 * to copy them. 409 */ 410 411 size = 0; 412 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 413 size += rm->rm_col[c].rc_size; 414 415 buf = rm->rm_datacopy = zio_buf_alloc(size); 416 417 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 418 raidz_col_t *col = &rm->rm_col[c]; 419 420 bcopy(col->rc_data, buf, col->rc_size); 421 col->rc_data = buf; 422 423 buf += col->rc_size; 424 } 425 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 426 } 427 428 static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 429 vdev_raidz_map_free_vsd, 430 vdev_raidz_cksum_report 431 }; 432 433 /* 434 * Divides the IO evenly across all child vdevs; usually, dcols is 435 * the number of children in the target vdev. 436 */ 437 static raidz_map_t * 438 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 439 uint64_t nparity) 440 { 441 raidz_map_t *rm; 442 /* The starting RAIDZ (parent) vdev sector of the block. */ 443 uint64_t b = zio->io_offset >> unit_shift; 444 /* The zio's size in units of the vdev's minimum sector size. */ 445 uint64_t s = zio->io_size >> unit_shift; 446 /* The first column for this stripe. */ 447 uint64_t f = b % dcols; 448 /* The starting byte offset on each child vdev. */ 449 uint64_t o = (b / dcols) << unit_shift; 450 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 451 452 /* 453 * "Quotient": The number of data sectors for this stripe on all but 454 * the "big column" child vdevs that also contain "remainder" data. 455 */ 456 q = s / (dcols - nparity); 457 458 /* 459 * "Remainder": The number of partial stripe data sectors in this I/O. 460 * This will add a sector to some, but not all, child vdevs. 461 */ 462 r = s - q * (dcols - nparity); 463 464 /* The number of "big columns" - those which contain remainder data. */ 465 bc = (r == 0 ? 0 : r + nparity); 466 467 /* 468 * The total number of data and parity sectors associated with 469 * this I/O. 470 */ 471 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 472 473 /* acols: The columns that will be accessed. */ 474 /* scols: The columns that will be accessed or skipped. */ 475 if (q == 0) { 476 /* Our I/O request doesn't span all child vdevs. */ 477 acols = bc; 478 scols = MIN(dcols, roundup(bc, nparity + 1)); 479 } else { 480 acols = dcols; 481 scols = dcols; 482 } 483 484 ASSERT3U(acols, <=, scols); 485 486 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 487 488 rm->rm_cols = acols; 489 rm->rm_scols = scols; 490 rm->rm_bigcols = bc; 491 rm->rm_skipstart = bc; 492 rm->rm_missingdata = 0; 493 rm->rm_missingparity = 0; 494 rm->rm_firstdatacol = nparity; 495 rm->rm_datacopy = NULL; 496 rm->rm_reports = 0; 497 rm->rm_freed = 0; 498 rm->rm_ecksuminjected = 0; 499 500 asize = 0; 501 502 for (c = 0; c < scols; c++) { 503 col = f + c; 504 coff = o; 505 if (col >= dcols) { 506 col -= dcols; 507 coff += 1ULL << unit_shift; 508 } 509 rm->rm_col[c].rc_devidx = col; 510 rm->rm_col[c].rc_offset = coff; 511 rm->rm_col[c].rc_data = NULL; 512 rm->rm_col[c].rc_gdata = NULL; 513 rm->rm_col[c].rc_error = 0; 514 rm->rm_col[c].rc_tried = 0; 515 rm->rm_col[c].rc_skipped = 0; 516 517 if (c >= acols) 518 rm->rm_col[c].rc_size = 0; 519 else if (c < bc) 520 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 521 else 522 rm->rm_col[c].rc_size = q << unit_shift; 523 524 asize += rm->rm_col[c].rc_size; 525 } 526 527 ASSERT3U(asize, ==, tot << unit_shift); 528 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 529 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 530 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 531 ASSERT3U(rm->rm_nskip, <=, nparity); 532 533 for (c = 0; c < rm->rm_firstdatacol; c++) 534 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 535 536 rm->rm_col[c].rc_data = zio->io_data; 537 538 for (c = c + 1; c < acols; c++) 539 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 540 rm->rm_col[c - 1].rc_size; 541 542 /* 543 * If all data stored spans all columns, there's a danger that parity 544 * will always be on the same device and, since parity isn't read 545 * during normal operation, that that device's I/O bandwidth won't be 546 * used effectively. We therefore switch the parity every 1MB. 547 * 548 * ... at least that was, ostensibly, the theory. As a practical 549 * matter unless we juggle the parity between all devices evenly, we 550 * won't see any benefit. Further, occasional writes that aren't a 551 * multiple of the LCM of the number of children and the minimum 552 * stripe width are sufficient to avoid pessimal behavior. 553 * Unfortunately, this decision created an implicit on-disk format 554 * requirement that we need to support for all eternity, but only 555 * for single-parity RAID-Z. 556 * 557 * If we intend to skip a sector in the zeroth column for padding 558 * we must make sure to note this swap. We will never intend to 559 * skip the first column since at least one data and one parity 560 * column must appear in each row. 561 */ 562 ASSERT(rm->rm_cols >= 2); 563 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 564 565 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 566 devidx = rm->rm_col[0].rc_devidx; 567 o = rm->rm_col[0].rc_offset; 568 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 569 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 570 rm->rm_col[1].rc_devidx = devidx; 571 rm->rm_col[1].rc_offset = o; 572 573 if (rm->rm_skipstart == 0) 574 rm->rm_skipstart = 1; 575 } 576 577 zio->io_vsd = rm; 578 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 579 return (rm); 580 } 581 582 static void 583 vdev_raidz_generate_parity_p(raidz_map_t *rm) 584 { 585 uint64_t *p, *src, pcount, ccount, i; 586 int c; 587 588 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 589 590 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 591 src = rm->rm_col[c].rc_data; 592 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 593 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 594 595 if (c == rm->rm_firstdatacol) { 596 ASSERT(ccount == pcount); 597 for (i = 0; i < ccount; i++, src++, p++) { 598 *p = *src; 599 } 600 } else { 601 ASSERT(ccount <= pcount); 602 for (i = 0; i < ccount; i++, src++, p++) { 603 *p ^= *src; 604 } 605 } 606 } 607 } 608 609 static void 610 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 611 { 612 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 613 int c; 614 615 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 616 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 617 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 618 619 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 620 src = rm->rm_col[c].rc_data; 621 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 622 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 623 624 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 625 626 if (c == rm->rm_firstdatacol) { 627 ASSERT(ccnt == pcnt || ccnt == 0); 628 for (i = 0; i < ccnt; i++, src++, p++, q++) { 629 *p = *src; 630 *q = *src; 631 } 632 for (; i < pcnt; i++, src++, p++, q++) { 633 *p = 0; 634 *q = 0; 635 } 636 } else { 637 ASSERT(ccnt <= pcnt); 638 639 /* 640 * Apply the algorithm described above by multiplying 641 * the previous result and adding in the new value. 642 */ 643 for (i = 0; i < ccnt; i++, src++, p++, q++) { 644 *p ^= *src; 645 646 VDEV_RAIDZ_64MUL_2(*q, mask); 647 *q ^= *src; 648 } 649 650 /* 651 * Treat short columns as though they are full of 0s. 652 * Note that there's therefore nothing needed for P. 653 */ 654 for (; i < pcnt; i++, q++) { 655 VDEV_RAIDZ_64MUL_2(*q, mask); 656 } 657 } 658 } 659 } 660 661 static void 662 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 663 { 664 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 665 int c; 666 667 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 668 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 669 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 670 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 671 rm->rm_col[VDEV_RAIDZ_R].rc_size); 672 673 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 674 src = rm->rm_col[c].rc_data; 675 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 676 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 677 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 678 679 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 680 681 if (c == rm->rm_firstdatacol) { 682 ASSERT(ccnt == pcnt || ccnt == 0); 683 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 684 *p = *src; 685 *q = *src; 686 *r = *src; 687 } 688 for (; i < pcnt; i++, src++, p++, q++, r++) { 689 *p = 0; 690 *q = 0; 691 *r = 0; 692 } 693 } else { 694 ASSERT(ccnt <= pcnt); 695 696 /* 697 * Apply the algorithm described above by multiplying 698 * the previous result and adding in the new value. 699 */ 700 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 701 *p ^= *src; 702 703 VDEV_RAIDZ_64MUL_2(*q, mask); 704 *q ^= *src; 705 706 VDEV_RAIDZ_64MUL_4(*r, mask); 707 *r ^= *src; 708 } 709 710 /* 711 * Treat short columns as though they are full of 0s. 712 * Note that there's therefore nothing needed for P. 713 */ 714 for (; i < pcnt; i++, q++, r++) { 715 VDEV_RAIDZ_64MUL_2(*q, mask); 716 VDEV_RAIDZ_64MUL_4(*r, mask); 717 } 718 } 719 } 720 } 721 722 /* 723 * Generate RAID parity in the first virtual columns according to the number of 724 * parity columns available. 725 */ 726 static void 727 vdev_raidz_generate_parity(raidz_map_t *rm) 728 { 729 switch (rm->rm_firstdatacol) { 730 case 1: 731 vdev_raidz_generate_parity_p(rm); 732 break; 733 case 2: 734 vdev_raidz_generate_parity_pq(rm); 735 break; 736 case 3: 737 vdev_raidz_generate_parity_pqr(rm); 738 break; 739 default: 740 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 741 } 742 } 743 744 static int 745 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 746 { 747 uint64_t *dst, *src, xcount, ccount, count, i; 748 int x = tgts[0]; 749 int c; 750 751 ASSERT(ntgts == 1); 752 ASSERT(x >= rm->rm_firstdatacol); 753 ASSERT(x < rm->rm_cols); 754 755 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 756 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 757 ASSERT(xcount > 0); 758 759 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 760 dst = rm->rm_col[x].rc_data; 761 for (i = 0; i < xcount; i++, dst++, src++) { 762 *dst = *src; 763 } 764 765 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 766 src = rm->rm_col[c].rc_data; 767 dst = rm->rm_col[x].rc_data; 768 769 if (c == x) 770 continue; 771 772 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 773 count = MIN(ccount, xcount); 774 775 for (i = 0; i < count; i++, dst++, src++) { 776 *dst ^= *src; 777 } 778 } 779 780 return (1 << VDEV_RAIDZ_P); 781 } 782 783 static int 784 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 785 { 786 uint64_t *dst, *src, xcount, ccount, count, mask, i; 787 uint8_t *b; 788 int x = tgts[0]; 789 int c, j, exp; 790 791 ASSERT(ntgts == 1); 792 793 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 794 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 795 796 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 797 src = rm->rm_col[c].rc_data; 798 dst = rm->rm_col[x].rc_data; 799 800 if (c == x) 801 ccount = 0; 802 else 803 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 804 805 count = MIN(ccount, xcount); 806 807 if (c == rm->rm_firstdatacol) { 808 for (i = 0; i < count; i++, dst++, src++) { 809 *dst = *src; 810 } 811 for (; i < xcount; i++, dst++) { 812 *dst = 0; 813 } 814 815 } else { 816 for (i = 0; i < count; i++, dst++, src++) { 817 VDEV_RAIDZ_64MUL_2(*dst, mask); 818 *dst ^= *src; 819 } 820 821 for (; i < xcount; i++, dst++) { 822 VDEV_RAIDZ_64MUL_2(*dst, mask); 823 } 824 } 825 } 826 827 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 828 dst = rm->rm_col[x].rc_data; 829 exp = 255 - (rm->rm_cols - 1 - x); 830 831 for (i = 0; i < xcount; i++, dst++, src++) { 832 *dst ^= *src; 833 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 834 *b = vdev_raidz_exp2(*b, exp); 835 } 836 } 837 838 return (1 << VDEV_RAIDZ_Q); 839 } 840 841 static int 842 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 843 { 844 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 845 void *pdata, *qdata; 846 uint64_t xsize, ysize, i; 847 int x = tgts[0]; 848 int y = tgts[1]; 849 850 ASSERT(ntgts == 2); 851 ASSERT(x < y); 852 ASSERT(x >= rm->rm_firstdatacol); 853 ASSERT(y < rm->rm_cols); 854 855 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 856 857 /* 858 * Move the parity data aside -- we're going to compute parity as 859 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 860 * reuse the parity generation mechanism without trashing the actual 861 * parity so we make those columns appear to be full of zeros by 862 * setting their lengths to zero. 863 */ 864 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 865 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 866 xsize = rm->rm_col[x].rc_size; 867 ysize = rm->rm_col[y].rc_size; 868 869 rm->rm_col[VDEV_RAIDZ_P].rc_data = 870 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 871 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 872 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 873 rm->rm_col[x].rc_size = 0; 874 rm->rm_col[y].rc_size = 0; 875 876 vdev_raidz_generate_parity_pq(rm); 877 878 rm->rm_col[x].rc_size = xsize; 879 rm->rm_col[y].rc_size = ysize; 880 881 p = pdata; 882 q = qdata; 883 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 884 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 885 xd = rm->rm_col[x].rc_data; 886 yd = rm->rm_col[y].rc_data; 887 888 /* 889 * We now have: 890 * Pxy = P + D_x + D_y 891 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 892 * 893 * We can then solve for D_x: 894 * D_x = A * (P + Pxy) + B * (Q + Qxy) 895 * where 896 * A = 2^(x - y) * (2^(x - y) + 1)^-1 897 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 898 * 899 * With D_x in hand, we can easily solve for D_y: 900 * D_y = P + Pxy + D_x 901 */ 902 903 a = vdev_raidz_pow2[255 + x - y]; 904 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 905 tmp = 255 - vdev_raidz_log2[a ^ 1]; 906 907 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 908 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 909 910 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 911 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 912 vdev_raidz_exp2(*q ^ *qxy, bexp); 913 914 if (i < ysize) 915 *yd = *p ^ *pxy ^ *xd; 916 } 917 918 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 919 rm->rm_col[VDEV_RAIDZ_P].rc_size); 920 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 921 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 922 923 /* 924 * Restore the saved parity data. 925 */ 926 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 927 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 928 929 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 930 } 931 932 /* BEGIN CSTYLED */ 933 /* 934 * In the general case of reconstruction, we must solve the system of linear 935 * equations defined by the coeffecients used to generate parity as well as 936 * the contents of the data and parity disks. This can be expressed with 937 * vectors for the original data (D) and the actual data (d) and parity (p) 938 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 939 * 940 * __ __ __ __ 941 * | | __ __ | p_0 | 942 * | V | | D_0 | | p_m-1 | 943 * | | x | : | = | d_0 | 944 * | I | | D_n-1 | | : | 945 * | | ~~ ~~ | d_n-1 | 946 * ~~ ~~ ~~ ~~ 947 * 948 * I is simply a square identity matrix of size n, and V is a vandermonde 949 * matrix defined by the coeffecients we chose for the various parity columns 950 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 951 * computation as well as linear separability. 952 * 953 * __ __ __ __ 954 * | 1 .. 1 1 1 | | p_0 | 955 * | 2^n-1 .. 4 2 1 | __ __ | : | 956 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 957 * | 1 .. 0 0 0 | | D_1 | | d_0 | 958 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 959 * | : : : : | | : | | d_2 | 960 * | 0 .. 1 0 0 | | D_n-1 | | : | 961 * | 0 .. 0 1 0 | ~~ ~~ | : | 962 * | 0 .. 0 0 1 | | d_n-1 | 963 * ~~ ~~ ~~ ~~ 964 * 965 * Note that I, V, d, and p are known. To compute D, we must invert the 966 * matrix and use the known data and parity values to reconstruct the unknown 967 * data values. We begin by removing the rows in V|I and d|p that correspond 968 * to failed or missing columns; we then make V|I square (n x n) and d|p 969 * sized n by removing rows corresponding to unused parity from the bottom up 970 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 971 * using Gauss-Jordan elimination. In the example below we use m=3 parity 972 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 973 * __ __ 974 * | 1 1 1 1 1 1 1 1 | 975 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 976 * | 19 205 116 29 64 16 4 1 | / / 977 * | 1 0 0 0 0 0 0 0 | / / 978 * | 0 1 0 0 0 0 0 0 | <--' / 979 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 980 * | 0 0 0 1 0 0 0 0 | 981 * | 0 0 0 0 1 0 0 0 | 982 * | 0 0 0 0 0 1 0 0 | 983 * | 0 0 0 0 0 0 1 0 | 984 * | 0 0 0 0 0 0 0 1 | 985 * ~~ ~~ 986 * __ __ 987 * | 1 1 1 1 1 1 1 1 | 988 * | 128 64 32 16 8 4 2 1 | 989 * | 19 205 116 29 64 16 4 1 | 990 * | 1 0 0 0 0 0 0 0 | 991 * | 0 1 0 0 0 0 0 0 | 992 * (V|I)' = | 0 0 1 0 0 0 0 0 | 993 * | 0 0 0 1 0 0 0 0 | 994 * | 0 0 0 0 1 0 0 0 | 995 * | 0 0 0 0 0 1 0 0 | 996 * | 0 0 0 0 0 0 1 0 | 997 * | 0 0 0 0 0 0 0 1 | 998 * ~~ ~~ 999 * 1000 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1001 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1002 * matrix is not singular. 1003 * __ __ 1004 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1005 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1006 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1007 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1008 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1009 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1010 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1011 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1012 * ~~ ~~ 1013 * __ __ 1014 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1015 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1016 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1017 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1018 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1019 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1020 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1021 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1022 * ~~ ~~ 1023 * __ __ 1024 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1025 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1026 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1027 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1028 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1029 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1030 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1031 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1032 * ~~ ~~ 1033 * __ __ 1034 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1035 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1036 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1037 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1038 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1039 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1040 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1041 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1042 * ~~ ~~ 1043 * __ __ 1044 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1045 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1046 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1047 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1048 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1049 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1050 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1051 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1052 * ~~ ~~ 1053 * __ __ 1054 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1055 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1056 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1057 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1058 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1059 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1060 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1061 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1062 * ~~ ~~ 1063 * __ __ 1064 * | 0 0 1 0 0 0 0 0 | 1065 * | 167 100 5 41 159 169 217 208 | 1066 * | 166 100 4 40 158 168 216 209 | 1067 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1068 * | 0 0 0 0 1 0 0 0 | 1069 * | 0 0 0 0 0 1 0 0 | 1070 * | 0 0 0 0 0 0 1 0 | 1071 * | 0 0 0 0 0 0 0 1 | 1072 * ~~ ~~ 1073 * 1074 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1075 * of the missing data. 1076 * 1077 * As is apparent from the example above, the only non-trivial rows in the 1078 * inverse matrix correspond to the data disks that we're trying to 1079 * reconstruct. Indeed, those are the only rows we need as the others would 1080 * only be useful for reconstructing data known or assumed to be valid. For 1081 * that reason, we only build the coefficients in the rows that correspond to 1082 * targeted columns. 1083 */ 1084 /* END CSTYLED */ 1085 1086 static void 1087 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1088 uint8_t **rows) 1089 { 1090 int i, j; 1091 int pow; 1092 1093 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1094 1095 /* 1096 * Fill in the missing rows of interest. 1097 */ 1098 for (i = 0; i < nmap; i++) { 1099 ASSERT3S(0, <=, map[i]); 1100 ASSERT3S(map[i], <=, 2); 1101 1102 pow = map[i] * n; 1103 if (pow > 255) 1104 pow -= 255; 1105 ASSERT(pow <= 255); 1106 1107 for (j = 0; j < n; j++) { 1108 pow -= map[i]; 1109 if (pow < 0) 1110 pow += 255; 1111 rows[i][j] = vdev_raidz_pow2[pow]; 1112 } 1113 } 1114 } 1115 1116 static void 1117 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1118 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1119 { 1120 int i, j, ii, jj; 1121 uint8_t log; 1122 1123 /* 1124 * Assert that the first nmissing entries from the array of used 1125 * columns correspond to parity columns and that subsequent entries 1126 * correspond to data columns. 1127 */ 1128 for (i = 0; i < nmissing; i++) { 1129 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1130 } 1131 for (; i < n; i++) { 1132 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1133 } 1134 1135 /* 1136 * First initialize the storage where we'll compute the inverse rows. 1137 */ 1138 for (i = 0; i < nmissing; i++) { 1139 for (j = 0; j < n; j++) { 1140 invrows[i][j] = (i == j) ? 1 : 0; 1141 } 1142 } 1143 1144 /* 1145 * Subtract all trivial rows from the rows of consequence. 1146 */ 1147 for (i = 0; i < nmissing; i++) { 1148 for (j = nmissing; j < n; j++) { 1149 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1150 jj = used[j] - rm->rm_firstdatacol; 1151 ASSERT3S(jj, <, n); 1152 invrows[i][j] = rows[i][jj]; 1153 rows[i][jj] = 0; 1154 } 1155 } 1156 1157 /* 1158 * For each of the rows of interest, we must normalize it and subtract 1159 * a multiple of it from the other rows. 1160 */ 1161 for (i = 0; i < nmissing; i++) { 1162 for (j = 0; j < missing[i]; j++) { 1163 ASSERT0(rows[i][j]); 1164 } 1165 ASSERT3U(rows[i][missing[i]], !=, 0); 1166 1167 /* 1168 * Compute the inverse of the first element and multiply each 1169 * element in the row by that value. 1170 */ 1171 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1172 1173 for (j = 0; j < n; j++) { 1174 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1175 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1176 } 1177 1178 for (ii = 0; ii < nmissing; ii++) { 1179 if (i == ii) 1180 continue; 1181 1182 ASSERT3U(rows[ii][missing[i]], !=, 0); 1183 1184 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1185 1186 for (j = 0; j < n; j++) { 1187 rows[ii][j] ^= 1188 vdev_raidz_exp2(rows[i][j], log); 1189 invrows[ii][j] ^= 1190 vdev_raidz_exp2(invrows[i][j], log); 1191 } 1192 } 1193 } 1194 1195 /* 1196 * Verify that the data that is left in the rows are properly part of 1197 * an identity matrix. 1198 */ 1199 for (i = 0; i < nmissing; i++) { 1200 for (j = 0; j < n; j++) { 1201 if (j == missing[i]) { 1202 ASSERT3U(rows[i][j], ==, 1); 1203 } else { 1204 ASSERT0(rows[i][j]); 1205 } 1206 } 1207 } 1208 } 1209 1210 static void 1211 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1212 int *missing, uint8_t **invrows, const uint8_t *used) 1213 { 1214 int i, j, x, cc, c; 1215 uint8_t *src; 1216 uint64_t ccount; 1217 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1218 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1219 uint8_t log = 0; 1220 uint8_t val; 1221 int ll; 1222 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1223 uint8_t *p, *pp; 1224 size_t psize; 1225 1226 psize = sizeof (invlog[0][0]) * n * nmissing; 1227 p = kmem_alloc(psize, KM_SLEEP); 1228 1229 for (pp = p, i = 0; i < nmissing; i++) { 1230 invlog[i] = pp; 1231 pp += n; 1232 } 1233 1234 for (i = 0; i < nmissing; i++) { 1235 for (j = 0; j < n; j++) { 1236 ASSERT3U(invrows[i][j], !=, 0); 1237 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1238 } 1239 } 1240 1241 for (i = 0; i < n; i++) { 1242 c = used[i]; 1243 ASSERT3U(c, <, rm->rm_cols); 1244 1245 src = rm->rm_col[c].rc_data; 1246 ccount = rm->rm_col[c].rc_size; 1247 for (j = 0; j < nmissing; j++) { 1248 cc = missing[j] + rm->rm_firstdatacol; 1249 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1250 ASSERT3U(cc, <, rm->rm_cols); 1251 ASSERT3U(cc, !=, c); 1252 1253 dst[j] = rm->rm_col[cc].rc_data; 1254 dcount[j] = rm->rm_col[cc].rc_size; 1255 } 1256 1257 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1258 1259 for (x = 0; x < ccount; x++, src++) { 1260 if (*src != 0) 1261 log = vdev_raidz_log2[*src]; 1262 1263 for (cc = 0; cc < nmissing; cc++) { 1264 if (x >= dcount[cc]) 1265 continue; 1266 1267 if (*src == 0) { 1268 val = 0; 1269 } else { 1270 if ((ll = log + invlog[cc][i]) >= 255) 1271 ll -= 255; 1272 val = vdev_raidz_pow2[ll]; 1273 } 1274 1275 if (i == 0) 1276 dst[cc][x] = val; 1277 else 1278 dst[cc][x] ^= val; 1279 } 1280 } 1281 } 1282 1283 kmem_free(p, psize); 1284 } 1285 1286 static int 1287 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1288 { 1289 int n, i, c, t, tt; 1290 int nmissing_rows; 1291 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1292 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1293 1294 uint8_t *p, *pp; 1295 size_t psize; 1296 1297 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1298 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1299 uint8_t *used; 1300 1301 int code = 0; 1302 1303 1304 n = rm->rm_cols - rm->rm_firstdatacol; 1305 1306 /* 1307 * Figure out which data columns are missing. 1308 */ 1309 nmissing_rows = 0; 1310 for (t = 0; t < ntgts; t++) { 1311 if (tgts[t] >= rm->rm_firstdatacol) { 1312 missing_rows[nmissing_rows++] = 1313 tgts[t] - rm->rm_firstdatacol; 1314 } 1315 } 1316 1317 /* 1318 * Figure out which parity columns to use to help generate the missing 1319 * data columns. 1320 */ 1321 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1322 ASSERT(tt < ntgts); 1323 ASSERT(c < rm->rm_firstdatacol); 1324 1325 /* 1326 * Skip any targeted parity columns. 1327 */ 1328 if (c == tgts[tt]) { 1329 tt++; 1330 continue; 1331 } 1332 1333 code |= 1 << c; 1334 1335 parity_map[i] = c; 1336 i++; 1337 } 1338 1339 ASSERT(code != 0); 1340 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1341 1342 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1343 nmissing_rows * n + sizeof (used[0]) * n; 1344 p = kmem_alloc(psize, KM_SLEEP); 1345 1346 for (pp = p, i = 0; i < nmissing_rows; i++) { 1347 rows[i] = pp; 1348 pp += n; 1349 invrows[i] = pp; 1350 pp += n; 1351 } 1352 used = pp; 1353 1354 for (i = 0; i < nmissing_rows; i++) { 1355 used[i] = parity_map[i]; 1356 } 1357 1358 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1359 if (tt < nmissing_rows && 1360 c == missing_rows[tt] + rm->rm_firstdatacol) { 1361 tt++; 1362 continue; 1363 } 1364 1365 ASSERT3S(i, <, n); 1366 used[i] = c; 1367 i++; 1368 } 1369 1370 /* 1371 * Initialize the interesting rows of the matrix. 1372 */ 1373 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1374 1375 /* 1376 * Invert the matrix. 1377 */ 1378 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1379 invrows, used); 1380 1381 /* 1382 * Reconstruct the missing data using the generated matrix. 1383 */ 1384 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1385 invrows, used); 1386 1387 kmem_free(p, psize); 1388 1389 return (code); 1390 } 1391 1392 static int 1393 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1394 { 1395 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1396 int ntgts; 1397 int i, c; 1398 int code; 1399 int nbadparity, nbaddata; 1400 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1401 1402 /* 1403 * The tgts list must already be sorted. 1404 */ 1405 for (i = 1; i < nt; i++) { 1406 ASSERT(t[i] > t[i - 1]); 1407 } 1408 1409 nbadparity = rm->rm_firstdatacol; 1410 nbaddata = rm->rm_cols - nbadparity; 1411 ntgts = 0; 1412 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1413 if (c < rm->rm_firstdatacol) 1414 parity_valid[c] = B_FALSE; 1415 1416 if (i < nt && c == t[i]) { 1417 tgts[ntgts++] = c; 1418 i++; 1419 } else if (rm->rm_col[c].rc_error != 0) { 1420 tgts[ntgts++] = c; 1421 } else if (c >= rm->rm_firstdatacol) { 1422 nbaddata--; 1423 } else { 1424 parity_valid[c] = B_TRUE; 1425 nbadparity--; 1426 } 1427 } 1428 1429 ASSERT(ntgts >= nt); 1430 ASSERT(nbaddata >= 0); 1431 ASSERT(nbaddata + nbadparity == ntgts); 1432 1433 dt = &tgts[nbadparity]; 1434 1435 /* 1436 * See if we can use any of our optimized reconstruction routines. 1437 */ 1438 if (!vdev_raidz_default_to_general) { 1439 switch (nbaddata) { 1440 case 1: 1441 if (parity_valid[VDEV_RAIDZ_P]) 1442 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1443 1444 ASSERT(rm->rm_firstdatacol > 1); 1445 1446 if (parity_valid[VDEV_RAIDZ_Q]) 1447 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1448 1449 ASSERT(rm->rm_firstdatacol > 2); 1450 break; 1451 1452 case 2: 1453 ASSERT(rm->rm_firstdatacol > 1); 1454 1455 if (parity_valid[VDEV_RAIDZ_P] && 1456 parity_valid[VDEV_RAIDZ_Q]) 1457 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1458 1459 ASSERT(rm->rm_firstdatacol > 2); 1460 1461 break; 1462 } 1463 } 1464 1465 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1466 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1467 ASSERT(code > 0); 1468 return (code); 1469 } 1470 1471 static int 1472 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1473 uint64_t *ashift) 1474 { 1475 vdev_t *cvd; 1476 uint64_t nparity = vd->vdev_nparity; 1477 int c; 1478 int lasterror = 0; 1479 int numerrors = 0; 1480 1481 ASSERT(nparity > 0); 1482 1483 if (nparity > VDEV_RAIDZ_MAXPARITY || 1484 vd->vdev_children < nparity + 1) { 1485 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1486 return (SET_ERROR(EINVAL)); 1487 } 1488 1489 vdev_open_children(vd); 1490 1491 for (c = 0; c < vd->vdev_children; c++) { 1492 cvd = vd->vdev_child[c]; 1493 1494 if (cvd->vdev_open_error != 0) { 1495 lasterror = cvd->vdev_open_error; 1496 numerrors++; 1497 continue; 1498 } 1499 1500 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1501 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1502 *ashift = MAX(*ashift, cvd->vdev_ashift); 1503 } 1504 1505 *asize *= vd->vdev_children; 1506 *max_asize *= vd->vdev_children; 1507 1508 if (numerrors > nparity) { 1509 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1510 return (lasterror); 1511 } 1512 1513 return (0); 1514 } 1515 1516 static void 1517 vdev_raidz_close(vdev_t *vd) 1518 { 1519 int c; 1520 1521 for (c = 0; c < vd->vdev_children; c++) 1522 vdev_close(vd->vdev_child[c]); 1523 } 1524 1525 static uint64_t 1526 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1527 { 1528 uint64_t asize; 1529 uint64_t ashift = vd->vdev_top->vdev_ashift; 1530 uint64_t cols = vd->vdev_children; 1531 uint64_t nparity = vd->vdev_nparity; 1532 1533 asize = ((psize - 1) >> ashift) + 1; 1534 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1535 asize = roundup(asize, nparity + 1) << ashift; 1536 1537 return (asize); 1538 } 1539 1540 static void 1541 vdev_raidz_child_done(zio_t *zio) 1542 { 1543 raidz_col_t *rc = zio->io_private; 1544 1545 rc->rc_error = zio->io_error; 1546 rc->rc_tried = 1; 1547 rc->rc_skipped = 0; 1548 } 1549 1550 /* 1551 * Start an IO operation on a RAIDZ VDev 1552 * 1553 * Outline: 1554 * - For write operations: 1555 * 1. Generate the parity data 1556 * 2. Create child zio write operations to each column's vdev, for both 1557 * data and parity. 1558 * 3. If the column skips any sectors for padding, create optional dummy 1559 * write zio children for those areas to improve aggregation continuity. 1560 * - For read operations: 1561 * 1. Create child zio read operations to each data column's vdev to read 1562 * the range of data required for zio. 1563 * 2. If this is a scrub or resilver operation, or if any of the data 1564 * vdevs have had errors, then create zio read operations to the parity 1565 * columns' VDevs as well. 1566 */ 1567 static int 1568 vdev_raidz_io_start(zio_t *zio) 1569 { 1570 vdev_t *vd = zio->io_vd; 1571 vdev_t *tvd = vd->vdev_top; 1572 vdev_t *cvd; 1573 raidz_map_t *rm; 1574 raidz_col_t *rc; 1575 int c, i; 1576 1577 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1578 vd->vdev_nparity); 1579 1580 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1581 1582 if (zio->io_type == ZIO_TYPE_WRITE) { 1583 vdev_raidz_generate_parity(rm); 1584 1585 for (c = 0; c < rm->rm_cols; c++) { 1586 rc = &rm->rm_col[c]; 1587 cvd = vd->vdev_child[rc->rc_devidx]; 1588 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1589 rc->rc_offset, rc->rc_data, rc->rc_size, 1590 zio->io_type, zio->io_priority, 0, 1591 vdev_raidz_child_done, rc)); 1592 } 1593 1594 /* 1595 * Generate optional I/Os for any skipped sectors to improve 1596 * aggregation contiguity. 1597 */ 1598 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1599 ASSERT(c <= rm->rm_scols); 1600 if (c == rm->rm_scols) 1601 c = 0; 1602 rc = &rm->rm_col[c]; 1603 cvd = vd->vdev_child[rc->rc_devidx]; 1604 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1605 rc->rc_offset + rc->rc_size, NULL, 1606 1 << tvd->vdev_ashift, 1607 zio->io_type, zio->io_priority, 1608 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1609 } 1610 1611 return (ZIO_PIPELINE_CONTINUE); 1612 } 1613 1614 ASSERT(zio->io_type == ZIO_TYPE_READ); 1615 1616 /* 1617 * Iterate over the columns in reverse order so that we hit the parity 1618 * last -- any errors along the way will force us to read the parity. 1619 */ 1620 for (c = rm->rm_cols - 1; c >= 0; c--) { 1621 rc = &rm->rm_col[c]; 1622 cvd = vd->vdev_child[rc->rc_devidx]; 1623 if (!vdev_readable(cvd)) { 1624 if (c >= rm->rm_firstdatacol) 1625 rm->rm_missingdata++; 1626 else 1627 rm->rm_missingparity++; 1628 rc->rc_error = SET_ERROR(ENXIO); 1629 rc->rc_tried = 1; /* don't even try */ 1630 rc->rc_skipped = 1; 1631 continue; 1632 } 1633 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1634 if (c >= rm->rm_firstdatacol) 1635 rm->rm_missingdata++; 1636 else 1637 rm->rm_missingparity++; 1638 rc->rc_error = SET_ERROR(ESTALE); 1639 rc->rc_skipped = 1; 1640 continue; 1641 } 1642 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1643 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1644 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1645 rc->rc_offset, rc->rc_data, rc->rc_size, 1646 zio->io_type, zio->io_priority, 0, 1647 vdev_raidz_child_done, rc)); 1648 } 1649 } 1650 1651 return (ZIO_PIPELINE_CONTINUE); 1652 } 1653 1654 1655 /* 1656 * Report a checksum error for a child of a RAID-Z device. 1657 */ 1658 static void 1659 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1660 { 1661 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1662 1663 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1664 zio_bad_cksum_t zbc; 1665 raidz_map_t *rm = zio->io_vsd; 1666 1667 mutex_enter(&vd->vdev_stat_lock); 1668 vd->vdev_stat.vs_checksum_errors++; 1669 mutex_exit(&vd->vdev_stat_lock); 1670 1671 zbc.zbc_has_cksum = 0; 1672 zbc.zbc_injected = rm->rm_ecksuminjected; 1673 1674 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1675 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1676 &zbc); 1677 } 1678 } 1679 1680 /* 1681 * We keep track of whether or not there were any injected errors, so that 1682 * any ereports we generate can note it. 1683 */ 1684 static int 1685 raidz_checksum_verify(zio_t *zio) 1686 { 1687 zio_bad_cksum_t zbc; 1688 raidz_map_t *rm = zio->io_vsd; 1689 1690 int ret = zio_checksum_error(zio, &zbc); 1691 if (ret != 0 && zbc.zbc_injected != 0) 1692 rm->rm_ecksuminjected = 1; 1693 1694 return (ret); 1695 } 1696 1697 /* 1698 * Generate the parity from the data columns. If we tried and were able to 1699 * read the parity without error, verify that the generated parity matches the 1700 * data we read. If it doesn't, we fire off a checksum error. Return the 1701 * number such failures. 1702 */ 1703 static int 1704 raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1705 { 1706 void *orig[VDEV_RAIDZ_MAXPARITY]; 1707 int c, ret = 0; 1708 raidz_col_t *rc; 1709 1710 for (c = 0; c < rm->rm_firstdatacol; c++) { 1711 rc = &rm->rm_col[c]; 1712 if (!rc->rc_tried || rc->rc_error != 0) 1713 continue; 1714 orig[c] = zio_buf_alloc(rc->rc_size); 1715 bcopy(rc->rc_data, orig[c], rc->rc_size); 1716 } 1717 1718 vdev_raidz_generate_parity(rm); 1719 1720 for (c = 0; c < rm->rm_firstdatacol; c++) { 1721 rc = &rm->rm_col[c]; 1722 if (!rc->rc_tried || rc->rc_error != 0) 1723 continue; 1724 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1725 raidz_checksum_error(zio, rc, orig[c]); 1726 rc->rc_error = SET_ERROR(ECKSUM); 1727 ret++; 1728 } 1729 zio_buf_free(orig[c], rc->rc_size); 1730 } 1731 1732 return (ret); 1733 } 1734 1735 /* 1736 * Keep statistics on all the ways that we used parity to correct data. 1737 */ 1738 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1739 1740 static int 1741 vdev_raidz_worst_error(raidz_map_t *rm) 1742 { 1743 int error = 0; 1744 1745 for (int c = 0; c < rm->rm_cols; c++) 1746 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1747 1748 return (error); 1749 } 1750 1751 /* 1752 * Iterate over all combinations of bad data and attempt a reconstruction. 1753 * Note that the algorithm below is non-optimal because it doesn't take into 1754 * account how reconstruction is actually performed. For example, with 1755 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1756 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1757 * cases we'd only use parity information in column 0. 1758 */ 1759 static int 1760 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1761 { 1762 raidz_map_t *rm = zio->io_vsd; 1763 raidz_col_t *rc; 1764 void *orig[VDEV_RAIDZ_MAXPARITY]; 1765 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1766 int *tgts = &tstore[1]; 1767 int current, next, i, c, n; 1768 int code, ret = 0; 1769 1770 ASSERT(total_errors < rm->rm_firstdatacol); 1771 1772 /* 1773 * This simplifies one edge condition. 1774 */ 1775 tgts[-1] = -1; 1776 1777 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1778 /* 1779 * Initialize the targets array by finding the first n columns 1780 * that contain no error. 1781 * 1782 * If there were no data errors, we need to ensure that we're 1783 * always explicitly attempting to reconstruct at least one 1784 * data column. To do this, we simply push the highest target 1785 * up into the data columns. 1786 */ 1787 for (c = 0, i = 0; i < n; i++) { 1788 if (i == n - 1 && data_errors == 0 && 1789 c < rm->rm_firstdatacol) { 1790 c = rm->rm_firstdatacol; 1791 } 1792 1793 while (rm->rm_col[c].rc_error != 0) { 1794 c++; 1795 ASSERT3S(c, <, rm->rm_cols); 1796 } 1797 1798 tgts[i] = c++; 1799 } 1800 1801 /* 1802 * Setting tgts[n] simplifies the other edge condition. 1803 */ 1804 tgts[n] = rm->rm_cols; 1805 1806 /* 1807 * These buffers were allocated in previous iterations. 1808 */ 1809 for (i = 0; i < n - 1; i++) { 1810 ASSERT(orig[i] != NULL); 1811 } 1812 1813 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1814 1815 current = 0; 1816 next = tgts[current]; 1817 1818 while (current != n) { 1819 tgts[current] = next; 1820 current = 0; 1821 1822 /* 1823 * Save off the original data that we're going to 1824 * attempt to reconstruct. 1825 */ 1826 for (i = 0; i < n; i++) { 1827 ASSERT(orig[i] != NULL); 1828 c = tgts[i]; 1829 ASSERT3S(c, >=, 0); 1830 ASSERT3S(c, <, rm->rm_cols); 1831 rc = &rm->rm_col[c]; 1832 bcopy(rc->rc_data, orig[i], rc->rc_size); 1833 } 1834 1835 /* 1836 * Attempt a reconstruction and exit the outer loop on 1837 * success. 1838 */ 1839 code = vdev_raidz_reconstruct(rm, tgts, n); 1840 if (raidz_checksum_verify(zio) == 0) { 1841 atomic_inc_64(&raidz_corrected[code]); 1842 1843 for (i = 0; i < n; i++) { 1844 c = tgts[i]; 1845 rc = &rm->rm_col[c]; 1846 ASSERT(rc->rc_error == 0); 1847 if (rc->rc_tried) 1848 raidz_checksum_error(zio, rc, 1849 orig[i]); 1850 rc->rc_error = SET_ERROR(ECKSUM); 1851 } 1852 1853 ret = code; 1854 goto done; 1855 } 1856 1857 /* 1858 * Restore the original data. 1859 */ 1860 for (i = 0; i < n; i++) { 1861 c = tgts[i]; 1862 rc = &rm->rm_col[c]; 1863 bcopy(orig[i], rc->rc_data, rc->rc_size); 1864 } 1865 1866 do { 1867 /* 1868 * Find the next valid column after the current 1869 * position.. 1870 */ 1871 for (next = tgts[current] + 1; 1872 next < rm->rm_cols && 1873 rm->rm_col[next].rc_error != 0; next++) 1874 continue; 1875 1876 ASSERT(next <= tgts[current + 1]); 1877 1878 /* 1879 * If that spot is available, we're done here. 1880 */ 1881 if (next != tgts[current + 1]) 1882 break; 1883 1884 /* 1885 * Otherwise, find the next valid column after 1886 * the previous position. 1887 */ 1888 for (c = tgts[current - 1] + 1; 1889 rm->rm_col[c].rc_error != 0; c++) 1890 continue; 1891 1892 tgts[current] = c; 1893 current++; 1894 1895 } while (current != n); 1896 } 1897 } 1898 n--; 1899 done: 1900 for (i = 0; i < n; i++) { 1901 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 1902 } 1903 1904 return (ret); 1905 } 1906 1907 /* 1908 * Complete an IO operation on a RAIDZ VDev 1909 * 1910 * Outline: 1911 * - For write operations: 1912 * 1. Check for errors on the child IOs. 1913 * 2. Return, setting an error code if too few child VDevs were written 1914 * to reconstruct the data later. Note that partial writes are 1915 * considered successful if they can be reconstructed at all. 1916 * - For read operations: 1917 * 1. Check for errors on the child IOs. 1918 * 2. If data errors occurred: 1919 * a. Try to reassemble the data from the parity available. 1920 * b. If we haven't yet read the parity drives, read them now. 1921 * c. If all parity drives have been read but the data still doesn't 1922 * reassemble with a correct checksum, then try combinatorial 1923 * reconstruction. 1924 * d. If that doesn't work, return an error. 1925 * 3. If there were unexpected errors or this is a resilver operation, 1926 * rewrite the vdevs that had errors. 1927 */ 1928 static void 1929 vdev_raidz_io_done(zio_t *zio) 1930 { 1931 vdev_t *vd = zio->io_vd; 1932 vdev_t *cvd; 1933 raidz_map_t *rm = zio->io_vsd; 1934 raidz_col_t *rc; 1935 int unexpected_errors = 0; 1936 int parity_errors = 0; 1937 int parity_untried = 0; 1938 int data_errors = 0; 1939 int total_errors = 0; 1940 int n, c; 1941 int tgts[VDEV_RAIDZ_MAXPARITY]; 1942 int code; 1943 1944 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1945 1946 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1947 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1948 1949 for (c = 0; c < rm->rm_cols; c++) { 1950 rc = &rm->rm_col[c]; 1951 1952 if (rc->rc_error) { 1953 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1954 1955 if (c < rm->rm_firstdatacol) 1956 parity_errors++; 1957 else 1958 data_errors++; 1959 1960 if (!rc->rc_skipped) 1961 unexpected_errors++; 1962 1963 total_errors++; 1964 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1965 parity_untried++; 1966 } 1967 } 1968 1969 if (zio->io_type == ZIO_TYPE_WRITE) { 1970 /* 1971 * XXX -- for now, treat partial writes as a success. 1972 * (If we couldn't write enough columns to reconstruct 1973 * the data, the I/O failed. Otherwise, good enough.) 1974 * 1975 * Now that we support write reallocation, it would be better 1976 * to treat partial failure as real failure unless there are 1977 * no non-degraded top-level vdevs left, and not update DTLs 1978 * if we intend to reallocate. 1979 */ 1980 /* XXPOLICY */ 1981 if (total_errors > rm->rm_firstdatacol) 1982 zio->io_error = vdev_raidz_worst_error(rm); 1983 1984 return; 1985 } 1986 1987 ASSERT(zio->io_type == ZIO_TYPE_READ); 1988 /* 1989 * There are three potential phases for a read: 1990 * 1. produce valid data from the columns read 1991 * 2. read all disks and try again 1992 * 3. perform combinatorial reconstruction 1993 * 1994 * Each phase is progressively both more expensive and less likely to 1995 * occur. If we encounter more errors than we can repair or all phases 1996 * fail, we have no choice but to return an error. 1997 */ 1998 1999 /* 2000 * If the number of errors we saw was correctable -- less than or equal 2001 * to the number of parity disks read -- attempt to produce data that 2002 * has a valid checksum. Naturally, this case applies in the absence of 2003 * any errors. 2004 */ 2005 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2006 if (data_errors == 0) { 2007 if (raidz_checksum_verify(zio) == 0) { 2008 /* 2009 * If we read parity information (unnecessarily 2010 * as it happens since no reconstruction was 2011 * needed) regenerate and verify the parity. 2012 * We also regenerate parity when resilvering 2013 * so we can write it out to the failed device 2014 * later. 2015 */ 2016 if (parity_errors + parity_untried < 2017 rm->rm_firstdatacol || 2018 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2019 n = raidz_parity_verify(zio, rm); 2020 unexpected_errors += n; 2021 ASSERT(parity_errors + n <= 2022 rm->rm_firstdatacol); 2023 } 2024 goto done; 2025 } 2026 } else { 2027 /* 2028 * We either attempt to read all the parity columns or 2029 * none of them. If we didn't try to read parity, we 2030 * wouldn't be here in the correctable case. There must 2031 * also have been fewer parity errors than parity 2032 * columns or, again, we wouldn't be in this code path. 2033 */ 2034 ASSERT(parity_untried == 0); 2035 ASSERT(parity_errors < rm->rm_firstdatacol); 2036 2037 /* 2038 * Identify the data columns that reported an error. 2039 */ 2040 n = 0; 2041 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2042 rc = &rm->rm_col[c]; 2043 if (rc->rc_error != 0) { 2044 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2045 tgts[n++] = c; 2046 } 2047 } 2048 2049 ASSERT(rm->rm_firstdatacol >= n); 2050 2051 code = vdev_raidz_reconstruct(rm, tgts, n); 2052 2053 if (raidz_checksum_verify(zio) == 0) { 2054 atomic_inc_64(&raidz_corrected[code]); 2055 2056 /* 2057 * If we read more parity disks than were used 2058 * for reconstruction, confirm that the other 2059 * parity disks produced correct data. This 2060 * routine is suboptimal in that it regenerates 2061 * the parity that we already used in addition 2062 * to the parity that we're attempting to 2063 * verify, but this should be a relatively 2064 * uncommon case, and can be optimized if it 2065 * becomes a problem. Note that we regenerate 2066 * parity when resilvering so we can write it 2067 * out to failed devices later. 2068 */ 2069 if (parity_errors < rm->rm_firstdatacol - n || 2070 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2071 n = raidz_parity_verify(zio, rm); 2072 unexpected_errors += n; 2073 ASSERT(parity_errors + n <= 2074 rm->rm_firstdatacol); 2075 } 2076 2077 goto done; 2078 } 2079 } 2080 } 2081 2082 /* 2083 * This isn't a typical situation -- either we got a read error or 2084 * a child silently returned bad data. Read every block so we can 2085 * try again with as much data and parity as we can track down. If 2086 * we've already been through once before, all children will be marked 2087 * as tried so we'll proceed to combinatorial reconstruction. 2088 */ 2089 unexpected_errors = 1; 2090 rm->rm_missingdata = 0; 2091 rm->rm_missingparity = 0; 2092 2093 for (c = 0; c < rm->rm_cols; c++) { 2094 if (rm->rm_col[c].rc_tried) 2095 continue; 2096 2097 zio_vdev_io_redone(zio); 2098 do { 2099 rc = &rm->rm_col[c]; 2100 if (rc->rc_tried) 2101 continue; 2102 zio_nowait(zio_vdev_child_io(zio, NULL, 2103 vd->vdev_child[rc->rc_devidx], 2104 rc->rc_offset, rc->rc_data, rc->rc_size, 2105 zio->io_type, zio->io_priority, 0, 2106 vdev_raidz_child_done, rc)); 2107 } while (++c < rm->rm_cols); 2108 2109 return; 2110 } 2111 2112 /* 2113 * At this point we've attempted to reconstruct the data given the 2114 * errors we detected, and we've attempted to read all columns. There 2115 * must, therefore, be one or more additional problems -- silent errors 2116 * resulting in invalid data rather than explicit I/O errors resulting 2117 * in absent data. We check if there is enough additional data to 2118 * possibly reconstruct the data and then perform combinatorial 2119 * reconstruction over all possible combinations. If that fails, 2120 * we're cooked. 2121 */ 2122 if (total_errors > rm->rm_firstdatacol) { 2123 zio->io_error = vdev_raidz_worst_error(rm); 2124 2125 } else if (total_errors < rm->rm_firstdatacol && 2126 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2127 /* 2128 * If we didn't use all the available parity for the 2129 * combinatorial reconstruction, verify that the remaining 2130 * parity is correct. 2131 */ 2132 if (code != (1 << rm->rm_firstdatacol) - 1) 2133 (void) raidz_parity_verify(zio, rm); 2134 } else { 2135 /* 2136 * We're here because either: 2137 * 2138 * total_errors == rm_first_datacol, or 2139 * vdev_raidz_combrec() failed 2140 * 2141 * In either case, there is enough bad data to prevent 2142 * reconstruction. 2143 * 2144 * Start checksum ereports for all children which haven't 2145 * failed, and the IO wasn't speculative. 2146 */ 2147 zio->io_error = SET_ERROR(ECKSUM); 2148 2149 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2150 for (c = 0; c < rm->rm_cols; c++) { 2151 rc = &rm->rm_col[c]; 2152 if (rc->rc_error == 0) { 2153 zio_bad_cksum_t zbc; 2154 zbc.zbc_has_cksum = 0; 2155 zbc.zbc_injected = 2156 rm->rm_ecksuminjected; 2157 2158 zfs_ereport_start_checksum( 2159 zio->io_spa, 2160 vd->vdev_child[rc->rc_devidx], 2161 zio, rc->rc_offset, rc->rc_size, 2162 (void *)(uintptr_t)c, &zbc); 2163 } 2164 } 2165 } 2166 } 2167 2168 done: 2169 zio_checksum_verified(zio); 2170 2171 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2172 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2173 /* 2174 * Use the good data we have in hand to repair damaged children. 2175 */ 2176 for (c = 0; c < rm->rm_cols; c++) { 2177 rc = &rm->rm_col[c]; 2178 cvd = vd->vdev_child[rc->rc_devidx]; 2179 2180 if (rc->rc_error == 0) 2181 continue; 2182 2183 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2184 rc->rc_offset, rc->rc_data, rc->rc_size, 2185 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2186 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2187 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2188 } 2189 } 2190 } 2191 2192 static void 2193 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2194 { 2195 if (faulted > vd->vdev_nparity) 2196 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2197 VDEV_AUX_NO_REPLICAS); 2198 else if (degraded + faulted != 0) 2199 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2200 else 2201 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2202 } 2203 2204 vdev_ops_t vdev_raidz_ops = { 2205 vdev_raidz_open, 2206 vdev_raidz_close, 2207 vdev_raidz_asize, 2208 vdev_raidz_io_start, 2209 vdev_raidz_io_done, 2210 vdev_raidz_state_change, 2211 NULL, 2212 NULL, 2213 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2214 B_FALSE /* not a leaf vdev */ 2215 };