Print this page
3741 zfs needs better comments
Submitted by: Will Andrews <willa@spectralogic.com>
Submitted by: Justin Gibbs <justing@spectralogic.com>
Submitted by: Alan Somers <alans@spectralogic.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ new/usr/src/uts/common/fs/zfs/vdev_raidz.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 25 */
26 26
27 27 #include <sys/zfs_context.h>
28 28 #include <sys/spa.h>
29 29 #include <sys/vdev_impl.h>
30 30 #include <sys/zio.h>
31 31 #include <sys/zio_checksum.h>
32 32 #include <sys/fs/zfs.h>
33 33 #include <sys/fm/fs/zfs.h>
34 34
35 35 /*
36 36 * Virtual device vector for RAID-Z.
37 37 *
38 38 * This vdev supports single, double, and triple parity. For single parity,
39 39 * we use a simple XOR of all the data columns. For double or triple parity,
40 40 * we use a special case of Reed-Solomon coding. This extends the
41 41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 44 * former is also based. The latter is designed to provide higher performance
45 45 * for writes.
46 46 *
47 47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 48 * amended six years later identifying a critical flaw that invalidates its
49 49 * claims. Nevertheless, the technique can be adapted to work for up to
50 50 * triple parity. For additional parity, the amendment "Note: Correction to
51 51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 52 * is viable, but the additional complexity means that write performance will
53 53 * suffer.
54 54 *
55 55 * All of the methods above operate on a Galois field, defined over the
56 56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 57 * can be expressed with a single byte. Briefly, the operations on the
58 58 * field are defined as follows:
59 59 *
60 60 * o addition (+) is represented by a bitwise XOR
61 61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 62 * o multiplication of A by 2 is defined by the following bitwise expression:
63 63 * (A * 2)_7 = A_6
64 64 * (A * 2)_6 = A_5
65 65 * (A * 2)_5 = A_4
66 66 * (A * 2)_4 = A_3 + A_7
67 67 * (A * 2)_3 = A_2 + A_7
68 68 * (A * 2)_2 = A_1 + A_7
69 69 * (A * 2)_1 = A_0
70 70 * (A * 2)_0 = A_7
71 71 *
72 72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73 73 * As an aside, this multiplication is derived from the error correcting
74 74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75 75 *
76 76 * Observe that any number in the field (except for 0) can be expressed as a
77 77 * power of 2 -- a generator for the field. We store a table of the powers of
78 78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79 79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80 80 * than field addition). The inverse of a field element A (A^-1) is therefore
81 81 * A ^ (255 - 1) = A^254.
82 82 *
83 83 * The up-to-three parity columns, P, Q, R over several data columns,
84 84 * D_0, ... D_n-1, can be expressed by field operations:
85 85 *
86 86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
87 87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88 88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89 89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90 90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91 91 *
92 92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93 93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94 94 * independent coefficients. (There are no additional coefficients that have
95 95 * this property which is why the uncorrected Plank method breaks down.)
96 96 *
97 97 * See the reconstruction code below for how P, Q and R can used individually
98 98 * or in concert to recover missing data columns.
99 99 */
100 100
101 101 typedef struct raidz_col {
102 102 uint64_t rc_devidx; /* child device index for I/O */
103 103 uint64_t rc_offset; /* device offset */
104 104 uint64_t rc_size; /* I/O size */
105 105 void *rc_data; /* I/O data */
106 106 void *rc_gdata; /* used to store the "good" version */
107 107 int rc_error; /* I/O error for this device */
108 108 uint8_t rc_tried; /* Did we attempt this I/O column? */
109 109 uint8_t rc_skipped; /* Did we skip this I/O column? */
110 110 } raidz_col_t;
111 111
112 112 typedef struct raidz_map {
113 113 uint64_t rm_cols; /* Regular column count */
114 114 uint64_t rm_scols; /* Count including skipped columns */
115 115 uint64_t rm_bigcols; /* Number of oversized columns */
116 116 uint64_t rm_asize; /* Actual total I/O size */
117 117 uint64_t rm_missingdata; /* Count of missing data devices */
118 118 uint64_t rm_missingparity; /* Count of missing parity devices */
119 119 uint64_t rm_firstdatacol; /* First data column/parity count */
120 120 uint64_t rm_nskip; /* Skipped sectors for padding */
121 121 uint64_t rm_skipstart; /* Column index of padding start */
122 122 void *rm_datacopy; /* rm_asize-buffer of copied data */
123 123 uintptr_t rm_reports; /* # of referencing checksum reports */
124 124 uint8_t rm_freed; /* map no longer has referencing ZIO */
125 125 uint8_t rm_ecksuminjected; /* checksum error was injected */
126 126 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
127 127 } raidz_map_t;
128 128
129 129 #define VDEV_RAIDZ_P 0
130 130 #define VDEV_RAIDZ_Q 1
131 131 #define VDEV_RAIDZ_R 2
132 132
133 133 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
134 134 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
135 135
136 136 /*
137 137 * We provide a mechanism to perform the field multiplication operation on a
138 138 * 64-bit value all at once rather than a byte at a time. This works by
139 139 * creating a mask from the top bit in each byte and using that to
140 140 * conditionally apply the XOR of 0x1d.
141 141 */
142 142 #define VDEV_RAIDZ_64MUL_2(x, mask) \
143 143 { \
144 144 (mask) = (x) & 0x8080808080808080ULL; \
145 145 (mask) = ((mask) << 1) - ((mask) >> 7); \
146 146 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
147 147 ((mask) & 0x1d1d1d1d1d1d1d1d); \
148 148 }
149 149
150 150 #define VDEV_RAIDZ_64MUL_4(x, mask) \
151 151 { \
152 152 VDEV_RAIDZ_64MUL_2((x), mask); \
153 153 VDEV_RAIDZ_64MUL_2((x), mask); \
154 154 }
155 155
156 156 /*
157 157 * Force reconstruction to use the general purpose method.
158 158 */
159 159 int vdev_raidz_default_to_general;
160 160
161 161 /*
162 162 * These two tables represent powers and logs of 2 in the Galois field defined
163 163 * above. These values were computed by repeatedly multiplying by 2 as above.
164 164 */
165 165 static const uint8_t vdev_raidz_pow2[256] = {
166 166 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
167 167 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
168 168 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
169 169 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
170 170 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
171 171 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
172 172 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
173 173 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
174 174 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
175 175 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
176 176 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
177 177 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
178 178 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
179 179 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
180 180 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
181 181 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
182 182 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
183 183 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
184 184 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
185 185 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
186 186 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
187 187 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
188 188 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
189 189 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
190 190 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
191 191 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
192 192 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
193 193 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
194 194 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
195 195 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
196 196 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
197 197 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
198 198 };
199 199 static const uint8_t vdev_raidz_log2[256] = {
200 200 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
201 201 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
202 202 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
203 203 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
204 204 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
205 205 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
206 206 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
207 207 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
208 208 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
209 209 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
210 210 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
211 211 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
212 212 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
213 213 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
214 214 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
215 215 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
216 216 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
217 217 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
218 218 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
219 219 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
220 220 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
221 221 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
222 222 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
223 223 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
224 224 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
225 225 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
226 226 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
227 227 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
228 228 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
229 229 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
230 230 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
231 231 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
232 232 };
233 233
234 234 static void vdev_raidz_generate_parity(raidz_map_t *rm);
235 235
236 236 /*
237 237 * Multiply a given number by 2 raised to the given power.
238 238 */
239 239 static uint8_t
240 240 vdev_raidz_exp2(uint_t a, int exp)
241 241 {
242 242 if (a == 0)
243 243 return (0);
244 244
245 245 ASSERT(exp >= 0);
246 246 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
247 247
248 248 exp += vdev_raidz_log2[a];
249 249 if (exp > 255)
250 250 exp -= 255;
251 251
252 252 return (vdev_raidz_pow2[exp]);
253 253 }
254 254
255 255 static void
256 256 vdev_raidz_map_free(raidz_map_t *rm)
257 257 {
258 258 int c;
259 259 size_t size;
260 260
261 261 for (c = 0; c < rm->rm_firstdatacol; c++) {
262 262 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
263 263
264 264 if (rm->rm_col[c].rc_gdata != NULL)
265 265 zio_buf_free(rm->rm_col[c].rc_gdata,
266 266 rm->rm_col[c].rc_size);
267 267 }
268 268
269 269 size = 0;
270 270 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
271 271 size += rm->rm_col[c].rc_size;
272 272
273 273 if (rm->rm_datacopy != NULL)
274 274 zio_buf_free(rm->rm_datacopy, size);
275 275
276 276 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
277 277 }
278 278
279 279 static void
280 280 vdev_raidz_map_free_vsd(zio_t *zio)
281 281 {
282 282 raidz_map_t *rm = zio->io_vsd;
283 283
284 284 ASSERT0(rm->rm_freed);
285 285 rm->rm_freed = 1;
286 286
287 287 if (rm->rm_reports == 0)
288 288 vdev_raidz_map_free(rm);
289 289 }
290 290
291 291 /*ARGSUSED*/
292 292 static void
293 293 vdev_raidz_cksum_free(void *arg, size_t ignored)
294 294 {
295 295 raidz_map_t *rm = arg;
296 296
297 297 ASSERT3U(rm->rm_reports, >, 0);
298 298
299 299 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
300 300 vdev_raidz_map_free(rm);
301 301 }
302 302
303 303 static void
304 304 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
305 305 {
306 306 raidz_map_t *rm = zcr->zcr_cbdata;
307 307 size_t c = zcr->zcr_cbinfo;
308 308 size_t x;
309 309
310 310 const char *good = NULL;
311 311 const char *bad = rm->rm_col[c].rc_data;
312 312
313 313 if (good_data == NULL) {
314 314 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
315 315 return;
316 316 }
317 317
318 318 if (c < rm->rm_firstdatacol) {
319 319 /*
320 320 * The first time through, calculate the parity blocks for
321 321 * the good data (this relies on the fact that the good
322 322 * data never changes for a given logical ZIO)
323 323 */
324 324 if (rm->rm_col[0].rc_gdata == NULL) {
325 325 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
326 326 char *buf;
327 327
328 328 /*
329 329 * Set up the rm_col[]s to generate the parity for
330 330 * good_data, first saving the parity bufs and
331 331 * replacing them with buffers to hold the result.
332 332 */
333 333 for (x = 0; x < rm->rm_firstdatacol; x++) {
334 334 bad_parity[x] = rm->rm_col[x].rc_data;
335 335 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
336 336 zio_buf_alloc(rm->rm_col[x].rc_size);
337 337 }
338 338
339 339 /* fill in the data columns from good_data */
340 340 buf = (char *)good_data;
341 341 for (; x < rm->rm_cols; x++) {
342 342 rm->rm_col[x].rc_data = buf;
343 343 buf += rm->rm_col[x].rc_size;
344 344 }
345 345
346 346 /*
347 347 * Construct the parity from the good data.
348 348 */
349 349 vdev_raidz_generate_parity(rm);
350 350
351 351 /* restore everything back to its original state */
352 352 for (x = 0; x < rm->rm_firstdatacol; x++)
353 353 rm->rm_col[x].rc_data = bad_parity[x];
354 354
355 355 buf = rm->rm_datacopy;
356 356 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
357 357 rm->rm_col[x].rc_data = buf;
358 358 buf += rm->rm_col[x].rc_size;
359 359 }
360 360 }
361 361
362 362 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
363 363 good = rm->rm_col[c].rc_gdata;
364 364 } else {
365 365 /* adjust good_data to point at the start of our column */
366 366 good = good_data;
367 367
368 368 for (x = rm->rm_firstdatacol; x < c; x++)
369 369 good += rm->rm_col[x].rc_size;
370 370 }
371 371
372 372 /* we drop the ereport if it ends up that the data was good */
373 373 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
374 374 }
375 375
376 376 /*
377 377 * Invoked indirectly by zfs_ereport_start_checksum(), called
378 378 * below when our read operation fails completely. The main point
379 379 * is to keep a copy of everything we read from disk, so that at
380 380 * vdev_raidz_cksum_finish() time we can compare it with the good data.
381 381 */
382 382 static void
383 383 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
384 384 {
385 385 size_t c = (size_t)(uintptr_t)arg;
386 386 caddr_t buf;
387 387
388 388 raidz_map_t *rm = zio->io_vsd;
389 389 size_t size;
390 390
391 391 /* set up the report and bump the refcount */
392 392 zcr->zcr_cbdata = rm;
393 393 zcr->zcr_cbinfo = c;
394 394 zcr->zcr_finish = vdev_raidz_cksum_finish;
395 395 zcr->zcr_free = vdev_raidz_cksum_free;
396 396
397 397 rm->rm_reports++;
398 398 ASSERT3U(rm->rm_reports, >, 0);
399 399
400 400 if (rm->rm_datacopy != NULL)
401 401 return;
402 402
403 403 /*
404 404 * It's the first time we're called for this raidz_map_t, so we need
405 405 * to copy the data aside; there's no guarantee that our zio's buffer
406 406 * won't be re-used for something else.
407 407 *
408 408 * Our parity data is already in separate buffers, so there's no need
409 409 * to copy them.
410 410 */
411 411
412 412 size = 0;
413 413 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
414 414 size += rm->rm_col[c].rc_size;
415 415
416 416 buf = rm->rm_datacopy = zio_buf_alloc(size);
417 417
418 418 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
419 419 raidz_col_t *col = &rm->rm_col[c];
420 420
421 421 bcopy(col->rc_data, buf, col->rc_size);
422 422 col->rc_data = buf;
423 423
↓ open down ↓ |
423 lines elided |
↑ open up ↑ |
424 424 buf += col->rc_size;
425 425 }
426 426 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
427 427 }
428 428
429 429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
430 430 vdev_raidz_map_free_vsd,
431 431 vdev_raidz_cksum_report
432 432 };
433 433
434 +/*
435 + * Divides the IO evenly across all child vdevs; usually, dcols is
436 + * the number of children in the target vdev.
437 + */
434 438 static raidz_map_t *
435 439 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
436 440 uint64_t nparity)
437 441 {
438 442 raidz_map_t *rm;
443 + /* The starting RAIDZ (parent) vdev sector of the block. */
439 444 uint64_t b = zio->io_offset >> unit_shift;
445 + /* The zio's size in units of the vdev's minimum sector size */
440 446 uint64_t s = zio->io_size >> unit_shift;
447 + /* The first column for this stripe. */
441 448 uint64_t f = b % dcols;
449 + /* The starting byte offset on each child vdev. */
442 450 uint64_t o = (b / dcols) << unit_shift;
443 451 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
444 452
453 + /*
454 + * "Quotient": The number of data sectors for this stripe on all but
455 + * the "big column" child vdevs that also contain "remainder" data.
456 + */
445 457 q = s / (dcols - nparity);
458 +
459 + /*
460 + * "Remainder": The number of partial stripe data sectors in this I/O.
461 + * This will add a sector to some, but not all, child vdevs.
462 + */
446 463 r = s - q * (dcols - nparity);
464 +
465 + /* The number of "big columns" - those which contain remainder data. */
447 466 bc = (r == 0 ? 0 : r + nparity);
467 +
468 + /*
469 + * The total number of data and parity sectors associated with
470 + * this I/O.
471 + */
448 472 tot = s + nparity * (q + (r == 0 ? 0 : 1));
449 473
474 + /* acols: The columns that will be accessed. */
475 + /* scols: The columns that will be accessed or skipped. */
450 476 if (q == 0) {
477 + /* Our I/O request doesn't span all child vdevs. */
451 478 acols = bc;
452 479 scols = MIN(dcols, roundup(bc, nparity + 1));
453 480 } else {
454 481 acols = dcols;
455 482 scols = dcols;
456 483 }
457 484
458 485 ASSERT3U(acols, <=, scols);
459 486
460 487 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
461 488
462 489 rm->rm_cols = acols;
463 490 rm->rm_scols = scols;
464 491 rm->rm_bigcols = bc;
465 492 rm->rm_skipstart = bc;
466 493 rm->rm_missingdata = 0;
467 494 rm->rm_missingparity = 0;
468 495 rm->rm_firstdatacol = nparity;
469 496 rm->rm_datacopy = NULL;
470 497 rm->rm_reports = 0;
471 498 rm->rm_freed = 0;
472 499 rm->rm_ecksuminjected = 0;
473 500
474 501 asize = 0;
475 502
476 503 for (c = 0; c < scols; c++) {
477 504 col = f + c;
478 505 coff = o;
479 506 if (col >= dcols) {
480 507 col -= dcols;
481 508 coff += 1ULL << unit_shift;
482 509 }
483 510 rm->rm_col[c].rc_devidx = col;
484 511 rm->rm_col[c].rc_offset = coff;
485 512 rm->rm_col[c].rc_data = NULL;
486 513 rm->rm_col[c].rc_gdata = NULL;
487 514 rm->rm_col[c].rc_error = 0;
488 515 rm->rm_col[c].rc_tried = 0;
489 516 rm->rm_col[c].rc_skipped = 0;
490 517
491 518 if (c >= acols)
492 519 rm->rm_col[c].rc_size = 0;
493 520 else if (c < bc)
494 521 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
495 522 else
496 523 rm->rm_col[c].rc_size = q << unit_shift;
497 524
498 525 asize += rm->rm_col[c].rc_size;
499 526 }
500 527
501 528 ASSERT3U(asize, ==, tot << unit_shift);
502 529 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
503 530 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
504 531 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
505 532 ASSERT3U(rm->rm_nskip, <=, nparity);
506 533
507 534 for (c = 0; c < rm->rm_firstdatacol; c++)
508 535 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
509 536
510 537 rm->rm_col[c].rc_data = zio->io_data;
511 538
512 539 for (c = c + 1; c < acols; c++)
513 540 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
514 541 rm->rm_col[c - 1].rc_size;
515 542
516 543 /*
517 544 * If all data stored spans all columns, there's a danger that parity
518 545 * will always be on the same device and, since parity isn't read
519 546 * during normal operation, that that device's I/O bandwidth won't be
520 547 * used effectively. We therefore switch the parity every 1MB.
521 548 *
522 549 * ... at least that was, ostensibly, the theory. As a practical
523 550 * matter unless we juggle the parity between all devices evenly, we
524 551 * won't see any benefit. Further, occasional writes that aren't a
525 552 * multiple of the LCM of the number of children and the minimum
526 553 * stripe width are sufficient to avoid pessimal behavior.
527 554 * Unfortunately, this decision created an implicit on-disk format
528 555 * requirement that we need to support for all eternity, but only
529 556 * for single-parity RAID-Z.
530 557 *
531 558 * If we intend to skip a sector in the zeroth column for padding
532 559 * we must make sure to note this swap. We will never intend to
533 560 * skip the first column since at least one data and one parity
534 561 * column must appear in each row.
535 562 */
536 563 ASSERT(rm->rm_cols >= 2);
537 564 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
538 565
539 566 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
540 567 devidx = rm->rm_col[0].rc_devidx;
541 568 o = rm->rm_col[0].rc_offset;
542 569 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
543 570 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
544 571 rm->rm_col[1].rc_devidx = devidx;
545 572 rm->rm_col[1].rc_offset = o;
546 573
547 574 if (rm->rm_skipstart == 0)
548 575 rm->rm_skipstart = 1;
549 576 }
550 577
551 578 zio->io_vsd = rm;
552 579 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
553 580 return (rm);
554 581 }
555 582
556 583 static void
557 584 vdev_raidz_generate_parity_p(raidz_map_t *rm)
558 585 {
559 586 uint64_t *p, *src, pcount, ccount, i;
560 587 int c;
561 588
562 589 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
563 590
564 591 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565 592 src = rm->rm_col[c].rc_data;
566 593 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567 594 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
568 595
569 596 if (c == rm->rm_firstdatacol) {
570 597 ASSERT(ccount == pcount);
571 598 for (i = 0; i < ccount; i++, src++, p++) {
572 599 *p = *src;
573 600 }
574 601 } else {
575 602 ASSERT(ccount <= pcount);
576 603 for (i = 0; i < ccount; i++, src++, p++) {
577 604 *p ^= *src;
578 605 }
579 606 }
580 607 }
581 608 }
582 609
583 610 static void
584 611 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
585 612 {
586 613 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
587 614 int c;
588 615
589 616 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
590 617 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
591 618 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
592 619
593 620 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
594 621 src = rm->rm_col[c].rc_data;
595 622 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
596 623 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
597 624
598 625 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
599 626
600 627 if (c == rm->rm_firstdatacol) {
601 628 ASSERT(ccnt == pcnt || ccnt == 0);
602 629 for (i = 0; i < ccnt; i++, src++, p++, q++) {
603 630 *p = *src;
604 631 *q = *src;
605 632 }
606 633 for (; i < pcnt; i++, src++, p++, q++) {
607 634 *p = 0;
608 635 *q = 0;
609 636 }
610 637 } else {
611 638 ASSERT(ccnt <= pcnt);
612 639
613 640 /*
614 641 * Apply the algorithm described above by multiplying
615 642 * the previous result and adding in the new value.
616 643 */
617 644 for (i = 0; i < ccnt; i++, src++, p++, q++) {
618 645 *p ^= *src;
619 646
620 647 VDEV_RAIDZ_64MUL_2(*q, mask);
621 648 *q ^= *src;
622 649 }
623 650
624 651 /*
625 652 * Treat short columns as though they are full of 0s.
626 653 * Note that there's therefore nothing needed for P.
627 654 */
628 655 for (; i < pcnt; i++, q++) {
629 656 VDEV_RAIDZ_64MUL_2(*q, mask);
630 657 }
631 658 }
632 659 }
633 660 }
634 661
635 662 static void
636 663 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
637 664 {
638 665 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
639 666 int c;
640 667
641 668 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
642 669 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
643 670 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
644 671 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
645 672 rm->rm_col[VDEV_RAIDZ_R].rc_size);
646 673
647 674 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
648 675 src = rm->rm_col[c].rc_data;
649 676 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
650 677 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
651 678 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
652 679
653 680 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
654 681
655 682 if (c == rm->rm_firstdatacol) {
656 683 ASSERT(ccnt == pcnt || ccnt == 0);
657 684 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
658 685 *p = *src;
659 686 *q = *src;
660 687 *r = *src;
661 688 }
662 689 for (; i < pcnt; i++, src++, p++, q++, r++) {
663 690 *p = 0;
664 691 *q = 0;
665 692 *r = 0;
666 693 }
667 694 } else {
668 695 ASSERT(ccnt <= pcnt);
669 696
670 697 /*
671 698 * Apply the algorithm described above by multiplying
672 699 * the previous result and adding in the new value.
673 700 */
674 701 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
675 702 *p ^= *src;
676 703
677 704 VDEV_RAIDZ_64MUL_2(*q, mask);
678 705 *q ^= *src;
679 706
680 707 VDEV_RAIDZ_64MUL_4(*r, mask);
681 708 *r ^= *src;
682 709 }
683 710
684 711 /*
685 712 * Treat short columns as though they are full of 0s.
686 713 * Note that there's therefore nothing needed for P.
687 714 */
688 715 for (; i < pcnt; i++, q++, r++) {
689 716 VDEV_RAIDZ_64MUL_2(*q, mask);
690 717 VDEV_RAIDZ_64MUL_4(*r, mask);
691 718 }
692 719 }
693 720 }
694 721 }
695 722
696 723 /*
697 724 * Generate RAID parity in the first virtual columns according to the number of
698 725 * parity columns available.
699 726 */
700 727 static void
701 728 vdev_raidz_generate_parity(raidz_map_t *rm)
702 729 {
703 730 switch (rm->rm_firstdatacol) {
704 731 case 1:
705 732 vdev_raidz_generate_parity_p(rm);
706 733 break;
707 734 case 2:
708 735 vdev_raidz_generate_parity_pq(rm);
709 736 break;
710 737 case 3:
711 738 vdev_raidz_generate_parity_pqr(rm);
712 739 break;
713 740 default:
714 741 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
715 742 }
716 743 }
717 744
718 745 static int
719 746 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
720 747 {
721 748 uint64_t *dst, *src, xcount, ccount, count, i;
722 749 int x = tgts[0];
723 750 int c;
724 751
725 752 ASSERT(ntgts == 1);
726 753 ASSERT(x >= rm->rm_firstdatacol);
727 754 ASSERT(x < rm->rm_cols);
728 755
729 756 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
730 757 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
731 758 ASSERT(xcount > 0);
732 759
733 760 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
734 761 dst = rm->rm_col[x].rc_data;
735 762 for (i = 0; i < xcount; i++, dst++, src++) {
736 763 *dst = *src;
737 764 }
738 765
739 766 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
740 767 src = rm->rm_col[c].rc_data;
741 768 dst = rm->rm_col[x].rc_data;
742 769
743 770 if (c == x)
744 771 continue;
745 772
746 773 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
747 774 count = MIN(ccount, xcount);
748 775
749 776 for (i = 0; i < count; i++, dst++, src++) {
750 777 *dst ^= *src;
751 778 }
752 779 }
753 780
754 781 return (1 << VDEV_RAIDZ_P);
755 782 }
756 783
757 784 static int
758 785 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
759 786 {
760 787 uint64_t *dst, *src, xcount, ccount, count, mask, i;
761 788 uint8_t *b;
762 789 int x = tgts[0];
763 790 int c, j, exp;
764 791
765 792 ASSERT(ntgts == 1);
766 793
767 794 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
768 795 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
769 796
770 797 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
771 798 src = rm->rm_col[c].rc_data;
772 799 dst = rm->rm_col[x].rc_data;
773 800
774 801 if (c == x)
775 802 ccount = 0;
776 803 else
777 804 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
778 805
779 806 count = MIN(ccount, xcount);
780 807
781 808 if (c == rm->rm_firstdatacol) {
782 809 for (i = 0; i < count; i++, dst++, src++) {
783 810 *dst = *src;
784 811 }
785 812 for (; i < xcount; i++, dst++) {
786 813 *dst = 0;
787 814 }
788 815
789 816 } else {
790 817 for (i = 0; i < count; i++, dst++, src++) {
791 818 VDEV_RAIDZ_64MUL_2(*dst, mask);
792 819 *dst ^= *src;
793 820 }
794 821
795 822 for (; i < xcount; i++, dst++) {
796 823 VDEV_RAIDZ_64MUL_2(*dst, mask);
797 824 }
798 825 }
799 826 }
800 827
801 828 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
802 829 dst = rm->rm_col[x].rc_data;
803 830 exp = 255 - (rm->rm_cols - 1 - x);
804 831
805 832 for (i = 0; i < xcount; i++, dst++, src++) {
806 833 *dst ^= *src;
807 834 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
808 835 *b = vdev_raidz_exp2(*b, exp);
809 836 }
810 837 }
811 838
812 839 return (1 << VDEV_RAIDZ_Q);
813 840 }
814 841
815 842 static int
816 843 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
817 844 {
818 845 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
819 846 void *pdata, *qdata;
820 847 uint64_t xsize, ysize, i;
821 848 int x = tgts[0];
822 849 int y = tgts[1];
823 850
824 851 ASSERT(ntgts == 2);
825 852 ASSERT(x < y);
826 853 ASSERT(x >= rm->rm_firstdatacol);
827 854 ASSERT(y < rm->rm_cols);
828 855
829 856 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
830 857
831 858 /*
832 859 * Move the parity data aside -- we're going to compute parity as
833 860 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
834 861 * reuse the parity generation mechanism without trashing the actual
835 862 * parity so we make those columns appear to be full of zeros by
836 863 * setting their lengths to zero.
837 864 */
838 865 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
839 866 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
840 867 xsize = rm->rm_col[x].rc_size;
841 868 ysize = rm->rm_col[y].rc_size;
842 869
843 870 rm->rm_col[VDEV_RAIDZ_P].rc_data =
844 871 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
845 872 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
846 873 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
847 874 rm->rm_col[x].rc_size = 0;
848 875 rm->rm_col[y].rc_size = 0;
849 876
850 877 vdev_raidz_generate_parity_pq(rm);
851 878
852 879 rm->rm_col[x].rc_size = xsize;
853 880 rm->rm_col[y].rc_size = ysize;
854 881
855 882 p = pdata;
856 883 q = qdata;
857 884 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
858 885 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
859 886 xd = rm->rm_col[x].rc_data;
860 887 yd = rm->rm_col[y].rc_data;
861 888
862 889 /*
863 890 * We now have:
864 891 * Pxy = P + D_x + D_y
865 892 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
866 893 *
867 894 * We can then solve for D_x:
868 895 * D_x = A * (P + Pxy) + B * (Q + Qxy)
869 896 * where
870 897 * A = 2^(x - y) * (2^(x - y) + 1)^-1
871 898 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
872 899 *
873 900 * With D_x in hand, we can easily solve for D_y:
874 901 * D_y = P + Pxy + D_x
875 902 */
876 903
877 904 a = vdev_raidz_pow2[255 + x - y];
878 905 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
879 906 tmp = 255 - vdev_raidz_log2[a ^ 1];
880 907
881 908 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
882 909 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
883 910
884 911 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
885 912 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
886 913 vdev_raidz_exp2(*q ^ *qxy, bexp);
887 914
888 915 if (i < ysize)
889 916 *yd = *p ^ *pxy ^ *xd;
890 917 }
891 918
892 919 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
893 920 rm->rm_col[VDEV_RAIDZ_P].rc_size);
894 921 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
895 922 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
896 923
897 924 /*
898 925 * Restore the saved parity data.
899 926 */
900 927 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
901 928 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
902 929
903 930 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
904 931 }
905 932
906 933 /* BEGIN CSTYLED */
907 934 /*
908 935 * In the general case of reconstruction, we must solve the system of linear
909 936 * equations defined by the coeffecients used to generate parity as well as
910 937 * the contents of the data and parity disks. This can be expressed with
911 938 * vectors for the original data (D) and the actual data (d) and parity (p)
912 939 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
913 940 *
914 941 * __ __ __ __
915 942 * | | __ __ | p_0 |
916 943 * | V | | D_0 | | p_m-1 |
917 944 * | | x | : | = | d_0 |
918 945 * | I | | D_n-1 | | : |
919 946 * | | ~~ ~~ | d_n-1 |
920 947 * ~~ ~~ ~~ ~~
921 948 *
922 949 * I is simply a square identity matrix of size n, and V is a vandermonde
923 950 * matrix defined by the coeffecients we chose for the various parity columns
924 951 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
925 952 * computation as well as linear separability.
926 953 *
927 954 * __ __ __ __
928 955 * | 1 .. 1 1 1 | | p_0 |
929 956 * | 2^n-1 .. 4 2 1 | __ __ | : |
930 957 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
931 958 * | 1 .. 0 0 0 | | D_1 | | d_0 |
932 959 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
933 960 * | : : : : | | : | | d_2 |
934 961 * | 0 .. 1 0 0 | | D_n-1 | | : |
935 962 * | 0 .. 0 1 0 | ~~ ~~ | : |
936 963 * | 0 .. 0 0 1 | | d_n-1 |
937 964 * ~~ ~~ ~~ ~~
938 965 *
939 966 * Note that I, V, d, and p are known. To compute D, we must invert the
940 967 * matrix and use the known data and parity values to reconstruct the unknown
941 968 * data values. We begin by removing the rows in V|I and d|p that correspond
942 969 * to failed or missing columns; we then make V|I square (n x n) and d|p
943 970 * sized n by removing rows corresponding to unused parity from the bottom up
944 971 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
945 972 * using Gauss-Jordan elimination. In the example below we use m=3 parity
946 973 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
947 974 * __ __
948 975 * | 1 1 1 1 1 1 1 1 |
949 976 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
950 977 * | 19 205 116 29 64 16 4 1 | / /
951 978 * | 1 0 0 0 0 0 0 0 | / /
952 979 * | 0 1 0 0 0 0 0 0 | <--' /
953 980 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
954 981 * | 0 0 0 1 0 0 0 0 |
955 982 * | 0 0 0 0 1 0 0 0 |
956 983 * | 0 0 0 0 0 1 0 0 |
957 984 * | 0 0 0 0 0 0 1 0 |
958 985 * | 0 0 0 0 0 0 0 1 |
959 986 * ~~ ~~
960 987 * __ __
961 988 * | 1 1 1 1 1 1 1 1 |
962 989 * | 128 64 32 16 8 4 2 1 |
963 990 * | 19 205 116 29 64 16 4 1 |
964 991 * | 1 0 0 0 0 0 0 0 |
965 992 * | 0 1 0 0 0 0 0 0 |
966 993 * (V|I)' = | 0 0 1 0 0 0 0 0 |
967 994 * | 0 0 0 1 0 0 0 0 |
968 995 * | 0 0 0 0 1 0 0 0 |
969 996 * | 0 0 0 0 0 1 0 0 |
970 997 * | 0 0 0 0 0 0 1 0 |
971 998 * | 0 0 0 0 0 0 0 1 |
972 999 * ~~ ~~
973 1000 *
974 1001 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
975 1002 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
976 1003 * matrix is not singular.
977 1004 * __ __
978 1005 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
979 1006 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
980 1007 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
981 1008 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
982 1009 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
983 1010 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
984 1011 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
985 1012 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
986 1013 * ~~ ~~
987 1014 * __ __
988 1015 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
989 1016 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
990 1017 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
991 1018 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
992 1019 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
993 1020 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
994 1021 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
995 1022 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
996 1023 * ~~ ~~
997 1024 * __ __
998 1025 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
999 1026 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1000 1027 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1001 1028 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1002 1029 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1003 1030 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1004 1031 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1005 1032 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1006 1033 * ~~ ~~
1007 1034 * __ __
1008 1035 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1009 1036 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1010 1037 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1011 1038 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1012 1039 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1013 1040 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1014 1041 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1015 1042 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1016 1043 * ~~ ~~
1017 1044 * __ __
1018 1045 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1019 1046 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1020 1047 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1021 1048 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1022 1049 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1023 1050 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1024 1051 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1025 1052 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1026 1053 * ~~ ~~
1027 1054 * __ __
1028 1055 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1029 1056 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1030 1057 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1031 1058 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1032 1059 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1033 1060 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1034 1061 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1035 1062 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1036 1063 * ~~ ~~
1037 1064 * __ __
1038 1065 * | 0 0 1 0 0 0 0 0 |
1039 1066 * | 167 100 5 41 159 169 217 208 |
1040 1067 * | 166 100 4 40 158 168 216 209 |
1041 1068 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1042 1069 * | 0 0 0 0 1 0 0 0 |
1043 1070 * | 0 0 0 0 0 1 0 0 |
1044 1071 * | 0 0 0 0 0 0 1 0 |
1045 1072 * | 0 0 0 0 0 0 0 1 |
1046 1073 * ~~ ~~
1047 1074 *
1048 1075 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1049 1076 * of the missing data.
1050 1077 *
1051 1078 * As is apparent from the example above, the only non-trivial rows in the
1052 1079 * inverse matrix correspond to the data disks that we're trying to
1053 1080 * reconstruct. Indeed, those are the only rows we need as the others would
1054 1081 * only be useful for reconstructing data known or assumed to be valid. For
1055 1082 * that reason, we only build the coefficients in the rows that correspond to
1056 1083 * targeted columns.
1057 1084 */
1058 1085 /* END CSTYLED */
1059 1086
1060 1087 static void
1061 1088 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1062 1089 uint8_t **rows)
1063 1090 {
1064 1091 int i, j;
1065 1092 int pow;
1066 1093
1067 1094 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1068 1095
1069 1096 /*
1070 1097 * Fill in the missing rows of interest.
1071 1098 */
1072 1099 for (i = 0; i < nmap; i++) {
1073 1100 ASSERT3S(0, <=, map[i]);
1074 1101 ASSERT3S(map[i], <=, 2);
1075 1102
1076 1103 pow = map[i] * n;
1077 1104 if (pow > 255)
1078 1105 pow -= 255;
1079 1106 ASSERT(pow <= 255);
1080 1107
1081 1108 for (j = 0; j < n; j++) {
1082 1109 pow -= map[i];
1083 1110 if (pow < 0)
1084 1111 pow += 255;
1085 1112 rows[i][j] = vdev_raidz_pow2[pow];
1086 1113 }
1087 1114 }
1088 1115 }
1089 1116
1090 1117 static void
1091 1118 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1092 1119 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1093 1120 {
1094 1121 int i, j, ii, jj;
1095 1122 uint8_t log;
1096 1123
1097 1124 /*
1098 1125 * Assert that the first nmissing entries from the array of used
1099 1126 * columns correspond to parity columns and that subsequent entries
1100 1127 * correspond to data columns.
1101 1128 */
1102 1129 for (i = 0; i < nmissing; i++) {
1103 1130 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1104 1131 }
1105 1132 for (; i < n; i++) {
1106 1133 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1107 1134 }
1108 1135
1109 1136 /*
1110 1137 * First initialize the storage where we'll compute the inverse rows.
1111 1138 */
1112 1139 for (i = 0; i < nmissing; i++) {
1113 1140 for (j = 0; j < n; j++) {
1114 1141 invrows[i][j] = (i == j) ? 1 : 0;
1115 1142 }
1116 1143 }
1117 1144
1118 1145 /*
1119 1146 * Subtract all trivial rows from the rows of consequence.
1120 1147 */
1121 1148 for (i = 0; i < nmissing; i++) {
1122 1149 for (j = nmissing; j < n; j++) {
1123 1150 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1124 1151 jj = used[j] - rm->rm_firstdatacol;
1125 1152 ASSERT3S(jj, <, n);
1126 1153 invrows[i][j] = rows[i][jj];
1127 1154 rows[i][jj] = 0;
1128 1155 }
1129 1156 }
1130 1157
1131 1158 /*
1132 1159 * For each of the rows of interest, we must normalize it and subtract
1133 1160 * a multiple of it from the other rows.
1134 1161 */
1135 1162 for (i = 0; i < nmissing; i++) {
1136 1163 for (j = 0; j < missing[i]; j++) {
1137 1164 ASSERT0(rows[i][j]);
1138 1165 }
1139 1166 ASSERT3U(rows[i][missing[i]], !=, 0);
1140 1167
1141 1168 /*
1142 1169 * Compute the inverse of the first element and multiply each
1143 1170 * element in the row by that value.
1144 1171 */
1145 1172 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1146 1173
1147 1174 for (j = 0; j < n; j++) {
1148 1175 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1149 1176 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1150 1177 }
1151 1178
1152 1179 for (ii = 0; ii < nmissing; ii++) {
1153 1180 if (i == ii)
1154 1181 continue;
1155 1182
1156 1183 ASSERT3U(rows[ii][missing[i]], !=, 0);
1157 1184
1158 1185 log = vdev_raidz_log2[rows[ii][missing[i]]];
1159 1186
1160 1187 for (j = 0; j < n; j++) {
1161 1188 rows[ii][j] ^=
1162 1189 vdev_raidz_exp2(rows[i][j], log);
1163 1190 invrows[ii][j] ^=
1164 1191 vdev_raidz_exp2(invrows[i][j], log);
1165 1192 }
1166 1193 }
1167 1194 }
1168 1195
1169 1196 /*
1170 1197 * Verify that the data that is left in the rows are properly part of
1171 1198 * an identity matrix.
1172 1199 */
1173 1200 for (i = 0; i < nmissing; i++) {
1174 1201 for (j = 0; j < n; j++) {
1175 1202 if (j == missing[i]) {
1176 1203 ASSERT3U(rows[i][j], ==, 1);
1177 1204 } else {
1178 1205 ASSERT0(rows[i][j]);
1179 1206 }
1180 1207 }
1181 1208 }
1182 1209 }
1183 1210
1184 1211 static void
1185 1212 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1186 1213 int *missing, uint8_t **invrows, const uint8_t *used)
1187 1214 {
1188 1215 int i, j, x, cc, c;
1189 1216 uint8_t *src;
1190 1217 uint64_t ccount;
1191 1218 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1192 1219 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1193 1220 uint8_t log = 0;
1194 1221 uint8_t val;
1195 1222 int ll;
1196 1223 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1197 1224 uint8_t *p, *pp;
1198 1225 size_t psize;
1199 1226
1200 1227 psize = sizeof (invlog[0][0]) * n * nmissing;
1201 1228 p = kmem_alloc(psize, KM_SLEEP);
1202 1229
1203 1230 for (pp = p, i = 0; i < nmissing; i++) {
1204 1231 invlog[i] = pp;
1205 1232 pp += n;
1206 1233 }
1207 1234
1208 1235 for (i = 0; i < nmissing; i++) {
1209 1236 for (j = 0; j < n; j++) {
1210 1237 ASSERT3U(invrows[i][j], !=, 0);
1211 1238 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1212 1239 }
1213 1240 }
1214 1241
1215 1242 for (i = 0; i < n; i++) {
1216 1243 c = used[i];
1217 1244 ASSERT3U(c, <, rm->rm_cols);
1218 1245
1219 1246 src = rm->rm_col[c].rc_data;
1220 1247 ccount = rm->rm_col[c].rc_size;
1221 1248 for (j = 0; j < nmissing; j++) {
1222 1249 cc = missing[j] + rm->rm_firstdatacol;
1223 1250 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1224 1251 ASSERT3U(cc, <, rm->rm_cols);
1225 1252 ASSERT3U(cc, !=, c);
1226 1253
1227 1254 dst[j] = rm->rm_col[cc].rc_data;
1228 1255 dcount[j] = rm->rm_col[cc].rc_size;
1229 1256 }
1230 1257
1231 1258 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1232 1259
1233 1260 for (x = 0; x < ccount; x++, src++) {
1234 1261 if (*src != 0)
1235 1262 log = vdev_raidz_log2[*src];
1236 1263
1237 1264 for (cc = 0; cc < nmissing; cc++) {
1238 1265 if (x >= dcount[cc])
1239 1266 continue;
1240 1267
1241 1268 if (*src == 0) {
1242 1269 val = 0;
1243 1270 } else {
1244 1271 if ((ll = log + invlog[cc][i]) >= 255)
1245 1272 ll -= 255;
1246 1273 val = vdev_raidz_pow2[ll];
1247 1274 }
1248 1275
1249 1276 if (i == 0)
1250 1277 dst[cc][x] = val;
1251 1278 else
1252 1279 dst[cc][x] ^= val;
1253 1280 }
1254 1281 }
1255 1282 }
1256 1283
1257 1284 kmem_free(p, psize);
1258 1285 }
1259 1286
1260 1287 static int
1261 1288 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1262 1289 {
1263 1290 int n, i, c, t, tt;
1264 1291 int nmissing_rows;
1265 1292 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1266 1293 int parity_map[VDEV_RAIDZ_MAXPARITY];
1267 1294
1268 1295 uint8_t *p, *pp;
1269 1296 size_t psize;
1270 1297
1271 1298 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1272 1299 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1273 1300 uint8_t *used;
1274 1301
1275 1302 int code = 0;
1276 1303
1277 1304
1278 1305 n = rm->rm_cols - rm->rm_firstdatacol;
1279 1306
1280 1307 /*
1281 1308 * Figure out which data columns are missing.
1282 1309 */
1283 1310 nmissing_rows = 0;
1284 1311 for (t = 0; t < ntgts; t++) {
1285 1312 if (tgts[t] >= rm->rm_firstdatacol) {
1286 1313 missing_rows[nmissing_rows++] =
1287 1314 tgts[t] - rm->rm_firstdatacol;
1288 1315 }
1289 1316 }
1290 1317
1291 1318 /*
1292 1319 * Figure out which parity columns to use to help generate the missing
1293 1320 * data columns.
1294 1321 */
1295 1322 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1296 1323 ASSERT(tt < ntgts);
1297 1324 ASSERT(c < rm->rm_firstdatacol);
1298 1325
1299 1326 /*
1300 1327 * Skip any targeted parity columns.
1301 1328 */
1302 1329 if (c == tgts[tt]) {
1303 1330 tt++;
1304 1331 continue;
1305 1332 }
1306 1333
1307 1334 code |= 1 << c;
1308 1335
1309 1336 parity_map[i] = c;
1310 1337 i++;
1311 1338 }
1312 1339
1313 1340 ASSERT(code != 0);
1314 1341 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1315 1342
1316 1343 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1317 1344 nmissing_rows * n + sizeof (used[0]) * n;
1318 1345 p = kmem_alloc(psize, KM_SLEEP);
1319 1346
1320 1347 for (pp = p, i = 0; i < nmissing_rows; i++) {
1321 1348 rows[i] = pp;
1322 1349 pp += n;
1323 1350 invrows[i] = pp;
1324 1351 pp += n;
1325 1352 }
1326 1353 used = pp;
1327 1354
1328 1355 for (i = 0; i < nmissing_rows; i++) {
1329 1356 used[i] = parity_map[i];
1330 1357 }
1331 1358
1332 1359 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1333 1360 if (tt < nmissing_rows &&
1334 1361 c == missing_rows[tt] + rm->rm_firstdatacol) {
1335 1362 tt++;
1336 1363 continue;
1337 1364 }
1338 1365
1339 1366 ASSERT3S(i, <, n);
1340 1367 used[i] = c;
1341 1368 i++;
1342 1369 }
1343 1370
1344 1371 /*
1345 1372 * Initialize the interesting rows of the matrix.
1346 1373 */
1347 1374 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1348 1375
1349 1376 /*
1350 1377 * Invert the matrix.
1351 1378 */
1352 1379 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1353 1380 invrows, used);
1354 1381
1355 1382 /*
1356 1383 * Reconstruct the missing data using the generated matrix.
1357 1384 */
1358 1385 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1359 1386 invrows, used);
1360 1387
1361 1388 kmem_free(p, psize);
1362 1389
1363 1390 return (code);
1364 1391 }
1365 1392
1366 1393 static int
1367 1394 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1368 1395 {
1369 1396 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1370 1397 int ntgts;
1371 1398 int i, c;
1372 1399 int code;
1373 1400 int nbadparity, nbaddata;
1374 1401 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1375 1402
1376 1403 /*
1377 1404 * The tgts list must already be sorted.
1378 1405 */
1379 1406 for (i = 1; i < nt; i++) {
1380 1407 ASSERT(t[i] > t[i - 1]);
1381 1408 }
1382 1409
1383 1410 nbadparity = rm->rm_firstdatacol;
1384 1411 nbaddata = rm->rm_cols - nbadparity;
1385 1412 ntgts = 0;
1386 1413 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1387 1414 if (c < rm->rm_firstdatacol)
1388 1415 parity_valid[c] = B_FALSE;
1389 1416
1390 1417 if (i < nt && c == t[i]) {
1391 1418 tgts[ntgts++] = c;
1392 1419 i++;
1393 1420 } else if (rm->rm_col[c].rc_error != 0) {
1394 1421 tgts[ntgts++] = c;
1395 1422 } else if (c >= rm->rm_firstdatacol) {
1396 1423 nbaddata--;
1397 1424 } else {
1398 1425 parity_valid[c] = B_TRUE;
1399 1426 nbadparity--;
1400 1427 }
1401 1428 }
1402 1429
1403 1430 ASSERT(ntgts >= nt);
1404 1431 ASSERT(nbaddata >= 0);
1405 1432 ASSERT(nbaddata + nbadparity == ntgts);
1406 1433
1407 1434 dt = &tgts[nbadparity];
1408 1435
1409 1436 /*
1410 1437 * See if we can use any of our optimized reconstruction routines.
1411 1438 */
1412 1439 if (!vdev_raidz_default_to_general) {
1413 1440 switch (nbaddata) {
1414 1441 case 1:
1415 1442 if (parity_valid[VDEV_RAIDZ_P])
1416 1443 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1417 1444
1418 1445 ASSERT(rm->rm_firstdatacol > 1);
1419 1446
1420 1447 if (parity_valid[VDEV_RAIDZ_Q])
1421 1448 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1422 1449
1423 1450 ASSERT(rm->rm_firstdatacol > 2);
1424 1451 break;
1425 1452
1426 1453 case 2:
1427 1454 ASSERT(rm->rm_firstdatacol > 1);
1428 1455
1429 1456 if (parity_valid[VDEV_RAIDZ_P] &&
1430 1457 parity_valid[VDEV_RAIDZ_Q])
1431 1458 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1432 1459
1433 1460 ASSERT(rm->rm_firstdatacol > 2);
1434 1461
1435 1462 break;
1436 1463 }
1437 1464 }
1438 1465
1439 1466 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1440 1467 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1441 1468 ASSERT(code > 0);
1442 1469 return (code);
1443 1470 }
1444 1471
1445 1472 static int
1446 1473 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1447 1474 uint64_t *ashift)
1448 1475 {
1449 1476 vdev_t *cvd;
1450 1477 uint64_t nparity = vd->vdev_nparity;
1451 1478 int c;
1452 1479 int lasterror = 0;
1453 1480 int numerrors = 0;
1454 1481
1455 1482 ASSERT(nparity > 0);
1456 1483
1457 1484 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1458 1485 vd->vdev_children < nparity + 1) {
1459 1486 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1460 1487 return (SET_ERROR(EINVAL));
1461 1488 }
1462 1489
1463 1490 vdev_open_children(vd);
1464 1491
1465 1492 for (c = 0; c < vd->vdev_children; c++) {
1466 1493 cvd = vd->vdev_child[c];
1467 1494
1468 1495 if (cvd->vdev_open_error != 0) {
1469 1496 lasterror = cvd->vdev_open_error;
1470 1497 numerrors++;
1471 1498 continue;
1472 1499 }
1473 1500
1474 1501 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1475 1502 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1476 1503 *ashift = MAX(*ashift, cvd->vdev_ashift);
1477 1504 }
1478 1505
1479 1506 *asize *= vd->vdev_children;
1480 1507 *max_asize *= vd->vdev_children;
1481 1508
1482 1509 if (numerrors > nparity) {
1483 1510 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1484 1511 return (lasterror);
1485 1512 }
1486 1513
1487 1514 return (0);
1488 1515 }
1489 1516
1490 1517 static void
1491 1518 vdev_raidz_close(vdev_t *vd)
1492 1519 {
1493 1520 int c;
1494 1521
1495 1522 for (c = 0; c < vd->vdev_children; c++)
1496 1523 vdev_close(vd->vdev_child[c]);
1497 1524 }
1498 1525
1499 1526 static uint64_t
1500 1527 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1501 1528 {
1502 1529 uint64_t asize;
1503 1530 uint64_t ashift = vd->vdev_top->vdev_ashift;
1504 1531 uint64_t cols = vd->vdev_children;
1505 1532 uint64_t nparity = vd->vdev_nparity;
1506 1533
1507 1534 asize = ((psize - 1) >> ashift) + 1;
1508 1535 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1509 1536 asize = roundup(asize, nparity + 1) << ashift;
1510 1537
1511 1538 return (asize);
1512 1539 }
1513 1540
↓ open down ↓ |
1053 lines elided |
↑ open up ↑ |
1514 1541 static void
1515 1542 vdev_raidz_child_done(zio_t *zio)
1516 1543 {
1517 1544 raidz_col_t *rc = zio->io_private;
1518 1545
1519 1546 rc->rc_error = zio->io_error;
1520 1547 rc->rc_tried = 1;
1521 1548 rc->rc_skipped = 0;
1522 1549 }
1523 1550
1551 +/*
1552 + * Start an IO operation on a RAIDZ VDev
1553 + *
1554 + * Outline:
1555 + * - For write operations:
1556 + * 1. Generate the parity data
1557 + * 2. Create child zio write operations to each column's vdev, for both
1558 + * data and parity.
1559 + * 3. If the column skips any sectors for padding, create optional dummy
1560 + * write zio children for those areas to improve aggregation continuity.
1561 + * - For read operations:
1562 + * 1. Create child zio read operations to each data column's vdev to read
1563 + * the range of data required for zio.
1564 + * 2. If this is a scrub or resilver operation, or if any of the data
1565 + * vdevs have had errors, then create zio read operations to the parity
1566 + * columns' VDevs as well.
1567 + */
1524 1568 static int
1525 1569 vdev_raidz_io_start(zio_t *zio)
1526 1570 {
1527 1571 vdev_t *vd = zio->io_vd;
1528 1572 vdev_t *tvd = vd->vdev_top;
1529 1573 vdev_t *cvd;
1530 1574 raidz_map_t *rm;
1531 1575 raidz_col_t *rc;
1532 1576 int c, i;
1533 1577
1534 1578 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1535 1579 vd->vdev_nparity);
1536 1580
1537 1581 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1538 1582
1539 1583 if (zio->io_type == ZIO_TYPE_WRITE) {
1540 1584 vdev_raidz_generate_parity(rm);
1541 1585
1542 1586 for (c = 0; c < rm->rm_cols; c++) {
1543 1587 rc = &rm->rm_col[c];
1544 1588 cvd = vd->vdev_child[rc->rc_devidx];
1545 1589 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1546 1590 rc->rc_offset, rc->rc_data, rc->rc_size,
1547 1591 zio->io_type, zio->io_priority, 0,
1548 1592 vdev_raidz_child_done, rc));
1549 1593 }
1550 1594
1551 1595 /*
1552 1596 * Generate optional I/Os for any skipped sectors to improve
1553 1597 * aggregation contiguity.
1554 1598 */
1555 1599 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1556 1600 ASSERT(c <= rm->rm_scols);
1557 1601 if (c == rm->rm_scols)
1558 1602 c = 0;
1559 1603 rc = &rm->rm_col[c];
1560 1604 cvd = vd->vdev_child[rc->rc_devidx];
1561 1605 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1562 1606 rc->rc_offset + rc->rc_size, NULL,
1563 1607 1 << tvd->vdev_ashift,
1564 1608 zio->io_type, zio->io_priority,
1565 1609 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1566 1610 }
1567 1611
1568 1612 return (ZIO_PIPELINE_CONTINUE);
1569 1613 }
1570 1614
1571 1615 ASSERT(zio->io_type == ZIO_TYPE_READ);
1572 1616
1573 1617 /*
1574 1618 * Iterate over the columns in reverse order so that we hit the parity
1575 1619 * last -- any errors along the way will force us to read the parity.
1576 1620 */
1577 1621 for (c = rm->rm_cols - 1; c >= 0; c--) {
1578 1622 rc = &rm->rm_col[c];
1579 1623 cvd = vd->vdev_child[rc->rc_devidx];
1580 1624 if (!vdev_readable(cvd)) {
1581 1625 if (c >= rm->rm_firstdatacol)
1582 1626 rm->rm_missingdata++;
1583 1627 else
1584 1628 rm->rm_missingparity++;
1585 1629 rc->rc_error = SET_ERROR(ENXIO);
1586 1630 rc->rc_tried = 1; /* don't even try */
1587 1631 rc->rc_skipped = 1;
1588 1632 continue;
1589 1633 }
1590 1634 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1591 1635 if (c >= rm->rm_firstdatacol)
1592 1636 rm->rm_missingdata++;
1593 1637 else
1594 1638 rm->rm_missingparity++;
1595 1639 rc->rc_error = SET_ERROR(ESTALE);
1596 1640 rc->rc_skipped = 1;
1597 1641 continue;
1598 1642 }
1599 1643 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1600 1644 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1601 1645 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1602 1646 rc->rc_offset, rc->rc_data, rc->rc_size,
1603 1647 zio->io_type, zio->io_priority, 0,
1604 1648 vdev_raidz_child_done, rc));
1605 1649 }
1606 1650 }
1607 1651
1608 1652 return (ZIO_PIPELINE_CONTINUE);
1609 1653 }
1610 1654
1611 1655
1612 1656 /*
1613 1657 * Report a checksum error for a child of a RAID-Z device.
1614 1658 */
1615 1659 static void
1616 1660 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1617 1661 {
1618 1662 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1619 1663
1620 1664 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1621 1665 zio_bad_cksum_t zbc;
1622 1666 raidz_map_t *rm = zio->io_vsd;
1623 1667
1624 1668 mutex_enter(&vd->vdev_stat_lock);
1625 1669 vd->vdev_stat.vs_checksum_errors++;
1626 1670 mutex_exit(&vd->vdev_stat_lock);
1627 1671
1628 1672 zbc.zbc_has_cksum = 0;
1629 1673 zbc.zbc_injected = rm->rm_ecksuminjected;
1630 1674
1631 1675 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1632 1676 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1633 1677 &zbc);
1634 1678 }
1635 1679 }
1636 1680
1637 1681 /*
1638 1682 * We keep track of whether or not there were any injected errors, so that
1639 1683 * any ereports we generate can note it.
1640 1684 */
1641 1685 static int
1642 1686 raidz_checksum_verify(zio_t *zio)
1643 1687 {
1644 1688 zio_bad_cksum_t zbc;
1645 1689 raidz_map_t *rm = zio->io_vsd;
1646 1690
1647 1691 int ret = zio_checksum_error(zio, &zbc);
1648 1692 if (ret != 0 && zbc.zbc_injected != 0)
1649 1693 rm->rm_ecksuminjected = 1;
1650 1694
1651 1695 return (ret);
1652 1696 }
1653 1697
1654 1698 /*
1655 1699 * Generate the parity from the data columns. If we tried and were able to
1656 1700 * read the parity without error, verify that the generated parity matches the
1657 1701 * data we read. If it doesn't, we fire off a checksum error. Return the
1658 1702 * number such failures.
1659 1703 */
1660 1704 static int
1661 1705 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1662 1706 {
1663 1707 void *orig[VDEV_RAIDZ_MAXPARITY];
1664 1708 int c, ret = 0;
1665 1709 raidz_col_t *rc;
1666 1710
1667 1711 for (c = 0; c < rm->rm_firstdatacol; c++) {
1668 1712 rc = &rm->rm_col[c];
1669 1713 if (!rc->rc_tried || rc->rc_error != 0)
1670 1714 continue;
1671 1715 orig[c] = zio_buf_alloc(rc->rc_size);
1672 1716 bcopy(rc->rc_data, orig[c], rc->rc_size);
1673 1717 }
1674 1718
1675 1719 vdev_raidz_generate_parity(rm);
1676 1720
1677 1721 for (c = 0; c < rm->rm_firstdatacol; c++) {
1678 1722 rc = &rm->rm_col[c];
1679 1723 if (!rc->rc_tried || rc->rc_error != 0)
1680 1724 continue;
1681 1725 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1682 1726 raidz_checksum_error(zio, rc, orig[c]);
1683 1727 rc->rc_error = SET_ERROR(ECKSUM);
1684 1728 ret++;
1685 1729 }
1686 1730 zio_buf_free(orig[c], rc->rc_size);
1687 1731 }
1688 1732
1689 1733 return (ret);
1690 1734 }
1691 1735
1692 1736 /*
1693 1737 * Keep statistics on all the ways that we used parity to correct data.
1694 1738 */
1695 1739 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1696 1740
1697 1741 static int
1698 1742 vdev_raidz_worst_error(raidz_map_t *rm)
1699 1743 {
1700 1744 int error = 0;
1701 1745
1702 1746 for (int c = 0; c < rm->rm_cols; c++)
1703 1747 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1704 1748
1705 1749 return (error);
1706 1750 }
1707 1751
1708 1752 /*
1709 1753 * Iterate over all combinations of bad data and attempt a reconstruction.
1710 1754 * Note that the algorithm below is non-optimal because it doesn't take into
1711 1755 * account how reconstruction is actually performed. For example, with
1712 1756 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1713 1757 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1714 1758 * cases we'd only use parity information in column 0.
1715 1759 */
1716 1760 static int
1717 1761 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1718 1762 {
1719 1763 raidz_map_t *rm = zio->io_vsd;
1720 1764 raidz_col_t *rc;
1721 1765 void *orig[VDEV_RAIDZ_MAXPARITY];
1722 1766 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1723 1767 int *tgts = &tstore[1];
1724 1768 int current, next, i, c, n;
1725 1769 int code, ret = 0;
1726 1770
1727 1771 ASSERT(total_errors < rm->rm_firstdatacol);
1728 1772
1729 1773 /*
1730 1774 * This simplifies one edge condition.
1731 1775 */
1732 1776 tgts[-1] = -1;
1733 1777
1734 1778 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1735 1779 /*
1736 1780 * Initialize the targets array by finding the first n columns
1737 1781 * that contain no error.
1738 1782 *
1739 1783 * If there were no data errors, we need to ensure that we're
1740 1784 * always explicitly attempting to reconstruct at least one
1741 1785 * data column. To do this, we simply push the highest target
1742 1786 * up into the data columns.
1743 1787 */
1744 1788 for (c = 0, i = 0; i < n; i++) {
1745 1789 if (i == n - 1 && data_errors == 0 &&
1746 1790 c < rm->rm_firstdatacol) {
1747 1791 c = rm->rm_firstdatacol;
1748 1792 }
1749 1793
1750 1794 while (rm->rm_col[c].rc_error != 0) {
1751 1795 c++;
1752 1796 ASSERT3S(c, <, rm->rm_cols);
1753 1797 }
1754 1798
1755 1799 tgts[i] = c++;
1756 1800 }
1757 1801
1758 1802 /*
1759 1803 * Setting tgts[n] simplifies the other edge condition.
1760 1804 */
1761 1805 tgts[n] = rm->rm_cols;
1762 1806
1763 1807 /*
1764 1808 * These buffers were allocated in previous iterations.
1765 1809 */
1766 1810 for (i = 0; i < n - 1; i++) {
1767 1811 ASSERT(orig[i] != NULL);
1768 1812 }
1769 1813
1770 1814 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1771 1815
1772 1816 current = 0;
1773 1817 next = tgts[current];
1774 1818
1775 1819 while (current != n) {
1776 1820 tgts[current] = next;
1777 1821 current = 0;
1778 1822
1779 1823 /*
1780 1824 * Save off the original data that we're going to
1781 1825 * attempt to reconstruct.
1782 1826 */
1783 1827 for (i = 0; i < n; i++) {
1784 1828 ASSERT(orig[i] != NULL);
1785 1829 c = tgts[i];
1786 1830 ASSERT3S(c, >=, 0);
1787 1831 ASSERT3S(c, <, rm->rm_cols);
1788 1832 rc = &rm->rm_col[c];
1789 1833 bcopy(rc->rc_data, orig[i], rc->rc_size);
1790 1834 }
1791 1835
1792 1836 /*
1793 1837 * Attempt a reconstruction and exit the outer loop on
1794 1838 * success.
1795 1839 */
1796 1840 code = vdev_raidz_reconstruct(rm, tgts, n);
1797 1841 if (raidz_checksum_verify(zio) == 0) {
1798 1842 atomic_inc_64(&raidz_corrected[code]);
1799 1843
1800 1844 for (i = 0; i < n; i++) {
1801 1845 c = tgts[i];
1802 1846 rc = &rm->rm_col[c];
1803 1847 ASSERT(rc->rc_error == 0);
1804 1848 if (rc->rc_tried)
1805 1849 raidz_checksum_error(zio, rc,
1806 1850 orig[i]);
1807 1851 rc->rc_error = SET_ERROR(ECKSUM);
1808 1852 }
1809 1853
1810 1854 ret = code;
1811 1855 goto done;
1812 1856 }
1813 1857
1814 1858 /*
1815 1859 * Restore the original data.
1816 1860 */
1817 1861 for (i = 0; i < n; i++) {
1818 1862 c = tgts[i];
1819 1863 rc = &rm->rm_col[c];
1820 1864 bcopy(orig[i], rc->rc_data, rc->rc_size);
1821 1865 }
1822 1866
1823 1867 do {
1824 1868 /*
1825 1869 * Find the next valid column after the current
1826 1870 * position..
1827 1871 */
1828 1872 for (next = tgts[current] + 1;
1829 1873 next < rm->rm_cols &&
1830 1874 rm->rm_col[next].rc_error != 0; next++)
1831 1875 continue;
1832 1876
1833 1877 ASSERT(next <= tgts[current + 1]);
1834 1878
1835 1879 /*
1836 1880 * If that spot is available, we're done here.
1837 1881 */
1838 1882 if (next != tgts[current + 1])
1839 1883 break;
1840 1884
1841 1885 /*
1842 1886 * Otherwise, find the next valid column after
1843 1887 * the previous position.
1844 1888 */
1845 1889 for (c = tgts[current - 1] + 1;
1846 1890 rm->rm_col[c].rc_error != 0; c++)
1847 1891 continue;
1848 1892
1849 1893 tgts[current] = c;
1850 1894 current++;
1851 1895
1852 1896 } while (current != n);
1853 1897 }
↓ open down ↓ |
320 lines elided |
↑ open up ↑ |
1854 1898 }
1855 1899 n--;
1856 1900 done:
1857 1901 for (i = 0; i < n; i++) {
1858 1902 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1859 1903 }
1860 1904
1861 1905 return (ret);
1862 1906 }
1863 1907
1908 +/*
1909 + * Complete an IO operation on a RAIDZ VDev
1910 + *
1911 + * Outline:
1912 + * - For write operations:
1913 + * 1. Check for errors on the child IOs.
1914 + * 2. Return, setting an error code if too few child VDevs were written
1915 + * to reconstruct the data later. Note that partial writes are
1916 + * considered successful if they can be reconstructed at all.
1917 + * - For read operations:
1918 + * 1. Check for errors on the child IOs.
1919 + * 2. If data errors occurred:
1920 + * a. Try to reassemble the data from the parity available.
1921 + * b. If we haven't yet read the parity drives, read them now.
1922 + * c. If all parity drives have been read but the data still doesn't
1923 + * reassemble with a correct checksum, then try combinatorial
1924 + * reconstruction.
1925 + * d. If that doesn't work, return an error.
1926 + * 3. If there were unexpected errors or this is a resilver operation,
1927 + * rewrite the vdevs that had errors.
1928 + */
1864 1929 static void
1865 1930 vdev_raidz_io_done(zio_t *zio)
1866 1931 {
1867 1932 vdev_t *vd = zio->io_vd;
1868 1933 vdev_t *cvd;
1869 1934 raidz_map_t *rm = zio->io_vsd;
1870 1935 raidz_col_t *rc;
1871 1936 int unexpected_errors = 0;
1872 1937 int parity_errors = 0;
1873 1938 int parity_untried = 0;
1874 1939 int data_errors = 0;
1875 1940 int total_errors = 0;
1876 1941 int n, c;
1877 1942 int tgts[VDEV_RAIDZ_MAXPARITY];
1878 1943 int code;
1879 1944
1880 1945 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1881 1946
1882 1947 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1883 1948 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1884 1949
1885 1950 for (c = 0; c < rm->rm_cols; c++) {
1886 1951 rc = &rm->rm_col[c];
1887 1952
1888 1953 if (rc->rc_error) {
1889 1954 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1890 1955
1891 1956 if (c < rm->rm_firstdatacol)
1892 1957 parity_errors++;
1893 1958 else
1894 1959 data_errors++;
1895 1960
1896 1961 if (!rc->rc_skipped)
1897 1962 unexpected_errors++;
1898 1963
1899 1964 total_errors++;
1900 1965 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1901 1966 parity_untried++;
1902 1967 }
1903 1968 }
1904 1969
1905 1970 if (zio->io_type == ZIO_TYPE_WRITE) {
1906 1971 /*
1907 1972 * XXX -- for now, treat partial writes as a success.
1908 1973 * (If we couldn't write enough columns to reconstruct
1909 1974 * the data, the I/O failed. Otherwise, good enough.)
1910 1975 *
1911 1976 * Now that we support write reallocation, it would be better
1912 1977 * to treat partial failure as real failure unless there are
1913 1978 * no non-degraded top-level vdevs left, and not update DTLs
1914 1979 * if we intend to reallocate.
1915 1980 */
1916 1981 /* XXPOLICY */
1917 1982 if (total_errors > rm->rm_firstdatacol)
1918 1983 zio->io_error = vdev_raidz_worst_error(rm);
1919 1984
1920 1985 return;
1921 1986 }
1922 1987
1923 1988 ASSERT(zio->io_type == ZIO_TYPE_READ);
1924 1989 /*
1925 1990 * There are three potential phases for a read:
1926 1991 * 1. produce valid data from the columns read
1927 1992 * 2. read all disks and try again
1928 1993 * 3. perform combinatorial reconstruction
1929 1994 *
1930 1995 * Each phase is progressively both more expensive and less likely to
1931 1996 * occur. If we encounter more errors than we can repair or all phases
1932 1997 * fail, we have no choice but to return an error.
1933 1998 */
1934 1999
1935 2000 /*
1936 2001 * If the number of errors we saw was correctable -- less than or equal
1937 2002 * to the number of parity disks read -- attempt to produce data that
1938 2003 * has a valid checksum. Naturally, this case applies in the absence of
1939 2004 * any errors.
1940 2005 */
1941 2006 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1942 2007 if (data_errors == 0) {
1943 2008 if (raidz_checksum_verify(zio) == 0) {
1944 2009 /*
1945 2010 * If we read parity information (unnecessarily
1946 2011 * as it happens since no reconstruction was
1947 2012 * needed) regenerate and verify the parity.
1948 2013 * We also regenerate parity when resilvering
1949 2014 * so we can write it out to the failed device
1950 2015 * later.
1951 2016 */
1952 2017 if (parity_errors + parity_untried <
1953 2018 rm->rm_firstdatacol ||
1954 2019 (zio->io_flags & ZIO_FLAG_RESILVER)) {
1955 2020 n = raidz_parity_verify(zio, rm);
1956 2021 unexpected_errors += n;
1957 2022 ASSERT(parity_errors + n <=
1958 2023 rm->rm_firstdatacol);
1959 2024 }
1960 2025 goto done;
1961 2026 }
1962 2027 } else {
1963 2028 /*
1964 2029 * We either attempt to read all the parity columns or
1965 2030 * none of them. If we didn't try to read parity, we
1966 2031 * wouldn't be here in the correctable case. There must
1967 2032 * also have been fewer parity errors than parity
1968 2033 * columns or, again, we wouldn't be in this code path.
1969 2034 */
1970 2035 ASSERT(parity_untried == 0);
1971 2036 ASSERT(parity_errors < rm->rm_firstdatacol);
1972 2037
1973 2038 /*
1974 2039 * Identify the data columns that reported an error.
1975 2040 */
1976 2041 n = 0;
1977 2042 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1978 2043 rc = &rm->rm_col[c];
1979 2044 if (rc->rc_error != 0) {
1980 2045 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1981 2046 tgts[n++] = c;
1982 2047 }
1983 2048 }
1984 2049
1985 2050 ASSERT(rm->rm_firstdatacol >= n);
1986 2051
1987 2052 code = vdev_raidz_reconstruct(rm, tgts, n);
1988 2053
1989 2054 if (raidz_checksum_verify(zio) == 0) {
1990 2055 atomic_inc_64(&raidz_corrected[code]);
1991 2056
1992 2057 /*
1993 2058 * If we read more parity disks than were used
1994 2059 * for reconstruction, confirm that the other
1995 2060 * parity disks produced correct data. This
1996 2061 * routine is suboptimal in that it regenerates
1997 2062 * the parity that we already used in addition
1998 2063 * to the parity that we're attempting to
1999 2064 * verify, but this should be a relatively
2000 2065 * uncommon case, and can be optimized if it
2001 2066 * becomes a problem. Note that we regenerate
2002 2067 * parity when resilvering so we can write it
2003 2068 * out to failed devices later.
2004 2069 */
2005 2070 if (parity_errors < rm->rm_firstdatacol - n ||
2006 2071 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2007 2072 n = raidz_parity_verify(zio, rm);
2008 2073 unexpected_errors += n;
2009 2074 ASSERT(parity_errors + n <=
2010 2075 rm->rm_firstdatacol);
2011 2076 }
2012 2077
2013 2078 goto done;
2014 2079 }
2015 2080 }
2016 2081 }
2017 2082
2018 2083 /*
2019 2084 * This isn't a typical situation -- either we got a read error or
2020 2085 * a child silently returned bad data. Read every block so we can
2021 2086 * try again with as much data and parity as we can track down. If
2022 2087 * we've already been through once before, all children will be marked
2023 2088 * as tried so we'll proceed to combinatorial reconstruction.
2024 2089 */
2025 2090 unexpected_errors = 1;
2026 2091 rm->rm_missingdata = 0;
2027 2092 rm->rm_missingparity = 0;
2028 2093
2029 2094 for (c = 0; c < rm->rm_cols; c++) {
2030 2095 if (rm->rm_col[c].rc_tried)
2031 2096 continue;
2032 2097
2033 2098 zio_vdev_io_redone(zio);
2034 2099 do {
2035 2100 rc = &rm->rm_col[c];
2036 2101 if (rc->rc_tried)
2037 2102 continue;
2038 2103 zio_nowait(zio_vdev_child_io(zio, NULL,
2039 2104 vd->vdev_child[rc->rc_devidx],
2040 2105 rc->rc_offset, rc->rc_data, rc->rc_size,
2041 2106 zio->io_type, zio->io_priority, 0,
2042 2107 vdev_raidz_child_done, rc));
2043 2108 } while (++c < rm->rm_cols);
2044 2109
2045 2110 return;
2046 2111 }
2047 2112
2048 2113 /*
2049 2114 * At this point we've attempted to reconstruct the data given the
2050 2115 * errors we detected, and we've attempted to read all columns. There
2051 2116 * must, therefore, be one or more additional problems -- silent errors
2052 2117 * resulting in invalid data rather than explicit I/O errors resulting
2053 2118 * in absent data. We check if there is enough additional data to
2054 2119 * possibly reconstruct the data and then perform combinatorial
2055 2120 * reconstruction over all possible combinations. If that fails,
2056 2121 * we're cooked.
2057 2122 */
2058 2123 if (total_errors > rm->rm_firstdatacol) {
2059 2124 zio->io_error = vdev_raidz_worst_error(rm);
2060 2125
2061 2126 } else if (total_errors < rm->rm_firstdatacol &&
2062 2127 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2063 2128 /*
2064 2129 * If we didn't use all the available parity for the
2065 2130 * combinatorial reconstruction, verify that the remaining
2066 2131 * parity is correct.
2067 2132 */
2068 2133 if (code != (1 << rm->rm_firstdatacol) - 1)
2069 2134 (void) raidz_parity_verify(zio, rm);
2070 2135 } else {
2071 2136 /*
2072 2137 * We're here because either:
2073 2138 *
2074 2139 * total_errors == rm_first_datacol, or
2075 2140 * vdev_raidz_combrec() failed
2076 2141 *
2077 2142 * In either case, there is enough bad data to prevent
2078 2143 * reconstruction.
2079 2144 *
2080 2145 * Start checksum ereports for all children which haven't
2081 2146 * failed, and the IO wasn't speculative.
2082 2147 */
2083 2148 zio->io_error = SET_ERROR(ECKSUM);
2084 2149
2085 2150 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2086 2151 for (c = 0; c < rm->rm_cols; c++) {
2087 2152 rc = &rm->rm_col[c];
2088 2153 if (rc->rc_error == 0) {
2089 2154 zio_bad_cksum_t zbc;
2090 2155 zbc.zbc_has_cksum = 0;
2091 2156 zbc.zbc_injected =
2092 2157 rm->rm_ecksuminjected;
2093 2158
2094 2159 zfs_ereport_start_checksum(
2095 2160 zio->io_spa,
2096 2161 vd->vdev_child[rc->rc_devidx],
2097 2162 zio, rc->rc_offset, rc->rc_size,
2098 2163 (void *)(uintptr_t)c, &zbc);
2099 2164 }
2100 2165 }
2101 2166 }
2102 2167 }
2103 2168
2104 2169 done:
2105 2170 zio_checksum_verified(zio);
2106 2171
2107 2172 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2108 2173 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2109 2174 /*
2110 2175 * Use the good data we have in hand to repair damaged children.
2111 2176 */
2112 2177 for (c = 0; c < rm->rm_cols; c++) {
2113 2178 rc = &rm->rm_col[c];
2114 2179 cvd = vd->vdev_child[rc->rc_devidx];
2115 2180
2116 2181 if (rc->rc_error == 0)
2117 2182 continue;
2118 2183
2119 2184 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2120 2185 rc->rc_offset, rc->rc_data, rc->rc_size,
2121 2186 ZIO_TYPE_WRITE, zio->io_priority,
2122 2187 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2123 2188 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2124 2189 }
2125 2190 }
2126 2191 }
2127 2192
2128 2193 static void
2129 2194 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2130 2195 {
2131 2196 if (faulted > vd->vdev_nparity)
2132 2197 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2133 2198 VDEV_AUX_NO_REPLICAS);
2134 2199 else if (degraded + faulted != 0)
2135 2200 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2136 2201 else
2137 2202 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2138 2203 }
2139 2204
2140 2205 vdev_ops_t vdev_raidz_ops = {
2141 2206 vdev_raidz_open,
2142 2207 vdev_raidz_close,
2143 2208 vdev_raidz_asize,
2144 2209 vdev_raidz_io_start,
2145 2210 vdev_raidz_io_done,
2146 2211 vdev_raidz_state_change,
2147 2212 NULL,
2148 2213 NULL,
2149 2214 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2150 2215 B_FALSE /* not a leaf vdev */
2151 2216 };
↓ open down ↓ |
278 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX