Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ new/usr/src/uts/common/fs/zfs/vdev_raidz.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 25 */
26 26
27 27 #include <sys/zfs_context.h>
28 28 #include <sys/spa.h>
29 29 #include <sys/vdev_impl.h>
30 30 #include <sys/zio.h>
31 31 #include <sys/zio_checksum.h>
32 32 #include <sys/fs/zfs.h>
33 33 #include <sys/fm/fs/zfs.h>
34 34
35 35 /*
36 36 * Virtual device vector for RAID-Z.
37 37 *
38 38 * This vdev supports single, double, and triple parity. For single parity,
39 39 * we use a simple XOR of all the data columns. For double or triple parity,
40 40 * we use a special case of Reed-Solomon coding. This extends the
41 41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 44 * former is also based. The latter is designed to provide higher performance
45 45 * for writes.
46 46 *
47 47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 48 * amended six years later identifying a critical flaw that invalidates its
49 49 * claims. Nevertheless, the technique can be adapted to work for up to
50 50 * triple parity. For additional parity, the amendment "Note: Correction to
51 51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 52 * is viable, but the additional complexity means that write performance will
53 53 * suffer.
54 54 *
55 55 * All of the methods above operate on a Galois field, defined over the
56 56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 57 * can be expressed with a single byte. Briefly, the operations on the
58 58 * field are defined as follows:
59 59 *
60 60 * o addition (+) is represented by a bitwise XOR
61 61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 62 * o multiplication of A by 2 is defined by the following bitwise expression:
63 63 *
64 64 * (A * 2)_7 = A_6
65 65 * (A * 2)_6 = A_5
66 66 * (A * 2)_5 = A_4
67 67 * (A * 2)_4 = A_3 + A_7
68 68 * (A * 2)_3 = A_2 + A_7
69 69 * (A * 2)_2 = A_1 + A_7
70 70 * (A * 2)_1 = A_0
71 71 * (A * 2)_0 = A_7
72 72 *
73 73 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
74 74 * As an aside, this multiplication is derived from the error correcting
75 75 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
76 76 *
77 77 * Observe that any number in the field (except for 0) can be expressed as a
78 78 * power of 2 -- a generator for the field. We store a table of the powers of
79 79 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
80 80 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
81 81 * than field addition). The inverse of a field element A (A^-1) is therefore
82 82 * A ^ (255 - 1) = A^254.
83 83 *
84 84 * The up-to-three parity columns, P, Q, R over several data columns,
85 85 * D_0, ... D_n-1, can be expressed by field operations:
86 86 *
87 87 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
88 88 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
89 89 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
90 90 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
91 91 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
92 92 *
93 93 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
94 94 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
95 95 * independent coefficients. (There are no additional coefficients that have
96 96 * this property which is why the uncorrected Plank method breaks down.)
97 97 *
98 98 * See the reconstruction code below for how P, Q and R can used individually
99 99 * or in concert to recover missing data columns.
100 100 */
101 101
102 102 typedef struct raidz_col {
103 103 uint64_t rc_devidx; /* child device index for I/O */
104 104 uint64_t rc_offset; /* device offset */
105 105 uint64_t rc_size; /* I/O size */
106 106 void *rc_data; /* I/O data */
107 107 void *rc_gdata; /* used to store the "good" version */
108 108 int rc_error; /* I/O error for this device */
109 109 uint8_t rc_tried; /* Did we attempt this I/O column? */
110 110 uint8_t rc_skipped; /* Did we skip this I/O column? */
111 111 } raidz_col_t;
112 112
113 113 typedef struct raidz_map {
114 114 uint64_t rm_cols; /* Regular column count */
115 115 uint64_t rm_scols; /* Count including skipped columns */
116 116 uint64_t rm_bigcols; /* Number of oversized columns */
117 117 uint64_t rm_asize; /* Actual total I/O size */
118 118 uint64_t rm_missingdata; /* Count of missing data devices */
119 119 uint64_t rm_missingparity; /* Count of missing parity devices */
120 120 uint64_t rm_firstdatacol; /* First data column/parity count */
121 121 uint64_t rm_nskip; /* Skipped sectors for padding */
122 122 uint64_t rm_skipstart; /* Column index of padding start */
123 123 void *rm_datacopy; /* rm_asize-buffer of copied data */
124 124 uintptr_t rm_reports; /* # of referencing checksum reports */
125 125 uint8_t rm_freed; /* map no longer has referencing ZIO */
126 126 uint8_t rm_ecksuminjected; /* checksum error was injected */
127 127 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
128 128 } raidz_map_t;
129 129
130 130 #define VDEV_RAIDZ_P 0
131 131 #define VDEV_RAIDZ_Q 1
132 132 #define VDEV_RAIDZ_R 2
133 133
134 134 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
135 135 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
136 136
137 137 /*
138 138 * We provide a mechanism to perform the field multiplication operation on a
139 139 * 64-bit value all at once rather than a byte at a time. This works by
140 140 * creating a mask from the top bit in each byte and using that to
141 141 * conditionally apply the XOR of 0x1d.
142 142 */
143 143 #define VDEV_RAIDZ_64MUL_2(x, mask) \
144 144 { \
145 145 (mask) = (x) & 0x8080808080808080ULL; \
146 146 (mask) = ((mask) << 1) - ((mask) >> 7); \
147 147 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
148 148 ((mask) & 0x1d1d1d1d1d1d1d1d); \
149 149 }
150 150
151 151 #define VDEV_RAIDZ_64MUL_4(x, mask) \
152 152 { \
153 153 VDEV_RAIDZ_64MUL_2((x), mask); \
154 154 VDEV_RAIDZ_64MUL_2((x), mask); \
155 155 }
156 156
157 157 /*
158 158 * Force reconstruction to use the general purpose method.
159 159 */
160 160 int vdev_raidz_default_to_general;
161 161
162 162 /* Powers of 2 in the Galois field defined above. */
163 163 static const uint8_t vdev_raidz_pow2[256] = {
164 164 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
165 165 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
166 166 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
167 167 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
168 168 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
169 169 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
170 170 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
171 171 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
172 172 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
173 173 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
174 174 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
175 175 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
176 176 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
177 177 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
178 178 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
179 179 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
180 180 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
181 181 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
182 182 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
183 183 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
184 184 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
185 185 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
186 186 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
187 187 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
188 188 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
189 189 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
190 190 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
191 191 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
192 192 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
193 193 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
194 194 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
195 195 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
196 196 };
197 197 /* Logs of 2 in the Galois field defined above. */
198 198 static const uint8_t vdev_raidz_log2[256] = {
199 199 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
200 200 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
201 201 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
202 202 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
203 203 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
204 204 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
205 205 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
206 206 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
207 207 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
208 208 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
209 209 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
210 210 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
211 211 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
212 212 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
213 213 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
214 214 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
215 215 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
216 216 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
217 217 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
218 218 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
219 219 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
220 220 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
221 221 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
222 222 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
223 223 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
224 224 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
225 225 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
226 226 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
227 227 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
228 228 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
229 229 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
230 230 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
231 231 };
232 232
233 233 static void vdev_raidz_generate_parity(raidz_map_t *rm);
234 234
235 235 /*
236 236 * Multiply a given number by 2 raised to the given power.
237 237 */
238 238 static uint8_t
239 239 vdev_raidz_exp2(uint_t a, int exp)
240 240 {
241 241 if (a == 0)
242 242 return (0);
243 243
244 244 ASSERT(exp >= 0);
245 245 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
246 246
247 247 exp += vdev_raidz_log2[a];
248 248 if (exp > 255)
249 249 exp -= 255;
250 250
251 251 return (vdev_raidz_pow2[exp]);
252 252 }
253 253
254 254 static void
255 255 vdev_raidz_map_free(raidz_map_t *rm)
256 256 {
257 257 int c;
258 258 size_t size;
259 259
260 260 for (c = 0; c < rm->rm_firstdatacol; c++) {
261 261 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
262 262
263 263 if (rm->rm_col[c].rc_gdata != NULL)
264 264 zio_buf_free(rm->rm_col[c].rc_gdata,
265 265 rm->rm_col[c].rc_size);
266 266 }
267 267
268 268 size = 0;
269 269 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
270 270 size += rm->rm_col[c].rc_size;
271 271
272 272 if (rm->rm_datacopy != NULL)
273 273 zio_buf_free(rm->rm_datacopy, size);
274 274
275 275 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
276 276 }
277 277
278 278 static void
279 279 vdev_raidz_map_free_vsd(zio_t *zio)
280 280 {
281 281 raidz_map_t *rm = zio->io_vsd;
282 282
283 283 ASSERT0(rm->rm_freed);
284 284 rm->rm_freed = 1;
285 285
286 286 if (rm->rm_reports == 0)
287 287 vdev_raidz_map_free(rm);
288 288 }
289 289
290 290 /*ARGSUSED*/
291 291 static void
292 292 vdev_raidz_cksum_free(void *arg, size_t ignored)
293 293 {
294 294 raidz_map_t *rm = arg;
295 295
296 296 ASSERT3U(rm->rm_reports, >, 0);
297 297
298 298 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
299 299 vdev_raidz_map_free(rm);
300 300 }
301 301
302 302 static void
303 303 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
304 304 {
305 305 raidz_map_t *rm = zcr->zcr_cbdata;
306 306 size_t c = zcr->zcr_cbinfo;
307 307 size_t x;
308 308
309 309 const char *good = NULL;
310 310 const char *bad = rm->rm_col[c].rc_data;
311 311
312 312 if (good_data == NULL) {
313 313 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
314 314 return;
315 315 }
316 316
317 317 if (c < rm->rm_firstdatacol) {
318 318 /*
319 319 * The first time through, calculate the parity blocks for
320 320 * the good data (this relies on the fact that the good
321 321 * data never changes for a given logical ZIO)
322 322 */
323 323 if (rm->rm_col[0].rc_gdata == NULL) {
324 324 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
325 325 char *buf;
326 326
327 327 /*
328 328 * Set up the rm_col[]s to generate the parity for
329 329 * good_data, first saving the parity bufs and
330 330 * replacing them with buffers to hold the result.
331 331 */
332 332 for (x = 0; x < rm->rm_firstdatacol; x++) {
333 333 bad_parity[x] = rm->rm_col[x].rc_data;
334 334 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
335 335 zio_buf_alloc(rm->rm_col[x].rc_size);
336 336 }
337 337
338 338 /* fill in the data columns from good_data */
339 339 buf = (char *)good_data;
340 340 for (; x < rm->rm_cols; x++) {
341 341 rm->rm_col[x].rc_data = buf;
342 342 buf += rm->rm_col[x].rc_size;
343 343 }
344 344
345 345 /*
346 346 * Construct the parity from the good data.
347 347 */
348 348 vdev_raidz_generate_parity(rm);
349 349
350 350 /* restore everything back to its original state */
351 351 for (x = 0; x < rm->rm_firstdatacol; x++)
352 352 rm->rm_col[x].rc_data = bad_parity[x];
353 353
354 354 buf = rm->rm_datacopy;
355 355 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
356 356 rm->rm_col[x].rc_data = buf;
357 357 buf += rm->rm_col[x].rc_size;
358 358 }
359 359 }
360 360
361 361 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
362 362 good = rm->rm_col[c].rc_gdata;
363 363 } else {
364 364 /* adjust good_data to point at the start of our column */
365 365 good = good_data;
366 366
367 367 for (x = rm->rm_firstdatacol; x < c; x++)
368 368 good += rm->rm_col[x].rc_size;
369 369 }
370 370
371 371 /* we drop the ereport if it ends up that the data was good */
372 372 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
373 373 }
374 374
375 375 /*
376 376 * Invoked indirectly by zfs_ereport_start_checksum(), called
377 377 * below when our read operation fails completely. The main point
378 378 * is to keep a copy of everything we read from disk, so that at
379 379 * vdev_raidz_cksum_finish() time we can compare it with the good data.
380 380 */
381 381 static void
382 382 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
383 383 {
384 384 size_t c = (size_t)(uintptr_t)arg;
385 385 caddr_t buf;
386 386
387 387 raidz_map_t *rm = zio->io_vsd;
388 388 size_t size;
389 389
390 390 /* set up the report and bump the refcount */
391 391 zcr->zcr_cbdata = rm;
392 392 zcr->zcr_cbinfo = c;
393 393 zcr->zcr_finish = vdev_raidz_cksum_finish;
394 394 zcr->zcr_free = vdev_raidz_cksum_free;
395 395
396 396 rm->rm_reports++;
397 397 ASSERT3U(rm->rm_reports, >, 0);
398 398
399 399 if (rm->rm_datacopy != NULL)
400 400 return;
401 401
402 402 /*
403 403 * It's the first time we're called for this raidz_map_t, so we need
404 404 * to copy the data aside; there's no guarantee that our zio's buffer
405 405 * won't be re-used for something else.
406 406 *
407 407 * Our parity data is already in separate buffers, so there's no need
408 408 * to copy them.
409 409 */
410 410
411 411 size = 0;
412 412 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
413 413 size += rm->rm_col[c].rc_size;
414 414
415 415 buf = rm->rm_datacopy = zio_buf_alloc(size);
416 416
417 417 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
418 418 raidz_col_t *col = &rm->rm_col[c];
419 419
420 420 bcopy(col->rc_data, buf, col->rc_size);
421 421 col->rc_data = buf;
422 422
423 423 buf += col->rc_size;
424 424 }
425 425 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
426 426 }
427 427
428 428 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
429 429 vdev_raidz_map_free_vsd,
430 430 vdev_raidz_cksum_report
431 431 };
432 432
433 433 /*
434 434 * Divides the IO evenly across all child vdevs; usually, dcols is
435 435 * the number of children in the target vdev.
436 436 */
437 437 static raidz_map_t *
438 438 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
439 439 uint64_t nparity)
440 440 {
441 441 raidz_map_t *rm;
442 442 /* The starting RAIDZ (parent) vdev sector of the block. */
443 443 uint64_t b = zio->io_offset >> unit_shift;
444 444 /* The zio's size in units of the vdev's minimum sector size. */
445 445 uint64_t s = zio->io_size >> unit_shift;
446 446 /* The first column for this stripe. */
447 447 uint64_t f = b % dcols;
448 448 /* The starting byte offset on each child vdev. */
449 449 uint64_t o = (b / dcols) << unit_shift;
450 450 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
451 451
452 452 /*
453 453 * "Quotient": The number of data sectors for this stripe on all but
454 454 * the "big column" child vdevs that also contain "remainder" data.
455 455 */
456 456 q = s / (dcols - nparity);
457 457
458 458 /*
459 459 * "Remainder": The number of partial stripe data sectors in this I/O.
460 460 * This will add a sector to some, but not all, child vdevs.
461 461 */
462 462 r = s - q * (dcols - nparity);
463 463
464 464 /* The number of "big columns" - those which contain remainder data. */
465 465 bc = (r == 0 ? 0 : r + nparity);
466 466
467 467 /*
468 468 * The total number of data and parity sectors associated with
469 469 * this I/O.
470 470 */
471 471 tot = s + nparity * (q + (r == 0 ? 0 : 1));
472 472
473 473 /* acols: The columns that will be accessed. */
474 474 /* scols: The columns that will be accessed or skipped. */
475 475 if (q == 0) {
476 476 /* Our I/O request doesn't span all child vdevs. */
477 477 acols = bc;
478 478 scols = MIN(dcols, roundup(bc, nparity + 1));
479 479 } else {
480 480 acols = dcols;
481 481 scols = dcols;
482 482 }
483 483
484 484 ASSERT3U(acols, <=, scols);
485 485
486 486 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
487 487
488 488 rm->rm_cols = acols;
489 489 rm->rm_scols = scols;
490 490 rm->rm_bigcols = bc;
491 491 rm->rm_skipstart = bc;
492 492 rm->rm_missingdata = 0;
493 493 rm->rm_missingparity = 0;
494 494 rm->rm_firstdatacol = nparity;
495 495 rm->rm_datacopy = NULL;
496 496 rm->rm_reports = 0;
497 497 rm->rm_freed = 0;
498 498 rm->rm_ecksuminjected = 0;
499 499
500 500 asize = 0;
501 501
502 502 for (c = 0; c < scols; c++) {
503 503 col = f + c;
504 504 coff = o;
505 505 if (col >= dcols) {
506 506 col -= dcols;
507 507 coff += 1ULL << unit_shift;
508 508 }
509 509 rm->rm_col[c].rc_devidx = col;
510 510 rm->rm_col[c].rc_offset = coff;
511 511 rm->rm_col[c].rc_data = NULL;
512 512 rm->rm_col[c].rc_gdata = NULL;
513 513 rm->rm_col[c].rc_error = 0;
514 514 rm->rm_col[c].rc_tried = 0;
515 515 rm->rm_col[c].rc_skipped = 0;
516 516
517 517 if (c >= acols)
518 518 rm->rm_col[c].rc_size = 0;
519 519 else if (c < bc)
520 520 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
521 521 else
522 522 rm->rm_col[c].rc_size = q << unit_shift;
523 523
524 524 asize += rm->rm_col[c].rc_size;
525 525 }
526 526
527 527 ASSERT3U(asize, ==, tot << unit_shift);
528 528 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
529 529 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
530 530 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
531 531 ASSERT3U(rm->rm_nskip, <=, nparity);
532 532
533 533 for (c = 0; c < rm->rm_firstdatacol; c++)
534 534 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
535 535
536 536 rm->rm_col[c].rc_data = zio->io_data;
537 537
538 538 for (c = c + 1; c < acols; c++)
539 539 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
540 540 rm->rm_col[c - 1].rc_size;
541 541
542 542 /*
543 543 * If all data stored spans all columns, there's a danger that parity
544 544 * will always be on the same device and, since parity isn't read
545 545 * during normal operation, that that device's I/O bandwidth won't be
546 546 * used effectively. We therefore switch the parity every 1MB.
547 547 *
548 548 * ... at least that was, ostensibly, the theory. As a practical
549 549 * matter unless we juggle the parity between all devices evenly, we
550 550 * won't see any benefit. Further, occasional writes that aren't a
551 551 * multiple of the LCM of the number of children and the minimum
552 552 * stripe width are sufficient to avoid pessimal behavior.
553 553 * Unfortunately, this decision created an implicit on-disk format
554 554 * requirement that we need to support for all eternity, but only
555 555 * for single-parity RAID-Z.
556 556 *
557 557 * If we intend to skip a sector in the zeroth column for padding
558 558 * we must make sure to note this swap. We will never intend to
559 559 * skip the first column since at least one data and one parity
560 560 * column must appear in each row.
561 561 */
562 562 ASSERT(rm->rm_cols >= 2);
563 563 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
564 564
565 565 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
566 566 devidx = rm->rm_col[0].rc_devidx;
567 567 o = rm->rm_col[0].rc_offset;
568 568 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
569 569 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
570 570 rm->rm_col[1].rc_devidx = devidx;
571 571 rm->rm_col[1].rc_offset = o;
572 572
573 573 if (rm->rm_skipstart == 0)
574 574 rm->rm_skipstart = 1;
575 575 }
576 576
577 577 zio->io_vsd = rm;
578 578 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
579 579 return (rm);
580 580 }
581 581
582 582 static void
583 583 vdev_raidz_generate_parity_p(raidz_map_t *rm)
584 584 {
585 585 uint64_t *p, *src, pcount, ccount, i;
586 586 int c;
587 587
588 588 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
589 589
590 590 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
591 591 src = rm->rm_col[c].rc_data;
592 592 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
593 593 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
594 594
595 595 if (c == rm->rm_firstdatacol) {
596 596 ASSERT(ccount == pcount);
597 597 for (i = 0; i < ccount; i++, src++, p++) {
598 598 *p = *src;
599 599 }
600 600 } else {
601 601 ASSERT(ccount <= pcount);
602 602 for (i = 0; i < ccount; i++, src++, p++) {
603 603 *p ^= *src;
604 604 }
605 605 }
606 606 }
607 607 }
608 608
609 609 static void
610 610 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
611 611 {
612 612 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
613 613 int c;
614 614
615 615 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
616 616 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
617 617 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
618 618
619 619 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
620 620 src = rm->rm_col[c].rc_data;
621 621 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
622 622 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
623 623
624 624 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
625 625
626 626 if (c == rm->rm_firstdatacol) {
627 627 ASSERT(ccnt == pcnt || ccnt == 0);
628 628 for (i = 0; i < ccnt; i++, src++, p++, q++) {
629 629 *p = *src;
630 630 *q = *src;
631 631 }
632 632 for (; i < pcnt; i++, src++, p++, q++) {
633 633 *p = 0;
634 634 *q = 0;
635 635 }
636 636 } else {
637 637 ASSERT(ccnt <= pcnt);
638 638
639 639 /*
640 640 * Apply the algorithm described above by multiplying
641 641 * the previous result and adding in the new value.
642 642 */
643 643 for (i = 0; i < ccnt; i++, src++, p++, q++) {
644 644 *p ^= *src;
645 645
646 646 VDEV_RAIDZ_64MUL_2(*q, mask);
647 647 *q ^= *src;
648 648 }
649 649
650 650 /*
651 651 * Treat short columns as though they are full of 0s.
652 652 * Note that there's therefore nothing needed for P.
653 653 */
654 654 for (; i < pcnt; i++, q++) {
655 655 VDEV_RAIDZ_64MUL_2(*q, mask);
656 656 }
657 657 }
658 658 }
659 659 }
660 660
661 661 static void
662 662 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
663 663 {
664 664 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
665 665 int c;
666 666
667 667 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
668 668 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
669 669 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
670 670 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
671 671 rm->rm_col[VDEV_RAIDZ_R].rc_size);
672 672
673 673 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
674 674 src = rm->rm_col[c].rc_data;
675 675 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
676 676 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
677 677 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
678 678
679 679 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
680 680
681 681 if (c == rm->rm_firstdatacol) {
682 682 ASSERT(ccnt == pcnt || ccnt == 0);
683 683 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
684 684 *p = *src;
685 685 *q = *src;
686 686 *r = *src;
687 687 }
688 688 for (; i < pcnt; i++, src++, p++, q++, r++) {
689 689 *p = 0;
690 690 *q = 0;
691 691 *r = 0;
692 692 }
693 693 } else {
694 694 ASSERT(ccnt <= pcnt);
695 695
696 696 /*
697 697 * Apply the algorithm described above by multiplying
698 698 * the previous result and adding in the new value.
699 699 */
700 700 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
701 701 *p ^= *src;
702 702
703 703 VDEV_RAIDZ_64MUL_2(*q, mask);
704 704 *q ^= *src;
705 705
706 706 VDEV_RAIDZ_64MUL_4(*r, mask);
707 707 *r ^= *src;
708 708 }
709 709
710 710 /*
711 711 * Treat short columns as though they are full of 0s.
712 712 * Note that there's therefore nothing needed for P.
713 713 */
714 714 for (; i < pcnt; i++, q++, r++) {
715 715 VDEV_RAIDZ_64MUL_2(*q, mask);
716 716 VDEV_RAIDZ_64MUL_4(*r, mask);
717 717 }
718 718 }
719 719 }
720 720 }
721 721
722 722 /*
723 723 * Generate RAID parity in the first virtual columns according to the number of
724 724 * parity columns available.
725 725 */
726 726 static void
727 727 vdev_raidz_generate_parity(raidz_map_t *rm)
728 728 {
729 729 switch (rm->rm_firstdatacol) {
730 730 case 1:
731 731 vdev_raidz_generate_parity_p(rm);
732 732 break;
733 733 case 2:
734 734 vdev_raidz_generate_parity_pq(rm);
735 735 break;
736 736 case 3:
737 737 vdev_raidz_generate_parity_pqr(rm);
738 738 break;
739 739 default:
740 740 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
741 741 }
742 742 }
743 743
744 744 static int
745 745 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
746 746 {
747 747 uint64_t *dst, *src, xcount, ccount, count, i;
748 748 int x = tgts[0];
749 749 int c;
750 750
751 751 ASSERT(ntgts == 1);
752 752 ASSERT(x >= rm->rm_firstdatacol);
753 753 ASSERT(x < rm->rm_cols);
754 754
755 755 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
756 756 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
757 757 ASSERT(xcount > 0);
758 758
759 759 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
760 760 dst = rm->rm_col[x].rc_data;
761 761 for (i = 0; i < xcount; i++, dst++, src++) {
762 762 *dst = *src;
763 763 }
764 764
765 765 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
766 766 src = rm->rm_col[c].rc_data;
767 767 dst = rm->rm_col[x].rc_data;
768 768
769 769 if (c == x)
770 770 continue;
771 771
772 772 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
773 773 count = MIN(ccount, xcount);
774 774
775 775 for (i = 0; i < count; i++, dst++, src++) {
776 776 *dst ^= *src;
777 777 }
778 778 }
779 779
780 780 return (1 << VDEV_RAIDZ_P);
781 781 }
782 782
783 783 static int
784 784 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
785 785 {
786 786 uint64_t *dst, *src, xcount, ccount, count, mask, i;
787 787 uint8_t *b;
788 788 int x = tgts[0];
789 789 int c, j, exp;
790 790
791 791 ASSERT(ntgts == 1);
792 792
793 793 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
794 794 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
795 795
796 796 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
797 797 src = rm->rm_col[c].rc_data;
798 798 dst = rm->rm_col[x].rc_data;
799 799
800 800 if (c == x)
801 801 ccount = 0;
802 802 else
803 803 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
804 804
805 805 count = MIN(ccount, xcount);
806 806
807 807 if (c == rm->rm_firstdatacol) {
808 808 for (i = 0; i < count; i++, dst++, src++) {
809 809 *dst = *src;
810 810 }
811 811 for (; i < xcount; i++, dst++) {
812 812 *dst = 0;
813 813 }
814 814
815 815 } else {
816 816 for (i = 0; i < count; i++, dst++, src++) {
817 817 VDEV_RAIDZ_64MUL_2(*dst, mask);
818 818 *dst ^= *src;
819 819 }
820 820
821 821 for (; i < xcount; i++, dst++) {
822 822 VDEV_RAIDZ_64MUL_2(*dst, mask);
823 823 }
824 824 }
825 825 }
826 826
827 827 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
828 828 dst = rm->rm_col[x].rc_data;
829 829 exp = 255 - (rm->rm_cols - 1 - x);
830 830
831 831 for (i = 0; i < xcount; i++, dst++, src++) {
832 832 *dst ^= *src;
833 833 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
834 834 *b = vdev_raidz_exp2(*b, exp);
835 835 }
836 836 }
837 837
838 838 return (1 << VDEV_RAIDZ_Q);
839 839 }
840 840
841 841 static int
842 842 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
843 843 {
844 844 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
845 845 void *pdata, *qdata;
846 846 uint64_t xsize, ysize, i;
847 847 int x = tgts[0];
848 848 int y = tgts[1];
849 849
850 850 ASSERT(ntgts == 2);
851 851 ASSERT(x < y);
852 852 ASSERT(x >= rm->rm_firstdatacol);
853 853 ASSERT(y < rm->rm_cols);
854 854
855 855 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
856 856
857 857 /*
858 858 * Move the parity data aside -- we're going to compute parity as
859 859 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
860 860 * reuse the parity generation mechanism without trashing the actual
861 861 * parity so we make those columns appear to be full of zeros by
862 862 * setting their lengths to zero.
863 863 */
864 864 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
865 865 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
866 866 xsize = rm->rm_col[x].rc_size;
867 867 ysize = rm->rm_col[y].rc_size;
868 868
869 869 rm->rm_col[VDEV_RAIDZ_P].rc_data =
870 870 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
871 871 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
872 872 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
873 873 rm->rm_col[x].rc_size = 0;
874 874 rm->rm_col[y].rc_size = 0;
875 875
876 876 vdev_raidz_generate_parity_pq(rm);
877 877
878 878 rm->rm_col[x].rc_size = xsize;
879 879 rm->rm_col[y].rc_size = ysize;
880 880
881 881 p = pdata;
882 882 q = qdata;
883 883 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
884 884 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
885 885 xd = rm->rm_col[x].rc_data;
886 886 yd = rm->rm_col[y].rc_data;
887 887
888 888 /*
889 889 * We now have:
890 890 * Pxy = P + D_x + D_y
891 891 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
892 892 *
893 893 * We can then solve for D_x:
894 894 * D_x = A * (P + Pxy) + B * (Q + Qxy)
895 895 * where
896 896 * A = 2^(x - y) * (2^(x - y) + 1)^-1
897 897 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
898 898 *
899 899 * With D_x in hand, we can easily solve for D_y:
900 900 * D_y = P + Pxy + D_x
901 901 */
902 902
903 903 a = vdev_raidz_pow2[255 + x - y];
904 904 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
905 905 tmp = 255 - vdev_raidz_log2[a ^ 1];
906 906
907 907 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
908 908 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
909 909
910 910 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
911 911 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
912 912 vdev_raidz_exp2(*q ^ *qxy, bexp);
913 913
914 914 if (i < ysize)
915 915 *yd = *p ^ *pxy ^ *xd;
916 916 }
917 917
918 918 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
919 919 rm->rm_col[VDEV_RAIDZ_P].rc_size);
920 920 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
921 921 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
922 922
923 923 /*
924 924 * Restore the saved parity data.
925 925 */
926 926 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
927 927 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
928 928
929 929 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
930 930 }
931 931
932 932 /* BEGIN CSTYLED */
933 933 /*
934 934 * In the general case of reconstruction, we must solve the system of linear
935 935 * equations defined by the coeffecients used to generate parity as well as
936 936 * the contents of the data and parity disks. This can be expressed with
937 937 * vectors for the original data (D) and the actual data (d) and parity (p)
938 938 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
939 939 *
940 940 * __ __ __ __
941 941 * | | __ __ | p_0 |
942 942 * | V | | D_0 | | p_m-1 |
943 943 * | | x | : | = | d_0 |
944 944 * | I | | D_n-1 | | : |
945 945 * | | ~~ ~~ | d_n-1 |
946 946 * ~~ ~~ ~~ ~~
947 947 *
948 948 * I is simply a square identity matrix of size n, and V is a vandermonde
949 949 * matrix defined by the coeffecients we chose for the various parity columns
950 950 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
951 951 * computation as well as linear separability.
952 952 *
953 953 * __ __ __ __
954 954 * | 1 .. 1 1 1 | | p_0 |
955 955 * | 2^n-1 .. 4 2 1 | __ __ | : |
956 956 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
957 957 * | 1 .. 0 0 0 | | D_1 | | d_0 |
958 958 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
959 959 * | : : : : | | : | | d_2 |
960 960 * | 0 .. 1 0 0 | | D_n-1 | | : |
961 961 * | 0 .. 0 1 0 | ~~ ~~ | : |
962 962 * | 0 .. 0 0 1 | | d_n-1 |
963 963 * ~~ ~~ ~~ ~~
964 964 *
965 965 * Note that I, V, d, and p are known. To compute D, we must invert the
966 966 * matrix and use the known data and parity values to reconstruct the unknown
967 967 * data values. We begin by removing the rows in V|I and d|p that correspond
968 968 * to failed or missing columns; we then make V|I square (n x n) and d|p
969 969 * sized n by removing rows corresponding to unused parity from the bottom up
970 970 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
971 971 * using Gauss-Jordan elimination. In the example below we use m=3 parity
972 972 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
973 973 * __ __
974 974 * | 1 1 1 1 1 1 1 1 |
975 975 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
976 976 * | 19 205 116 29 64 16 4 1 | / /
977 977 * | 1 0 0 0 0 0 0 0 | / /
978 978 * | 0 1 0 0 0 0 0 0 | <--' /
979 979 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
980 980 * | 0 0 0 1 0 0 0 0 |
981 981 * | 0 0 0 0 1 0 0 0 |
982 982 * | 0 0 0 0 0 1 0 0 |
983 983 * | 0 0 0 0 0 0 1 0 |
984 984 * | 0 0 0 0 0 0 0 1 |
985 985 * ~~ ~~
986 986 * __ __
987 987 * | 1 1 1 1 1 1 1 1 |
988 988 * | 128 64 32 16 8 4 2 1 |
989 989 * | 19 205 116 29 64 16 4 1 |
990 990 * | 1 0 0 0 0 0 0 0 |
991 991 * | 0 1 0 0 0 0 0 0 |
992 992 * (V|I)' = | 0 0 1 0 0 0 0 0 |
993 993 * | 0 0 0 1 0 0 0 0 |
994 994 * | 0 0 0 0 1 0 0 0 |
995 995 * | 0 0 0 0 0 1 0 0 |
996 996 * | 0 0 0 0 0 0 1 0 |
997 997 * | 0 0 0 0 0 0 0 1 |
998 998 * ~~ ~~
999 999 *
1000 1000 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1001 1001 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1002 1002 * matrix is not singular.
1003 1003 * __ __
1004 1004 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1005 1005 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1006 1006 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1007 1007 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1008 1008 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1009 1009 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1010 1010 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1011 1011 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1012 1012 * ~~ ~~
1013 1013 * __ __
1014 1014 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1015 1015 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1016 1016 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1017 1017 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1018 1018 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1019 1019 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1020 1020 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1021 1021 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1022 1022 * ~~ ~~
1023 1023 * __ __
1024 1024 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1025 1025 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1026 1026 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1027 1027 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1028 1028 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1029 1029 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1030 1030 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1031 1031 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1032 1032 * ~~ ~~
1033 1033 * __ __
1034 1034 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1035 1035 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1036 1036 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1037 1037 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1038 1038 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1039 1039 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1040 1040 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1041 1041 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1042 1042 * ~~ ~~
1043 1043 * __ __
1044 1044 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1045 1045 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1046 1046 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1047 1047 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1048 1048 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1049 1049 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1050 1050 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1051 1051 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1052 1052 * ~~ ~~
1053 1053 * __ __
1054 1054 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1055 1055 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1056 1056 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1057 1057 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1058 1058 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1059 1059 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1060 1060 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1061 1061 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1062 1062 * ~~ ~~
1063 1063 * __ __
1064 1064 * | 0 0 1 0 0 0 0 0 |
1065 1065 * | 167 100 5 41 159 169 217 208 |
1066 1066 * | 166 100 4 40 158 168 216 209 |
1067 1067 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1068 1068 * | 0 0 0 0 1 0 0 0 |
1069 1069 * | 0 0 0 0 0 1 0 0 |
1070 1070 * | 0 0 0 0 0 0 1 0 |
1071 1071 * | 0 0 0 0 0 0 0 1 |
1072 1072 * ~~ ~~
1073 1073 *
1074 1074 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1075 1075 * of the missing data.
1076 1076 *
1077 1077 * As is apparent from the example above, the only non-trivial rows in the
1078 1078 * inverse matrix correspond to the data disks that we're trying to
1079 1079 * reconstruct. Indeed, those are the only rows we need as the others would
1080 1080 * only be useful for reconstructing data known or assumed to be valid. For
1081 1081 * that reason, we only build the coefficients in the rows that correspond to
1082 1082 * targeted columns.
1083 1083 */
1084 1084 /* END CSTYLED */
1085 1085
1086 1086 static void
1087 1087 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1088 1088 uint8_t **rows)
1089 1089 {
1090 1090 int i, j;
1091 1091 int pow;
1092 1092
1093 1093 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1094 1094
1095 1095 /*
1096 1096 * Fill in the missing rows of interest.
1097 1097 */
1098 1098 for (i = 0; i < nmap; i++) {
1099 1099 ASSERT3S(0, <=, map[i]);
1100 1100 ASSERT3S(map[i], <=, 2);
1101 1101
1102 1102 pow = map[i] * n;
1103 1103 if (pow > 255)
1104 1104 pow -= 255;
1105 1105 ASSERT(pow <= 255);
1106 1106
1107 1107 for (j = 0; j < n; j++) {
1108 1108 pow -= map[i];
1109 1109 if (pow < 0)
1110 1110 pow += 255;
1111 1111 rows[i][j] = vdev_raidz_pow2[pow];
1112 1112 }
1113 1113 }
1114 1114 }
1115 1115
1116 1116 static void
1117 1117 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1118 1118 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1119 1119 {
1120 1120 int i, j, ii, jj;
1121 1121 uint8_t log;
1122 1122
1123 1123 /*
1124 1124 * Assert that the first nmissing entries from the array of used
1125 1125 * columns correspond to parity columns and that subsequent entries
1126 1126 * correspond to data columns.
1127 1127 */
1128 1128 for (i = 0; i < nmissing; i++) {
1129 1129 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1130 1130 }
1131 1131 for (; i < n; i++) {
1132 1132 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1133 1133 }
1134 1134
1135 1135 /*
1136 1136 * First initialize the storage where we'll compute the inverse rows.
1137 1137 */
1138 1138 for (i = 0; i < nmissing; i++) {
1139 1139 for (j = 0; j < n; j++) {
1140 1140 invrows[i][j] = (i == j) ? 1 : 0;
1141 1141 }
1142 1142 }
1143 1143
1144 1144 /*
1145 1145 * Subtract all trivial rows from the rows of consequence.
1146 1146 */
1147 1147 for (i = 0; i < nmissing; i++) {
1148 1148 for (j = nmissing; j < n; j++) {
1149 1149 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1150 1150 jj = used[j] - rm->rm_firstdatacol;
1151 1151 ASSERT3S(jj, <, n);
1152 1152 invrows[i][j] = rows[i][jj];
1153 1153 rows[i][jj] = 0;
1154 1154 }
1155 1155 }
1156 1156
1157 1157 /*
1158 1158 * For each of the rows of interest, we must normalize it and subtract
1159 1159 * a multiple of it from the other rows.
1160 1160 */
1161 1161 for (i = 0; i < nmissing; i++) {
1162 1162 for (j = 0; j < missing[i]; j++) {
1163 1163 ASSERT0(rows[i][j]);
1164 1164 }
1165 1165 ASSERT3U(rows[i][missing[i]], !=, 0);
1166 1166
1167 1167 /*
1168 1168 * Compute the inverse of the first element and multiply each
1169 1169 * element in the row by that value.
1170 1170 */
1171 1171 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1172 1172
1173 1173 for (j = 0; j < n; j++) {
1174 1174 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1175 1175 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1176 1176 }
1177 1177
1178 1178 for (ii = 0; ii < nmissing; ii++) {
1179 1179 if (i == ii)
1180 1180 continue;
1181 1181
1182 1182 ASSERT3U(rows[ii][missing[i]], !=, 0);
1183 1183
1184 1184 log = vdev_raidz_log2[rows[ii][missing[i]]];
1185 1185
1186 1186 for (j = 0; j < n; j++) {
1187 1187 rows[ii][j] ^=
1188 1188 vdev_raidz_exp2(rows[i][j], log);
1189 1189 invrows[ii][j] ^=
1190 1190 vdev_raidz_exp2(invrows[i][j], log);
1191 1191 }
1192 1192 }
1193 1193 }
1194 1194
1195 1195 /*
1196 1196 * Verify that the data that is left in the rows are properly part of
1197 1197 * an identity matrix.
1198 1198 */
1199 1199 for (i = 0; i < nmissing; i++) {
1200 1200 for (j = 0; j < n; j++) {
1201 1201 if (j == missing[i]) {
1202 1202 ASSERT3U(rows[i][j], ==, 1);
1203 1203 } else {
1204 1204 ASSERT0(rows[i][j]);
1205 1205 }
1206 1206 }
1207 1207 }
1208 1208 }
1209 1209
1210 1210 static void
1211 1211 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1212 1212 int *missing, uint8_t **invrows, const uint8_t *used)
1213 1213 {
1214 1214 int i, j, x, cc, c;
1215 1215 uint8_t *src;
1216 1216 uint64_t ccount;
1217 1217 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1218 1218 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1219 1219 uint8_t log = 0;
1220 1220 uint8_t val;
1221 1221 int ll;
1222 1222 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1223 1223 uint8_t *p, *pp;
1224 1224 size_t psize;
1225 1225
1226 1226 psize = sizeof (invlog[0][0]) * n * nmissing;
1227 1227 p = kmem_alloc(psize, KM_SLEEP);
1228 1228
1229 1229 for (pp = p, i = 0; i < nmissing; i++) {
1230 1230 invlog[i] = pp;
1231 1231 pp += n;
1232 1232 }
1233 1233
1234 1234 for (i = 0; i < nmissing; i++) {
1235 1235 for (j = 0; j < n; j++) {
1236 1236 ASSERT3U(invrows[i][j], !=, 0);
1237 1237 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1238 1238 }
1239 1239 }
1240 1240
1241 1241 for (i = 0; i < n; i++) {
1242 1242 c = used[i];
1243 1243 ASSERT3U(c, <, rm->rm_cols);
1244 1244
1245 1245 src = rm->rm_col[c].rc_data;
1246 1246 ccount = rm->rm_col[c].rc_size;
1247 1247 for (j = 0; j < nmissing; j++) {
1248 1248 cc = missing[j] + rm->rm_firstdatacol;
1249 1249 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1250 1250 ASSERT3U(cc, <, rm->rm_cols);
1251 1251 ASSERT3U(cc, !=, c);
1252 1252
1253 1253 dst[j] = rm->rm_col[cc].rc_data;
1254 1254 dcount[j] = rm->rm_col[cc].rc_size;
1255 1255 }
1256 1256
1257 1257 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1258 1258
1259 1259 for (x = 0; x < ccount; x++, src++) {
1260 1260 if (*src != 0)
1261 1261 log = vdev_raidz_log2[*src];
1262 1262
1263 1263 for (cc = 0; cc < nmissing; cc++) {
1264 1264 if (x >= dcount[cc])
1265 1265 continue;
1266 1266
1267 1267 if (*src == 0) {
1268 1268 val = 0;
1269 1269 } else {
1270 1270 if ((ll = log + invlog[cc][i]) >= 255)
1271 1271 ll -= 255;
1272 1272 val = vdev_raidz_pow2[ll];
1273 1273 }
1274 1274
1275 1275 if (i == 0)
1276 1276 dst[cc][x] = val;
1277 1277 else
1278 1278 dst[cc][x] ^= val;
1279 1279 }
1280 1280 }
1281 1281 }
1282 1282
1283 1283 kmem_free(p, psize);
1284 1284 }
1285 1285
1286 1286 static int
1287 1287 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1288 1288 {
1289 1289 int n, i, c, t, tt;
1290 1290 int nmissing_rows;
1291 1291 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1292 1292 int parity_map[VDEV_RAIDZ_MAXPARITY];
1293 1293
1294 1294 uint8_t *p, *pp;
1295 1295 size_t psize;
1296 1296
1297 1297 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1298 1298 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1299 1299 uint8_t *used;
1300 1300
1301 1301 int code = 0;
1302 1302
1303 1303
1304 1304 n = rm->rm_cols - rm->rm_firstdatacol;
1305 1305
1306 1306 /*
1307 1307 * Figure out which data columns are missing.
1308 1308 */
1309 1309 nmissing_rows = 0;
1310 1310 for (t = 0; t < ntgts; t++) {
1311 1311 if (tgts[t] >= rm->rm_firstdatacol) {
1312 1312 missing_rows[nmissing_rows++] =
1313 1313 tgts[t] - rm->rm_firstdatacol;
1314 1314 }
1315 1315 }
1316 1316
1317 1317 /*
1318 1318 * Figure out which parity columns to use to help generate the missing
1319 1319 * data columns.
1320 1320 */
1321 1321 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1322 1322 ASSERT(tt < ntgts);
1323 1323 ASSERT(c < rm->rm_firstdatacol);
1324 1324
1325 1325 /*
1326 1326 * Skip any targeted parity columns.
1327 1327 */
1328 1328 if (c == tgts[tt]) {
1329 1329 tt++;
1330 1330 continue;
1331 1331 }
1332 1332
1333 1333 code |= 1 << c;
1334 1334
1335 1335 parity_map[i] = c;
1336 1336 i++;
1337 1337 }
1338 1338
1339 1339 ASSERT(code != 0);
1340 1340 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1341 1341
1342 1342 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1343 1343 nmissing_rows * n + sizeof (used[0]) * n;
1344 1344 p = kmem_alloc(psize, KM_SLEEP);
1345 1345
1346 1346 for (pp = p, i = 0; i < nmissing_rows; i++) {
1347 1347 rows[i] = pp;
1348 1348 pp += n;
1349 1349 invrows[i] = pp;
1350 1350 pp += n;
1351 1351 }
1352 1352 used = pp;
1353 1353
1354 1354 for (i = 0; i < nmissing_rows; i++) {
1355 1355 used[i] = parity_map[i];
1356 1356 }
1357 1357
1358 1358 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1359 1359 if (tt < nmissing_rows &&
1360 1360 c == missing_rows[tt] + rm->rm_firstdatacol) {
1361 1361 tt++;
1362 1362 continue;
1363 1363 }
1364 1364
1365 1365 ASSERT3S(i, <, n);
1366 1366 used[i] = c;
1367 1367 i++;
1368 1368 }
1369 1369
1370 1370 /*
1371 1371 * Initialize the interesting rows of the matrix.
1372 1372 */
1373 1373 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1374 1374
1375 1375 /*
1376 1376 * Invert the matrix.
1377 1377 */
1378 1378 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1379 1379 invrows, used);
1380 1380
1381 1381 /*
1382 1382 * Reconstruct the missing data using the generated matrix.
1383 1383 */
1384 1384 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1385 1385 invrows, used);
1386 1386
1387 1387 kmem_free(p, psize);
1388 1388
1389 1389 return (code);
1390 1390 }
1391 1391
1392 1392 static int
1393 1393 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1394 1394 {
1395 1395 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1396 1396 int ntgts;
1397 1397 int i, c;
1398 1398 int code;
1399 1399 int nbadparity, nbaddata;
1400 1400 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1401 1401
1402 1402 /*
1403 1403 * The tgts list must already be sorted.
1404 1404 */
1405 1405 for (i = 1; i < nt; i++) {
1406 1406 ASSERT(t[i] > t[i - 1]);
1407 1407 }
1408 1408
1409 1409 nbadparity = rm->rm_firstdatacol;
1410 1410 nbaddata = rm->rm_cols - nbadparity;
1411 1411 ntgts = 0;
1412 1412 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1413 1413 if (c < rm->rm_firstdatacol)
1414 1414 parity_valid[c] = B_FALSE;
1415 1415
1416 1416 if (i < nt && c == t[i]) {
1417 1417 tgts[ntgts++] = c;
1418 1418 i++;
1419 1419 } else if (rm->rm_col[c].rc_error != 0) {
1420 1420 tgts[ntgts++] = c;
1421 1421 } else if (c >= rm->rm_firstdatacol) {
1422 1422 nbaddata--;
1423 1423 } else {
1424 1424 parity_valid[c] = B_TRUE;
1425 1425 nbadparity--;
1426 1426 }
1427 1427 }
1428 1428
1429 1429 ASSERT(ntgts >= nt);
1430 1430 ASSERT(nbaddata >= 0);
1431 1431 ASSERT(nbaddata + nbadparity == ntgts);
1432 1432
1433 1433 dt = &tgts[nbadparity];
1434 1434
1435 1435 /*
1436 1436 * See if we can use any of our optimized reconstruction routines.
1437 1437 */
1438 1438 if (!vdev_raidz_default_to_general) {
1439 1439 switch (nbaddata) {
1440 1440 case 1:
1441 1441 if (parity_valid[VDEV_RAIDZ_P])
1442 1442 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1443 1443
1444 1444 ASSERT(rm->rm_firstdatacol > 1);
1445 1445
1446 1446 if (parity_valid[VDEV_RAIDZ_Q])
1447 1447 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1448 1448
1449 1449 ASSERT(rm->rm_firstdatacol > 2);
1450 1450 break;
1451 1451
1452 1452 case 2:
1453 1453 ASSERT(rm->rm_firstdatacol > 1);
1454 1454
1455 1455 if (parity_valid[VDEV_RAIDZ_P] &&
1456 1456 parity_valid[VDEV_RAIDZ_Q])
1457 1457 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1458 1458
1459 1459 ASSERT(rm->rm_firstdatacol > 2);
1460 1460
1461 1461 break;
1462 1462 }
1463 1463 }
1464 1464
1465 1465 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1466 1466 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1467 1467 ASSERT(code > 0);
1468 1468 return (code);
1469 1469 }
1470 1470
1471 1471 static int
1472 1472 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1473 1473 uint64_t *ashift)
1474 1474 {
1475 1475 vdev_t *cvd;
1476 1476 uint64_t nparity = vd->vdev_nparity;
1477 1477 int c;
1478 1478 int lasterror = 0;
1479 1479 int numerrors = 0;
1480 1480
1481 1481 ASSERT(nparity > 0);
1482 1482
1483 1483 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1484 1484 vd->vdev_children < nparity + 1) {
1485 1485 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1486 1486 return (SET_ERROR(EINVAL));
1487 1487 }
1488 1488
1489 1489 vdev_open_children(vd);
1490 1490
1491 1491 for (c = 0; c < vd->vdev_children; c++) {
1492 1492 cvd = vd->vdev_child[c];
1493 1493
1494 1494 if (cvd->vdev_open_error != 0) {
1495 1495 lasterror = cvd->vdev_open_error;
1496 1496 numerrors++;
1497 1497 continue;
1498 1498 }
1499 1499
1500 1500 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1501 1501 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1502 1502 *ashift = MAX(*ashift, cvd->vdev_ashift);
1503 1503 }
1504 1504
1505 1505 *asize *= vd->vdev_children;
1506 1506 *max_asize *= vd->vdev_children;
1507 1507
1508 1508 if (numerrors > nparity) {
1509 1509 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1510 1510 return (lasterror);
1511 1511 }
1512 1512
1513 1513 return (0);
1514 1514 }
1515 1515
1516 1516 static void
1517 1517 vdev_raidz_close(vdev_t *vd)
1518 1518 {
1519 1519 int c;
1520 1520
1521 1521 for (c = 0; c < vd->vdev_children; c++)
1522 1522 vdev_close(vd->vdev_child[c]);
1523 1523 }
1524 1524
1525 1525 static uint64_t
1526 1526 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1527 1527 {
1528 1528 uint64_t asize;
1529 1529 uint64_t ashift = vd->vdev_top->vdev_ashift;
1530 1530 uint64_t cols = vd->vdev_children;
1531 1531 uint64_t nparity = vd->vdev_nparity;
1532 1532
1533 1533 asize = ((psize - 1) >> ashift) + 1;
1534 1534 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1535 1535 asize = roundup(asize, nparity + 1) << ashift;
1536 1536
1537 1537 return (asize);
1538 1538 }
1539 1539
1540 1540 static void
1541 1541 vdev_raidz_child_done(zio_t *zio)
1542 1542 {
1543 1543 raidz_col_t *rc = zio->io_private;
1544 1544
1545 1545 rc->rc_error = zio->io_error;
1546 1546 rc->rc_tried = 1;
1547 1547 rc->rc_skipped = 0;
1548 1548 }
1549 1549
1550 1550 /*
1551 1551 * Start an IO operation on a RAIDZ VDev
1552 1552 *
1553 1553 * Outline:
1554 1554 * - For write operations:
1555 1555 * 1. Generate the parity data
1556 1556 * 2. Create child zio write operations to each column's vdev, for both
1557 1557 * data and parity.
1558 1558 * 3. If the column skips any sectors for padding, create optional dummy
1559 1559 * write zio children for those areas to improve aggregation continuity.
1560 1560 * - For read operations:
1561 1561 * 1. Create child zio read operations to each data column's vdev to read
1562 1562 * the range of data required for zio.
1563 1563 * 2. If this is a scrub or resilver operation, or if any of the data
1564 1564 * vdevs have had errors, then create zio read operations to the parity
1565 1565 * columns' VDevs as well.
1566 1566 */
1567 1567 static int
1568 1568 vdev_raidz_io_start(zio_t *zio)
1569 1569 {
1570 1570 vdev_t *vd = zio->io_vd;
1571 1571 vdev_t *tvd = vd->vdev_top;
1572 1572 vdev_t *cvd;
1573 1573 raidz_map_t *rm;
1574 1574 raidz_col_t *rc;
1575 1575 int c, i;
1576 1576
1577 1577 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1578 1578 vd->vdev_nparity);
1579 1579
1580 1580 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1581 1581
1582 1582 if (zio->io_type == ZIO_TYPE_WRITE) {
1583 1583 vdev_raidz_generate_parity(rm);
1584 1584
1585 1585 for (c = 0; c < rm->rm_cols; c++) {
1586 1586 rc = &rm->rm_col[c];
1587 1587 cvd = vd->vdev_child[rc->rc_devidx];
1588 1588 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1589 1589 rc->rc_offset, rc->rc_data, rc->rc_size,
1590 1590 zio->io_type, zio->io_priority, 0,
1591 1591 vdev_raidz_child_done, rc));
1592 1592 }
1593 1593
1594 1594 /*
1595 1595 * Generate optional I/Os for any skipped sectors to improve
1596 1596 * aggregation contiguity.
1597 1597 */
1598 1598 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1599 1599 ASSERT(c <= rm->rm_scols);
1600 1600 if (c == rm->rm_scols)
1601 1601 c = 0;
1602 1602 rc = &rm->rm_col[c];
1603 1603 cvd = vd->vdev_child[rc->rc_devidx];
1604 1604 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1605 1605 rc->rc_offset + rc->rc_size, NULL,
1606 1606 1 << tvd->vdev_ashift,
1607 1607 zio->io_type, zio->io_priority,
1608 1608 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1609 1609 }
1610 1610
1611 1611 return (ZIO_PIPELINE_CONTINUE);
1612 1612 }
1613 1613
1614 1614 ASSERT(zio->io_type == ZIO_TYPE_READ);
1615 1615
1616 1616 /*
1617 1617 * Iterate over the columns in reverse order so that we hit the parity
1618 1618 * last -- any errors along the way will force us to read the parity.
1619 1619 */
1620 1620 for (c = rm->rm_cols - 1; c >= 0; c--) {
1621 1621 rc = &rm->rm_col[c];
1622 1622 cvd = vd->vdev_child[rc->rc_devidx];
1623 1623 if (!vdev_readable(cvd)) {
1624 1624 if (c >= rm->rm_firstdatacol)
1625 1625 rm->rm_missingdata++;
1626 1626 else
1627 1627 rm->rm_missingparity++;
1628 1628 rc->rc_error = SET_ERROR(ENXIO);
1629 1629 rc->rc_tried = 1; /* don't even try */
1630 1630 rc->rc_skipped = 1;
1631 1631 continue;
1632 1632 }
1633 1633 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1634 1634 if (c >= rm->rm_firstdatacol)
1635 1635 rm->rm_missingdata++;
1636 1636 else
1637 1637 rm->rm_missingparity++;
1638 1638 rc->rc_error = SET_ERROR(ESTALE);
1639 1639 rc->rc_skipped = 1;
1640 1640 continue;
1641 1641 }
1642 1642 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1643 1643 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1644 1644 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1645 1645 rc->rc_offset, rc->rc_data, rc->rc_size,
1646 1646 zio->io_type, zio->io_priority, 0,
1647 1647 vdev_raidz_child_done, rc));
1648 1648 }
1649 1649 }
1650 1650
1651 1651 return (ZIO_PIPELINE_CONTINUE);
1652 1652 }
1653 1653
1654 1654
1655 1655 /*
1656 1656 * Report a checksum error for a child of a RAID-Z device.
1657 1657 */
1658 1658 static void
1659 1659 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1660 1660 {
1661 1661 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1662 1662
1663 1663 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1664 1664 zio_bad_cksum_t zbc;
1665 1665 raidz_map_t *rm = zio->io_vsd;
1666 1666
1667 1667 mutex_enter(&vd->vdev_stat_lock);
1668 1668 vd->vdev_stat.vs_checksum_errors++;
1669 1669 mutex_exit(&vd->vdev_stat_lock);
1670 1670
1671 1671 zbc.zbc_has_cksum = 0;
1672 1672 zbc.zbc_injected = rm->rm_ecksuminjected;
1673 1673
1674 1674 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1675 1675 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1676 1676 &zbc);
1677 1677 }
1678 1678 }
1679 1679
1680 1680 /*
1681 1681 * We keep track of whether or not there were any injected errors, so that
1682 1682 * any ereports we generate can note it.
1683 1683 */
1684 1684 static int
1685 1685 raidz_checksum_verify(zio_t *zio)
1686 1686 {
1687 1687 zio_bad_cksum_t zbc;
1688 1688 raidz_map_t *rm = zio->io_vsd;
1689 1689
1690 1690 int ret = zio_checksum_error(zio, &zbc);
1691 1691 if (ret != 0 && zbc.zbc_injected != 0)
1692 1692 rm->rm_ecksuminjected = 1;
1693 1693
1694 1694 return (ret);
1695 1695 }
1696 1696
1697 1697 /*
1698 1698 * Generate the parity from the data columns. If we tried and were able to
1699 1699 * read the parity without error, verify that the generated parity matches the
1700 1700 * data we read. If it doesn't, we fire off a checksum error. Return the
1701 1701 * number such failures.
1702 1702 */
1703 1703 static int
1704 1704 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1705 1705 {
1706 1706 void *orig[VDEV_RAIDZ_MAXPARITY];
1707 1707 int c, ret = 0;
1708 1708 raidz_col_t *rc;
1709 1709
1710 1710 for (c = 0; c < rm->rm_firstdatacol; c++) {
1711 1711 rc = &rm->rm_col[c];
1712 1712 if (!rc->rc_tried || rc->rc_error != 0)
1713 1713 continue;
1714 1714 orig[c] = zio_buf_alloc(rc->rc_size);
1715 1715 bcopy(rc->rc_data, orig[c], rc->rc_size);
1716 1716 }
1717 1717
1718 1718 vdev_raidz_generate_parity(rm);
1719 1719
1720 1720 for (c = 0; c < rm->rm_firstdatacol; c++) {
1721 1721 rc = &rm->rm_col[c];
1722 1722 if (!rc->rc_tried || rc->rc_error != 0)
1723 1723 continue;
1724 1724 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1725 1725 raidz_checksum_error(zio, rc, orig[c]);
1726 1726 rc->rc_error = SET_ERROR(ECKSUM);
1727 1727 ret++;
1728 1728 }
1729 1729 zio_buf_free(orig[c], rc->rc_size);
1730 1730 }
1731 1731
1732 1732 return (ret);
1733 1733 }
1734 1734
1735 1735 /*
1736 1736 * Keep statistics on all the ways that we used parity to correct data.
1737 1737 */
1738 1738 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1739 1739
1740 1740 static int
1741 1741 vdev_raidz_worst_error(raidz_map_t *rm)
1742 1742 {
1743 1743 int error = 0;
1744 1744
1745 1745 for (int c = 0; c < rm->rm_cols; c++)
1746 1746 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1747 1747
1748 1748 return (error);
1749 1749 }
1750 1750
1751 1751 /*
1752 1752 * Iterate over all combinations of bad data and attempt a reconstruction.
1753 1753 * Note that the algorithm below is non-optimal because it doesn't take into
1754 1754 * account how reconstruction is actually performed. For example, with
1755 1755 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1756 1756 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1757 1757 * cases we'd only use parity information in column 0.
1758 1758 */
1759 1759 static int
1760 1760 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1761 1761 {
1762 1762 raidz_map_t *rm = zio->io_vsd;
1763 1763 raidz_col_t *rc;
1764 1764 void *orig[VDEV_RAIDZ_MAXPARITY];
1765 1765 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1766 1766 int *tgts = &tstore[1];
1767 1767 int current, next, i, c, n;
1768 1768 int code, ret = 0;
1769 1769
1770 1770 ASSERT(total_errors < rm->rm_firstdatacol);
1771 1771
1772 1772 /*
1773 1773 * This simplifies one edge condition.
1774 1774 */
1775 1775 tgts[-1] = -1;
1776 1776
1777 1777 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1778 1778 /*
1779 1779 * Initialize the targets array by finding the first n columns
1780 1780 * that contain no error.
1781 1781 *
1782 1782 * If there were no data errors, we need to ensure that we're
1783 1783 * always explicitly attempting to reconstruct at least one
1784 1784 * data column. To do this, we simply push the highest target
1785 1785 * up into the data columns.
1786 1786 */
1787 1787 for (c = 0, i = 0; i < n; i++) {
1788 1788 if (i == n - 1 && data_errors == 0 &&
1789 1789 c < rm->rm_firstdatacol) {
1790 1790 c = rm->rm_firstdatacol;
1791 1791 }
1792 1792
1793 1793 while (rm->rm_col[c].rc_error != 0) {
1794 1794 c++;
1795 1795 ASSERT3S(c, <, rm->rm_cols);
1796 1796 }
1797 1797
1798 1798 tgts[i] = c++;
1799 1799 }
1800 1800
1801 1801 /*
1802 1802 * Setting tgts[n] simplifies the other edge condition.
1803 1803 */
1804 1804 tgts[n] = rm->rm_cols;
1805 1805
1806 1806 /*
1807 1807 * These buffers were allocated in previous iterations.
1808 1808 */
1809 1809 for (i = 0; i < n - 1; i++) {
1810 1810 ASSERT(orig[i] != NULL);
1811 1811 }
1812 1812
1813 1813 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1814 1814
1815 1815 current = 0;
1816 1816 next = tgts[current];
1817 1817
1818 1818 while (current != n) {
1819 1819 tgts[current] = next;
1820 1820 current = 0;
1821 1821
1822 1822 /*
1823 1823 * Save off the original data that we're going to
1824 1824 * attempt to reconstruct.
1825 1825 */
1826 1826 for (i = 0; i < n; i++) {
1827 1827 ASSERT(orig[i] != NULL);
1828 1828 c = tgts[i];
1829 1829 ASSERT3S(c, >=, 0);
1830 1830 ASSERT3S(c, <, rm->rm_cols);
1831 1831 rc = &rm->rm_col[c];
1832 1832 bcopy(rc->rc_data, orig[i], rc->rc_size);
1833 1833 }
1834 1834
1835 1835 /*
1836 1836 * Attempt a reconstruction and exit the outer loop on
1837 1837 * success.
1838 1838 */
1839 1839 code = vdev_raidz_reconstruct(rm, tgts, n);
1840 1840 if (raidz_checksum_verify(zio) == 0) {
1841 1841 atomic_inc_64(&raidz_corrected[code]);
1842 1842
1843 1843 for (i = 0; i < n; i++) {
1844 1844 c = tgts[i];
1845 1845 rc = &rm->rm_col[c];
1846 1846 ASSERT(rc->rc_error == 0);
1847 1847 if (rc->rc_tried)
1848 1848 raidz_checksum_error(zio, rc,
1849 1849 orig[i]);
1850 1850 rc->rc_error = SET_ERROR(ECKSUM);
1851 1851 }
1852 1852
1853 1853 ret = code;
1854 1854 goto done;
1855 1855 }
1856 1856
1857 1857 /*
1858 1858 * Restore the original data.
1859 1859 */
1860 1860 for (i = 0; i < n; i++) {
1861 1861 c = tgts[i];
1862 1862 rc = &rm->rm_col[c];
1863 1863 bcopy(orig[i], rc->rc_data, rc->rc_size);
1864 1864 }
1865 1865
1866 1866 do {
1867 1867 /*
1868 1868 * Find the next valid column after the current
1869 1869 * position..
1870 1870 */
1871 1871 for (next = tgts[current] + 1;
1872 1872 next < rm->rm_cols &&
1873 1873 rm->rm_col[next].rc_error != 0; next++)
1874 1874 continue;
1875 1875
1876 1876 ASSERT(next <= tgts[current + 1]);
1877 1877
1878 1878 /*
1879 1879 * If that spot is available, we're done here.
1880 1880 */
1881 1881 if (next != tgts[current + 1])
1882 1882 break;
1883 1883
1884 1884 /*
1885 1885 * Otherwise, find the next valid column after
1886 1886 * the previous position.
1887 1887 */
1888 1888 for (c = tgts[current - 1] + 1;
1889 1889 rm->rm_col[c].rc_error != 0; c++)
1890 1890 continue;
1891 1891
1892 1892 tgts[current] = c;
1893 1893 current++;
1894 1894
1895 1895 } while (current != n);
1896 1896 }
1897 1897 }
1898 1898 n--;
1899 1899 done:
1900 1900 for (i = 0; i < n; i++) {
1901 1901 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1902 1902 }
1903 1903
1904 1904 return (ret);
1905 1905 }
1906 1906
1907 1907 /*
1908 1908 * Complete an IO operation on a RAIDZ VDev
1909 1909 *
1910 1910 * Outline:
1911 1911 * - For write operations:
1912 1912 * 1. Check for errors on the child IOs.
1913 1913 * 2. Return, setting an error code if too few child VDevs were written
1914 1914 * to reconstruct the data later. Note that partial writes are
1915 1915 * considered successful if they can be reconstructed at all.
1916 1916 * - For read operations:
1917 1917 * 1. Check for errors on the child IOs.
1918 1918 * 2. If data errors occurred:
1919 1919 * a. Try to reassemble the data from the parity available.
1920 1920 * b. If we haven't yet read the parity drives, read them now.
1921 1921 * c. If all parity drives have been read but the data still doesn't
1922 1922 * reassemble with a correct checksum, then try combinatorial
1923 1923 * reconstruction.
1924 1924 * d. If that doesn't work, return an error.
1925 1925 * 3. If there were unexpected errors or this is a resilver operation,
1926 1926 * rewrite the vdevs that had errors.
1927 1927 */
1928 1928 static void
1929 1929 vdev_raidz_io_done(zio_t *zio)
1930 1930 {
1931 1931 vdev_t *vd = zio->io_vd;
1932 1932 vdev_t *cvd;
1933 1933 raidz_map_t *rm = zio->io_vsd;
1934 1934 raidz_col_t *rc;
1935 1935 int unexpected_errors = 0;
1936 1936 int parity_errors = 0;
1937 1937 int parity_untried = 0;
1938 1938 int data_errors = 0;
1939 1939 int total_errors = 0;
1940 1940 int n, c;
1941 1941 int tgts[VDEV_RAIDZ_MAXPARITY];
1942 1942 int code;
1943 1943
1944 1944 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1945 1945
1946 1946 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1947 1947 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1948 1948
1949 1949 for (c = 0; c < rm->rm_cols; c++) {
1950 1950 rc = &rm->rm_col[c];
1951 1951
1952 1952 if (rc->rc_error) {
1953 1953 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1954 1954
1955 1955 if (c < rm->rm_firstdatacol)
1956 1956 parity_errors++;
1957 1957 else
1958 1958 data_errors++;
1959 1959
1960 1960 if (!rc->rc_skipped)
1961 1961 unexpected_errors++;
1962 1962
1963 1963 total_errors++;
1964 1964 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1965 1965 parity_untried++;
1966 1966 }
1967 1967 }
1968 1968
1969 1969 if (zio->io_type == ZIO_TYPE_WRITE) {
1970 1970 /*
1971 1971 * XXX -- for now, treat partial writes as a success.
1972 1972 * (If we couldn't write enough columns to reconstruct
1973 1973 * the data, the I/O failed. Otherwise, good enough.)
1974 1974 *
1975 1975 * Now that we support write reallocation, it would be better
1976 1976 * to treat partial failure as real failure unless there are
1977 1977 * no non-degraded top-level vdevs left, and not update DTLs
1978 1978 * if we intend to reallocate.
1979 1979 */
1980 1980 /* XXPOLICY */
1981 1981 if (total_errors > rm->rm_firstdatacol)
1982 1982 zio->io_error = vdev_raidz_worst_error(rm);
1983 1983
1984 1984 return;
1985 1985 }
1986 1986
1987 1987 ASSERT(zio->io_type == ZIO_TYPE_READ);
1988 1988 /*
1989 1989 * There are three potential phases for a read:
1990 1990 * 1. produce valid data from the columns read
1991 1991 * 2. read all disks and try again
1992 1992 * 3. perform combinatorial reconstruction
1993 1993 *
1994 1994 * Each phase is progressively both more expensive and less likely to
1995 1995 * occur. If we encounter more errors than we can repair or all phases
1996 1996 * fail, we have no choice but to return an error.
1997 1997 */
1998 1998
1999 1999 /*
2000 2000 * If the number of errors we saw was correctable -- less than or equal
2001 2001 * to the number of parity disks read -- attempt to produce data that
2002 2002 * has a valid checksum. Naturally, this case applies in the absence of
2003 2003 * any errors.
2004 2004 */
2005 2005 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2006 2006 if (data_errors == 0) {
2007 2007 if (raidz_checksum_verify(zio) == 0) {
2008 2008 /*
2009 2009 * If we read parity information (unnecessarily
2010 2010 * as it happens since no reconstruction was
2011 2011 * needed) regenerate and verify the parity.
2012 2012 * We also regenerate parity when resilvering
2013 2013 * so we can write it out to the failed device
2014 2014 * later.
2015 2015 */
2016 2016 if (parity_errors + parity_untried <
2017 2017 rm->rm_firstdatacol ||
2018 2018 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2019 2019 n = raidz_parity_verify(zio, rm);
2020 2020 unexpected_errors += n;
2021 2021 ASSERT(parity_errors + n <=
2022 2022 rm->rm_firstdatacol);
2023 2023 }
2024 2024 goto done;
2025 2025 }
2026 2026 } else {
2027 2027 /*
2028 2028 * We either attempt to read all the parity columns or
2029 2029 * none of them. If we didn't try to read parity, we
2030 2030 * wouldn't be here in the correctable case. There must
2031 2031 * also have been fewer parity errors than parity
2032 2032 * columns or, again, we wouldn't be in this code path.
2033 2033 */
2034 2034 ASSERT(parity_untried == 0);
2035 2035 ASSERT(parity_errors < rm->rm_firstdatacol);
2036 2036
2037 2037 /*
2038 2038 * Identify the data columns that reported an error.
2039 2039 */
2040 2040 n = 0;
2041 2041 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2042 2042 rc = &rm->rm_col[c];
2043 2043 if (rc->rc_error != 0) {
2044 2044 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2045 2045 tgts[n++] = c;
2046 2046 }
2047 2047 }
2048 2048
2049 2049 ASSERT(rm->rm_firstdatacol >= n);
2050 2050
2051 2051 code = vdev_raidz_reconstruct(rm, tgts, n);
2052 2052
2053 2053 if (raidz_checksum_verify(zio) == 0) {
2054 2054 atomic_inc_64(&raidz_corrected[code]);
2055 2055
2056 2056 /*
2057 2057 * If we read more parity disks than were used
2058 2058 * for reconstruction, confirm that the other
2059 2059 * parity disks produced correct data. This
2060 2060 * routine is suboptimal in that it regenerates
2061 2061 * the parity that we already used in addition
2062 2062 * to the parity that we're attempting to
2063 2063 * verify, but this should be a relatively
2064 2064 * uncommon case, and can be optimized if it
2065 2065 * becomes a problem. Note that we regenerate
2066 2066 * parity when resilvering so we can write it
2067 2067 * out to failed devices later.
2068 2068 */
2069 2069 if (parity_errors < rm->rm_firstdatacol - n ||
2070 2070 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2071 2071 n = raidz_parity_verify(zio, rm);
2072 2072 unexpected_errors += n;
2073 2073 ASSERT(parity_errors + n <=
2074 2074 rm->rm_firstdatacol);
2075 2075 }
2076 2076
2077 2077 goto done;
2078 2078 }
2079 2079 }
2080 2080 }
2081 2081
2082 2082 /*
2083 2083 * This isn't a typical situation -- either we got a read error or
2084 2084 * a child silently returned bad data. Read every block so we can
2085 2085 * try again with as much data and parity as we can track down. If
2086 2086 * we've already been through once before, all children will be marked
2087 2087 * as tried so we'll proceed to combinatorial reconstruction.
2088 2088 */
2089 2089 unexpected_errors = 1;
2090 2090 rm->rm_missingdata = 0;
2091 2091 rm->rm_missingparity = 0;
2092 2092
2093 2093 for (c = 0; c < rm->rm_cols; c++) {
2094 2094 if (rm->rm_col[c].rc_tried)
2095 2095 continue;
2096 2096
2097 2097 zio_vdev_io_redone(zio);
2098 2098 do {
2099 2099 rc = &rm->rm_col[c];
2100 2100 if (rc->rc_tried)
2101 2101 continue;
2102 2102 zio_nowait(zio_vdev_child_io(zio, NULL,
2103 2103 vd->vdev_child[rc->rc_devidx],
2104 2104 rc->rc_offset, rc->rc_data, rc->rc_size,
2105 2105 zio->io_type, zio->io_priority, 0,
2106 2106 vdev_raidz_child_done, rc));
2107 2107 } while (++c < rm->rm_cols);
2108 2108
2109 2109 return;
2110 2110 }
2111 2111
2112 2112 /*
2113 2113 * At this point we've attempted to reconstruct the data given the
2114 2114 * errors we detected, and we've attempted to read all columns. There
2115 2115 * must, therefore, be one or more additional problems -- silent errors
2116 2116 * resulting in invalid data rather than explicit I/O errors resulting
2117 2117 * in absent data. We check if there is enough additional data to
2118 2118 * possibly reconstruct the data and then perform combinatorial
2119 2119 * reconstruction over all possible combinations. If that fails,
2120 2120 * we're cooked.
2121 2121 */
2122 2122 if (total_errors > rm->rm_firstdatacol) {
2123 2123 zio->io_error = vdev_raidz_worst_error(rm);
2124 2124
2125 2125 } else if (total_errors < rm->rm_firstdatacol &&
2126 2126 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2127 2127 /*
2128 2128 * If we didn't use all the available parity for the
2129 2129 * combinatorial reconstruction, verify that the remaining
2130 2130 * parity is correct.
2131 2131 */
2132 2132 if (code != (1 << rm->rm_firstdatacol) - 1)
2133 2133 (void) raidz_parity_verify(zio, rm);
2134 2134 } else {
2135 2135 /*
2136 2136 * We're here because either:
2137 2137 *
2138 2138 * total_errors == rm_first_datacol, or
2139 2139 * vdev_raidz_combrec() failed
2140 2140 *
2141 2141 * In either case, there is enough bad data to prevent
2142 2142 * reconstruction.
2143 2143 *
2144 2144 * Start checksum ereports for all children which haven't
2145 2145 * failed, and the IO wasn't speculative.
2146 2146 */
2147 2147 zio->io_error = SET_ERROR(ECKSUM);
2148 2148
2149 2149 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2150 2150 for (c = 0; c < rm->rm_cols; c++) {
2151 2151 rc = &rm->rm_col[c];
2152 2152 if (rc->rc_error == 0) {
2153 2153 zio_bad_cksum_t zbc;
2154 2154 zbc.zbc_has_cksum = 0;
2155 2155 zbc.zbc_injected =
2156 2156 rm->rm_ecksuminjected;
2157 2157
2158 2158 zfs_ereport_start_checksum(
2159 2159 zio->io_spa,
2160 2160 vd->vdev_child[rc->rc_devidx],
2161 2161 zio, rc->rc_offset, rc->rc_size,
2162 2162 (void *)(uintptr_t)c, &zbc);
2163 2163 }
2164 2164 }
2165 2165 }
2166 2166 }
2167 2167
2168 2168 done:
2169 2169 zio_checksum_verified(zio);
2170 2170
2171 2171 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2172 2172 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2173 2173 /*
2174 2174 * Use the good data we have in hand to repair damaged children.
↓ open down ↓ |
2174 lines elided |
↑ open up ↑ |
2175 2175 */
2176 2176 for (c = 0; c < rm->rm_cols; c++) {
2177 2177 rc = &rm->rm_col[c];
2178 2178 cvd = vd->vdev_child[rc->rc_devidx];
2179 2179
2180 2180 if (rc->rc_error == 0)
2181 2181 continue;
2182 2182
2183 2183 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2184 2184 rc->rc_offset, rc->rc_data, rc->rc_size,
2185 - ZIO_TYPE_WRITE, zio->io_priority,
2185 + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2186 2186 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2187 2187 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2188 2188 }
2189 2189 }
2190 2190 }
2191 2191
2192 2192 static void
2193 2193 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2194 2194 {
2195 2195 if (faulted > vd->vdev_nparity)
2196 2196 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2197 2197 VDEV_AUX_NO_REPLICAS);
2198 2198 else if (degraded + faulted != 0)
2199 2199 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2200 2200 else
2201 2201 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2202 2202 }
2203 2203
2204 2204 vdev_ops_t vdev_raidz_ops = {
2205 2205 vdev_raidz_open,
2206 2206 vdev_raidz_close,
2207 2207 vdev_raidz_asize,
2208 2208 vdev_raidz_io_start,
2209 2209 vdev_raidz_io_done,
2210 2210 vdev_raidz_state_change,
2211 2211 NULL,
2212 2212 NULL,
2213 2213 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2214 2214 B_FALSE /* not a leaf vdev */
2215 2215 };
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX