Print this page
3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ new/usr/src/uts/common/fs/zfs/vdev_raidz.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2012 by Delphix. All rights reserved.
25 25 */
26 26
27 27 #include <sys/zfs_context.h>
28 28 #include <sys/spa.h>
29 29 #include <sys/vdev_impl.h>
30 30 #include <sys/zio.h>
31 31 #include <sys/zio_checksum.h>
32 32 #include <sys/fs/zfs.h>
33 33 #include <sys/fm/fs/zfs.h>
34 34
35 35 /*
36 36 * Virtual device vector for RAID-Z.
37 37 *
38 38 * This vdev supports single, double, and triple parity. For single parity,
39 39 * we use a simple XOR of all the data columns. For double or triple parity,
40 40 * we use a special case of Reed-Solomon coding. This extends the
41 41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 44 * former is also based. The latter is designed to provide higher performance
45 45 * for writes.
46 46 *
47 47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 48 * amended six years later identifying a critical flaw that invalidates its
49 49 * claims. Nevertheless, the technique can be adapted to work for up to
50 50 * triple parity. For additional parity, the amendment "Note: Correction to
51 51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 52 * is viable, but the additional complexity means that write performance will
53 53 * suffer.
54 54 *
55 55 * All of the methods above operate on a Galois field, defined over the
56 56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 57 * can be expressed with a single byte. Briefly, the operations on the
58 58 * field are defined as follows:
59 59 *
60 60 * o addition (+) is represented by a bitwise XOR
61 61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 62 * o multiplication of A by 2 is defined by the following bitwise expression:
63 63 * (A * 2)_7 = A_6
64 64 * (A * 2)_6 = A_5
65 65 * (A * 2)_5 = A_4
66 66 * (A * 2)_4 = A_3 + A_7
67 67 * (A * 2)_3 = A_2 + A_7
68 68 * (A * 2)_2 = A_1 + A_7
69 69 * (A * 2)_1 = A_0
70 70 * (A * 2)_0 = A_7
71 71 *
72 72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73 73 * As an aside, this multiplication is derived from the error correcting
74 74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75 75 *
76 76 * Observe that any number in the field (except for 0) can be expressed as a
77 77 * power of 2 -- a generator for the field. We store a table of the powers of
78 78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79 79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80 80 * than field addition). The inverse of a field element A (A^-1) is therefore
81 81 * A ^ (255 - 1) = A^254.
82 82 *
83 83 * The up-to-three parity columns, P, Q, R over several data columns,
84 84 * D_0, ... D_n-1, can be expressed by field operations:
85 85 *
86 86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
87 87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88 88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89 89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90 90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91 91 *
92 92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93 93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94 94 * independent coefficients. (There are no additional coefficients that have
95 95 * this property which is why the uncorrected Plank method breaks down.)
96 96 *
97 97 * See the reconstruction code below for how P, Q and R can used individually
98 98 * or in concert to recover missing data columns.
99 99 */
100 100
101 101 typedef struct raidz_col {
102 102 uint64_t rc_devidx; /* child device index for I/O */
103 103 uint64_t rc_offset; /* device offset */
104 104 uint64_t rc_size; /* I/O size */
105 105 void *rc_data; /* I/O data */
106 106 void *rc_gdata; /* used to store the "good" version */
107 107 int rc_error; /* I/O error for this device */
108 108 uint8_t rc_tried; /* Did we attempt this I/O column? */
109 109 uint8_t rc_skipped; /* Did we skip this I/O column? */
110 110 } raidz_col_t;
111 111
112 112 typedef struct raidz_map {
113 113 uint64_t rm_cols; /* Regular column count */
114 114 uint64_t rm_scols; /* Count including skipped columns */
115 115 uint64_t rm_bigcols; /* Number of oversized columns */
116 116 uint64_t rm_asize; /* Actual total I/O size */
117 117 uint64_t rm_missingdata; /* Count of missing data devices */
118 118 uint64_t rm_missingparity; /* Count of missing parity devices */
119 119 uint64_t rm_firstdatacol; /* First data column/parity count */
120 120 uint64_t rm_nskip; /* Skipped sectors for padding */
121 121 uint64_t rm_skipstart; /* Column index of padding start */
122 122 void *rm_datacopy; /* rm_asize-buffer of copied data */
123 123 uintptr_t rm_reports; /* # of referencing checksum reports */
124 124 uint8_t rm_freed; /* map no longer has referencing ZIO */
125 125 uint8_t rm_ecksuminjected; /* checksum error was injected */
126 126 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
127 127 } raidz_map_t;
128 128
129 129 #define VDEV_RAIDZ_P 0
130 130 #define VDEV_RAIDZ_Q 1
131 131 #define VDEV_RAIDZ_R 2
132 132
133 133 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
134 134 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
135 135
136 136 /*
137 137 * We provide a mechanism to perform the field multiplication operation on a
138 138 * 64-bit value all at once rather than a byte at a time. This works by
139 139 * creating a mask from the top bit in each byte and using that to
140 140 * conditionally apply the XOR of 0x1d.
141 141 */
142 142 #define VDEV_RAIDZ_64MUL_2(x, mask) \
143 143 { \
144 144 (mask) = (x) & 0x8080808080808080ULL; \
145 145 (mask) = ((mask) << 1) - ((mask) >> 7); \
146 146 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
147 147 ((mask) & 0x1d1d1d1d1d1d1d1d); \
148 148 }
149 149
150 150 #define VDEV_RAIDZ_64MUL_4(x, mask) \
151 151 { \
152 152 VDEV_RAIDZ_64MUL_2((x), mask); \
153 153 VDEV_RAIDZ_64MUL_2((x), mask); \
154 154 }
155 155
156 156 /*
157 157 * Force reconstruction to use the general purpose method.
158 158 */
159 159 int vdev_raidz_default_to_general;
160 160
161 161 /*
162 162 * These two tables represent powers and logs of 2 in the Galois field defined
163 163 * above. These values were computed by repeatedly multiplying by 2 as above.
164 164 */
165 165 static const uint8_t vdev_raidz_pow2[256] = {
166 166 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
167 167 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
168 168 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
169 169 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
170 170 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
171 171 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
172 172 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
173 173 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
174 174 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
175 175 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
176 176 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
177 177 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
178 178 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
179 179 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
180 180 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
181 181 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
182 182 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
183 183 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
184 184 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
185 185 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
186 186 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
187 187 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
188 188 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
189 189 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
190 190 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
191 191 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
192 192 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
193 193 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
194 194 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
195 195 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
196 196 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
197 197 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
198 198 };
199 199 static const uint8_t vdev_raidz_log2[256] = {
200 200 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
201 201 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
202 202 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
203 203 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
204 204 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
205 205 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
206 206 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
207 207 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
208 208 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
209 209 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
210 210 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
211 211 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
212 212 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
213 213 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
214 214 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
215 215 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
216 216 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
217 217 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
218 218 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
219 219 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
220 220 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
221 221 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
222 222 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
223 223 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
224 224 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
225 225 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
226 226 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
227 227 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
228 228 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
229 229 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
230 230 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
231 231 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
232 232 };
233 233
234 234 static void vdev_raidz_generate_parity(raidz_map_t *rm);
235 235
236 236 /*
237 237 * Multiply a given number by 2 raised to the given power.
238 238 */
239 239 static uint8_t
240 240 vdev_raidz_exp2(uint_t a, int exp)
241 241 {
242 242 if (a == 0)
243 243 return (0);
244 244
245 245 ASSERT(exp >= 0);
246 246 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
247 247
248 248 exp += vdev_raidz_log2[a];
249 249 if (exp > 255)
250 250 exp -= 255;
251 251
252 252 return (vdev_raidz_pow2[exp]);
253 253 }
254 254
255 255 static void
256 256 vdev_raidz_map_free(raidz_map_t *rm)
257 257 {
258 258 int c;
259 259 size_t size;
260 260
261 261 for (c = 0; c < rm->rm_firstdatacol; c++) {
262 262 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
263 263
264 264 if (rm->rm_col[c].rc_gdata != NULL)
265 265 zio_buf_free(rm->rm_col[c].rc_gdata,
266 266 rm->rm_col[c].rc_size);
267 267 }
268 268
269 269 size = 0;
270 270 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
271 271 size += rm->rm_col[c].rc_size;
272 272
273 273 if (rm->rm_datacopy != NULL)
↓ open down ↓ |
273 lines elided |
↑ open up ↑ |
274 274 zio_buf_free(rm->rm_datacopy, size);
275 275
276 276 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
277 277 }
278 278
279 279 static void
280 280 vdev_raidz_map_free_vsd(zio_t *zio)
281 281 {
282 282 raidz_map_t *rm = zio->io_vsd;
283 283
284 - ASSERT3U(rm->rm_freed, ==, 0);
284 + ASSERT0(rm->rm_freed);
285 285 rm->rm_freed = 1;
286 286
287 287 if (rm->rm_reports == 0)
288 288 vdev_raidz_map_free(rm);
289 289 }
290 290
291 291 /*ARGSUSED*/
292 292 static void
293 293 vdev_raidz_cksum_free(void *arg, size_t ignored)
294 294 {
295 295 raidz_map_t *rm = arg;
296 296
297 297 ASSERT3U(rm->rm_reports, >, 0);
298 298
299 299 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
300 300 vdev_raidz_map_free(rm);
301 301 }
302 302
303 303 static void
304 304 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
305 305 {
306 306 raidz_map_t *rm = zcr->zcr_cbdata;
307 307 size_t c = zcr->zcr_cbinfo;
308 308 size_t x;
309 309
310 310 const char *good = NULL;
311 311 const char *bad = rm->rm_col[c].rc_data;
312 312
313 313 if (good_data == NULL) {
314 314 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
315 315 return;
316 316 }
317 317
318 318 if (c < rm->rm_firstdatacol) {
319 319 /*
320 320 * The first time through, calculate the parity blocks for
321 321 * the good data (this relies on the fact that the good
322 322 * data never changes for a given logical ZIO)
323 323 */
324 324 if (rm->rm_col[0].rc_gdata == NULL) {
325 325 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
326 326 char *buf;
327 327
328 328 /*
329 329 * Set up the rm_col[]s to generate the parity for
330 330 * good_data, first saving the parity bufs and
331 331 * replacing them with buffers to hold the result.
332 332 */
333 333 for (x = 0; x < rm->rm_firstdatacol; x++) {
334 334 bad_parity[x] = rm->rm_col[x].rc_data;
335 335 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
336 336 zio_buf_alloc(rm->rm_col[x].rc_size);
337 337 }
338 338
339 339 /* fill in the data columns from good_data */
340 340 buf = (char *)good_data;
341 341 for (; x < rm->rm_cols; x++) {
342 342 rm->rm_col[x].rc_data = buf;
343 343 buf += rm->rm_col[x].rc_size;
344 344 }
345 345
346 346 /*
347 347 * Construct the parity from the good data.
348 348 */
349 349 vdev_raidz_generate_parity(rm);
350 350
351 351 /* restore everything back to its original state */
352 352 for (x = 0; x < rm->rm_firstdatacol; x++)
353 353 rm->rm_col[x].rc_data = bad_parity[x];
354 354
355 355 buf = rm->rm_datacopy;
356 356 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
357 357 rm->rm_col[x].rc_data = buf;
358 358 buf += rm->rm_col[x].rc_size;
359 359 }
360 360 }
361 361
362 362 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
363 363 good = rm->rm_col[c].rc_gdata;
364 364 } else {
365 365 /* adjust good_data to point at the start of our column */
366 366 good = good_data;
367 367
368 368 for (x = rm->rm_firstdatacol; x < c; x++)
369 369 good += rm->rm_col[x].rc_size;
370 370 }
371 371
372 372 /* we drop the ereport if it ends up that the data was good */
373 373 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
374 374 }
375 375
376 376 /*
377 377 * Invoked indirectly by zfs_ereport_start_checksum(), called
378 378 * below when our read operation fails completely. The main point
379 379 * is to keep a copy of everything we read from disk, so that at
380 380 * vdev_raidz_cksum_finish() time we can compare it with the good data.
381 381 */
382 382 static void
383 383 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
384 384 {
385 385 size_t c = (size_t)(uintptr_t)arg;
386 386 caddr_t buf;
387 387
388 388 raidz_map_t *rm = zio->io_vsd;
389 389 size_t size;
390 390
391 391 /* set up the report and bump the refcount */
392 392 zcr->zcr_cbdata = rm;
393 393 zcr->zcr_cbinfo = c;
394 394 zcr->zcr_finish = vdev_raidz_cksum_finish;
395 395 zcr->zcr_free = vdev_raidz_cksum_free;
396 396
397 397 rm->rm_reports++;
398 398 ASSERT3U(rm->rm_reports, >, 0);
399 399
400 400 if (rm->rm_datacopy != NULL)
401 401 return;
402 402
403 403 /*
404 404 * It's the first time we're called for this raidz_map_t, so we need
405 405 * to copy the data aside; there's no guarantee that our zio's buffer
406 406 * won't be re-used for something else.
407 407 *
408 408 * Our parity data is already in separate buffers, so there's no need
409 409 * to copy them.
410 410 */
411 411
412 412 size = 0;
413 413 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
414 414 size += rm->rm_col[c].rc_size;
415 415
416 416 buf = rm->rm_datacopy = zio_buf_alloc(size);
417 417
418 418 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
419 419 raidz_col_t *col = &rm->rm_col[c];
420 420
421 421 bcopy(col->rc_data, buf, col->rc_size);
422 422 col->rc_data = buf;
423 423
424 424 buf += col->rc_size;
425 425 }
426 426 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
427 427 }
428 428
429 429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
430 430 vdev_raidz_map_free_vsd,
431 431 vdev_raidz_cksum_report
432 432 };
433 433
434 434 static raidz_map_t *
435 435 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
436 436 uint64_t nparity)
437 437 {
438 438 raidz_map_t *rm;
439 439 uint64_t b = zio->io_offset >> unit_shift;
440 440 uint64_t s = zio->io_size >> unit_shift;
441 441 uint64_t f = b % dcols;
442 442 uint64_t o = (b / dcols) << unit_shift;
443 443 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
444 444
445 445 q = s / (dcols - nparity);
446 446 r = s - q * (dcols - nparity);
447 447 bc = (r == 0 ? 0 : r + nparity);
448 448 tot = s + nparity * (q + (r == 0 ? 0 : 1));
449 449
450 450 if (q == 0) {
451 451 acols = bc;
452 452 scols = MIN(dcols, roundup(bc, nparity + 1));
453 453 } else {
454 454 acols = dcols;
455 455 scols = dcols;
456 456 }
457 457
458 458 ASSERT3U(acols, <=, scols);
459 459
460 460 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
461 461
462 462 rm->rm_cols = acols;
463 463 rm->rm_scols = scols;
464 464 rm->rm_bigcols = bc;
465 465 rm->rm_skipstart = bc;
466 466 rm->rm_missingdata = 0;
467 467 rm->rm_missingparity = 0;
468 468 rm->rm_firstdatacol = nparity;
469 469 rm->rm_datacopy = NULL;
470 470 rm->rm_reports = 0;
471 471 rm->rm_freed = 0;
472 472 rm->rm_ecksuminjected = 0;
473 473
474 474 asize = 0;
475 475
476 476 for (c = 0; c < scols; c++) {
477 477 col = f + c;
478 478 coff = o;
479 479 if (col >= dcols) {
480 480 col -= dcols;
481 481 coff += 1ULL << unit_shift;
482 482 }
483 483 rm->rm_col[c].rc_devidx = col;
484 484 rm->rm_col[c].rc_offset = coff;
485 485 rm->rm_col[c].rc_data = NULL;
486 486 rm->rm_col[c].rc_gdata = NULL;
487 487 rm->rm_col[c].rc_error = 0;
488 488 rm->rm_col[c].rc_tried = 0;
489 489 rm->rm_col[c].rc_skipped = 0;
490 490
491 491 if (c >= acols)
492 492 rm->rm_col[c].rc_size = 0;
493 493 else if (c < bc)
494 494 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
495 495 else
496 496 rm->rm_col[c].rc_size = q << unit_shift;
497 497
498 498 asize += rm->rm_col[c].rc_size;
499 499 }
500 500
501 501 ASSERT3U(asize, ==, tot << unit_shift);
502 502 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
503 503 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
504 504 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
505 505 ASSERT3U(rm->rm_nskip, <=, nparity);
506 506
507 507 for (c = 0; c < rm->rm_firstdatacol; c++)
508 508 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
509 509
510 510 rm->rm_col[c].rc_data = zio->io_data;
511 511
512 512 for (c = c + 1; c < acols; c++)
513 513 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
514 514 rm->rm_col[c - 1].rc_size;
515 515
516 516 /*
517 517 * If all data stored spans all columns, there's a danger that parity
518 518 * will always be on the same device and, since parity isn't read
519 519 * during normal operation, that that device's I/O bandwidth won't be
520 520 * used effectively. We therefore switch the parity every 1MB.
521 521 *
522 522 * ... at least that was, ostensibly, the theory. As a practical
523 523 * matter unless we juggle the parity between all devices evenly, we
524 524 * won't see any benefit. Further, occasional writes that aren't a
525 525 * multiple of the LCM of the number of children and the minimum
526 526 * stripe width are sufficient to avoid pessimal behavior.
527 527 * Unfortunately, this decision created an implicit on-disk format
528 528 * requirement that we need to support for all eternity, but only
529 529 * for single-parity RAID-Z.
530 530 *
531 531 * If we intend to skip a sector in the zeroth column for padding
532 532 * we must make sure to note this swap. We will never intend to
533 533 * skip the first column since at least one data and one parity
534 534 * column must appear in each row.
535 535 */
536 536 ASSERT(rm->rm_cols >= 2);
537 537 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
538 538
539 539 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
540 540 devidx = rm->rm_col[0].rc_devidx;
541 541 o = rm->rm_col[0].rc_offset;
542 542 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
543 543 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
544 544 rm->rm_col[1].rc_devidx = devidx;
545 545 rm->rm_col[1].rc_offset = o;
546 546
547 547 if (rm->rm_skipstart == 0)
548 548 rm->rm_skipstart = 1;
549 549 }
550 550
551 551 zio->io_vsd = rm;
552 552 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
553 553 return (rm);
554 554 }
555 555
556 556 static void
557 557 vdev_raidz_generate_parity_p(raidz_map_t *rm)
558 558 {
559 559 uint64_t *p, *src, pcount, ccount, i;
560 560 int c;
561 561
562 562 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
563 563
564 564 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565 565 src = rm->rm_col[c].rc_data;
566 566 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567 567 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
568 568
569 569 if (c == rm->rm_firstdatacol) {
570 570 ASSERT(ccount == pcount);
571 571 for (i = 0; i < ccount; i++, src++, p++) {
572 572 *p = *src;
573 573 }
574 574 } else {
575 575 ASSERT(ccount <= pcount);
576 576 for (i = 0; i < ccount; i++, src++, p++) {
577 577 *p ^= *src;
578 578 }
579 579 }
580 580 }
581 581 }
582 582
583 583 static void
584 584 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
585 585 {
586 586 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
587 587 int c;
588 588
589 589 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
590 590 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
591 591 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
592 592
593 593 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
594 594 src = rm->rm_col[c].rc_data;
595 595 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
596 596 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
597 597
598 598 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
599 599
600 600 if (c == rm->rm_firstdatacol) {
601 601 ASSERT(ccnt == pcnt || ccnt == 0);
602 602 for (i = 0; i < ccnt; i++, src++, p++, q++) {
603 603 *p = *src;
604 604 *q = *src;
605 605 }
606 606 for (; i < pcnt; i++, src++, p++, q++) {
607 607 *p = 0;
608 608 *q = 0;
609 609 }
610 610 } else {
611 611 ASSERT(ccnt <= pcnt);
612 612
613 613 /*
614 614 * Apply the algorithm described above by multiplying
615 615 * the previous result and adding in the new value.
616 616 */
617 617 for (i = 0; i < ccnt; i++, src++, p++, q++) {
618 618 *p ^= *src;
619 619
620 620 VDEV_RAIDZ_64MUL_2(*q, mask);
621 621 *q ^= *src;
622 622 }
623 623
624 624 /*
625 625 * Treat short columns as though they are full of 0s.
626 626 * Note that there's therefore nothing needed for P.
627 627 */
628 628 for (; i < pcnt; i++, q++) {
629 629 VDEV_RAIDZ_64MUL_2(*q, mask);
630 630 }
631 631 }
632 632 }
633 633 }
634 634
635 635 static void
636 636 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
637 637 {
638 638 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
639 639 int c;
640 640
641 641 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
642 642 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
643 643 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
644 644 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
645 645 rm->rm_col[VDEV_RAIDZ_R].rc_size);
646 646
647 647 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
648 648 src = rm->rm_col[c].rc_data;
649 649 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
650 650 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
651 651 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
652 652
653 653 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
654 654
655 655 if (c == rm->rm_firstdatacol) {
656 656 ASSERT(ccnt == pcnt || ccnt == 0);
657 657 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
658 658 *p = *src;
659 659 *q = *src;
660 660 *r = *src;
661 661 }
662 662 for (; i < pcnt; i++, src++, p++, q++, r++) {
663 663 *p = 0;
664 664 *q = 0;
665 665 *r = 0;
666 666 }
667 667 } else {
668 668 ASSERT(ccnt <= pcnt);
669 669
670 670 /*
671 671 * Apply the algorithm described above by multiplying
672 672 * the previous result and adding in the new value.
673 673 */
674 674 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
675 675 *p ^= *src;
676 676
677 677 VDEV_RAIDZ_64MUL_2(*q, mask);
678 678 *q ^= *src;
679 679
680 680 VDEV_RAIDZ_64MUL_4(*r, mask);
681 681 *r ^= *src;
682 682 }
683 683
684 684 /*
685 685 * Treat short columns as though they are full of 0s.
686 686 * Note that there's therefore nothing needed for P.
687 687 */
688 688 for (; i < pcnt; i++, q++, r++) {
689 689 VDEV_RAIDZ_64MUL_2(*q, mask);
690 690 VDEV_RAIDZ_64MUL_4(*r, mask);
691 691 }
692 692 }
693 693 }
694 694 }
695 695
696 696 /*
697 697 * Generate RAID parity in the first virtual columns according to the number of
698 698 * parity columns available.
699 699 */
700 700 static void
701 701 vdev_raidz_generate_parity(raidz_map_t *rm)
702 702 {
703 703 switch (rm->rm_firstdatacol) {
704 704 case 1:
705 705 vdev_raidz_generate_parity_p(rm);
706 706 break;
707 707 case 2:
708 708 vdev_raidz_generate_parity_pq(rm);
709 709 break;
710 710 case 3:
711 711 vdev_raidz_generate_parity_pqr(rm);
712 712 break;
713 713 default:
714 714 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
715 715 }
716 716 }
717 717
718 718 static int
719 719 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
720 720 {
721 721 uint64_t *dst, *src, xcount, ccount, count, i;
722 722 int x = tgts[0];
723 723 int c;
724 724
725 725 ASSERT(ntgts == 1);
726 726 ASSERT(x >= rm->rm_firstdatacol);
727 727 ASSERT(x < rm->rm_cols);
728 728
729 729 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
730 730 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
731 731 ASSERT(xcount > 0);
732 732
733 733 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
734 734 dst = rm->rm_col[x].rc_data;
735 735 for (i = 0; i < xcount; i++, dst++, src++) {
736 736 *dst = *src;
737 737 }
738 738
739 739 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
740 740 src = rm->rm_col[c].rc_data;
741 741 dst = rm->rm_col[x].rc_data;
742 742
743 743 if (c == x)
744 744 continue;
745 745
746 746 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
747 747 count = MIN(ccount, xcount);
748 748
749 749 for (i = 0; i < count; i++, dst++, src++) {
750 750 *dst ^= *src;
751 751 }
752 752 }
753 753
754 754 return (1 << VDEV_RAIDZ_P);
755 755 }
756 756
757 757 static int
758 758 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
759 759 {
760 760 uint64_t *dst, *src, xcount, ccount, count, mask, i;
761 761 uint8_t *b;
762 762 int x = tgts[0];
763 763 int c, j, exp;
764 764
765 765 ASSERT(ntgts == 1);
766 766
767 767 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
768 768 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
769 769
770 770 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
771 771 src = rm->rm_col[c].rc_data;
772 772 dst = rm->rm_col[x].rc_data;
773 773
774 774 if (c == x)
775 775 ccount = 0;
776 776 else
777 777 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
778 778
779 779 count = MIN(ccount, xcount);
780 780
781 781 if (c == rm->rm_firstdatacol) {
782 782 for (i = 0; i < count; i++, dst++, src++) {
783 783 *dst = *src;
784 784 }
785 785 for (; i < xcount; i++, dst++) {
786 786 *dst = 0;
787 787 }
788 788
789 789 } else {
790 790 for (i = 0; i < count; i++, dst++, src++) {
791 791 VDEV_RAIDZ_64MUL_2(*dst, mask);
792 792 *dst ^= *src;
793 793 }
794 794
795 795 for (; i < xcount; i++, dst++) {
796 796 VDEV_RAIDZ_64MUL_2(*dst, mask);
797 797 }
798 798 }
799 799 }
800 800
801 801 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
802 802 dst = rm->rm_col[x].rc_data;
803 803 exp = 255 - (rm->rm_cols - 1 - x);
804 804
805 805 for (i = 0; i < xcount; i++, dst++, src++) {
806 806 *dst ^= *src;
807 807 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
808 808 *b = vdev_raidz_exp2(*b, exp);
809 809 }
810 810 }
811 811
812 812 return (1 << VDEV_RAIDZ_Q);
813 813 }
814 814
815 815 static int
816 816 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
817 817 {
818 818 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
819 819 void *pdata, *qdata;
820 820 uint64_t xsize, ysize, i;
821 821 int x = tgts[0];
822 822 int y = tgts[1];
823 823
824 824 ASSERT(ntgts == 2);
825 825 ASSERT(x < y);
826 826 ASSERT(x >= rm->rm_firstdatacol);
827 827 ASSERT(y < rm->rm_cols);
828 828
829 829 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
830 830
831 831 /*
832 832 * Move the parity data aside -- we're going to compute parity as
833 833 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
834 834 * reuse the parity generation mechanism without trashing the actual
835 835 * parity so we make those columns appear to be full of zeros by
836 836 * setting their lengths to zero.
837 837 */
838 838 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
839 839 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
840 840 xsize = rm->rm_col[x].rc_size;
841 841 ysize = rm->rm_col[y].rc_size;
842 842
843 843 rm->rm_col[VDEV_RAIDZ_P].rc_data =
844 844 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
845 845 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
846 846 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
847 847 rm->rm_col[x].rc_size = 0;
848 848 rm->rm_col[y].rc_size = 0;
849 849
850 850 vdev_raidz_generate_parity_pq(rm);
851 851
852 852 rm->rm_col[x].rc_size = xsize;
853 853 rm->rm_col[y].rc_size = ysize;
854 854
855 855 p = pdata;
856 856 q = qdata;
857 857 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
858 858 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
859 859 xd = rm->rm_col[x].rc_data;
860 860 yd = rm->rm_col[y].rc_data;
861 861
862 862 /*
863 863 * We now have:
864 864 * Pxy = P + D_x + D_y
865 865 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
866 866 *
867 867 * We can then solve for D_x:
868 868 * D_x = A * (P + Pxy) + B * (Q + Qxy)
869 869 * where
870 870 * A = 2^(x - y) * (2^(x - y) + 1)^-1
871 871 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
872 872 *
873 873 * With D_x in hand, we can easily solve for D_y:
874 874 * D_y = P + Pxy + D_x
875 875 */
876 876
877 877 a = vdev_raidz_pow2[255 + x - y];
878 878 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
879 879 tmp = 255 - vdev_raidz_log2[a ^ 1];
880 880
881 881 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
882 882 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
883 883
884 884 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
885 885 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
886 886 vdev_raidz_exp2(*q ^ *qxy, bexp);
887 887
888 888 if (i < ysize)
889 889 *yd = *p ^ *pxy ^ *xd;
890 890 }
891 891
892 892 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
893 893 rm->rm_col[VDEV_RAIDZ_P].rc_size);
894 894 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
895 895 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
896 896
897 897 /*
898 898 * Restore the saved parity data.
899 899 */
900 900 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
901 901 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
902 902
903 903 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
904 904 }
905 905
906 906 /* BEGIN CSTYLED */
907 907 /*
908 908 * In the general case of reconstruction, we must solve the system of linear
909 909 * equations defined by the coeffecients used to generate parity as well as
910 910 * the contents of the data and parity disks. This can be expressed with
911 911 * vectors for the original data (D) and the actual data (d) and parity (p)
912 912 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
913 913 *
914 914 * __ __ __ __
915 915 * | | __ __ | p_0 |
916 916 * | V | | D_0 | | p_m-1 |
917 917 * | | x | : | = | d_0 |
918 918 * | I | | D_n-1 | | : |
919 919 * | | ~~ ~~ | d_n-1 |
920 920 * ~~ ~~ ~~ ~~
921 921 *
922 922 * I is simply a square identity matrix of size n, and V is a vandermonde
923 923 * matrix defined by the coeffecients we chose for the various parity columns
924 924 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
925 925 * computation as well as linear separability.
926 926 *
927 927 * __ __ __ __
928 928 * | 1 .. 1 1 1 | | p_0 |
929 929 * | 2^n-1 .. 4 2 1 | __ __ | : |
930 930 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
931 931 * | 1 .. 0 0 0 | | D_1 | | d_0 |
932 932 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
933 933 * | : : : : | | : | | d_2 |
934 934 * | 0 .. 1 0 0 | | D_n-1 | | : |
935 935 * | 0 .. 0 1 0 | ~~ ~~ | : |
936 936 * | 0 .. 0 0 1 | | d_n-1 |
937 937 * ~~ ~~ ~~ ~~
938 938 *
939 939 * Note that I, V, d, and p are known. To compute D, we must invert the
940 940 * matrix and use the known data and parity values to reconstruct the unknown
941 941 * data values. We begin by removing the rows in V|I and d|p that correspond
942 942 * to failed or missing columns; we then make V|I square (n x n) and d|p
943 943 * sized n by removing rows corresponding to unused parity from the bottom up
944 944 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
945 945 * using Gauss-Jordan elimination. In the example below we use m=3 parity
946 946 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
947 947 * __ __
948 948 * | 1 1 1 1 1 1 1 1 |
949 949 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
950 950 * | 19 205 116 29 64 16 4 1 | / /
951 951 * | 1 0 0 0 0 0 0 0 | / /
952 952 * | 0 1 0 0 0 0 0 0 | <--' /
953 953 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
954 954 * | 0 0 0 1 0 0 0 0 |
955 955 * | 0 0 0 0 1 0 0 0 |
956 956 * | 0 0 0 0 0 1 0 0 |
957 957 * | 0 0 0 0 0 0 1 0 |
958 958 * | 0 0 0 0 0 0 0 1 |
959 959 * ~~ ~~
960 960 * __ __
961 961 * | 1 1 1 1 1 1 1 1 |
962 962 * | 128 64 32 16 8 4 2 1 |
963 963 * | 19 205 116 29 64 16 4 1 |
964 964 * | 1 0 0 0 0 0 0 0 |
965 965 * | 0 1 0 0 0 0 0 0 |
966 966 * (V|I)' = | 0 0 1 0 0 0 0 0 |
967 967 * | 0 0 0 1 0 0 0 0 |
968 968 * | 0 0 0 0 1 0 0 0 |
969 969 * | 0 0 0 0 0 1 0 0 |
970 970 * | 0 0 0 0 0 0 1 0 |
971 971 * | 0 0 0 0 0 0 0 1 |
972 972 * ~~ ~~
973 973 *
974 974 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
975 975 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
976 976 * matrix is not singular.
977 977 * __ __
978 978 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
979 979 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
980 980 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
981 981 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
982 982 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
983 983 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
984 984 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
985 985 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
986 986 * ~~ ~~
987 987 * __ __
988 988 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
989 989 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
990 990 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
991 991 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
992 992 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
993 993 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
994 994 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
995 995 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
996 996 * ~~ ~~
997 997 * __ __
998 998 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
999 999 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1000 1000 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1001 1001 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1002 1002 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1003 1003 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1004 1004 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1005 1005 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1006 1006 * ~~ ~~
1007 1007 * __ __
1008 1008 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1009 1009 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1010 1010 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1011 1011 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1012 1012 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1013 1013 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1014 1014 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1015 1015 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1016 1016 * ~~ ~~
1017 1017 * __ __
1018 1018 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1019 1019 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1020 1020 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1021 1021 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1022 1022 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1023 1023 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1024 1024 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1025 1025 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1026 1026 * ~~ ~~
1027 1027 * __ __
1028 1028 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1029 1029 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1030 1030 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1031 1031 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1032 1032 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1033 1033 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1034 1034 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1035 1035 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1036 1036 * ~~ ~~
1037 1037 * __ __
1038 1038 * | 0 0 1 0 0 0 0 0 |
1039 1039 * | 167 100 5 41 159 169 217 208 |
1040 1040 * | 166 100 4 40 158 168 216 209 |
1041 1041 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1042 1042 * | 0 0 0 0 1 0 0 0 |
1043 1043 * | 0 0 0 0 0 1 0 0 |
1044 1044 * | 0 0 0 0 0 0 1 0 |
1045 1045 * | 0 0 0 0 0 0 0 1 |
1046 1046 * ~~ ~~
1047 1047 *
1048 1048 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1049 1049 * of the missing data.
1050 1050 *
1051 1051 * As is apparent from the example above, the only non-trivial rows in the
1052 1052 * inverse matrix correspond to the data disks that we're trying to
1053 1053 * reconstruct. Indeed, those are the only rows we need as the others would
1054 1054 * only be useful for reconstructing data known or assumed to be valid. For
1055 1055 * that reason, we only build the coefficients in the rows that correspond to
1056 1056 * targeted columns.
1057 1057 */
1058 1058 /* END CSTYLED */
1059 1059
1060 1060 static void
1061 1061 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1062 1062 uint8_t **rows)
1063 1063 {
1064 1064 int i, j;
1065 1065 int pow;
1066 1066
1067 1067 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1068 1068
1069 1069 /*
1070 1070 * Fill in the missing rows of interest.
1071 1071 */
1072 1072 for (i = 0; i < nmap; i++) {
1073 1073 ASSERT3S(0, <=, map[i]);
1074 1074 ASSERT3S(map[i], <=, 2);
1075 1075
1076 1076 pow = map[i] * n;
1077 1077 if (pow > 255)
1078 1078 pow -= 255;
1079 1079 ASSERT(pow <= 255);
1080 1080
1081 1081 for (j = 0; j < n; j++) {
1082 1082 pow -= map[i];
1083 1083 if (pow < 0)
1084 1084 pow += 255;
1085 1085 rows[i][j] = vdev_raidz_pow2[pow];
1086 1086 }
1087 1087 }
1088 1088 }
1089 1089
1090 1090 static void
1091 1091 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1092 1092 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1093 1093 {
1094 1094 int i, j, ii, jj;
1095 1095 uint8_t log;
1096 1096
1097 1097 /*
1098 1098 * Assert that the first nmissing entries from the array of used
1099 1099 * columns correspond to parity columns and that subsequent entries
1100 1100 * correspond to data columns.
1101 1101 */
1102 1102 for (i = 0; i < nmissing; i++) {
1103 1103 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1104 1104 }
1105 1105 for (; i < n; i++) {
1106 1106 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1107 1107 }
1108 1108
1109 1109 /*
1110 1110 * First initialize the storage where we'll compute the inverse rows.
1111 1111 */
1112 1112 for (i = 0; i < nmissing; i++) {
1113 1113 for (j = 0; j < n; j++) {
1114 1114 invrows[i][j] = (i == j) ? 1 : 0;
1115 1115 }
1116 1116 }
1117 1117
1118 1118 /*
1119 1119 * Subtract all trivial rows from the rows of consequence.
1120 1120 */
1121 1121 for (i = 0; i < nmissing; i++) {
1122 1122 for (j = nmissing; j < n; j++) {
1123 1123 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1124 1124 jj = used[j] - rm->rm_firstdatacol;
1125 1125 ASSERT3S(jj, <, n);
1126 1126 invrows[i][j] = rows[i][jj];
↓ open down ↓ |
832 lines elided |
↑ open up ↑ |
1127 1127 rows[i][jj] = 0;
1128 1128 }
1129 1129 }
1130 1130
1131 1131 /*
1132 1132 * For each of the rows of interest, we must normalize it and subtract
1133 1133 * a multiple of it from the other rows.
1134 1134 */
1135 1135 for (i = 0; i < nmissing; i++) {
1136 1136 for (j = 0; j < missing[i]; j++) {
1137 - ASSERT3U(rows[i][j], ==, 0);
1137 + ASSERT0(rows[i][j]);
1138 1138 }
1139 1139 ASSERT3U(rows[i][missing[i]], !=, 0);
1140 1140
1141 1141 /*
1142 1142 * Compute the inverse of the first element and multiply each
1143 1143 * element in the row by that value.
1144 1144 */
1145 1145 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1146 1146
1147 1147 for (j = 0; j < n; j++) {
1148 1148 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1149 1149 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1150 1150 }
1151 1151
1152 1152 for (ii = 0; ii < nmissing; ii++) {
1153 1153 if (i == ii)
1154 1154 continue;
1155 1155
1156 1156 ASSERT3U(rows[ii][missing[i]], !=, 0);
1157 1157
1158 1158 log = vdev_raidz_log2[rows[ii][missing[i]]];
1159 1159
1160 1160 for (j = 0; j < n; j++) {
1161 1161 rows[ii][j] ^=
1162 1162 vdev_raidz_exp2(rows[i][j], log);
1163 1163 invrows[ii][j] ^=
1164 1164 vdev_raidz_exp2(invrows[i][j], log);
1165 1165 }
1166 1166 }
1167 1167 }
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
1168 1168
1169 1169 /*
1170 1170 * Verify that the data that is left in the rows are properly part of
1171 1171 * an identity matrix.
1172 1172 */
1173 1173 for (i = 0; i < nmissing; i++) {
1174 1174 for (j = 0; j < n; j++) {
1175 1175 if (j == missing[i]) {
1176 1176 ASSERT3U(rows[i][j], ==, 1);
1177 1177 } else {
1178 - ASSERT3U(rows[i][j], ==, 0);
1178 + ASSERT0(rows[i][j]);
1179 1179 }
1180 1180 }
1181 1181 }
1182 1182 }
1183 1183
1184 1184 static void
1185 1185 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1186 1186 int *missing, uint8_t **invrows, const uint8_t *used)
1187 1187 {
1188 1188 int i, j, x, cc, c;
1189 1189 uint8_t *src;
1190 1190 uint64_t ccount;
1191 1191 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1192 1192 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1193 1193 uint8_t log, val;
1194 1194 int ll;
1195 1195 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1196 1196 uint8_t *p, *pp;
1197 1197 size_t psize;
1198 1198
1199 1199 psize = sizeof (invlog[0][0]) * n * nmissing;
1200 1200 p = kmem_alloc(psize, KM_SLEEP);
1201 1201
1202 1202 for (pp = p, i = 0; i < nmissing; i++) {
1203 1203 invlog[i] = pp;
1204 1204 pp += n;
1205 1205 }
1206 1206
1207 1207 for (i = 0; i < nmissing; i++) {
1208 1208 for (j = 0; j < n; j++) {
1209 1209 ASSERT3U(invrows[i][j], !=, 0);
1210 1210 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1211 1211 }
1212 1212 }
1213 1213
1214 1214 for (i = 0; i < n; i++) {
1215 1215 c = used[i];
1216 1216 ASSERT3U(c, <, rm->rm_cols);
1217 1217
1218 1218 src = rm->rm_col[c].rc_data;
1219 1219 ccount = rm->rm_col[c].rc_size;
1220 1220 for (j = 0; j < nmissing; j++) {
1221 1221 cc = missing[j] + rm->rm_firstdatacol;
1222 1222 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1223 1223 ASSERT3U(cc, <, rm->rm_cols);
1224 1224 ASSERT3U(cc, !=, c);
1225 1225
1226 1226 dst[j] = rm->rm_col[cc].rc_data;
1227 1227 dcount[j] = rm->rm_col[cc].rc_size;
1228 1228 }
1229 1229
1230 1230 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1231 1231
1232 1232 for (x = 0; x < ccount; x++, src++) {
1233 1233 if (*src != 0)
1234 1234 log = vdev_raidz_log2[*src];
1235 1235
1236 1236 for (cc = 0; cc < nmissing; cc++) {
1237 1237 if (x >= dcount[cc])
1238 1238 continue;
1239 1239
1240 1240 if (*src == 0) {
1241 1241 val = 0;
1242 1242 } else {
1243 1243 if ((ll = log + invlog[cc][i]) >= 255)
1244 1244 ll -= 255;
1245 1245 val = vdev_raidz_pow2[ll];
1246 1246 }
1247 1247
1248 1248 if (i == 0)
1249 1249 dst[cc][x] = val;
1250 1250 else
1251 1251 dst[cc][x] ^= val;
1252 1252 }
1253 1253 }
1254 1254 }
1255 1255
1256 1256 kmem_free(p, psize);
1257 1257 }
1258 1258
1259 1259 static int
1260 1260 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1261 1261 {
1262 1262 int n, i, c, t, tt;
1263 1263 int nmissing_rows;
1264 1264 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1265 1265 int parity_map[VDEV_RAIDZ_MAXPARITY];
1266 1266
1267 1267 uint8_t *p, *pp;
1268 1268 size_t psize;
1269 1269
1270 1270 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1271 1271 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1272 1272 uint8_t *used;
1273 1273
1274 1274 int code = 0;
1275 1275
1276 1276
1277 1277 n = rm->rm_cols - rm->rm_firstdatacol;
1278 1278
1279 1279 /*
1280 1280 * Figure out which data columns are missing.
1281 1281 */
1282 1282 nmissing_rows = 0;
1283 1283 for (t = 0; t < ntgts; t++) {
1284 1284 if (tgts[t] >= rm->rm_firstdatacol) {
1285 1285 missing_rows[nmissing_rows++] =
1286 1286 tgts[t] - rm->rm_firstdatacol;
1287 1287 }
1288 1288 }
1289 1289
1290 1290 /*
1291 1291 * Figure out which parity columns to use to help generate the missing
1292 1292 * data columns.
1293 1293 */
1294 1294 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1295 1295 ASSERT(tt < ntgts);
1296 1296 ASSERT(c < rm->rm_firstdatacol);
1297 1297
1298 1298 /*
1299 1299 * Skip any targeted parity columns.
1300 1300 */
1301 1301 if (c == tgts[tt]) {
1302 1302 tt++;
1303 1303 continue;
1304 1304 }
1305 1305
1306 1306 code |= 1 << c;
1307 1307
1308 1308 parity_map[i] = c;
1309 1309 i++;
1310 1310 }
1311 1311
1312 1312 ASSERT(code != 0);
1313 1313 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1314 1314
1315 1315 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1316 1316 nmissing_rows * n + sizeof (used[0]) * n;
1317 1317 p = kmem_alloc(psize, KM_SLEEP);
1318 1318
1319 1319 for (pp = p, i = 0; i < nmissing_rows; i++) {
1320 1320 rows[i] = pp;
1321 1321 pp += n;
1322 1322 invrows[i] = pp;
1323 1323 pp += n;
1324 1324 }
1325 1325 used = pp;
1326 1326
1327 1327 for (i = 0; i < nmissing_rows; i++) {
1328 1328 used[i] = parity_map[i];
1329 1329 }
1330 1330
1331 1331 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1332 1332 if (tt < nmissing_rows &&
1333 1333 c == missing_rows[tt] + rm->rm_firstdatacol) {
1334 1334 tt++;
1335 1335 continue;
1336 1336 }
1337 1337
1338 1338 ASSERT3S(i, <, n);
1339 1339 used[i] = c;
1340 1340 i++;
1341 1341 }
1342 1342
1343 1343 /*
1344 1344 * Initialize the interesting rows of the matrix.
1345 1345 */
1346 1346 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1347 1347
1348 1348 /*
1349 1349 * Invert the matrix.
1350 1350 */
1351 1351 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1352 1352 invrows, used);
1353 1353
1354 1354 /*
1355 1355 * Reconstruct the missing data using the generated matrix.
1356 1356 */
1357 1357 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1358 1358 invrows, used);
1359 1359
1360 1360 kmem_free(p, psize);
1361 1361
1362 1362 return (code);
1363 1363 }
1364 1364
1365 1365 static int
1366 1366 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1367 1367 {
1368 1368 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1369 1369 int ntgts;
1370 1370 int i, c;
1371 1371 int code;
1372 1372 int nbadparity, nbaddata;
1373 1373 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1374 1374
1375 1375 /*
1376 1376 * The tgts list must already be sorted.
1377 1377 */
1378 1378 for (i = 1; i < nt; i++) {
1379 1379 ASSERT(t[i] > t[i - 1]);
1380 1380 }
1381 1381
1382 1382 nbadparity = rm->rm_firstdatacol;
1383 1383 nbaddata = rm->rm_cols - nbadparity;
1384 1384 ntgts = 0;
1385 1385 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1386 1386 if (c < rm->rm_firstdatacol)
1387 1387 parity_valid[c] = B_FALSE;
1388 1388
1389 1389 if (i < nt && c == t[i]) {
1390 1390 tgts[ntgts++] = c;
1391 1391 i++;
1392 1392 } else if (rm->rm_col[c].rc_error != 0) {
1393 1393 tgts[ntgts++] = c;
1394 1394 } else if (c >= rm->rm_firstdatacol) {
1395 1395 nbaddata--;
1396 1396 } else {
1397 1397 parity_valid[c] = B_TRUE;
1398 1398 nbadparity--;
1399 1399 }
1400 1400 }
1401 1401
1402 1402 ASSERT(ntgts >= nt);
1403 1403 ASSERT(nbaddata >= 0);
1404 1404 ASSERT(nbaddata + nbadparity == ntgts);
1405 1405
1406 1406 dt = &tgts[nbadparity];
1407 1407
1408 1408 /*
1409 1409 * See if we can use any of our optimized reconstruction routines.
1410 1410 */
1411 1411 if (!vdev_raidz_default_to_general) {
1412 1412 switch (nbaddata) {
1413 1413 case 1:
1414 1414 if (parity_valid[VDEV_RAIDZ_P])
1415 1415 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1416 1416
1417 1417 ASSERT(rm->rm_firstdatacol > 1);
1418 1418
1419 1419 if (parity_valid[VDEV_RAIDZ_Q])
1420 1420 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1421 1421
1422 1422 ASSERT(rm->rm_firstdatacol > 2);
1423 1423 break;
1424 1424
1425 1425 case 2:
1426 1426 ASSERT(rm->rm_firstdatacol > 1);
1427 1427
1428 1428 if (parity_valid[VDEV_RAIDZ_P] &&
1429 1429 parity_valid[VDEV_RAIDZ_Q])
1430 1430 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1431 1431
1432 1432 ASSERT(rm->rm_firstdatacol > 2);
1433 1433
1434 1434 break;
1435 1435 }
1436 1436 }
1437 1437
1438 1438 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1439 1439 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1440 1440 ASSERT(code > 0);
1441 1441 return (code);
1442 1442 }
1443 1443
1444 1444 static int
1445 1445 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1446 1446 uint64_t *ashift)
1447 1447 {
1448 1448 vdev_t *cvd;
1449 1449 uint64_t nparity = vd->vdev_nparity;
1450 1450 int c;
1451 1451 int lasterror = 0;
1452 1452 int numerrors = 0;
1453 1453
1454 1454 ASSERT(nparity > 0);
1455 1455
1456 1456 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1457 1457 vd->vdev_children < nparity + 1) {
1458 1458 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1459 1459 return (EINVAL);
1460 1460 }
1461 1461
1462 1462 vdev_open_children(vd);
1463 1463
1464 1464 for (c = 0; c < vd->vdev_children; c++) {
1465 1465 cvd = vd->vdev_child[c];
1466 1466
1467 1467 if (cvd->vdev_open_error != 0) {
1468 1468 lasterror = cvd->vdev_open_error;
1469 1469 numerrors++;
1470 1470 continue;
1471 1471 }
1472 1472
1473 1473 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1474 1474 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1475 1475 *ashift = MAX(*ashift, cvd->vdev_ashift);
1476 1476 }
1477 1477
1478 1478 *asize *= vd->vdev_children;
1479 1479 *max_asize *= vd->vdev_children;
1480 1480
1481 1481 if (numerrors > nparity) {
1482 1482 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1483 1483 return (lasterror);
1484 1484 }
1485 1485
1486 1486 return (0);
1487 1487 }
1488 1488
1489 1489 static void
1490 1490 vdev_raidz_close(vdev_t *vd)
1491 1491 {
1492 1492 int c;
1493 1493
1494 1494 for (c = 0; c < vd->vdev_children; c++)
1495 1495 vdev_close(vd->vdev_child[c]);
1496 1496 }
1497 1497
1498 1498 static uint64_t
1499 1499 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1500 1500 {
1501 1501 uint64_t asize;
1502 1502 uint64_t ashift = vd->vdev_top->vdev_ashift;
1503 1503 uint64_t cols = vd->vdev_children;
1504 1504 uint64_t nparity = vd->vdev_nparity;
1505 1505
1506 1506 asize = ((psize - 1) >> ashift) + 1;
1507 1507 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1508 1508 asize = roundup(asize, nparity + 1) << ashift;
1509 1509
1510 1510 return (asize);
1511 1511 }
1512 1512
1513 1513 static void
1514 1514 vdev_raidz_child_done(zio_t *zio)
1515 1515 {
1516 1516 raidz_col_t *rc = zio->io_private;
1517 1517
1518 1518 rc->rc_error = zio->io_error;
1519 1519 rc->rc_tried = 1;
1520 1520 rc->rc_skipped = 0;
1521 1521 }
1522 1522
1523 1523 static int
1524 1524 vdev_raidz_io_start(zio_t *zio)
1525 1525 {
1526 1526 vdev_t *vd = zio->io_vd;
1527 1527 vdev_t *tvd = vd->vdev_top;
1528 1528 vdev_t *cvd;
1529 1529 raidz_map_t *rm;
1530 1530 raidz_col_t *rc;
1531 1531 int c, i;
1532 1532
1533 1533 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1534 1534 vd->vdev_nparity);
1535 1535
1536 1536 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1537 1537
1538 1538 if (zio->io_type == ZIO_TYPE_WRITE) {
1539 1539 vdev_raidz_generate_parity(rm);
1540 1540
1541 1541 for (c = 0; c < rm->rm_cols; c++) {
1542 1542 rc = &rm->rm_col[c];
1543 1543 cvd = vd->vdev_child[rc->rc_devidx];
1544 1544 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1545 1545 rc->rc_offset, rc->rc_data, rc->rc_size,
1546 1546 zio->io_type, zio->io_priority, 0,
1547 1547 vdev_raidz_child_done, rc));
1548 1548 }
1549 1549
1550 1550 /*
1551 1551 * Generate optional I/Os for any skipped sectors to improve
1552 1552 * aggregation contiguity.
1553 1553 */
1554 1554 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1555 1555 ASSERT(c <= rm->rm_scols);
1556 1556 if (c == rm->rm_scols)
1557 1557 c = 0;
1558 1558 rc = &rm->rm_col[c];
1559 1559 cvd = vd->vdev_child[rc->rc_devidx];
1560 1560 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1561 1561 rc->rc_offset + rc->rc_size, NULL,
1562 1562 1 << tvd->vdev_ashift,
1563 1563 zio->io_type, zio->io_priority,
1564 1564 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1565 1565 }
1566 1566
1567 1567 return (ZIO_PIPELINE_CONTINUE);
1568 1568 }
1569 1569
1570 1570 ASSERT(zio->io_type == ZIO_TYPE_READ);
1571 1571
1572 1572 /*
1573 1573 * Iterate over the columns in reverse order so that we hit the parity
1574 1574 * last -- any errors along the way will force us to read the parity.
1575 1575 */
1576 1576 for (c = rm->rm_cols - 1; c >= 0; c--) {
1577 1577 rc = &rm->rm_col[c];
1578 1578 cvd = vd->vdev_child[rc->rc_devidx];
1579 1579 if (!vdev_readable(cvd)) {
1580 1580 if (c >= rm->rm_firstdatacol)
1581 1581 rm->rm_missingdata++;
1582 1582 else
1583 1583 rm->rm_missingparity++;
1584 1584 rc->rc_error = ENXIO;
1585 1585 rc->rc_tried = 1; /* don't even try */
1586 1586 rc->rc_skipped = 1;
1587 1587 continue;
1588 1588 }
1589 1589 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1590 1590 if (c >= rm->rm_firstdatacol)
1591 1591 rm->rm_missingdata++;
1592 1592 else
1593 1593 rm->rm_missingparity++;
1594 1594 rc->rc_error = ESTALE;
1595 1595 rc->rc_skipped = 1;
1596 1596 continue;
1597 1597 }
1598 1598 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1599 1599 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1600 1600 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1601 1601 rc->rc_offset, rc->rc_data, rc->rc_size,
1602 1602 zio->io_type, zio->io_priority, 0,
1603 1603 vdev_raidz_child_done, rc));
1604 1604 }
1605 1605 }
1606 1606
1607 1607 return (ZIO_PIPELINE_CONTINUE);
1608 1608 }
1609 1609
1610 1610
1611 1611 /*
1612 1612 * Report a checksum error for a child of a RAID-Z device.
1613 1613 */
1614 1614 static void
1615 1615 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1616 1616 {
1617 1617 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1618 1618
1619 1619 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1620 1620 zio_bad_cksum_t zbc;
1621 1621 raidz_map_t *rm = zio->io_vsd;
1622 1622
1623 1623 mutex_enter(&vd->vdev_stat_lock);
1624 1624 vd->vdev_stat.vs_checksum_errors++;
1625 1625 mutex_exit(&vd->vdev_stat_lock);
1626 1626
1627 1627 zbc.zbc_has_cksum = 0;
1628 1628 zbc.zbc_injected = rm->rm_ecksuminjected;
1629 1629
1630 1630 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1631 1631 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1632 1632 &zbc);
1633 1633 }
1634 1634 }
1635 1635
1636 1636 /*
1637 1637 * We keep track of whether or not there were any injected errors, so that
1638 1638 * any ereports we generate can note it.
1639 1639 */
1640 1640 static int
1641 1641 raidz_checksum_verify(zio_t *zio)
1642 1642 {
1643 1643 zio_bad_cksum_t zbc;
1644 1644 raidz_map_t *rm = zio->io_vsd;
1645 1645
1646 1646 int ret = zio_checksum_error(zio, &zbc);
1647 1647 if (ret != 0 && zbc.zbc_injected != 0)
1648 1648 rm->rm_ecksuminjected = 1;
1649 1649
1650 1650 return (ret);
1651 1651 }
1652 1652
1653 1653 /*
1654 1654 * Generate the parity from the data columns. If we tried and were able to
1655 1655 * read the parity without error, verify that the generated parity matches the
1656 1656 * data we read. If it doesn't, we fire off a checksum error. Return the
1657 1657 * number such failures.
1658 1658 */
1659 1659 static int
1660 1660 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1661 1661 {
1662 1662 void *orig[VDEV_RAIDZ_MAXPARITY];
1663 1663 int c, ret = 0;
1664 1664 raidz_col_t *rc;
1665 1665
1666 1666 for (c = 0; c < rm->rm_firstdatacol; c++) {
1667 1667 rc = &rm->rm_col[c];
1668 1668 if (!rc->rc_tried || rc->rc_error != 0)
1669 1669 continue;
1670 1670 orig[c] = zio_buf_alloc(rc->rc_size);
1671 1671 bcopy(rc->rc_data, orig[c], rc->rc_size);
1672 1672 }
1673 1673
1674 1674 vdev_raidz_generate_parity(rm);
1675 1675
1676 1676 for (c = 0; c < rm->rm_firstdatacol; c++) {
1677 1677 rc = &rm->rm_col[c];
1678 1678 if (!rc->rc_tried || rc->rc_error != 0)
1679 1679 continue;
1680 1680 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1681 1681 raidz_checksum_error(zio, rc, orig[c]);
1682 1682 rc->rc_error = ECKSUM;
1683 1683 ret++;
1684 1684 }
1685 1685 zio_buf_free(orig[c], rc->rc_size);
1686 1686 }
1687 1687
1688 1688 return (ret);
1689 1689 }
1690 1690
1691 1691 /*
1692 1692 * Keep statistics on all the ways that we used parity to correct data.
1693 1693 */
1694 1694 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1695 1695
1696 1696 static int
1697 1697 vdev_raidz_worst_error(raidz_map_t *rm)
1698 1698 {
1699 1699 int error = 0;
1700 1700
1701 1701 for (int c = 0; c < rm->rm_cols; c++)
1702 1702 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1703 1703
1704 1704 return (error);
1705 1705 }
1706 1706
1707 1707 /*
1708 1708 * Iterate over all combinations of bad data and attempt a reconstruction.
1709 1709 * Note that the algorithm below is non-optimal because it doesn't take into
1710 1710 * account how reconstruction is actually performed. For example, with
1711 1711 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1712 1712 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1713 1713 * cases we'd only use parity information in column 0.
1714 1714 */
1715 1715 static int
1716 1716 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1717 1717 {
1718 1718 raidz_map_t *rm = zio->io_vsd;
1719 1719 raidz_col_t *rc;
1720 1720 void *orig[VDEV_RAIDZ_MAXPARITY];
1721 1721 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1722 1722 int *tgts = &tstore[1];
1723 1723 int current, next, i, c, n;
1724 1724 int code, ret = 0;
1725 1725
1726 1726 ASSERT(total_errors < rm->rm_firstdatacol);
1727 1727
1728 1728 /*
1729 1729 * This simplifies one edge condition.
1730 1730 */
1731 1731 tgts[-1] = -1;
1732 1732
1733 1733 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1734 1734 /*
1735 1735 * Initialize the targets array by finding the first n columns
1736 1736 * that contain no error.
1737 1737 *
1738 1738 * If there were no data errors, we need to ensure that we're
1739 1739 * always explicitly attempting to reconstruct at least one
1740 1740 * data column. To do this, we simply push the highest target
1741 1741 * up into the data columns.
1742 1742 */
1743 1743 for (c = 0, i = 0; i < n; i++) {
1744 1744 if (i == n - 1 && data_errors == 0 &&
1745 1745 c < rm->rm_firstdatacol) {
1746 1746 c = rm->rm_firstdatacol;
1747 1747 }
1748 1748
1749 1749 while (rm->rm_col[c].rc_error != 0) {
1750 1750 c++;
1751 1751 ASSERT3S(c, <, rm->rm_cols);
1752 1752 }
1753 1753
1754 1754 tgts[i] = c++;
1755 1755 }
1756 1756
1757 1757 /*
1758 1758 * Setting tgts[n] simplifies the other edge condition.
1759 1759 */
1760 1760 tgts[n] = rm->rm_cols;
1761 1761
1762 1762 /*
1763 1763 * These buffers were allocated in previous iterations.
1764 1764 */
1765 1765 for (i = 0; i < n - 1; i++) {
1766 1766 ASSERT(orig[i] != NULL);
1767 1767 }
1768 1768
1769 1769 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1770 1770
1771 1771 current = 0;
1772 1772 next = tgts[current];
1773 1773
1774 1774 while (current != n) {
1775 1775 tgts[current] = next;
1776 1776 current = 0;
1777 1777
1778 1778 /*
1779 1779 * Save off the original data that we're going to
1780 1780 * attempt to reconstruct.
1781 1781 */
1782 1782 for (i = 0; i < n; i++) {
1783 1783 ASSERT(orig[i] != NULL);
1784 1784 c = tgts[i];
1785 1785 ASSERT3S(c, >=, 0);
1786 1786 ASSERT3S(c, <, rm->rm_cols);
1787 1787 rc = &rm->rm_col[c];
1788 1788 bcopy(rc->rc_data, orig[i], rc->rc_size);
1789 1789 }
1790 1790
1791 1791 /*
1792 1792 * Attempt a reconstruction and exit the outer loop on
1793 1793 * success.
1794 1794 */
1795 1795 code = vdev_raidz_reconstruct(rm, tgts, n);
1796 1796 if (raidz_checksum_verify(zio) == 0) {
1797 1797 atomic_inc_64(&raidz_corrected[code]);
1798 1798
1799 1799 for (i = 0; i < n; i++) {
1800 1800 c = tgts[i];
1801 1801 rc = &rm->rm_col[c];
1802 1802 ASSERT(rc->rc_error == 0);
1803 1803 if (rc->rc_tried)
1804 1804 raidz_checksum_error(zio, rc,
1805 1805 orig[i]);
1806 1806 rc->rc_error = ECKSUM;
1807 1807 }
1808 1808
1809 1809 ret = code;
1810 1810 goto done;
1811 1811 }
1812 1812
1813 1813 /*
1814 1814 * Restore the original data.
1815 1815 */
1816 1816 for (i = 0; i < n; i++) {
1817 1817 c = tgts[i];
1818 1818 rc = &rm->rm_col[c];
1819 1819 bcopy(orig[i], rc->rc_data, rc->rc_size);
1820 1820 }
1821 1821
1822 1822 do {
1823 1823 /*
1824 1824 * Find the next valid column after the current
1825 1825 * position..
1826 1826 */
1827 1827 for (next = tgts[current] + 1;
1828 1828 next < rm->rm_cols &&
1829 1829 rm->rm_col[next].rc_error != 0; next++)
1830 1830 continue;
1831 1831
1832 1832 ASSERT(next <= tgts[current + 1]);
1833 1833
1834 1834 /*
1835 1835 * If that spot is available, we're done here.
1836 1836 */
1837 1837 if (next != tgts[current + 1])
1838 1838 break;
1839 1839
1840 1840 /*
1841 1841 * Otherwise, find the next valid column after
1842 1842 * the previous position.
1843 1843 */
1844 1844 for (c = tgts[current - 1] + 1;
1845 1845 rm->rm_col[c].rc_error != 0; c++)
1846 1846 continue;
1847 1847
1848 1848 tgts[current] = c;
1849 1849 current++;
1850 1850
1851 1851 } while (current != n);
1852 1852 }
1853 1853 }
1854 1854 n--;
1855 1855 done:
1856 1856 for (i = 0; i < n; i++) {
1857 1857 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1858 1858 }
1859 1859
1860 1860 return (ret);
1861 1861 }
1862 1862
1863 1863 static void
1864 1864 vdev_raidz_io_done(zio_t *zio)
1865 1865 {
1866 1866 vdev_t *vd = zio->io_vd;
1867 1867 vdev_t *cvd;
1868 1868 raidz_map_t *rm = zio->io_vsd;
1869 1869 raidz_col_t *rc;
1870 1870 int unexpected_errors = 0;
1871 1871 int parity_errors = 0;
1872 1872 int parity_untried = 0;
1873 1873 int data_errors = 0;
1874 1874 int total_errors = 0;
1875 1875 int n, c;
1876 1876 int tgts[VDEV_RAIDZ_MAXPARITY];
1877 1877 int code;
1878 1878
1879 1879 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1880 1880
1881 1881 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1882 1882 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1883 1883
1884 1884 for (c = 0; c < rm->rm_cols; c++) {
1885 1885 rc = &rm->rm_col[c];
1886 1886
1887 1887 if (rc->rc_error) {
1888 1888 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1889 1889
1890 1890 if (c < rm->rm_firstdatacol)
1891 1891 parity_errors++;
1892 1892 else
1893 1893 data_errors++;
1894 1894
1895 1895 if (!rc->rc_skipped)
1896 1896 unexpected_errors++;
1897 1897
1898 1898 total_errors++;
1899 1899 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1900 1900 parity_untried++;
1901 1901 }
1902 1902 }
1903 1903
1904 1904 if (zio->io_type == ZIO_TYPE_WRITE) {
1905 1905 /*
1906 1906 * XXX -- for now, treat partial writes as a success.
1907 1907 * (If we couldn't write enough columns to reconstruct
1908 1908 * the data, the I/O failed. Otherwise, good enough.)
1909 1909 *
1910 1910 * Now that we support write reallocation, it would be better
1911 1911 * to treat partial failure as real failure unless there are
1912 1912 * no non-degraded top-level vdevs left, and not update DTLs
1913 1913 * if we intend to reallocate.
1914 1914 */
1915 1915 /* XXPOLICY */
1916 1916 if (total_errors > rm->rm_firstdatacol)
1917 1917 zio->io_error = vdev_raidz_worst_error(rm);
1918 1918
1919 1919 return;
1920 1920 }
1921 1921
1922 1922 ASSERT(zio->io_type == ZIO_TYPE_READ);
1923 1923 /*
1924 1924 * There are three potential phases for a read:
1925 1925 * 1. produce valid data from the columns read
1926 1926 * 2. read all disks and try again
1927 1927 * 3. perform combinatorial reconstruction
1928 1928 *
1929 1929 * Each phase is progressively both more expensive and less likely to
1930 1930 * occur. If we encounter more errors than we can repair or all phases
1931 1931 * fail, we have no choice but to return an error.
1932 1932 */
1933 1933
1934 1934 /*
1935 1935 * If the number of errors we saw was correctable -- less than or equal
1936 1936 * to the number of parity disks read -- attempt to produce data that
1937 1937 * has a valid checksum. Naturally, this case applies in the absence of
1938 1938 * any errors.
1939 1939 */
1940 1940 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1941 1941 if (data_errors == 0) {
1942 1942 if (raidz_checksum_verify(zio) == 0) {
1943 1943 /*
1944 1944 * If we read parity information (unnecessarily
1945 1945 * as it happens since no reconstruction was
1946 1946 * needed) regenerate and verify the parity.
1947 1947 * We also regenerate parity when resilvering
1948 1948 * so we can write it out to the failed device
1949 1949 * later.
1950 1950 */
1951 1951 if (parity_errors + parity_untried <
1952 1952 rm->rm_firstdatacol ||
1953 1953 (zio->io_flags & ZIO_FLAG_RESILVER)) {
1954 1954 n = raidz_parity_verify(zio, rm);
1955 1955 unexpected_errors += n;
1956 1956 ASSERT(parity_errors + n <=
1957 1957 rm->rm_firstdatacol);
1958 1958 }
1959 1959 goto done;
1960 1960 }
1961 1961 } else {
1962 1962 /*
1963 1963 * We either attempt to read all the parity columns or
1964 1964 * none of them. If we didn't try to read parity, we
1965 1965 * wouldn't be here in the correctable case. There must
1966 1966 * also have been fewer parity errors than parity
1967 1967 * columns or, again, we wouldn't be in this code path.
1968 1968 */
1969 1969 ASSERT(parity_untried == 0);
1970 1970 ASSERT(parity_errors < rm->rm_firstdatacol);
1971 1971
1972 1972 /*
1973 1973 * Identify the data columns that reported an error.
1974 1974 */
1975 1975 n = 0;
1976 1976 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1977 1977 rc = &rm->rm_col[c];
1978 1978 if (rc->rc_error != 0) {
1979 1979 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1980 1980 tgts[n++] = c;
1981 1981 }
1982 1982 }
1983 1983
1984 1984 ASSERT(rm->rm_firstdatacol >= n);
1985 1985
1986 1986 code = vdev_raidz_reconstruct(rm, tgts, n);
1987 1987
1988 1988 if (raidz_checksum_verify(zio) == 0) {
1989 1989 atomic_inc_64(&raidz_corrected[code]);
1990 1990
1991 1991 /*
1992 1992 * If we read more parity disks than were used
1993 1993 * for reconstruction, confirm that the other
1994 1994 * parity disks produced correct data. This
1995 1995 * routine is suboptimal in that it regenerates
1996 1996 * the parity that we already used in addition
1997 1997 * to the parity that we're attempting to
1998 1998 * verify, but this should be a relatively
1999 1999 * uncommon case, and can be optimized if it
2000 2000 * becomes a problem. Note that we regenerate
2001 2001 * parity when resilvering so we can write it
2002 2002 * out to failed devices later.
2003 2003 */
2004 2004 if (parity_errors < rm->rm_firstdatacol - n ||
2005 2005 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2006 2006 n = raidz_parity_verify(zio, rm);
2007 2007 unexpected_errors += n;
2008 2008 ASSERT(parity_errors + n <=
2009 2009 rm->rm_firstdatacol);
2010 2010 }
2011 2011
2012 2012 goto done;
2013 2013 }
2014 2014 }
2015 2015 }
2016 2016
2017 2017 /*
2018 2018 * This isn't a typical situation -- either we got a read error or
2019 2019 * a child silently returned bad data. Read every block so we can
2020 2020 * try again with as much data and parity as we can track down. If
2021 2021 * we've already been through once before, all children will be marked
2022 2022 * as tried so we'll proceed to combinatorial reconstruction.
2023 2023 */
2024 2024 unexpected_errors = 1;
2025 2025 rm->rm_missingdata = 0;
2026 2026 rm->rm_missingparity = 0;
2027 2027
2028 2028 for (c = 0; c < rm->rm_cols; c++) {
2029 2029 if (rm->rm_col[c].rc_tried)
2030 2030 continue;
2031 2031
2032 2032 zio_vdev_io_redone(zio);
2033 2033 do {
2034 2034 rc = &rm->rm_col[c];
2035 2035 if (rc->rc_tried)
2036 2036 continue;
2037 2037 zio_nowait(zio_vdev_child_io(zio, NULL,
2038 2038 vd->vdev_child[rc->rc_devidx],
2039 2039 rc->rc_offset, rc->rc_data, rc->rc_size,
2040 2040 zio->io_type, zio->io_priority, 0,
2041 2041 vdev_raidz_child_done, rc));
2042 2042 } while (++c < rm->rm_cols);
2043 2043
2044 2044 return;
2045 2045 }
2046 2046
2047 2047 /*
2048 2048 * At this point we've attempted to reconstruct the data given the
2049 2049 * errors we detected, and we've attempted to read all columns. There
2050 2050 * must, therefore, be one or more additional problems -- silent errors
2051 2051 * resulting in invalid data rather than explicit I/O errors resulting
2052 2052 * in absent data. We check if there is enough additional data to
2053 2053 * possibly reconstruct the data and then perform combinatorial
2054 2054 * reconstruction over all possible combinations. If that fails,
2055 2055 * we're cooked.
2056 2056 */
2057 2057 if (total_errors > rm->rm_firstdatacol) {
2058 2058 zio->io_error = vdev_raidz_worst_error(rm);
2059 2059
2060 2060 } else if (total_errors < rm->rm_firstdatacol &&
2061 2061 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2062 2062 /*
2063 2063 * If we didn't use all the available parity for the
2064 2064 * combinatorial reconstruction, verify that the remaining
2065 2065 * parity is correct.
2066 2066 */
2067 2067 if (code != (1 << rm->rm_firstdatacol) - 1)
2068 2068 (void) raidz_parity_verify(zio, rm);
2069 2069 } else {
2070 2070 /*
2071 2071 * We're here because either:
2072 2072 *
2073 2073 * total_errors == rm_first_datacol, or
2074 2074 * vdev_raidz_combrec() failed
2075 2075 *
2076 2076 * In either case, there is enough bad data to prevent
2077 2077 * reconstruction.
2078 2078 *
2079 2079 * Start checksum ereports for all children which haven't
2080 2080 * failed, and the IO wasn't speculative.
2081 2081 */
2082 2082 zio->io_error = ECKSUM;
2083 2083
2084 2084 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2085 2085 for (c = 0; c < rm->rm_cols; c++) {
2086 2086 rc = &rm->rm_col[c];
2087 2087 if (rc->rc_error == 0) {
2088 2088 zio_bad_cksum_t zbc;
2089 2089 zbc.zbc_has_cksum = 0;
2090 2090 zbc.zbc_injected =
2091 2091 rm->rm_ecksuminjected;
2092 2092
2093 2093 zfs_ereport_start_checksum(
2094 2094 zio->io_spa,
2095 2095 vd->vdev_child[rc->rc_devidx],
2096 2096 zio, rc->rc_offset, rc->rc_size,
2097 2097 (void *)(uintptr_t)c, &zbc);
2098 2098 }
2099 2099 }
2100 2100 }
2101 2101 }
2102 2102
2103 2103 done:
2104 2104 zio_checksum_verified(zio);
2105 2105
2106 2106 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2107 2107 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2108 2108 /*
2109 2109 * Use the good data we have in hand to repair damaged children.
2110 2110 */
2111 2111 for (c = 0; c < rm->rm_cols; c++) {
2112 2112 rc = &rm->rm_col[c];
2113 2113 cvd = vd->vdev_child[rc->rc_devidx];
2114 2114
2115 2115 if (rc->rc_error == 0)
2116 2116 continue;
2117 2117
2118 2118 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2119 2119 rc->rc_offset, rc->rc_data, rc->rc_size,
2120 2120 ZIO_TYPE_WRITE, zio->io_priority,
2121 2121 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2122 2122 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2123 2123 }
2124 2124 }
2125 2125 }
2126 2126
2127 2127 static void
2128 2128 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2129 2129 {
2130 2130 if (faulted > vd->vdev_nparity)
2131 2131 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2132 2132 VDEV_AUX_NO_REPLICAS);
2133 2133 else if (degraded + faulted != 0)
2134 2134 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2135 2135 else
2136 2136 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2137 2137 }
2138 2138
2139 2139 vdev_ops_t vdev_raidz_ops = {
2140 2140 vdev_raidz_open,
2141 2141 vdev_raidz_close,
2142 2142 vdev_raidz_asize,
2143 2143 vdev_raidz_io_start,
2144 2144 vdev_raidz_io_done,
2145 2145 vdev_raidz_state_change,
2146 2146 NULL,
2147 2147 NULL,
2148 2148 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2149 2149 B_FALSE /* not a leaf vdev */
2150 2150 };
↓ open down ↓ |
962 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX