Print this page
3742 zfs comments need cleaner, more consistent style
Submitted by: Will Andrews <willa@spectralogic.com>
Submitted by: Alan Somers <alans@spectralogic.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Eric Schrock <eric.schrock@delphix.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ new/usr/src/uts/common/fs/zfs/vdev_raidz.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 25 */
26 26
27 27 #include <sys/zfs_context.h>
28 28 #include <sys/spa.h>
29 29 #include <sys/vdev_impl.h>
30 30 #include <sys/zio.h>
31 31 #include <sys/zio_checksum.h>
32 32 #include <sys/fs/zfs.h>
33 33 #include <sys/fm/fs/zfs.h>
34 34
35 35 /*
36 36 * Virtual device vector for RAID-Z.
37 37 *
38 38 * This vdev supports single, double, and triple parity. For single parity,
39 39 * we use a simple XOR of all the data columns. For double or triple parity,
40 40 * we use a special case of Reed-Solomon coding. This extends the
41 41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 44 * former is also based. The latter is designed to provide higher performance
45 45 * for writes.
46 46 *
47 47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 48 * amended six years later identifying a critical flaw that invalidates its
49 49 * claims. Nevertheless, the technique can be adapted to work for up to
50 50 * triple parity. For additional parity, the amendment "Note: Correction to
51 51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 52 * is viable, but the additional complexity means that write performance will
↓ open down ↓ |
52 lines elided |
↑ open up ↑ |
53 53 * suffer.
54 54 *
55 55 * All of the methods above operate on a Galois field, defined over the
56 56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 57 * can be expressed with a single byte. Briefly, the operations on the
58 58 * field are defined as follows:
59 59 *
60 60 * o addition (+) is represented by a bitwise XOR
61 61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 62 * o multiplication of A by 2 is defined by the following bitwise expression:
63 + *
63 64 * (A * 2)_7 = A_6
64 65 * (A * 2)_6 = A_5
65 66 * (A * 2)_5 = A_4
66 67 * (A * 2)_4 = A_3 + A_7
67 68 * (A * 2)_3 = A_2 + A_7
68 69 * (A * 2)_2 = A_1 + A_7
69 70 * (A * 2)_1 = A_0
70 71 * (A * 2)_0 = A_7
71 72 *
72 73 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73 74 * As an aside, this multiplication is derived from the error correcting
74 75 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75 76 *
76 77 * Observe that any number in the field (except for 0) can be expressed as a
77 78 * power of 2 -- a generator for the field. We store a table of the powers of
78 79 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79 80 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80 81 * than field addition). The inverse of a field element A (A^-1) is therefore
81 82 * A ^ (255 - 1) = A^254.
82 83 *
83 84 * The up-to-three parity columns, P, Q, R over several data columns,
84 85 * D_0, ... D_n-1, can be expressed by field operations:
85 86 *
86 87 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
87 88 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88 89 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89 90 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90 91 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91 92 *
92 93 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93 94 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94 95 * independent coefficients. (There are no additional coefficients that have
95 96 * this property which is why the uncorrected Plank method breaks down.)
96 97 *
97 98 * See the reconstruction code below for how P, Q and R can used individually
98 99 * or in concert to recover missing data columns.
99 100 */
100 101
101 102 typedef struct raidz_col {
102 103 uint64_t rc_devidx; /* child device index for I/O */
103 104 uint64_t rc_offset; /* device offset */
104 105 uint64_t rc_size; /* I/O size */
105 106 void *rc_data; /* I/O data */
106 107 void *rc_gdata; /* used to store the "good" version */
107 108 int rc_error; /* I/O error for this device */
108 109 uint8_t rc_tried; /* Did we attempt this I/O column? */
109 110 uint8_t rc_skipped; /* Did we skip this I/O column? */
110 111 } raidz_col_t;
↓ open down ↓ |
38 lines elided |
↑ open up ↑ |
111 112
112 113 typedef struct raidz_map {
113 114 uint64_t rm_cols; /* Regular column count */
114 115 uint64_t rm_scols; /* Count including skipped columns */
115 116 uint64_t rm_bigcols; /* Number of oversized columns */
116 117 uint64_t rm_asize; /* Actual total I/O size */
117 118 uint64_t rm_missingdata; /* Count of missing data devices */
118 119 uint64_t rm_missingparity; /* Count of missing parity devices */
119 120 uint64_t rm_firstdatacol; /* First data column/parity count */
120 121 uint64_t rm_nskip; /* Skipped sectors for padding */
121 - uint64_t rm_skipstart; /* Column index of padding start */
122 + uint64_t rm_skipstart; /* Column index of padding start */
122 123 void *rm_datacopy; /* rm_asize-buffer of copied data */
123 124 uintptr_t rm_reports; /* # of referencing checksum reports */
124 125 uint8_t rm_freed; /* map no longer has referencing ZIO */
125 126 uint8_t rm_ecksuminjected; /* checksum error was injected */
126 127 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
127 128 } raidz_map_t;
128 129
129 130 #define VDEV_RAIDZ_P 0
130 131 #define VDEV_RAIDZ_Q 1
131 132 #define VDEV_RAIDZ_R 2
132 133
133 134 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
134 135 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
135 136
136 137 /*
137 138 * We provide a mechanism to perform the field multiplication operation on a
138 139 * 64-bit value all at once rather than a byte at a time. This works by
139 140 * creating a mask from the top bit in each byte and using that to
140 141 * conditionally apply the XOR of 0x1d.
141 142 */
142 143 #define VDEV_RAIDZ_64MUL_2(x, mask) \
143 144 { \
144 145 (mask) = (x) & 0x8080808080808080ULL; \
145 146 (mask) = ((mask) << 1) - ((mask) >> 7); \
146 147 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
147 148 ((mask) & 0x1d1d1d1d1d1d1d1d); \
148 149 }
149 150
150 151 #define VDEV_RAIDZ_64MUL_4(x, mask) \
↓ open down ↓ |
19 lines elided |
↑ open up ↑ |
151 152 { \
152 153 VDEV_RAIDZ_64MUL_2((x), mask); \
153 154 VDEV_RAIDZ_64MUL_2((x), mask); \
154 155 }
155 156
156 157 /*
157 158 * Force reconstruction to use the general purpose method.
158 159 */
159 160 int vdev_raidz_default_to_general;
160 161
161 -/*
162 - * These two tables represent powers and logs of 2 in the Galois field defined
163 - * above. These values were computed by repeatedly multiplying by 2 as above.
164 - */
162 +/* Powers of 2 in the Galois field defined above. */
165 163 static const uint8_t vdev_raidz_pow2[256] = {
166 164 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
167 165 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
168 166 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
169 167 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
170 168 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
171 169 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
172 170 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
173 171 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
174 172 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
175 173 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
176 174 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
177 175 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
178 176 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
179 177 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
180 178 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
181 179 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
182 180 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
183 181 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
184 182 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
185 183 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
186 184 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
187 185 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
188 186 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
189 187 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
190 188 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
191 189 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
192 190 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
193 191 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
194 192 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
195 193 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
196 194 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
197 195 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
198 196 };
197 +/* Logs of 2 in the Galois field defined above. */
199 198 static const uint8_t vdev_raidz_log2[256] = {
200 199 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
201 200 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
202 201 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
203 202 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
204 203 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
205 204 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
206 205 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
207 206 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
208 207 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
209 208 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
210 209 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
211 210 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
212 211 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
213 212 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
214 213 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
215 214 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
216 215 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
217 216 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
218 217 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
219 218 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
220 219 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
221 220 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
222 221 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
223 222 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
224 223 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
225 224 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
226 225 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
227 226 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
228 227 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
229 228 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
230 229 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
231 230 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
232 231 };
233 232
234 233 static void vdev_raidz_generate_parity(raidz_map_t *rm);
235 234
236 235 /*
237 236 * Multiply a given number by 2 raised to the given power.
238 237 */
239 238 static uint8_t
240 239 vdev_raidz_exp2(uint_t a, int exp)
241 240 {
242 241 if (a == 0)
243 242 return (0);
244 243
245 244 ASSERT(exp >= 0);
246 245 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
247 246
248 247 exp += vdev_raidz_log2[a];
249 248 if (exp > 255)
250 249 exp -= 255;
251 250
252 251 return (vdev_raidz_pow2[exp]);
253 252 }
254 253
255 254 static void
256 255 vdev_raidz_map_free(raidz_map_t *rm)
257 256 {
258 257 int c;
259 258 size_t size;
260 259
261 260 for (c = 0; c < rm->rm_firstdatacol; c++) {
262 261 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
263 262
264 263 if (rm->rm_col[c].rc_gdata != NULL)
265 264 zio_buf_free(rm->rm_col[c].rc_gdata,
266 265 rm->rm_col[c].rc_size);
267 266 }
268 267
269 268 size = 0;
270 269 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
271 270 size += rm->rm_col[c].rc_size;
272 271
273 272 if (rm->rm_datacopy != NULL)
274 273 zio_buf_free(rm->rm_datacopy, size);
275 274
276 275 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
277 276 }
278 277
279 278 static void
280 279 vdev_raidz_map_free_vsd(zio_t *zio)
281 280 {
282 281 raidz_map_t *rm = zio->io_vsd;
283 282
284 283 ASSERT0(rm->rm_freed);
285 284 rm->rm_freed = 1;
286 285
287 286 if (rm->rm_reports == 0)
288 287 vdev_raidz_map_free(rm);
289 288 }
290 289
291 290 /*ARGSUSED*/
292 291 static void
293 292 vdev_raidz_cksum_free(void *arg, size_t ignored)
294 293 {
295 294 raidz_map_t *rm = arg;
296 295
297 296 ASSERT3U(rm->rm_reports, >, 0);
298 297
299 298 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
300 299 vdev_raidz_map_free(rm);
301 300 }
302 301
303 302 static void
304 303 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
305 304 {
306 305 raidz_map_t *rm = zcr->zcr_cbdata;
307 306 size_t c = zcr->zcr_cbinfo;
308 307 size_t x;
309 308
310 309 const char *good = NULL;
311 310 const char *bad = rm->rm_col[c].rc_data;
312 311
313 312 if (good_data == NULL) {
314 313 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
315 314 return;
316 315 }
317 316
318 317 if (c < rm->rm_firstdatacol) {
319 318 /*
320 319 * The first time through, calculate the parity blocks for
321 320 * the good data (this relies on the fact that the good
322 321 * data never changes for a given logical ZIO)
323 322 */
324 323 if (rm->rm_col[0].rc_gdata == NULL) {
325 324 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
326 325 char *buf;
327 326
328 327 /*
329 328 * Set up the rm_col[]s to generate the parity for
330 329 * good_data, first saving the parity bufs and
331 330 * replacing them with buffers to hold the result.
332 331 */
333 332 for (x = 0; x < rm->rm_firstdatacol; x++) {
334 333 bad_parity[x] = rm->rm_col[x].rc_data;
335 334 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
336 335 zio_buf_alloc(rm->rm_col[x].rc_size);
337 336 }
338 337
339 338 /* fill in the data columns from good_data */
340 339 buf = (char *)good_data;
341 340 for (; x < rm->rm_cols; x++) {
342 341 rm->rm_col[x].rc_data = buf;
343 342 buf += rm->rm_col[x].rc_size;
344 343 }
345 344
346 345 /*
347 346 * Construct the parity from the good data.
348 347 */
349 348 vdev_raidz_generate_parity(rm);
350 349
351 350 /* restore everything back to its original state */
352 351 for (x = 0; x < rm->rm_firstdatacol; x++)
353 352 rm->rm_col[x].rc_data = bad_parity[x];
354 353
355 354 buf = rm->rm_datacopy;
356 355 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
357 356 rm->rm_col[x].rc_data = buf;
358 357 buf += rm->rm_col[x].rc_size;
359 358 }
360 359 }
361 360
362 361 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
363 362 good = rm->rm_col[c].rc_gdata;
364 363 } else {
365 364 /* adjust good_data to point at the start of our column */
366 365 good = good_data;
367 366
368 367 for (x = rm->rm_firstdatacol; x < c; x++)
369 368 good += rm->rm_col[x].rc_size;
370 369 }
371 370
372 371 /* we drop the ereport if it ends up that the data was good */
373 372 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
374 373 }
375 374
376 375 /*
377 376 * Invoked indirectly by zfs_ereport_start_checksum(), called
378 377 * below when our read operation fails completely. The main point
379 378 * is to keep a copy of everything we read from disk, so that at
380 379 * vdev_raidz_cksum_finish() time we can compare it with the good data.
381 380 */
382 381 static void
383 382 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
384 383 {
385 384 size_t c = (size_t)(uintptr_t)arg;
386 385 caddr_t buf;
387 386
388 387 raidz_map_t *rm = zio->io_vsd;
389 388 size_t size;
390 389
391 390 /* set up the report and bump the refcount */
392 391 zcr->zcr_cbdata = rm;
393 392 zcr->zcr_cbinfo = c;
394 393 zcr->zcr_finish = vdev_raidz_cksum_finish;
395 394 zcr->zcr_free = vdev_raidz_cksum_free;
396 395
397 396 rm->rm_reports++;
398 397 ASSERT3U(rm->rm_reports, >, 0);
399 398
400 399 if (rm->rm_datacopy != NULL)
401 400 return;
402 401
403 402 /*
404 403 * It's the first time we're called for this raidz_map_t, so we need
405 404 * to copy the data aside; there's no guarantee that our zio's buffer
406 405 * won't be re-used for something else.
407 406 *
408 407 * Our parity data is already in separate buffers, so there's no need
409 408 * to copy them.
410 409 */
411 410
412 411 size = 0;
413 412 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
414 413 size += rm->rm_col[c].rc_size;
415 414
416 415 buf = rm->rm_datacopy = zio_buf_alloc(size);
417 416
418 417 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
419 418 raidz_col_t *col = &rm->rm_col[c];
420 419
421 420 bcopy(col->rc_data, buf, col->rc_size);
422 421 col->rc_data = buf;
423 422
424 423 buf += col->rc_size;
425 424 }
426 425 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
427 426 }
428 427
429 428 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
430 429 vdev_raidz_map_free_vsd,
431 430 vdev_raidz_cksum_report
432 431 };
433 432
434 433 /*
435 434 * Divides the IO evenly across all child vdevs; usually, dcols is
436 435 * the number of children in the target vdev.
437 436 */
438 437 static raidz_map_t *
439 438 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
440 439 uint64_t nparity)
441 440 {
442 441 raidz_map_t *rm;
443 442 /* The starting RAIDZ (parent) vdev sector of the block. */
444 443 uint64_t b = zio->io_offset >> unit_shift;
445 444 /* The zio's size in units of the vdev's minimum sector size */
446 445 uint64_t s = zio->io_size >> unit_shift;
447 446 /* The first column for this stripe. */
448 447 uint64_t f = b % dcols;
449 448 /* The starting byte offset on each child vdev. */
450 449 uint64_t o = (b / dcols) << unit_shift;
451 450 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
452 451
453 452 /*
454 453 * "Quotient": The number of data sectors for this stripe on all but
455 454 * the "big column" child vdevs that also contain "remainder" data.
456 455 */
457 456 q = s / (dcols - nparity);
458 457
459 458 /*
460 459 * "Remainder": The number of partial stripe data sectors in this I/O.
461 460 * This will add a sector to some, but not all, child vdevs.
462 461 */
463 462 r = s - q * (dcols - nparity);
464 463
465 464 /* The number of "big columns" - those which contain remainder data. */
466 465 bc = (r == 0 ? 0 : r + nparity);
467 466
468 467 /*
469 468 * The total number of data and parity sectors associated with
470 469 * this I/O.
471 470 */
472 471 tot = s + nparity * (q + (r == 0 ? 0 : 1));
473 472
474 473 /* acols: The columns that will be accessed. */
475 474 /* scols: The columns that will be accessed or skipped. */
476 475 if (q == 0) {
477 476 /* Our I/O request doesn't span all child vdevs. */
478 477 acols = bc;
479 478 scols = MIN(dcols, roundup(bc, nparity + 1));
480 479 } else {
481 480 acols = dcols;
482 481 scols = dcols;
483 482 }
484 483
485 484 ASSERT3U(acols, <=, scols);
486 485
487 486 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
488 487
489 488 rm->rm_cols = acols;
490 489 rm->rm_scols = scols;
491 490 rm->rm_bigcols = bc;
492 491 rm->rm_skipstart = bc;
493 492 rm->rm_missingdata = 0;
494 493 rm->rm_missingparity = 0;
495 494 rm->rm_firstdatacol = nparity;
496 495 rm->rm_datacopy = NULL;
497 496 rm->rm_reports = 0;
498 497 rm->rm_freed = 0;
499 498 rm->rm_ecksuminjected = 0;
500 499
501 500 asize = 0;
502 501
503 502 for (c = 0; c < scols; c++) {
504 503 col = f + c;
505 504 coff = o;
506 505 if (col >= dcols) {
507 506 col -= dcols;
508 507 coff += 1ULL << unit_shift;
509 508 }
510 509 rm->rm_col[c].rc_devidx = col;
511 510 rm->rm_col[c].rc_offset = coff;
512 511 rm->rm_col[c].rc_data = NULL;
513 512 rm->rm_col[c].rc_gdata = NULL;
514 513 rm->rm_col[c].rc_error = 0;
515 514 rm->rm_col[c].rc_tried = 0;
516 515 rm->rm_col[c].rc_skipped = 0;
517 516
518 517 if (c >= acols)
519 518 rm->rm_col[c].rc_size = 0;
520 519 else if (c < bc)
521 520 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
522 521 else
523 522 rm->rm_col[c].rc_size = q << unit_shift;
524 523
525 524 asize += rm->rm_col[c].rc_size;
526 525 }
527 526
528 527 ASSERT3U(asize, ==, tot << unit_shift);
529 528 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
530 529 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
531 530 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
532 531 ASSERT3U(rm->rm_nskip, <=, nparity);
533 532
534 533 for (c = 0; c < rm->rm_firstdatacol; c++)
535 534 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
536 535
537 536 rm->rm_col[c].rc_data = zio->io_data;
538 537
539 538 for (c = c + 1; c < acols; c++)
540 539 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
541 540 rm->rm_col[c - 1].rc_size;
542 541
543 542 /*
544 543 * If all data stored spans all columns, there's a danger that parity
545 544 * will always be on the same device and, since parity isn't read
546 545 * during normal operation, that that device's I/O bandwidth won't be
547 546 * used effectively. We therefore switch the parity every 1MB.
548 547 *
549 548 * ... at least that was, ostensibly, the theory. As a practical
550 549 * matter unless we juggle the parity between all devices evenly, we
551 550 * won't see any benefit. Further, occasional writes that aren't a
552 551 * multiple of the LCM of the number of children and the minimum
553 552 * stripe width are sufficient to avoid pessimal behavior.
554 553 * Unfortunately, this decision created an implicit on-disk format
555 554 * requirement that we need to support for all eternity, but only
556 555 * for single-parity RAID-Z.
557 556 *
558 557 * If we intend to skip a sector in the zeroth column for padding
559 558 * we must make sure to note this swap. We will never intend to
560 559 * skip the first column since at least one data and one parity
561 560 * column must appear in each row.
562 561 */
563 562 ASSERT(rm->rm_cols >= 2);
564 563 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
565 564
566 565 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
567 566 devidx = rm->rm_col[0].rc_devidx;
568 567 o = rm->rm_col[0].rc_offset;
569 568 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
570 569 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
571 570 rm->rm_col[1].rc_devidx = devidx;
572 571 rm->rm_col[1].rc_offset = o;
573 572
574 573 if (rm->rm_skipstart == 0)
575 574 rm->rm_skipstart = 1;
576 575 }
577 576
578 577 zio->io_vsd = rm;
579 578 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
580 579 return (rm);
581 580 }
582 581
583 582 static void
584 583 vdev_raidz_generate_parity_p(raidz_map_t *rm)
585 584 {
586 585 uint64_t *p, *src, pcount, ccount, i;
587 586 int c;
588 587
589 588 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
590 589
591 590 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
592 591 src = rm->rm_col[c].rc_data;
593 592 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
594 593 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
595 594
596 595 if (c == rm->rm_firstdatacol) {
597 596 ASSERT(ccount == pcount);
598 597 for (i = 0; i < ccount; i++, src++, p++) {
599 598 *p = *src;
600 599 }
601 600 } else {
602 601 ASSERT(ccount <= pcount);
603 602 for (i = 0; i < ccount; i++, src++, p++) {
604 603 *p ^= *src;
605 604 }
606 605 }
607 606 }
608 607 }
609 608
610 609 static void
611 610 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
612 611 {
613 612 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
614 613 int c;
615 614
616 615 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
617 616 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
618 617 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
619 618
620 619 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
621 620 src = rm->rm_col[c].rc_data;
622 621 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
623 622 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
624 623
625 624 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
626 625
627 626 if (c == rm->rm_firstdatacol) {
628 627 ASSERT(ccnt == pcnt || ccnt == 0);
629 628 for (i = 0; i < ccnt; i++, src++, p++, q++) {
630 629 *p = *src;
631 630 *q = *src;
632 631 }
633 632 for (; i < pcnt; i++, src++, p++, q++) {
634 633 *p = 0;
635 634 *q = 0;
636 635 }
637 636 } else {
638 637 ASSERT(ccnt <= pcnt);
639 638
640 639 /*
641 640 * Apply the algorithm described above by multiplying
642 641 * the previous result and adding in the new value.
643 642 */
644 643 for (i = 0; i < ccnt; i++, src++, p++, q++) {
645 644 *p ^= *src;
646 645
647 646 VDEV_RAIDZ_64MUL_2(*q, mask);
648 647 *q ^= *src;
649 648 }
650 649
651 650 /*
652 651 * Treat short columns as though they are full of 0s.
653 652 * Note that there's therefore nothing needed for P.
654 653 */
655 654 for (; i < pcnt; i++, q++) {
656 655 VDEV_RAIDZ_64MUL_2(*q, mask);
657 656 }
658 657 }
659 658 }
660 659 }
661 660
662 661 static void
663 662 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
664 663 {
665 664 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
666 665 int c;
667 666
668 667 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
669 668 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
670 669 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
671 670 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
672 671 rm->rm_col[VDEV_RAIDZ_R].rc_size);
673 672
674 673 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
675 674 src = rm->rm_col[c].rc_data;
676 675 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
677 676 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
678 677 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
679 678
680 679 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
681 680
682 681 if (c == rm->rm_firstdatacol) {
683 682 ASSERT(ccnt == pcnt || ccnt == 0);
684 683 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
685 684 *p = *src;
686 685 *q = *src;
687 686 *r = *src;
688 687 }
689 688 for (; i < pcnt; i++, src++, p++, q++, r++) {
690 689 *p = 0;
691 690 *q = 0;
692 691 *r = 0;
693 692 }
694 693 } else {
695 694 ASSERT(ccnt <= pcnt);
696 695
697 696 /*
698 697 * Apply the algorithm described above by multiplying
699 698 * the previous result and adding in the new value.
700 699 */
701 700 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
702 701 *p ^= *src;
703 702
704 703 VDEV_RAIDZ_64MUL_2(*q, mask);
705 704 *q ^= *src;
706 705
707 706 VDEV_RAIDZ_64MUL_4(*r, mask);
708 707 *r ^= *src;
709 708 }
710 709
711 710 /*
712 711 * Treat short columns as though they are full of 0s.
713 712 * Note that there's therefore nothing needed for P.
714 713 */
715 714 for (; i < pcnt; i++, q++, r++) {
716 715 VDEV_RAIDZ_64MUL_2(*q, mask);
717 716 VDEV_RAIDZ_64MUL_4(*r, mask);
718 717 }
719 718 }
720 719 }
721 720 }
722 721
723 722 /*
724 723 * Generate RAID parity in the first virtual columns according to the number of
725 724 * parity columns available.
726 725 */
727 726 static void
728 727 vdev_raidz_generate_parity(raidz_map_t *rm)
729 728 {
730 729 switch (rm->rm_firstdatacol) {
731 730 case 1:
732 731 vdev_raidz_generate_parity_p(rm);
733 732 break;
734 733 case 2:
735 734 vdev_raidz_generate_parity_pq(rm);
736 735 break;
737 736 case 3:
738 737 vdev_raidz_generate_parity_pqr(rm);
739 738 break;
740 739 default:
741 740 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
742 741 }
743 742 }
744 743
745 744 static int
746 745 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
747 746 {
748 747 uint64_t *dst, *src, xcount, ccount, count, i;
749 748 int x = tgts[0];
750 749 int c;
751 750
752 751 ASSERT(ntgts == 1);
753 752 ASSERT(x >= rm->rm_firstdatacol);
754 753 ASSERT(x < rm->rm_cols);
755 754
756 755 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
757 756 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
758 757 ASSERT(xcount > 0);
759 758
760 759 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
761 760 dst = rm->rm_col[x].rc_data;
762 761 for (i = 0; i < xcount; i++, dst++, src++) {
763 762 *dst = *src;
764 763 }
765 764
766 765 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
767 766 src = rm->rm_col[c].rc_data;
768 767 dst = rm->rm_col[x].rc_data;
769 768
770 769 if (c == x)
771 770 continue;
772 771
773 772 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
774 773 count = MIN(ccount, xcount);
775 774
776 775 for (i = 0; i < count; i++, dst++, src++) {
777 776 *dst ^= *src;
778 777 }
779 778 }
780 779
781 780 return (1 << VDEV_RAIDZ_P);
782 781 }
783 782
784 783 static int
785 784 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
786 785 {
787 786 uint64_t *dst, *src, xcount, ccount, count, mask, i;
788 787 uint8_t *b;
789 788 int x = tgts[0];
790 789 int c, j, exp;
791 790
792 791 ASSERT(ntgts == 1);
793 792
794 793 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
795 794 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
796 795
797 796 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
798 797 src = rm->rm_col[c].rc_data;
799 798 dst = rm->rm_col[x].rc_data;
800 799
801 800 if (c == x)
802 801 ccount = 0;
803 802 else
804 803 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
805 804
806 805 count = MIN(ccount, xcount);
807 806
808 807 if (c == rm->rm_firstdatacol) {
809 808 for (i = 0; i < count; i++, dst++, src++) {
810 809 *dst = *src;
811 810 }
812 811 for (; i < xcount; i++, dst++) {
813 812 *dst = 0;
814 813 }
815 814
816 815 } else {
817 816 for (i = 0; i < count; i++, dst++, src++) {
818 817 VDEV_RAIDZ_64MUL_2(*dst, mask);
819 818 *dst ^= *src;
820 819 }
821 820
822 821 for (; i < xcount; i++, dst++) {
823 822 VDEV_RAIDZ_64MUL_2(*dst, mask);
824 823 }
825 824 }
826 825 }
827 826
828 827 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
829 828 dst = rm->rm_col[x].rc_data;
830 829 exp = 255 - (rm->rm_cols - 1 - x);
831 830
832 831 for (i = 0; i < xcount; i++, dst++, src++) {
833 832 *dst ^= *src;
834 833 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
835 834 *b = vdev_raidz_exp2(*b, exp);
836 835 }
837 836 }
838 837
839 838 return (1 << VDEV_RAIDZ_Q);
840 839 }
841 840
842 841 static int
843 842 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
844 843 {
845 844 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
846 845 void *pdata, *qdata;
847 846 uint64_t xsize, ysize, i;
848 847 int x = tgts[0];
849 848 int y = tgts[1];
850 849
851 850 ASSERT(ntgts == 2);
852 851 ASSERT(x < y);
853 852 ASSERT(x >= rm->rm_firstdatacol);
854 853 ASSERT(y < rm->rm_cols);
855 854
856 855 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
857 856
858 857 /*
859 858 * Move the parity data aside -- we're going to compute parity as
860 859 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
861 860 * reuse the parity generation mechanism without trashing the actual
862 861 * parity so we make those columns appear to be full of zeros by
863 862 * setting their lengths to zero.
864 863 */
865 864 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
866 865 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
867 866 xsize = rm->rm_col[x].rc_size;
868 867 ysize = rm->rm_col[y].rc_size;
869 868
870 869 rm->rm_col[VDEV_RAIDZ_P].rc_data =
871 870 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
872 871 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
873 872 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
874 873 rm->rm_col[x].rc_size = 0;
875 874 rm->rm_col[y].rc_size = 0;
876 875
877 876 vdev_raidz_generate_parity_pq(rm);
878 877
879 878 rm->rm_col[x].rc_size = xsize;
880 879 rm->rm_col[y].rc_size = ysize;
881 880
882 881 p = pdata;
883 882 q = qdata;
884 883 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
885 884 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
886 885 xd = rm->rm_col[x].rc_data;
887 886 yd = rm->rm_col[y].rc_data;
888 887
889 888 /*
890 889 * We now have:
891 890 * Pxy = P + D_x + D_y
892 891 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
893 892 *
894 893 * We can then solve for D_x:
895 894 * D_x = A * (P + Pxy) + B * (Q + Qxy)
896 895 * where
897 896 * A = 2^(x - y) * (2^(x - y) + 1)^-1
898 897 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
899 898 *
900 899 * With D_x in hand, we can easily solve for D_y:
901 900 * D_y = P + Pxy + D_x
902 901 */
903 902
904 903 a = vdev_raidz_pow2[255 + x - y];
905 904 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
906 905 tmp = 255 - vdev_raidz_log2[a ^ 1];
907 906
908 907 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
909 908 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
910 909
911 910 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
912 911 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
913 912 vdev_raidz_exp2(*q ^ *qxy, bexp);
914 913
915 914 if (i < ysize)
916 915 *yd = *p ^ *pxy ^ *xd;
917 916 }
918 917
919 918 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
920 919 rm->rm_col[VDEV_RAIDZ_P].rc_size);
921 920 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
922 921 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
923 922
924 923 /*
925 924 * Restore the saved parity data.
926 925 */
927 926 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
928 927 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
929 928
930 929 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
931 930 }
932 931
933 932 /* BEGIN CSTYLED */
934 933 /*
935 934 * In the general case of reconstruction, we must solve the system of linear
936 935 * equations defined by the coeffecients used to generate parity as well as
937 936 * the contents of the data and parity disks. This can be expressed with
938 937 * vectors for the original data (D) and the actual data (d) and parity (p)
939 938 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
940 939 *
941 940 * __ __ __ __
942 941 * | | __ __ | p_0 |
943 942 * | V | | D_0 | | p_m-1 |
944 943 * | | x | : | = | d_0 |
945 944 * | I | | D_n-1 | | : |
946 945 * | | ~~ ~~ | d_n-1 |
947 946 * ~~ ~~ ~~ ~~
948 947 *
949 948 * I is simply a square identity matrix of size n, and V is a vandermonde
950 949 * matrix defined by the coeffecients we chose for the various parity columns
951 950 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
952 951 * computation as well as linear separability.
953 952 *
954 953 * __ __ __ __
955 954 * | 1 .. 1 1 1 | | p_0 |
956 955 * | 2^n-1 .. 4 2 1 | __ __ | : |
957 956 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
958 957 * | 1 .. 0 0 0 | | D_1 | | d_0 |
959 958 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
960 959 * | : : : : | | : | | d_2 |
961 960 * | 0 .. 1 0 0 | | D_n-1 | | : |
962 961 * | 0 .. 0 1 0 | ~~ ~~ | : |
963 962 * | 0 .. 0 0 1 | | d_n-1 |
964 963 * ~~ ~~ ~~ ~~
965 964 *
966 965 * Note that I, V, d, and p are known. To compute D, we must invert the
967 966 * matrix and use the known data and parity values to reconstruct the unknown
968 967 * data values. We begin by removing the rows in V|I and d|p that correspond
969 968 * to failed or missing columns; we then make V|I square (n x n) and d|p
970 969 * sized n by removing rows corresponding to unused parity from the bottom up
971 970 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
972 971 * using Gauss-Jordan elimination. In the example below we use m=3 parity
973 972 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
974 973 * __ __
975 974 * | 1 1 1 1 1 1 1 1 |
976 975 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
977 976 * | 19 205 116 29 64 16 4 1 | / /
978 977 * | 1 0 0 0 0 0 0 0 | / /
979 978 * | 0 1 0 0 0 0 0 0 | <--' /
980 979 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
981 980 * | 0 0 0 1 0 0 0 0 |
982 981 * | 0 0 0 0 1 0 0 0 |
983 982 * | 0 0 0 0 0 1 0 0 |
984 983 * | 0 0 0 0 0 0 1 0 |
985 984 * | 0 0 0 0 0 0 0 1 |
986 985 * ~~ ~~
987 986 * __ __
988 987 * | 1 1 1 1 1 1 1 1 |
989 988 * | 128 64 32 16 8 4 2 1 |
990 989 * | 19 205 116 29 64 16 4 1 |
991 990 * | 1 0 0 0 0 0 0 0 |
992 991 * | 0 1 0 0 0 0 0 0 |
993 992 * (V|I)' = | 0 0 1 0 0 0 0 0 |
994 993 * | 0 0 0 1 0 0 0 0 |
995 994 * | 0 0 0 0 1 0 0 0 |
996 995 * | 0 0 0 0 0 1 0 0 |
997 996 * | 0 0 0 0 0 0 1 0 |
998 997 * | 0 0 0 0 0 0 0 1 |
999 998 * ~~ ~~
1000 999 *
1001 1000 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1002 1001 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1003 1002 * matrix is not singular.
1004 1003 * __ __
1005 1004 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1006 1005 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1007 1006 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1008 1007 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1009 1008 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1010 1009 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1011 1010 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1012 1011 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1013 1012 * ~~ ~~
1014 1013 * __ __
1015 1014 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1016 1015 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1017 1016 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1018 1017 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1019 1018 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1020 1019 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1021 1020 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1022 1021 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1023 1022 * ~~ ~~
1024 1023 * __ __
1025 1024 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1026 1025 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1027 1026 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1028 1027 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1029 1028 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1030 1029 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1031 1030 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1032 1031 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1033 1032 * ~~ ~~
1034 1033 * __ __
1035 1034 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1036 1035 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1037 1036 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1038 1037 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1039 1038 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1040 1039 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1041 1040 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1042 1041 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1043 1042 * ~~ ~~
1044 1043 * __ __
1045 1044 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1046 1045 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1047 1046 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1048 1047 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1049 1048 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1050 1049 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1051 1050 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1052 1051 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1053 1052 * ~~ ~~
1054 1053 * __ __
1055 1054 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1056 1055 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1057 1056 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1058 1057 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1059 1058 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1060 1059 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1061 1060 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1062 1061 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1063 1062 * ~~ ~~
1064 1063 * __ __
1065 1064 * | 0 0 1 0 0 0 0 0 |
1066 1065 * | 167 100 5 41 159 169 217 208 |
1067 1066 * | 166 100 4 40 158 168 216 209 |
1068 1067 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1069 1068 * | 0 0 0 0 1 0 0 0 |
1070 1069 * | 0 0 0 0 0 1 0 0 |
1071 1070 * | 0 0 0 0 0 0 1 0 |
1072 1071 * | 0 0 0 0 0 0 0 1 |
1073 1072 * ~~ ~~
1074 1073 *
1075 1074 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1076 1075 * of the missing data.
1077 1076 *
1078 1077 * As is apparent from the example above, the only non-trivial rows in the
1079 1078 * inverse matrix correspond to the data disks that we're trying to
1080 1079 * reconstruct. Indeed, those are the only rows we need as the others would
1081 1080 * only be useful for reconstructing data known or assumed to be valid. For
1082 1081 * that reason, we only build the coefficients in the rows that correspond to
1083 1082 * targeted columns.
1084 1083 */
1085 1084 /* END CSTYLED */
1086 1085
1087 1086 static void
1088 1087 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1089 1088 uint8_t **rows)
1090 1089 {
1091 1090 int i, j;
1092 1091 int pow;
1093 1092
1094 1093 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1095 1094
1096 1095 /*
1097 1096 * Fill in the missing rows of interest.
1098 1097 */
1099 1098 for (i = 0; i < nmap; i++) {
1100 1099 ASSERT3S(0, <=, map[i]);
1101 1100 ASSERT3S(map[i], <=, 2);
1102 1101
1103 1102 pow = map[i] * n;
1104 1103 if (pow > 255)
1105 1104 pow -= 255;
1106 1105 ASSERT(pow <= 255);
1107 1106
1108 1107 for (j = 0; j < n; j++) {
1109 1108 pow -= map[i];
1110 1109 if (pow < 0)
1111 1110 pow += 255;
1112 1111 rows[i][j] = vdev_raidz_pow2[pow];
1113 1112 }
1114 1113 }
1115 1114 }
1116 1115
1117 1116 static void
1118 1117 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1119 1118 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1120 1119 {
1121 1120 int i, j, ii, jj;
1122 1121 uint8_t log;
1123 1122
1124 1123 /*
1125 1124 * Assert that the first nmissing entries from the array of used
1126 1125 * columns correspond to parity columns and that subsequent entries
1127 1126 * correspond to data columns.
1128 1127 */
1129 1128 for (i = 0; i < nmissing; i++) {
1130 1129 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1131 1130 }
1132 1131 for (; i < n; i++) {
1133 1132 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1134 1133 }
1135 1134
1136 1135 /*
1137 1136 * First initialize the storage where we'll compute the inverse rows.
1138 1137 */
1139 1138 for (i = 0; i < nmissing; i++) {
1140 1139 for (j = 0; j < n; j++) {
1141 1140 invrows[i][j] = (i == j) ? 1 : 0;
1142 1141 }
1143 1142 }
1144 1143
1145 1144 /*
1146 1145 * Subtract all trivial rows from the rows of consequence.
1147 1146 */
1148 1147 for (i = 0; i < nmissing; i++) {
1149 1148 for (j = nmissing; j < n; j++) {
1150 1149 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1151 1150 jj = used[j] - rm->rm_firstdatacol;
1152 1151 ASSERT3S(jj, <, n);
1153 1152 invrows[i][j] = rows[i][jj];
1154 1153 rows[i][jj] = 0;
1155 1154 }
1156 1155 }
1157 1156
1158 1157 /*
1159 1158 * For each of the rows of interest, we must normalize it and subtract
1160 1159 * a multiple of it from the other rows.
1161 1160 */
1162 1161 for (i = 0; i < nmissing; i++) {
1163 1162 for (j = 0; j < missing[i]; j++) {
1164 1163 ASSERT0(rows[i][j]);
1165 1164 }
1166 1165 ASSERT3U(rows[i][missing[i]], !=, 0);
1167 1166
1168 1167 /*
1169 1168 * Compute the inverse of the first element and multiply each
1170 1169 * element in the row by that value.
1171 1170 */
1172 1171 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1173 1172
1174 1173 for (j = 0; j < n; j++) {
1175 1174 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1176 1175 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1177 1176 }
1178 1177
1179 1178 for (ii = 0; ii < nmissing; ii++) {
1180 1179 if (i == ii)
1181 1180 continue;
1182 1181
1183 1182 ASSERT3U(rows[ii][missing[i]], !=, 0);
1184 1183
1185 1184 log = vdev_raidz_log2[rows[ii][missing[i]]];
1186 1185
1187 1186 for (j = 0; j < n; j++) {
1188 1187 rows[ii][j] ^=
1189 1188 vdev_raidz_exp2(rows[i][j], log);
1190 1189 invrows[ii][j] ^=
1191 1190 vdev_raidz_exp2(invrows[i][j], log);
1192 1191 }
1193 1192 }
1194 1193 }
1195 1194
1196 1195 /*
1197 1196 * Verify that the data that is left in the rows are properly part of
1198 1197 * an identity matrix.
1199 1198 */
1200 1199 for (i = 0; i < nmissing; i++) {
1201 1200 for (j = 0; j < n; j++) {
1202 1201 if (j == missing[i]) {
1203 1202 ASSERT3U(rows[i][j], ==, 1);
1204 1203 } else {
1205 1204 ASSERT0(rows[i][j]);
1206 1205 }
1207 1206 }
1208 1207 }
1209 1208 }
1210 1209
1211 1210 static void
1212 1211 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1213 1212 int *missing, uint8_t **invrows, const uint8_t *used)
1214 1213 {
1215 1214 int i, j, x, cc, c;
1216 1215 uint8_t *src;
1217 1216 uint64_t ccount;
1218 1217 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1219 1218 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1220 1219 uint8_t log = 0;
1221 1220 uint8_t val;
1222 1221 int ll;
1223 1222 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1224 1223 uint8_t *p, *pp;
1225 1224 size_t psize;
1226 1225
1227 1226 psize = sizeof (invlog[0][0]) * n * nmissing;
1228 1227 p = kmem_alloc(psize, KM_SLEEP);
1229 1228
1230 1229 for (pp = p, i = 0; i < nmissing; i++) {
1231 1230 invlog[i] = pp;
1232 1231 pp += n;
1233 1232 }
1234 1233
1235 1234 for (i = 0; i < nmissing; i++) {
1236 1235 for (j = 0; j < n; j++) {
1237 1236 ASSERT3U(invrows[i][j], !=, 0);
1238 1237 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1239 1238 }
1240 1239 }
1241 1240
1242 1241 for (i = 0; i < n; i++) {
1243 1242 c = used[i];
1244 1243 ASSERT3U(c, <, rm->rm_cols);
1245 1244
1246 1245 src = rm->rm_col[c].rc_data;
1247 1246 ccount = rm->rm_col[c].rc_size;
1248 1247 for (j = 0; j < nmissing; j++) {
1249 1248 cc = missing[j] + rm->rm_firstdatacol;
1250 1249 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1251 1250 ASSERT3U(cc, <, rm->rm_cols);
1252 1251 ASSERT3U(cc, !=, c);
1253 1252
1254 1253 dst[j] = rm->rm_col[cc].rc_data;
1255 1254 dcount[j] = rm->rm_col[cc].rc_size;
1256 1255 }
1257 1256
1258 1257 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1259 1258
1260 1259 for (x = 0; x < ccount; x++, src++) {
1261 1260 if (*src != 0)
1262 1261 log = vdev_raidz_log2[*src];
1263 1262
1264 1263 for (cc = 0; cc < nmissing; cc++) {
1265 1264 if (x >= dcount[cc])
1266 1265 continue;
1267 1266
1268 1267 if (*src == 0) {
1269 1268 val = 0;
1270 1269 } else {
1271 1270 if ((ll = log + invlog[cc][i]) >= 255)
1272 1271 ll -= 255;
1273 1272 val = vdev_raidz_pow2[ll];
1274 1273 }
1275 1274
1276 1275 if (i == 0)
1277 1276 dst[cc][x] = val;
1278 1277 else
1279 1278 dst[cc][x] ^= val;
1280 1279 }
1281 1280 }
1282 1281 }
1283 1282
1284 1283 kmem_free(p, psize);
1285 1284 }
1286 1285
1287 1286 static int
1288 1287 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1289 1288 {
1290 1289 int n, i, c, t, tt;
1291 1290 int nmissing_rows;
1292 1291 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1293 1292 int parity_map[VDEV_RAIDZ_MAXPARITY];
1294 1293
1295 1294 uint8_t *p, *pp;
1296 1295 size_t psize;
1297 1296
1298 1297 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1299 1298 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1300 1299 uint8_t *used;
1301 1300
1302 1301 int code = 0;
1303 1302
1304 1303
1305 1304 n = rm->rm_cols - rm->rm_firstdatacol;
1306 1305
1307 1306 /*
1308 1307 * Figure out which data columns are missing.
1309 1308 */
1310 1309 nmissing_rows = 0;
1311 1310 for (t = 0; t < ntgts; t++) {
1312 1311 if (tgts[t] >= rm->rm_firstdatacol) {
1313 1312 missing_rows[nmissing_rows++] =
1314 1313 tgts[t] - rm->rm_firstdatacol;
1315 1314 }
1316 1315 }
1317 1316
1318 1317 /*
1319 1318 * Figure out which parity columns to use to help generate the missing
1320 1319 * data columns.
1321 1320 */
1322 1321 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1323 1322 ASSERT(tt < ntgts);
1324 1323 ASSERT(c < rm->rm_firstdatacol);
1325 1324
1326 1325 /*
1327 1326 * Skip any targeted parity columns.
1328 1327 */
1329 1328 if (c == tgts[tt]) {
1330 1329 tt++;
1331 1330 continue;
1332 1331 }
1333 1332
1334 1333 code |= 1 << c;
1335 1334
1336 1335 parity_map[i] = c;
1337 1336 i++;
1338 1337 }
1339 1338
1340 1339 ASSERT(code != 0);
1341 1340 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1342 1341
1343 1342 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1344 1343 nmissing_rows * n + sizeof (used[0]) * n;
1345 1344 p = kmem_alloc(psize, KM_SLEEP);
1346 1345
1347 1346 for (pp = p, i = 0; i < nmissing_rows; i++) {
1348 1347 rows[i] = pp;
1349 1348 pp += n;
1350 1349 invrows[i] = pp;
1351 1350 pp += n;
1352 1351 }
1353 1352 used = pp;
1354 1353
1355 1354 for (i = 0; i < nmissing_rows; i++) {
1356 1355 used[i] = parity_map[i];
1357 1356 }
1358 1357
1359 1358 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1360 1359 if (tt < nmissing_rows &&
1361 1360 c == missing_rows[tt] + rm->rm_firstdatacol) {
1362 1361 tt++;
1363 1362 continue;
1364 1363 }
1365 1364
1366 1365 ASSERT3S(i, <, n);
1367 1366 used[i] = c;
1368 1367 i++;
1369 1368 }
1370 1369
1371 1370 /*
1372 1371 * Initialize the interesting rows of the matrix.
1373 1372 */
1374 1373 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1375 1374
1376 1375 /*
1377 1376 * Invert the matrix.
1378 1377 */
1379 1378 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1380 1379 invrows, used);
1381 1380
1382 1381 /*
1383 1382 * Reconstruct the missing data using the generated matrix.
1384 1383 */
1385 1384 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1386 1385 invrows, used);
1387 1386
1388 1387 kmem_free(p, psize);
1389 1388
1390 1389 return (code);
1391 1390 }
1392 1391
1393 1392 static int
1394 1393 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1395 1394 {
1396 1395 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1397 1396 int ntgts;
1398 1397 int i, c;
1399 1398 int code;
1400 1399 int nbadparity, nbaddata;
1401 1400 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1402 1401
1403 1402 /*
1404 1403 * The tgts list must already be sorted.
1405 1404 */
1406 1405 for (i = 1; i < nt; i++) {
1407 1406 ASSERT(t[i] > t[i - 1]);
1408 1407 }
1409 1408
1410 1409 nbadparity = rm->rm_firstdatacol;
1411 1410 nbaddata = rm->rm_cols - nbadparity;
1412 1411 ntgts = 0;
1413 1412 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1414 1413 if (c < rm->rm_firstdatacol)
1415 1414 parity_valid[c] = B_FALSE;
1416 1415
1417 1416 if (i < nt && c == t[i]) {
1418 1417 tgts[ntgts++] = c;
1419 1418 i++;
1420 1419 } else if (rm->rm_col[c].rc_error != 0) {
1421 1420 tgts[ntgts++] = c;
1422 1421 } else if (c >= rm->rm_firstdatacol) {
1423 1422 nbaddata--;
1424 1423 } else {
1425 1424 parity_valid[c] = B_TRUE;
1426 1425 nbadparity--;
1427 1426 }
1428 1427 }
1429 1428
1430 1429 ASSERT(ntgts >= nt);
1431 1430 ASSERT(nbaddata >= 0);
1432 1431 ASSERT(nbaddata + nbadparity == ntgts);
1433 1432
1434 1433 dt = &tgts[nbadparity];
1435 1434
1436 1435 /*
1437 1436 * See if we can use any of our optimized reconstruction routines.
1438 1437 */
1439 1438 if (!vdev_raidz_default_to_general) {
1440 1439 switch (nbaddata) {
1441 1440 case 1:
1442 1441 if (parity_valid[VDEV_RAIDZ_P])
1443 1442 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1444 1443
1445 1444 ASSERT(rm->rm_firstdatacol > 1);
1446 1445
1447 1446 if (parity_valid[VDEV_RAIDZ_Q])
1448 1447 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1449 1448
1450 1449 ASSERT(rm->rm_firstdatacol > 2);
1451 1450 break;
1452 1451
1453 1452 case 2:
1454 1453 ASSERT(rm->rm_firstdatacol > 1);
1455 1454
1456 1455 if (parity_valid[VDEV_RAIDZ_P] &&
1457 1456 parity_valid[VDEV_RAIDZ_Q])
1458 1457 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1459 1458
1460 1459 ASSERT(rm->rm_firstdatacol > 2);
1461 1460
1462 1461 break;
1463 1462 }
1464 1463 }
1465 1464
1466 1465 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1467 1466 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1468 1467 ASSERT(code > 0);
1469 1468 return (code);
1470 1469 }
1471 1470
1472 1471 static int
1473 1472 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1474 1473 uint64_t *ashift)
1475 1474 {
1476 1475 vdev_t *cvd;
1477 1476 uint64_t nparity = vd->vdev_nparity;
1478 1477 int c;
1479 1478 int lasterror = 0;
1480 1479 int numerrors = 0;
1481 1480
1482 1481 ASSERT(nparity > 0);
1483 1482
1484 1483 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1485 1484 vd->vdev_children < nparity + 1) {
1486 1485 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1487 1486 return (SET_ERROR(EINVAL));
1488 1487 }
1489 1488
1490 1489 vdev_open_children(vd);
1491 1490
1492 1491 for (c = 0; c < vd->vdev_children; c++) {
1493 1492 cvd = vd->vdev_child[c];
1494 1493
1495 1494 if (cvd->vdev_open_error != 0) {
1496 1495 lasterror = cvd->vdev_open_error;
1497 1496 numerrors++;
1498 1497 continue;
1499 1498 }
1500 1499
1501 1500 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1502 1501 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1503 1502 *ashift = MAX(*ashift, cvd->vdev_ashift);
1504 1503 }
1505 1504
1506 1505 *asize *= vd->vdev_children;
1507 1506 *max_asize *= vd->vdev_children;
1508 1507
1509 1508 if (numerrors > nparity) {
1510 1509 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1511 1510 return (lasterror);
1512 1511 }
1513 1512
1514 1513 return (0);
1515 1514 }
1516 1515
1517 1516 static void
1518 1517 vdev_raidz_close(vdev_t *vd)
1519 1518 {
1520 1519 int c;
1521 1520
1522 1521 for (c = 0; c < vd->vdev_children; c++)
1523 1522 vdev_close(vd->vdev_child[c]);
1524 1523 }
1525 1524
1526 1525 static uint64_t
1527 1526 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1528 1527 {
1529 1528 uint64_t asize;
1530 1529 uint64_t ashift = vd->vdev_top->vdev_ashift;
1531 1530 uint64_t cols = vd->vdev_children;
1532 1531 uint64_t nparity = vd->vdev_nparity;
1533 1532
1534 1533 asize = ((psize - 1) >> ashift) + 1;
1535 1534 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1536 1535 asize = roundup(asize, nparity + 1) << ashift;
1537 1536
1538 1537 return (asize);
1539 1538 }
1540 1539
1541 1540 static void
1542 1541 vdev_raidz_child_done(zio_t *zio)
1543 1542 {
1544 1543 raidz_col_t *rc = zio->io_private;
1545 1544
1546 1545 rc->rc_error = zio->io_error;
1547 1546 rc->rc_tried = 1;
1548 1547 rc->rc_skipped = 0;
1549 1548 }
1550 1549
1551 1550 /*
1552 1551 * Start an IO operation on a RAIDZ VDev
1553 1552 *
1554 1553 * Outline:
1555 1554 * - For write operations:
1556 1555 * 1. Generate the parity data
1557 1556 * 2. Create child zio write operations to each column's vdev, for both
1558 1557 * data and parity.
1559 1558 * 3. If the column skips any sectors for padding, create optional dummy
1560 1559 * write zio children for those areas to improve aggregation continuity.
1561 1560 * - For read operations:
1562 1561 * 1. Create child zio read operations to each data column's vdev to read
1563 1562 * the range of data required for zio.
1564 1563 * 2. If this is a scrub or resilver operation, or if any of the data
1565 1564 * vdevs have had errors, then create zio read operations to the parity
1566 1565 * columns' VDevs as well.
1567 1566 */
1568 1567 static int
1569 1568 vdev_raidz_io_start(zio_t *zio)
1570 1569 {
1571 1570 vdev_t *vd = zio->io_vd;
1572 1571 vdev_t *tvd = vd->vdev_top;
1573 1572 vdev_t *cvd;
1574 1573 raidz_map_t *rm;
1575 1574 raidz_col_t *rc;
1576 1575 int c, i;
1577 1576
1578 1577 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1579 1578 vd->vdev_nparity);
1580 1579
1581 1580 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1582 1581
1583 1582 if (zio->io_type == ZIO_TYPE_WRITE) {
1584 1583 vdev_raidz_generate_parity(rm);
1585 1584
1586 1585 for (c = 0; c < rm->rm_cols; c++) {
1587 1586 rc = &rm->rm_col[c];
1588 1587 cvd = vd->vdev_child[rc->rc_devidx];
1589 1588 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1590 1589 rc->rc_offset, rc->rc_data, rc->rc_size,
1591 1590 zio->io_type, zio->io_priority, 0,
1592 1591 vdev_raidz_child_done, rc));
1593 1592 }
1594 1593
1595 1594 /*
1596 1595 * Generate optional I/Os for any skipped sectors to improve
1597 1596 * aggregation contiguity.
1598 1597 */
1599 1598 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1600 1599 ASSERT(c <= rm->rm_scols);
1601 1600 if (c == rm->rm_scols)
1602 1601 c = 0;
1603 1602 rc = &rm->rm_col[c];
1604 1603 cvd = vd->vdev_child[rc->rc_devidx];
1605 1604 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1606 1605 rc->rc_offset + rc->rc_size, NULL,
1607 1606 1 << tvd->vdev_ashift,
1608 1607 zio->io_type, zio->io_priority,
1609 1608 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1610 1609 }
1611 1610
1612 1611 return (ZIO_PIPELINE_CONTINUE);
1613 1612 }
1614 1613
1615 1614 ASSERT(zio->io_type == ZIO_TYPE_READ);
1616 1615
1617 1616 /*
1618 1617 * Iterate over the columns in reverse order so that we hit the parity
1619 1618 * last -- any errors along the way will force us to read the parity.
1620 1619 */
1621 1620 for (c = rm->rm_cols - 1; c >= 0; c--) {
1622 1621 rc = &rm->rm_col[c];
1623 1622 cvd = vd->vdev_child[rc->rc_devidx];
1624 1623 if (!vdev_readable(cvd)) {
1625 1624 if (c >= rm->rm_firstdatacol)
1626 1625 rm->rm_missingdata++;
1627 1626 else
1628 1627 rm->rm_missingparity++;
1629 1628 rc->rc_error = SET_ERROR(ENXIO);
1630 1629 rc->rc_tried = 1; /* don't even try */
1631 1630 rc->rc_skipped = 1;
1632 1631 continue;
1633 1632 }
1634 1633 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1635 1634 if (c >= rm->rm_firstdatacol)
1636 1635 rm->rm_missingdata++;
1637 1636 else
1638 1637 rm->rm_missingparity++;
1639 1638 rc->rc_error = SET_ERROR(ESTALE);
1640 1639 rc->rc_skipped = 1;
1641 1640 continue;
1642 1641 }
1643 1642 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1644 1643 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1645 1644 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1646 1645 rc->rc_offset, rc->rc_data, rc->rc_size,
1647 1646 zio->io_type, zio->io_priority, 0,
1648 1647 vdev_raidz_child_done, rc));
1649 1648 }
1650 1649 }
1651 1650
1652 1651 return (ZIO_PIPELINE_CONTINUE);
1653 1652 }
1654 1653
1655 1654
1656 1655 /*
1657 1656 * Report a checksum error for a child of a RAID-Z device.
1658 1657 */
1659 1658 static void
1660 1659 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1661 1660 {
1662 1661 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1663 1662
1664 1663 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1665 1664 zio_bad_cksum_t zbc;
1666 1665 raidz_map_t *rm = zio->io_vsd;
1667 1666
1668 1667 mutex_enter(&vd->vdev_stat_lock);
1669 1668 vd->vdev_stat.vs_checksum_errors++;
1670 1669 mutex_exit(&vd->vdev_stat_lock);
1671 1670
1672 1671 zbc.zbc_has_cksum = 0;
1673 1672 zbc.zbc_injected = rm->rm_ecksuminjected;
1674 1673
1675 1674 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1676 1675 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1677 1676 &zbc);
1678 1677 }
1679 1678 }
1680 1679
1681 1680 /*
1682 1681 * We keep track of whether or not there were any injected errors, so that
1683 1682 * any ereports we generate can note it.
1684 1683 */
1685 1684 static int
1686 1685 raidz_checksum_verify(zio_t *zio)
1687 1686 {
1688 1687 zio_bad_cksum_t zbc;
1689 1688 raidz_map_t *rm = zio->io_vsd;
1690 1689
1691 1690 int ret = zio_checksum_error(zio, &zbc);
1692 1691 if (ret != 0 && zbc.zbc_injected != 0)
1693 1692 rm->rm_ecksuminjected = 1;
1694 1693
1695 1694 return (ret);
1696 1695 }
1697 1696
1698 1697 /*
1699 1698 * Generate the parity from the data columns. If we tried and were able to
1700 1699 * read the parity without error, verify that the generated parity matches the
1701 1700 * data we read. If it doesn't, we fire off a checksum error. Return the
1702 1701 * number such failures.
1703 1702 */
1704 1703 static int
1705 1704 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1706 1705 {
1707 1706 void *orig[VDEV_RAIDZ_MAXPARITY];
1708 1707 int c, ret = 0;
1709 1708 raidz_col_t *rc;
1710 1709
1711 1710 for (c = 0; c < rm->rm_firstdatacol; c++) {
1712 1711 rc = &rm->rm_col[c];
1713 1712 if (!rc->rc_tried || rc->rc_error != 0)
1714 1713 continue;
1715 1714 orig[c] = zio_buf_alloc(rc->rc_size);
1716 1715 bcopy(rc->rc_data, orig[c], rc->rc_size);
1717 1716 }
1718 1717
1719 1718 vdev_raidz_generate_parity(rm);
1720 1719
1721 1720 for (c = 0; c < rm->rm_firstdatacol; c++) {
1722 1721 rc = &rm->rm_col[c];
1723 1722 if (!rc->rc_tried || rc->rc_error != 0)
1724 1723 continue;
1725 1724 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1726 1725 raidz_checksum_error(zio, rc, orig[c]);
1727 1726 rc->rc_error = SET_ERROR(ECKSUM);
1728 1727 ret++;
1729 1728 }
1730 1729 zio_buf_free(orig[c], rc->rc_size);
1731 1730 }
1732 1731
1733 1732 return (ret);
1734 1733 }
1735 1734
1736 1735 /*
1737 1736 * Keep statistics on all the ways that we used parity to correct data.
1738 1737 */
1739 1738 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1740 1739
1741 1740 static int
1742 1741 vdev_raidz_worst_error(raidz_map_t *rm)
1743 1742 {
1744 1743 int error = 0;
1745 1744
1746 1745 for (int c = 0; c < rm->rm_cols; c++)
1747 1746 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1748 1747
1749 1748 return (error);
1750 1749 }
1751 1750
1752 1751 /*
1753 1752 * Iterate over all combinations of bad data and attempt a reconstruction.
1754 1753 * Note that the algorithm below is non-optimal because it doesn't take into
1755 1754 * account how reconstruction is actually performed. For example, with
1756 1755 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1757 1756 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1758 1757 * cases we'd only use parity information in column 0.
1759 1758 */
1760 1759 static int
1761 1760 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1762 1761 {
1763 1762 raidz_map_t *rm = zio->io_vsd;
1764 1763 raidz_col_t *rc;
1765 1764 void *orig[VDEV_RAIDZ_MAXPARITY];
1766 1765 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1767 1766 int *tgts = &tstore[1];
1768 1767 int current, next, i, c, n;
1769 1768 int code, ret = 0;
1770 1769
1771 1770 ASSERT(total_errors < rm->rm_firstdatacol);
1772 1771
1773 1772 /*
1774 1773 * This simplifies one edge condition.
1775 1774 */
1776 1775 tgts[-1] = -1;
1777 1776
1778 1777 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1779 1778 /*
1780 1779 * Initialize the targets array by finding the first n columns
1781 1780 * that contain no error.
1782 1781 *
1783 1782 * If there were no data errors, we need to ensure that we're
1784 1783 * always explicitly attempting to reconstruct at least one
1785 1784 * data column. To do this, we simply push the highest target
1786 1785 * up into the data columns.
1787 1786 */
1788 1787 for (c = 0, i = 0; i < n; i++) {
1789 1788 if (i == n - 1 && data_errors == 0 &&
1790 1789 c < rm->rm_firstdatacol) {
1791 1790 c = rm->rm_firstdatacol;
1792 1791 }
1793 1792
1794 1793 while (rm->rm_col[c].rc_error != 0) {
1795 1794 c++;
1796 1795 ASSERT3S(c, <, rm->rm_cols);
1797 1796 }
1798 1797
1799 1798 tgts[i] = c++;
1800 1799 }
1801 1800
1802 1801 /*
1803 1802 * Setting tgts[n] simplifies the other edge condition.
1804 1803 */
1805 1804 tgts[n] = rm->rm_cols;
1806 1805
1807 1806 /*
1808 1807 * These buffers were allocated in previous iterations.
1809 1808 */
1810 1809 for (i = 0; i < n - 1; i++) {
1811 1810 ASSERT(orig[i] != NULL);
1812 1811 }
1813 1812
1814 1813 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1815 1814
1816 1815 current = 0;
1817 1816 next = tgts[current];
1818 1817
1819 1818 while (current != n) {
1820 1819 tgts[current] = next;
1821 1820 current = 0;
1822 1821
1823 1822 /*
1824 1823 * Save off the original data that we're going to
1825 1824 * attempt to reconstruct.
1826 1825 */
1827 1826 for (i = 0; i < n; i++) {
1828 1827 ASSERT(orig[i] != NULL);
1829 1828 c = tgts[i];
1830 1829 ASSERT3S(c, >=, 0);
1831 1830 ASSERT3S(c, <, rm->rm_cols);
1832 1831 rc = &rm->rm_col[c];
1833 1832 bcopy(rc->rc_data, orig[i], rc->rc_size);
1834 1833 }
1835 1834
1836 1835 /*
1837 1836 * Attempt a reconstruction and exit the outer loop on
1838 1837 * success.
1839 1838 */
1840 1839 code = vdev_raidz_reconstruct(rm, tgts, n);
1841 1840 if (raidz_checksum_verify(zio) == 0) {
1842 1841 atomic_inc_64(&raidz_corrected[code]);
1843 1842
1844 1843 for (i = 0; i < n; i++) {
1845 1844 c = tgts[i];
1846 1845 rc = &rm->rm_col[c];
1847 1846 ASSERT(rc->rc_error == 0);
1848 1847 if (rc->rc_tried)
1849 1848 raidz_checksum_error(zio, rc,
1850 1849 orig[i]);
1851 1850 rc->rc_error = SET_ERROR(ECKSUM);
1852 1851 }
1853 1852
1854 1853 ret = code;
1855 1854 goto done;
1856 1855 }
1857 1856
1858 1857 /*
1859 1858 * Restore the original data.
1860 1859 */
1861 1860 for (i = 0; i < n; i++) {
1862 1861 c = tgts[i];
1863 1862 rc = &rm->rm_col[c];
1864 1863 bcopy(orig[i], rc->rc_data, rc->rc_size);
1865 1864 }
1866 1865
1867 1866 do {
1868 1867 /*
1869 1868 * Find the next valid column after the current
1870 1869 * position..
1871 1870 */
1872 1871 for (next = tgts[current] + 1;
1873 1872 next < rm->rm_cols &&
1874 1873 rm->rm_col[next].rc_error != 0; next++)
1875 1874 continue;
1876 1875
1877 1876 ASSERT(next <= tgts[current + 1]);
1878 1877
1879 1878 /*
1880 1879 * If that spot is available, we're done here.
1881 1880 */
1882 1881 if (next != tgts[current + 1])
1883 1882 break;
1884 1883
1885 1884 /*
1886 1885 * Otherwise, find the next valid column after
1887 1886 * the previous position.
1888 1887 */
1889 1888 for (c = tgts[current - 1] + 1;
1890 1889 rm->rm_col[c].rc_error != 0; c++)
1891 1890 continue;
1892 1891
1893 1892 tgts[current] = c;
1894 1893 current++;
1895 1894
1896 1895 } while (current != n);
1897 1896 }
1898 1897 }
1899 1898 n--;
1900 1899 done:
1901 1900 for (i = 0; i < n; i++) {
1902 1901 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1903 1902 }
1904 1903
1905 1904 return (ret);
1906 1905 }
1907 1906
1908 1907 /*
1909 1908 * Complete an IO operation on a RAIDZ VDev
1910 1909 *
1911 1910 * Outline:
1912 1911 * - For write operations:
1913 1912 * 1. Check for errors on the child IOs.
1914 1913 * 2. Return, setting an error code if too few child VDevs were written
1915 1914 * to reconstruct the data later. Note that partial writes are
1916 1915 * considered successful if they can be reconstructed at all.
1917 1916 * - For read operations:
1918 1917 * 1. Check for errors on the child IOs.
1919 1918 * 2. If data errors occurred:
1920 1919 * a. Try to reassemble the data from the parity available.
1921 1920 * b. If we haven't yet read the parity drives, read them now.
1922 1921 * c. If all parity drives have been read but the data still doesn't
1923 1922 * reassemble with a correct checksum, then try combinatorial
1924 1923 * reconstruction.
1925 1924 * d. If that doesn't work, return an error.
1926 1925 * 3. If there were unexpected errors or this is a resilver operation,
1927 1926 * rewrite the vdevs that had errors.
1928 1927 */
1929 1928 static void
1930 1929 vdev_raidz_io_done(zio_t *zio)
1931 1930 {
1932 1931 vdev_t *vd = zio->io_vd;
1933 1932 vdev_t *cvd;
1934 1933 raidz_map_t *rm = zio->io_vsd;
1935 1934 raidz_col_t *rc;
1936 1935 int unexpected_errors = 0;
1937 1936 int parity_errors = 0;
1938 1937 int parity_untried = 0;
1939 1938 int data_errors = 0;
1940 1939 int total_errors = 0;
1941 1940 int n, c;
1942 1941 int tgts[VDEV_RAIDZ_MAXPARITY];
1943 1942 int code;
1944 1943
1945 1944 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1946 1945
1947 1946 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1948 1947 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1949 1948
1950 1949 for (c = 0; c < rm->rm_cols; c++) {
1951 1950 rc = &rm->rm_col[c];
1952 1951
1953 1952 if (rc->rc_error) {
1954 1953 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1955 1954
1956 1955 if (c < rm->rm_firstdatacol)
1957 1956 parity_errors++;
1958 1957 else
1959 1958 data_errors++;
1960 1959
1961 1960 if (!rc->rc_skipped)
1962 1961 unexpected_errors++;
1963 1962
1964 1963 total_errors++;
1965 1964 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1966 1965 parity_untried++;
1967 1966 }
1968 1967 }
1969 1968
1970 1969 if (zio->io_type == ZIO_TYPE_WRITE) {
1971 1970 /*
1972 1971 * XXX -- for now, treat partial writes as a success.
1973 1972 * (If we couldn't write enough columns to reconstruct
1974 1973 * the data, the I/O failed. Otherwise, good enough.)
1975 1974 *
1976 1975 * Now that we support write reallocation, it would be better
1977 1976 * to treat partial failure as real failure unless there are
1978 1977 * no non-degraded top-level vdevs left, and not update DTLs
1979 1978 * if we intend to reallocate.
1980 1979 */
1981 1980 /* XXPOLICY */
1982 1981 if (total_errors > rm->rm_firstdatacol)
1983 1982 zio->io_error = vdev_raidz_worst_error(rm);
1984 1983
1985 1984 return;
1986 1985 }
1987 1986
1988 1987 ASSERT(zio->io_type == ZIO_TYPE_READ);
1989 1988 /*
1990 1989 * There are three potential phases for a read:
1991 1990 * 1. produce valid data from the columns read
1992 1991 * 2. read all disks and try again
1993 1992 * 3. perform combinatorial reconstruction
1994 1993 *
1995 1994 * Each phase is progressively both more expensive and less likely to
1996 1995 * occur. If we encounter more errors than we can repair or all phases
1997 1996 * fail, we have no choice but to return an error.
1998 1997 */
1999 1998
2000 1999 /*
2001 2000 * If the number of errors we saw was correctable -- less than or equal
2002 2001 * to the number of parity disks read -- attempt to produce data that
2003 2002 * has a valid checksum. Naturally, this case applies in the absence of
2004 2003 * any errors.
2005 2004 */
2006 2005 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2007 2006 if (data_errors == 0) {
2008 2007 if (raidz_checksum_verify(zio) == 0) {
2009 2008 /*
2010 2009 * If we read parity information (unnecessarily
2011 2010 * as it happens since no reconstruction was
2012 2011 * needed) regenerate and verify the parity.
2013 2012 * We also regenerate parity when resilvering
2014 2013 * so we can write it out to the failed device
2015 2014 * later.
2016 2015 */
2017 2016 if (parity_errors + parity_untried <
2018 2017 rm->rm_firstdatacol ||
2019 2018 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2020 2019 n = raidz_parity_verify(zio, rm);
2021 2020 unexpected_errors += n;
2022 2021 ASSERT(parity_errors + n <=
2023 2022 rm->rm_firstdatacol);
2024 2023 }
2025 2024 goto done;
2026 2025 }
2027 2026 } else {
2028 2027 /*
2029 2028 * We either attempt to read all the parity columns or
2030 2029 * none of them. If we didn't try to read parity, we
2031 2030 * wouldn't be here in the correctable case. There must
2032 2031 * also have been fewer parity errors than parity
2033 2032 * columns or, again, we wouldn't be in this code path.
2034 2033 */
2035 2034 ASSERT(parity_untried == 0);
2036 2035 ASSERT(parity_errors < rm->rm_firstdatacol);
2037 2036
2038 2037 /*
2039 2038 * Identify the data columns that reported an error.
2040 2039 */
2041 2040 n = 0;
2042 2041 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2043 2042 rc = &rm->rm_col[c];
2044 2043 if (rc->rc_error != 0) {
2045 2044 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2046 2045 tgts[n++] = c;
2047 2046 }
2048 2047 }
2049 2048
2050 2049 ASSERT(rm->rm_firstdatacol >= n);
2051 2050
2052 2051 code = vdev_raidz_reconstruct(rm, tgts, n);
2053 2052
2054 2053 if (raidz_checksum_verify(zio) == 0) {
2055 2054 atomic_inc_64(&raidz_corrected[code]);
2056 2055
2057 2056 /*
2058 2057 * If we read more parity disks than were used
2059 2058 * for reconstruction, confirm that the other
2060 2059 * parity disks produced correct data. This
2061 2060 * routine is suboptimal in that it regenerates
2062 2061 * the parity that we already used in addition
2063 2062 * to the parity that we're attempting to
2064 2063 * verify, but this should be a relatively
2065 2064 * uncommon case, and can be optimized if it
2066 2065 * becomes a problem. Note that we regenerate
2067 2066 * parity when resilvering so we can write it
2068 2067 * out to failed devices later.
2069 2068 */
2070 2069 if (parity_errors < rm->rm_firstdatacol - n ||
2071 2070 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2072 2071 n = raidz_parity_verify(zio, rm);
2073 2072 unexpected_errors += n;
2074 2073 ASSERT(parity_errors + n <=
2075 2074 rm->rm_firstdatacol);
2076 2075 }
2077 2076
2078 2077 goto done;
2079 2078 }
2080 2079 }
2081 2080 }
2082 2081
2083 2082 /*
2084 2083 * This isn't a typical situation -- either we got a read error or
2085 2084 * a child silently returned bad data. Read every block so we can
2086 2085 * try again with as much data and parity as we can track down. If
2087 2086 * we've already been through once before, all children will be marked
2088 2087 * as tried so we'll proceed to combinatorial reconstruction.
2089 2088 */
2090 2089 unexpected_errors = 1;
2091 2090 rm->rm_missingdata = 0;
2092 2091 rm->rm_missingparity = 0;
2093 2092
2094 2093 for (c = 0; c < rm->rm_cols; c++) {
2095 2094 if (rm->rm_col[c].rc_tried)
2096 2095 continue;
2097 2096
2098 2097 zio_vdev_io_redone(zio);
2099 2098 do {
2100 2099 rc = &rm->rm_col[c];
2101 2100 if (rc->rc_tried)
2102 2101 continue;
2103 2102 zio_nowait(zio_vdev_child_io(zio, NULL,
2104 2103 vd->vdev_child[rc->rc_devidx],
2105 2104 rc->rc_offset, rc->rc_data, rc->rc_size,
2106 2105 zio->io_type, zio->io_priority, 0,
2107 2106 vdev_raidz_child_done, rc));
2108 2107 } while (++c < rm->rm_cols);
2109 2108
2110 2109 return;
2111 2110 }
2112 2111
2113 2112 /*
2114 2113 * At this point we've attempted to reconstruct the data given the
2115 2114 * errors we detected, and we've attempted to read all columns. There
2116 2115 * must, therefore, be one or more additional problems -- silent errors
2117 2116 * resulting in invalid data rather than explicit I/O errors resulting
2118 2117 * in absent data. We check if there is enough additional data to
2119 2118 * possibly reconstruct the data and then perform combinatorial
2120 2119 * reconstruction over all possible combinations. If that fails,
2121 2120 * we're cooked.
2122 2121 */
2123 2122 if (total_errors > rm->rm_firstdatacol) {
2124 2123 zio->io_error = vdev_raidz_worst_error(rm);
2125 2124
2126 2125 } else if (total_errors < rm->rm_firstdatacol &&
2127 2126 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2128 2127 /*
2129 2128 * If we didn't use all the available parity for the
2130 2129 * combinatorial reconstruction, verify that the remaining
2131 2130 * parity is correct.
2132 2131 */
2133 2132 if (code != (1 << rm->rm_firstdatacol) - 1)
2134 2133 (void) raidz_parity_verify(zio, rm);
2135 2134 } else {
2136 2135 /*
2137 2136 * We're here because either:
2138 2137 *
2139 2138 * total_errors == rm_first_datacol, or
2140 2139 * vdev_raidz_combrec() failed
2141 2140 *
2142 2141 * In either case, there is enough bad data to prevent
2143 2142 * reconstruction.
2144 2143 *
2145 2144 * Start checksum ereports for all children which haven't
2146 2145 * failed, and the IO wasn't speculative.
2147 2146 */
2148 2147 zio->io_error = SET_ERROR(ECKSUM);
2149 2148
2150 2149 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2151 2150 for (c = 0; c < rm->rm_cols; c++) {
2152 2151 rc = &rm->rm_col[c];
2153 2152 if (rc->rc_error == 0) {
2154 2153 zio_bad_cksum_t zbc;
2155 2154 zbc.zbc_has_cksum = 0;
2156 2155 zbc.zbc_injected =
2157 2156 rm->rm_ecksuminjected;
2158 2157
2159 2158 zfs_ereport_start_checksum(
2160 2159 zio->io_spa,
2161 2160 vd->vdev_child[rc->rc_devidx],
2162 2161 zio, rc->rc_offset, rc->rc_size,
2163 2162 (void *)(uintptr_t)c, &zbc);
2164 2163 }
2165 2164 }
2166 2165 }
2167 2166 }
2168 2167
2169 2168 done:
2170 2169 zio_checksum_verified(zio);
2171 2170
2172 2171 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2173 2172 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2174 2173 /*
2175 2174 * Use the good data we have in hand to repair damaged children.
2176 2175 */
2177 2176 for (c = 0; c < rm->rm_cols; c++) {
2178 2177 rc = &rm->rm_col[c];
2179 2178 cvd = vd->vdev_child[rc->rc_devidx];
2180 2179
2181 2180 if (rc->rc_error == 0)
2182 2181 continue;
2183 2182
2184 2183 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2185 2184 rc->rc_offset, rc->rc_data, rc->rc_size,
2186 2185 ZIO_TYPE_WRITE, zio->io_priority,
2187 2186 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2188 2187 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2189 2188 }
2190 2189 }
2191 2190 }
2192 2191
2193 2192 static void
2194 2193 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2195 2194 {
2196 2195 if (faulted > vd->vdev_nparity)
2197 2196 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2198 2197 VDEV_AUX_NO_REPLICAS);
2199 2198 else if (degraded + faulted != 0)
2200 2199 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2201 2200 else
2202 2201 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2203 2202 }
2204 2203
2205 2204 vdev_ops_t vdev_raidz_ops = {
2206 2205 vdev_raidz_open,
2207 2206 vdev_raidz_close,
2208 2207 vdev_raidz_asize,
2209 2208 vdev_raidz_io_start,
2210 2209 vdev_raidz_io_done,
2211 2210 vdev_raidz_state_change,
2212 2211 NULL,
2213 2212 NULL,
2214 2213 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2215 2214 B_FALSE /* not a leaf vdev */
2216 2215 };
↓ open down ↓ |
2008 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX