Print this page
5269 zfs: zpool import slow
PORTING: this code relies on the property of taskq_wait to wait
until no more tasks are queued and no more tasks are active. As
we always queue new tasks from within other tasks, task_wait
reliably waits for the full recursion to finish, even though we
enqueue new tasks after taskq_wait has been called.
On platforms other than illumos, taskq_wait may not have this
property.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/vdev.c
+++ new/usr/src/uts/common/fs/zfs/vdev.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25 25 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
26 26 */
27 27
28 28 #include <sys/zfs_context.h>
29 29 #include <sys/fm/fs/zfs.h>
30 30 #include <sys/spa.h>
31 31 #include <sys/spa_impl.h>
32 32 #include <sys/dmu.h>
33 33 #include <sys/dmu_tx.h>
34 34 #include <sys/vdev_impl.h>
35 35 #include <sys/uberblock_impl.h>
36 36 #include <sys/metaslab.h>
37 37 #include <sys/metaslab_impl.h>
38 38 #include <sys/space_map.h>
39 39 #include <sys/space_reftree.h>
40 40 #include <sys/zio.h>
41 41 #include <sys/zap.h>
42 42 #include <sys/fs/zfs.h>
43 43 #include <sys/arc.h>
44 44 #include <sys/zil.h>
45 45 #include <sys/dsl_scan.h>
46 46
47 47 /*
48 48 * Virtual device management.
49 49 */
50 50
51 51 static vdev_ops_t *vdev_ops_table[] = {
52 52 &vdev_root_ops,
53 53 &vdev_raidz_ops,
54 54 &vdev_mirror_ops,
55 55 &vdev_replacing_ops,
56 56 &vdev_spare_ops,
57 57 &vdev_disk_ops,
58 58 &vdev_file_ops,
59 59 &vdev_missing_ops,
60 60 &vdev_hole_ops,
61 61 NULL
62 62 };
63 63
64 64 /* maximum scrub/resilver I/O queue per leaf vdev */
65 65 int zfs_scrub_limit = 10;
66 66
67 67 /*
68 68 * When a vdev is added, it will be divided into approximately (but no
69 69 * more than) this number of metaslabs.
70 70 */
71 71 int metaslabs_per_vdev = 200;
72 72
73 73 /*
74 74 * Given a vdev type, return the appropriate ops vector.
75 75 */
76 76 static vdev_ops_t *
77 77 vdev_getops(const char *type)
78 78 {
79 79 vdev_ops_t *ops, **opspp;
80 80
81 81 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
82 82 if (strcmp(ops->vdev_op_type, type) == 0)
83 83 break;
84 84
85 85 return (ops);
86 86 }
87 87
88 88 /*
89 89 * Default asize function: return the MAX of psize with the asize of
90 90 * all children. This is what's used by anything other than RAID-Z.
91 91 */
92 92 uint64_t
93 93 vdev_default_asize(vdev_t *vd, uint64_t psize)
94 94 {
95 95 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
96 96 uint64_t csize;
97 97
98 98 for (int c = 0; c < vd->vdev_children; c++) {
99 99 csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
100 100 asize = MAX(asize, csize);
101 101 }
102 102
103 103 return (asize);
104 104 }
105 105
106 106 /*
107 107 * Get the minimum allocatable size. We define the allocatable size as
108 108 * the vdev's asize rounded to the nearest metaslab. This allows us to
109 109 * replace or attach devices which don't have the same physical size but
110 110 * can still satisfy the same number of allocations.
111 111 */
112 112 uint64_t
113 113 vdev_get_min_asize(vdev_t *vd)
114 114 {
115 115 vdev_t *pvd = vd->vdev_parent;
116 116
117 117 /*
118 118 * If our parent is NULL (inactive spare or cache) or is the root,
119 119 * just return our own asize.
120 120 */
121 121 if (pvd == NULL)
122 122 return (vd->vdev_asize);
123 123
124 124 /*
125 125 * The top-level vdev just returns the allocatable size rounded
126 126 * to the nearest metaslab.
127 127 */
128 128 if (vd == vd->vdev_top)
129 129 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
130 130
131 131 /*
132 132 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
133 133 * so each child must provide at least 1/Nth of its asize.
134 134 */
135 135 if (pvd->vdev_ops == &vdev_raidz_ops)
136 136 return (pvd->vdev_min_asize / pvd->vdev_children);
137 137
138 138 return (pvd->vdev_min_asize);
139 139 }
140 140
141 141 void
142 142 vdev_set_min_asize(vdev_t *vd)
143 143 {
144 144 vd->vdev_min_asize = vdev_get_min_asize(vd);
145 145
146 146 for (int c = 0; c < vd->vdev_children; c++)
147 147 vdev_set_min_asize(vd->vdev_child[c]);
148 148 }
149 149
150 150 vdev_t *
151 151 vdev_lookup_top(spa_t *spa, uint64_t vdev)
152 152 {
153 153 vdev_t *rvd = spa->spa_root_vdev;
154 154
155 155 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
156 156
157 157 if (vdev < rvd->vdev_children) {
158 158 ASSERT(rvd->vdev_child[vdev] != NULL);
159 159 return (rvd->vdev_child[vdev]);
160 160 }
161 161
162 162 return (NULL);
163 163 }
164 164
165 165 vdev_t *
166 166 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
167 167 {
168 168 vdev_t *mvd;
169 169
170 170 if (vd->vdev_guid == guid)
↓ open down ↓ |
170 lines elided |
↑ open up ↑ |
171 171 return (vd);
172 172
173 173 for (int c = 0; c < vd->vdev_children; c++)
174 174 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
175 175 NULL)
176 176 return (mvd);
177 177
178 178 return (NULL);
179 179 }
180 180
181 +static int
182 +vdev_count_leaves_impl(vdev_t *vd)
183 +{
184 + int n = 0;
185 +
186 + if (vd->vdev_ops->vdev_op_leaf)
187 + return (1);
188 +
189 + for (int c = 0; c < vd->vdev_children; c++)
190 + n += vdev_count_leaves_impl(vd->vdev_child[c]);
191 +
192 + return (n);
193 +}
194 +
195 +int
196 +vdev_count_leaves(spa_t *spa)
197 +{
198 + return (vdev_count_leaves_impl(spa->spa_root_vdev));
199 +}
200 +
181 201 void
182 202 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
183 203 {
184 204 size_t oldsize, newsize;
185 205 uint64_t id = cvd->vdev_id;
186 206 vdev_t **newchild;
187 207
188 208 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
189 209 ASSERT(cvd->vdev_parent == NULL);
190 210
191 211 cvd->vdev_parent = pvd;
192 212
193 213 if (pvd == NULL)
194 214 return;
195 215
196 216 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
197 217
198 218 oldsize = pvd->vdev_children * sizeof (vdev_t *);
199 219 pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
200 220 newsize = pvd->vdev_children * sizeof (vdev_t *);
201 221
202 222 newchild = kmem_zalloc(newsize, KM_SLEEP);
203 223 if (pvd->vdev_child != NULL) {
204 224 bcopy(pvd->vdev_child, newchild, oldsize);
205 225 kmem_free(pvd->vdev_child, oldsize);
206 226 }
207 227
208 228 pvd->vdev_child = newchild;
209 229 pvd->vdev_child[id] = cvd;
210 230
211 231 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
212 232 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
213 233
214 234 /*
215 235 * Walk up all ancestors to update guid sum.
216 236 */
217 237 for (; pvd != NULL; pvd = pvd->vdev_parent)
218 238 pvd->vdev_guid_sum += cvd->vdev_guid_sum;
219 239 }
220 240
221 241 void
222 242 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
223 243 {
224 244 int c;
225 245 uint_t id = cvd->vdev_id;
226 246
227 247 ASSERT(cvd->vdev_parent == pvd);
228 248
229 249 if (pvd == NULL)
230 250 return;
231 251
232 252 ASSERT(id < pvd->vdev_children);
233 253 ASSERT(pvd->vdev_child[id] == cvd);
234 254
235 255 pvd->vdev_child[id] = NULL;
236 256 cvd->vdev_parent = NULL;
237 257
238 258 for (c = 0; c < pvd->vdev_children; c++)
239 259 if (pvd->vdev_child[c])
240 260 break;
241 261
242 262 if (c == pvd->vdev_children) {
243 263 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
244 264 pvd->vdev_child = NULL;
245 265 pvd->vdev_children = 0;
246 266 }
247 267
248 268 /*
249 269 * Walk up all ancestors to update guid sum.
250 270 */
251 271 for (; pvd != NULL; pvd = pvd->vdev_parent)
252 272 pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
253 273 }
254 274
255 275 /*
256 276 * Remove any holes in the child array.
257 277 */
258 278 void
259 279 vdev_compact_children(vdev_t *pvd)
260 280 {
261 281 vdev_t **newchild, *cvd;
262 282 int oldc = pvd->vdev_children;
263 283 int newc;
264 284
265 285 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
266 286
267 287 for (int c = newc = 0; c < oldc; c++)
268 288 if (pvd->vdev_child[c])
269 289 newc++;
270 290
271 291 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
272 292
273 293 for (int c = newc = 0; c < oldc; c++) {
274 294 if ((cvd = pvd->vdev_child[c]) != NULL) {
275 295 newchild[newc] = cvd;
276 296 cvd->vdev_id = newc++;
277 297 }
278 298 }
279 299
280 300 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
281 301 pvd->vdev_child = newchild;
282 302 pvd->vdev_children = newc;
283 303 }
284 304
285 305 /*
286 306 * Allocate and minimally initialize a vdev_t.
287 307 */
288 308 vdev_t *
289 309 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
290 310 {
291 311 vdev_t *vd;
292 312
293 313 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
294 314
295 315 if (spa->spa_root_vdev == NULL) {
296 316 ASSERT(ops == &vdev_root_ops);
297 317 spa->spa_root_vdev = vd;
298 318 spa->spa_load_guid = spa_generate_guid(NULL);
299 319 }
300 320
301 321 if (guid == 0 && ops != &vdev_hole_ops) {
302 322 if (spa->spa_root_vdev == vd) {
303 323 /*
304 324 * The root vdev's guid will also be the pool guid,
305 325 * which must be unique among all pools.
306 326 */
307 327 guid = spa_generate_guid(NULL);
308 328 } else {
309 329 /*
310 330 * Any other vdev's guid must be unique within the pool.
311 331 */
312 332 guid = spa_generate_guid(spa);
313 333 }
314 334 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
315 335 }
316 336
317 337 vd->vdev_spa = spa;
318 338 vd->vdev_id = id;
319 339 vd->vdev_guid = guid;
320 340 vd->vdev_guid_sum = guid;
321 341 vd->vdev_ops = ops;
322 342 vd->vdev_state = VDEV_STATE_CLOSED;
323 343 vd->vdev_ishole = (ops == &vdev_hole_ops);
324 344
325 345 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
326 346 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
327 347 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
328 348 for (int t = 0; t < DTL_TYPES; t++) {
329 349 vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
330 350 &vd->vdev_dtl_lock);
331 351 }
332 352 txg_list_create(&vd->vdev_ms_list,
333 353 offsetof(struct metaslab, ms_txg_node));
334 354 txg_list_create(&vd->vdev_dtl_list,
335 355 offsetof(struct vdev, vdev_dtl_node));
336 356 vd->vdev_stat.vs_timestamp = gethrtime();
337 357 vdev_queue_init(vd);
338 358 vdev_cache_init(vd);
339 359
340 360 return (vd);
341 361 }
342 362
343 363 /*
344 364 * Allocate a new vdev. The 'alloctype' is used to control whether we are
345 365 * creating a new vdev or loading an existing one - the behavior is slightly
346 366 * different for each case.
347 367 */
348 368 int
349 369 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
350 370 int alloctype)
351 371 {
352 372 vdev_ops_t *ops;
353 373 char *type;
354 374 uint64_t guid = 0, islog, nparity;
355 375 vdev_t *vd;
356 376
357 377 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
358 378
359 379 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
360 380 return (SET_ERROR(EINVAL));
361 381
362 382 if ((ops = vdev_getops(type)) == NULL)
363 383 return (SET_ERROR(EINVAL));
364 384
365 385 /*
366 386 * If this is a load, get the vdev guid from the nvlist.
367 387 * Otherwise, vdev_alloc_common() will generate one for us.
368 388 */
369 389 if (alloctype == VDEV_ALLOC_LOAD) {
370 390 uint64_t label_id;
371 391
372 392 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
373 393 label_id != id)
374 394 return (SET_ERROR(EINVAL));
375 395
376 396 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
377 397 return (SET_ERROR(EINVAL));
378 398 } else if (alloctype == VDEV_ALLOC_SPARE) {
379 399 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
380 400 return (SET_ERROR(EINVAL));
381 401 } else if (alloctype == VDEV_ALLOC_L2CACHE) {
382 402 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
383 403 return (SET_ERROR(EINVAL));
384 404 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
385 405 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
386 406 return (SET_ERROR(EINVAL));
387 407 }
388 408
389 409 /*
390 410 * The first allocated vdev must be of type 'root'.
391 411 */
392 412 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
393 413 return (SET_ERROR(EINVAL));
394 414
395 415 /*
396 416 * Determine whether we're a log vdev.
397 417 */
398 418 islog = 0;
399 419 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
400 420 if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
401 421 return (SET_ERROR(ENOTSUP));
402 422
403 423 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
404 424 return (SET_ERROR(ENOTSUP));
405 425
406 426 /*
407 427 * Set the nparity property for RAID-Z vdevs.
408 428 */
409 429 nparity = -1ULL;
410 430 if (ops == &vdev_raidz_ops) {
411 431 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
412 432 &nparity) == 0) {
413 433 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
414 434 return (SET_ERROR(EINVAL));
415 435 /*
416 436 * Previous versions could only support 1 or 2 parity
417 437 * device.
418 438 */
419 439 if (nparity > 1 &&
420 440 spa_version(spa) < SPA_VERSION_RAIDZ2)
421 441 return (SET_ERROR(ENOTSUP));
422 442 if (nparity > 2 &&
423 443 spa_version(spa) < SPA_VERSION_RAIDZ3)
424 444 return (SET_ERROR(ENOTSUP));
425 445 } else {
426 446 /*
427 447 * We require the parity to be specified for SPAs that
428 448 * support multiple parity levels.
429 449 */
430 450 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
431 451 return (SET_ERROR(EINVAL));
432 452 /*
433 453 * Otherwise, we default to 1 parity device for RAID-Z.
434 454 */
435 455 nparity = 1;
436 456 }
437 457 } else {
438 458 nparity = 0;
439 459 }
440 460 ASSERT(nparity != -1ULL);
441 461
442 462 vd = vdev_alloc_common(spa, id, guid, ops);
443 463
444 464 vd->vdev_islog = islog;
445 465 vd->vdev_nparity = nparity;
446 466
447 467 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
448 468 vd->vdev_path = spa_strdup(vd->vdev_path);
449 469 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
450 470 vd->vdev_devid = spa_strdup(vd->vdev_devid);
451 471 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
452 472 &vd->vdev_physpath) == 0)
453 473 vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
454 474 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
455 475 vd->vdev_fru = spa_strdup(vd->vdev_fru);
456 476
457 477 /*
458 478 * Set the whole_disk property. If it's not specified, leave the value
459 479 * as -1.
460 480 */
461 481 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
462 482 &vd->vdev_wholedisk) != 0)
463 483 vd->vdev_wholedisk = -1ULL;
464 484
465 485 /*
466 486 * Look for the 'not present' flag. This will only be set if the device
467 487 * was not present at the time of import.
468 488 */
469 489 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
470 490 &vd->vdev_not_present);
471 491
472 492 /*
473 493 * Get the alignment requirement.
474 494 */
475 495 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
476 496
477 497 /*
478 498 * Retrieve the vdev creation time.
479 499 */
480 500 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
481 501 &vd->vdev_crtxg);
482 502
483 503 /*
484 504 * If we're a top-level vdev, try to load the allocation parameters.
485 505 */
486 506 if (parent && !parent->vdev_parent &&
487 507 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
488 508 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
489 509 &vd->vdev_ms_array);
490 510 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
491 511 &vd->vdev_ms_shift);
492 512 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
493 513 &vd->vdev_asize);
494 514 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
495 515 &vd->vdev_removing);
496 516 }
497 517
498 518 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
499 519 ASSERT(alloctype == VDEV_ALLOC_LOAD ||
500 520 alloctype == VDEV_ALLOC_ADD ||
501 521 alloctype == VDEV_ALLOC_SPLIT ||
502 522 alloctype == VDEV_ALLOC_ROOTPOOL);
503 523 vd->vdev_mg = metaslab_group_create(islog ?
504 524 spa_log_class(spa) : spa_normal_class(spa), vd);
505 525 }
506 526
507 527 /*
508 528 * If we're a leaf vdev, try to load the DTL object and other state.
509 529 */
510 530 if (vd->vdev_ops->vdev_op_leaf &&
511 531 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
512 532 alloctype == VDEV_ALLOC_ROOTPOOL)) {
513 533 if (alloctype == VDEV_ALLOC_LOAD) {
514 534 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
515 535 &vd->vdev_dtl_object);
516 536 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
517 537 &vd->vdev_unspare);
518 538 }
519 539
520 540 if (alloctype == VDEV_ALLOC_ROOTPOOL) {
521 541 uint64_t spare = 0;
522 542
523 543 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
524 544 &spare) == 0 && spare)
525 545 spa_spare_add(vd);
526 546 }
527 547
528 548 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
529 549 &vd->vdev_offline);
530 550
531 551 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
532 552 &vd->vdev_resilver_txg);
533 553
534 554 /*
535 555 * When importing a pool, we want to ignore the persistent fault
536 556 * state, as the diagnosis made on another system may not be
537 557 * valid in the current context. Local vdevs will
538 558 * remain in the faulted state.
539 559 */
540 560 if (spa_load_state(spa) == SPA_LOAD_OPEN) {
541 561 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
542 562 &vd->vdev_faulted);
543 563 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
544 564 &vd->vdev_degraded);
545 565 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
546 566 &vd->vdev_removed);
547 567
548 568 if (vd->vdev_faulted || vd->vdev_degraded) {
549 569 char *aux;
550 570
551 571 vd->vdev_label_aux =
552 572 VDEV_AUX_ERR_EXCEEDED;
553 573 if (nvlist_lookup_string(nv,
554 574 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
555 575 strcmp(aux, "external") == 0)
556 576 vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
557 577 }
558 578 }
559 579 }
560 580
561 581 /*
562 582 * Add ourselves to the parent's list of children.
563 583 */
564 584 vdev_add_child(parent, vd);
565 585
566 586 *vdp = vd;
567 587
568 588 return (0);
569 589 }
570 590
571 591 void
572 592 vdev_free(vdev_t *vd)
573 593 {
574 594 spa_t *spa = vd->vdev_spa;
575 595
576 596 /*
577 597 * vdev_free() implies closing the vdev first. This is simpler than
578 598 * trying to ensure complicated semantics for all callers.
579 599 */
580 600 vdev_close(vd);
581 601
582 602 ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
583 603 ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
584 604
585 605 /*
586 606 * Free all children.
587 607 */
588 608 for (int c = 0; c < vd->vdev_children; c++)
589 609 vdev_free(vd->vdev_child[c]);
590 610
591 611 ASSERT(vd->vdev_child == NULL);
592 612 ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
593 613
594 614 /*
595 615 * Discard allocation state.
596 616 */
597 617 if (vd->vdev_mg != NULL) {
598 618 vdev_metaslab_fini(vd);
599 619 metaslab_group_destroy(vd->vdev_mg);
600 620 }
601 621
602 622 ASSERT0(vd->vdev_stat.vs_space);
603 623 ASSERT0(vd->vdev_stat.vs_dspace);
604 624 ASSERT0(vd->vdev_stat.vs_alloc);
605 625
606 626 /*
607 627 * Remove this vdev from its parent's child list.
608 628 */
609 629 vdev_remove_child(vd->vdev_parent, vd);
610 630
611 631 ASSERT(vd->vdev_parent == NULL);
612 632
613 633 /*
614 634 * Clean up vdev structure.
615 635 */
616 636 vdev_queue_fini(vd);
617 637 vdev_cache_fini(vd);
618 638
619 639 if (vd->vdev_path)
620 640 spa_strfree(vd->vdev_path);
621 641 if (vd->vdev_devid)
622 642 spa_strfree(vd->vdev_devid);
623 643 if (vd->vdev_physpath)
624 644 spa_strfree(vd->vdev_physpath);
625 645 if (vd->vdev_fru)
626 646 spa_strfree(vd->vdev_fru);
627 647
628 648 if (vd->vdev_isspare)
629 649 spa_spare_remove(vd);
630 650 if (vd->vdev_isl2cache)
631 651 spa_l2cache_remove(vd);
632 652
633 653 txg_list_destroy(&vd->vdev_ms_list);
634 654 txg_list_destroy(&vd->vdev_dtl_list);
635 655
636 656 mutex_enter(&vd->vdev_dtl_lock);
637 657 space_map_close(vd->vdev_dtl_sm);
638 658 for (int t = 0; t < DTL_TYPES; t++) {
639 659 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
640 660 range_tree_destroy(vd->vdev_dtl[t]);
641 661 }
642 662 mutex_exit(&vd->vdev_dtl_lock);
643 663
644 664 mutex_destroy(&vd->vdev_dtl_lock);
645 665 mutex_destroy(&vd->vdev_stat_lock);
646 666 mutex_destroy(&vd->vdev_probe_lock);
647 667
648 668 if (vd == spa->spa_root_vdev)
649 669 spa->spa_root_vdev = NULL;
650 670
651 671 kmem_free(vd, sizeof (vdev_t));
652 672 }
653 673
654 674 /*
655 675 * Transfer top-level vdev state from svd to tvd.
656 676 */
657 677 static void
658 678 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
659 679 {
660 680 spa_t *spa = svd->vdev_spa;
661 681 metaslab_t *msp;
662 682 vdev_t *vd;
663 683 int t;
664 684
665 685 ASSERT(tvd == tvd->vdev_top);
666 686
667 687 tvd->vdev_ms_array = svd->vdev_ms_array;
668 688 tvd->vdev_ms_shift = svd->vdev_ms_shift;
669 689 tvd->vdev_ms_count = svd->vdev_ms_count;
670 690
671 691 svd->vdev_ms_array = 0;
672 692 svd->vdev_ms_shift = 0;
673 693 svd->vdev_ms_count = 0;
674 694
675 695 if (tvd->vdev_mg)
676 696 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
677 697 tvd->vdev_mg = svd->vdev_mg;
678 698 tvd->vdev_ms = svd->vdev_ms;
679 699
680 700 svd->vdev_mg = NULL;
681 701 svd->vdev_ms = NULL;
682 702
683 703 if (tvd->vdev_mg != NULL)
684 704 tvd->vdev_mg->mg_vd = tvd;
685 705
686 706 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
687 707 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
688 708 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
689 709
690 710 svd->vdev_stat.vs_alloc = 0;
691 711 svd->vdev_stat.vs_space = 0;
692 712 svd->vdev_stat.vs_dspace = 0;
693 713
694 714 for (t = 0; t < TXG_SIZE; t++) {
695 715 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
696 716 (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
697 717 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
698 718 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
699 719 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
700 720 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
701 721 }
702 722
703 723 if (list_link_active(&svd->vdev_config_dirty_node)) {
704 724 vdev_config_clean(svd);
705 725 vdev_config_dirty(tvd);
706 726 }
707 727
708 728 if (list_link_active(&svd->vdev_state_dirty_node)) {
709 729 vdev_state_clean(svd);
710 730 vdev_state_dirty(tvd);
711 731 }
712 732
713 733 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
714 734 svd->vdev_deflate_ratio = 0;
715 735
716 736 tvd->vdev_islog = svd->vdev_islog;
717 737 svd->vdev_islog = 0;
718 738 }
719 739
720 740 static void
721 741 vdev_top_update(vdev_t *tvd, vdev_t *vd)
722 742 {
723 743 if (vd == NULL)
724 744 return;
725 745
726 746 vd->vdev_top = tvd;
727 747
728 748 for (int c = 0; c < vd->vdev_children; c++)
729 749 vdev_top_update(tvd, vd->vdev_child[c]);
730 750 }
731 751
732 752 /*
733 753 * Add a mirror/replacing vdev above an existing vdev.
734 754 */
735 755 vdev_t *
736 756 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
737 757 {
738 758 spa_t *spa = cvd->vdev_spa;
739 759 vdev_t *pvd = cvd->vdev_parent;
740 760 vdev_t *mvd;
741 761
742 762 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
743 763
744 764 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
745 765
746 766 mvd->vdev_asize = cvd->vdev_asize;
747 767 mvd->vdev_min_asize = cvd->vdev_min_asize;
748 768 mvd->vdev_max_asize = cvd->vdev_max_asize;
749 769 mvd->vdev_ashift = cvd->vdev_ashift;
750 770 mvd->vdev_state = cvd->vdev_state;
751 771 mvd->vdev_crtxg = cvd->vdev_crtxg;
752 772
753 773 vdev_remove_child(pvd, cvd);
754 774 vdev_add_child(pvd, mvd);
755 775 cvd->vdev_id = mvd->vdev_children;
756 776 vdev_add_child(mvd, cvd);
757 777 vdev_top_update(cvd->vdev_top, cvd->vdev_top);
758 778
759 779 if (mvd == mvd->vdev_top)
760 780 vdev_top_transfer(cvd, mvd);
761 781
762 782 return (mvd);
763 783 }
764 784
765 785 /*
766 786 * Remove a 1-way mirror/replacing vdev from the tree.
767 787 */
768 788 void
769 789 vdev_remove_parent(vdev_t *cvd)
770 790 {
771 791 vdev_t *mvd = cvd->vdev_parent;
772 792 vdev_t *pvd = mvd->vdev_parent;
773 793
774 794 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
775 795
776 796 ASSERT(mvd->vdev_children == 1);
777 797 ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
778 798 mvd->vdev_ops == &vdev_replacing_ops ||
779 799 mvd->vdev_ops == &vdev_spare_ops);
780 800 cvd->vdev_ashift = mvd->vdev_ashift;
781 801
782 802 vdev_remove_child(mvd, cvd);
783 803 vdev_remove_child(pvd, mvd);
784 804
785 805 /*
786 806 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
787 807 * Otherwise, we could have detached an offline device, and when we
788 808 * go to import the pool we'll think we have two top-level vdevs,
789 809 * instead of a different version of the same top-level vdev.
790 810 */
791 811 if (mvd->vdev_top == mvd) {
792 812 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
793 813 cvd->vdev_orig_guid = cvd->vdev_guid;
794 814 cvd->vdev_guid += guid_delta;
795 815 cvd->vdev_guid_sum += guid_delta;
796 816 }
797 817 cvd->vdev_id = mvd->vdev_id;
798 818 vdev_add_child(pvd, cvd);
799 819 vdev_top_update(cvd->vdev_top, cvd->vdev_top);
800 820
801 821 if (cvd == cvd->vdev_top)
802 822 vdev_top_transfer(mvd, cvd);
803 823
804 824 ASSERT(mvd->vdev_children == 0);
805 825 vdev_free(mvd);
806 826 }
807 827
808 828 int
809 829 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
810 830 {
811 831 spa_t *spa = vd->vdev_spa;
812 832 objset_t *mos = spa->spa_meta_objset;
813 833 uint64_t m;
814 834 uint64_t oldc = vd->vdev_ms_count;
815 835 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
816 836 metaslab_t **mspp;
817 837 int error;
818 838
819 839 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
820 840
821 841 /*
822 842 * This vdev is not being allocated from yet or is a hole.
823 843 */
824 844 if (vd->vdev_ms_shift == 0)
825 845 return (0);
826 846
827 847 ASSERT(!vd->vdev_ishole);
828 848
829 849 /*
830 850 * Compute the raidz-deflation ratio. Note, we hard-code
831 851 * in 128k (1 << 17) because it is the "typical" blocksize.
832 852 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
833 853 * otherwise it would inconsistently account for existing bp's.
834 854 */
835 855 vd->vdev_deflate_ratio = (1 << 17) /
836 856 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
837 857
838 858 ASSERT(oldc <= newc);
839 859
840 860 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
841 861
842 862 if (oldc != 0) {
843 863 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
844 864 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
845 865 }
846 866
847 867 vd->vdev_ms = mspp;
848 868 vd->vdev_ms_count = newc;
849 869
850 870 for (m = oldc; m < newc; m++) {
851 871 uint64_t object = 0;
852 872
853 873 if (txg == 0) {
854 874 error = dmu_read(mos, vd->vdev_ms_array,
855 875 m * sizeof (uint64_t), sizeof (uint64_t), &object,
856 876 DMU_READ_PREFETCH);
857 877 if (error)
858 878 return (error);
859 879 }
860 880
861 881 error = metaslab_init(vd->vdev_mg, m, object, txg,
862 882 &(vd->vdev_ms[m]));
863 883 if (error)
864 884 return (error);
865 885 }
866 886
867 887 if (txg == 0)
868 888 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
869 889
870 890 /*
871 891 * If the vdev is being removed we don't activate
872 892 * the metaslabs since we want to ensure that no new
873 893 * allocations are performed on this device.
874 894 */
875 895 if (oldc == 0 && !vd->vdev_removing)
876 896 metaslab_group_activate(vd->vdev_mg);
877 897
878 898 if (txg == 0)
879 899 spa_config_exit(spa, SCL_ALLOC, FTAG);
880 900
881 901 return (0);
882 902 }
883 903
884 904 void
885 905 vdev_metaslab_fini(vdev_t *vd)
886 906 {
887 907 uint64_t m;
888 908 uint64_t count = vd->vdev_ms_count;
889 909
890 910 if (vd->vdev_ms != NULL) {
891 911 metaslab_group_passivate(vd->vdev_mg);
892 912 for (m = 0; m < count; m++) {
893 913 metaslab_t *msp = vd->vdev_ms[m];
894 914
895 915 if (msp != NULL)
896 916 metaslab_fini(msp);
897 917 }
898 918 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
899 919 vd->vdev_ms = NULL;
900 920 }
901 921 }
902 922
903 923 typedef struct vdev_probe_stats {
904 924 boolean_t vps_readable;
905 925 boolean_t vps_writeable;
906 926 int vps_flags;
907 927 } vdev_probe_stats_t;
908 928
909 929 static void
910 930 vdev_probe_done(zio_t *zio)
911 931 {
912 932 spa_t *spa = zio->io_spa;
913 933 vdev_t *vd = zio->io_vd;
914 934 vdev_probe_stats_t *vps = zio->io_private;
915 935
916 936 ASSERT(vd->vdev_probe_zio != NULL);
917 937
918 938 if (zio->io_type == ZIO_TYPE_READ) {
919 939 if (zio->io_error == 0)
920 940 vps->vps_readable = 1;
921 941 if (zio->io_error == 0 && spa_writeable(spa)) {
922 942 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
923 943 zio->io_offset, zio->io_size, zio->io_data,
924 944 ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
925 945 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
926 946 } else {
927 947 zio_buf_free(zio->io_data, zio->io_size);
928 948 }
929 949 } else if (zio->io_type == ZIO_TYPE_WRITE) {
930 950 if (zio->io_error == 0)
931 951 vps->vps_writeable = 1;
932 952 zio_buf_free(zio->io_data, zio->io_size);
933 953 } else if (zio->io_type == ZIO_TYPE_NULL) {
934 954 zio_t *pio;
935 955
936 956 vd->vdev_cant_read |= !vps->vps_readable;
937 957 vd->vdev_cant_write |= !vps->vps_writeable;
938 958
939 959 if (vdev_readable(vd) &&
940 960 (vdev_writeable(vd) || !spa_writeable(spa))) {
941 961 zio->io_error = 0;
942 962 } else {
943 963 ASSERT(zio->io_error != 0);
944 964 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
945 965 spa, vd, NULL, 0, 0);
946 966 zio->io_error = SET_ERROR(ENXIO);
947 967 }
948 968
949 969 mutex_enter(&vd->vdev_probe_lock);
950 970 ASSERT(vd->vdev_probe_zio == zio);
951 971 vd->vdev_probe_zio = NULL;
952 972 mutex_exit(&vd->vdev_probe_lock);
953 973
954 974 while ((pio = zio_walk_parents(zio)) != NULL)
955 975 if (!vdev_accessible(vd, pio))
956 976 pio->io_error = SET_ERROR(ENXIO);
957 977
958 978 kmem_free(vps, sizeof (*vps));
959 979 }
960 980 }
961 981
962 982 /*
963 983 * Determine whether this device is accessible.
964 984 *
965 985 * Read and write to several known locations: the pad regions of each
966 986 * vdev label but the first, which we leave alone in case it contains
967 987 * a VTOC.
968 988 */
969 989 zio_t *
970 990 vdev_probe(vdev_t *vd, zio_t *zio)
971 991 {
972 992 spa_t *spa = vd->vdev_spa;
973 993 vdev_probe_stats_t *vps = NULL;
974 994 zio_t *pio;
975 995
976 996 ASSERT(vd->vdev_ops->vdev_op_leaf);
977 997
978 998 /*
979 999 * Don't probe the probe.
980 1000 */
981 1001 if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
982 1002 return (NULL);
983 1003
984 1004 /*
985 1005 * To prevent 'probe storms' when a device fails, we create
986 1006 * just one probe i/o at a time. All zios that want to probe
987 1007 * this vdev will become parents of the probe io.
988 1008 */
989 1009 mutex_enter(&vd->vdev_probe_lock);
990 1010
991 1011 if ((pio = vd->vdev_probe_zio) == NULL) {
992 1012 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
993 1013
994 1014 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
995 1015 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
996 1016 ZIO_FLAG_TRYHARD;
997 1017
998 1018 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
999 1019 /*
1000 1020 * vdev_cant_read and vdev_cant_write can only
1001 1021 * transition from TRUE to FALSE when we have the
1002 1022 * SCL_ZIO lock as writer; otherwise they can only
1003 1023 * transition from FALSE to TRUE. This ensures that
1004 1024 * any zio looking at these values can assume that
1005 1025 * failures persist for the life of the I/O. That's
1006 1026 * important because when a device has intermittent
1007 1027 * connectivity problems, we want to ensure that
1008 1028 * they're ascribed to the device (ENXIO) and not
1009 1029 * the zio (EIO).
1010 1030 *
1011 1031 * Since we hold SCL_ZIO as writer here, clear both
1012 1032 * values so the probe can reevaluate from first
1013 1033 * principles.
1014 1034 */
1015 1035 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
1016 1036 vd->vdev_cant_read = B_FALSE;
1017 1037 vd->vdev_cant_write = B_FALSE;
1018 1038 }
1019 1039
1020 1040 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
1021 1041 vdev_probe_done, vps,
1022 1042 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
1023 1043
1024 1044 /*
1025 1045 * We can't change the vdev state in this context, so we
1026 1046 * kick off an async task to do it on our behalf.
1027 1047 */
1028 1048 if (zio != NULL) {
1029 1049 vd->vdev_probe_wanted = B_TRUE;
1030 1050 spa_async_request(spa, SPA_ASYNC_PROBE);
1031 1051 }
1032 1052 }
1033 1053
1034 1054 if (zio != NULL)
1035 1055 zio_add_child(zio, pio);
1036 1056
1037 1057 mutex_exit(&vd->vdev_probe_lock);
1038 1058
1039 1059 if (vps == NULL) {
1040 1060 ASSERT(zio != NULL);
1041 1061 return (NULL);
1042 1062 }
1043 1063
1044 1064 for (int l = 1; l < VDEV_LABELS; l++) {
1045 1065 zio_nowait(zio_read_phys(pio, vd,
1046 1066 vdev_label_offset(vd->vdev_psize, l,
1047 1067 offsetof(vdev_label_t, vl_pad2)),
1048 1068 VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
1049 1069 ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1050 1070 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
1051 1071 }
1052 1072
1053 1073 if (zio == NULL)
1054 1074 return (pio);
1055 1075
1056 1076 zio_nowait(pio);
1057 1077 return (NULL);
1058 1078 }
1059 1079
1060 1080 static void
1061 1081 vdev_open_child(void *arg)
1062 1082 {
1063 1083 vdev_t *vd = arg;
1064 1084
1065 1085 vd->vdev_open_thread = curthread;
1066 1086 vd->vdev_open_error = vdev_open(vd);
1067 1087 vd->vdev_open_thread = NULL;
1068 1088 }
1069 1089
1070 1090 boolean_t
1071 1091 vdev_uses_zvols(vdev_t *vd)
1072 1092 {
1073 1093 if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
1074 1094 strlen(ZVOL_DIR)) == 0)
1075 1095 return (B_TRUE);
1076 1096 for (int c = 0; c < vd->vdev_children; c++)
1077 1097 if (vdev_uses_zvols(vd->vdev_child[c]))
1078 1098 return (B_TRUE);
1079 1099 return (B_FALSE);
1080 1100 }
1081 1101
1082 1102 void
1083 1103 vdev_open_children(vdev_t *vd)
1084 1104 {
1085 1105 taskq_t *tq;
1086 1106 int children = vd->vdev_children;
1087 1107
1088 1108 /*
1089 1109 * in order to handle pools on top of zvols, do the opens
1090 1110 * in a single thread so that the same thread holds the
1091 1111 * spa_namespace_lock
1092 1112 */
1093 1113 if (vdev_uses_zvols(vd)) {
1094 1114 for (int c = 0; c < children; c++)
1095 1115 vd->vdev_child[c]->vdev_open_error =
1096 1116 vdev_open(vd->vdev_child[c]);
1097 1117 return;
1098 1118 }
1099 1119 tq = taskq_create("vdev_open", children, minclsyspri,
1100 1120 children, children, TASKQ_PREPOPULATE);
1101 1121
1102 1122 for (int c = 0; c < children; c++)
1103 1123 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
1104 1124 TQ_SLEEP) != NULL);
1105 1125
1106 1126 taskq_destroy(tq);
1107 1127 }
1108 1128
1109 1129 /*
1110 1130 * Prepare a virtual device for access.
1111 1131 */
1112 1132 int
1113 1133 vdev_open(vdev_t *vd)
1114 1134 {
1115 1135 spa_t *spa = vd->vdev_spa;
1116 1136 int error;
1117 1137 uint64_t osize = 0;
1118 1138 uint64_t max_osize = 0;
1119 1139 uint64_t asize, max_asize, psize;
1120 1140 uint64_t ashift = 0;
1121 1141
1122 1142 ASSERT(vd->vdev_open_thread == curthread ||
1123 1143 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1124 1144 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
1125 1145 vd->vdev_state == VDEV_STATE_CANT_OPEN ||
1126 1146 vd->vdev_state == VDEV_STATE_OFFLINE);
1127 1147
1128 1148 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1129 1149 vd->vdev_cant_read = B_FALSE;
1130 1150 vd->vdev_cant_write = B_FALSE;
1131 1151 vd->vdev_min_asize = vdev_get_min_asize(vd);
1132 1152
1133 1153 /*
1134 1154 * If this vdev is not removed, check its fault status. If it's
1135 1155 * faulted, bail out of the open.
1136 1156 */
1137 1157 if (!vd->vdev_removed && vd->vdev_faulted) {
1138 1158 ASSERT(vd->vdev_children == 0);
1139 1159 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1140 1160 vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1141 1161 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1142 1162 vd->vdev_label_aux);
1143 1163 return (SET_ERROR(ENXIO));
1144 1164 } else if (vd->vdev_offline) {
1145 1165 ASSERT(vd->vdev_children == 0);
1146 1166 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
1147 1167 return (SET_ERROR(ENXIO));
1148 1168 }
1149 1169
1150 1170 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
1151 1171
1152 1172 /*
1153 1173 * Reset the vdev_reopening flag so that we actually close
1154 1174 * the vdev on error.
1155 1175 */
1156 1176 vd->vdev_reopening = B_FALSE;
1157 1177 if (zio_injection_enabled && error == 0)
1158 1178 error = zio_handle_device_injection(vd, NULL, ENXIO);
1159 1179
1160 1180 if (error) {
1161 1181 if (vd->vdev_removed &&
1162 1182 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
1163 1183 vd->vdev_removed = B_FALSE;
1164 1184
1165 1185 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1166 1186 vd->vdev_stat.vs_aux);
1167 1187 return (error);
1168 1188 }
1169 1189
1170 1190 vd->vdev_removed = B_FALSE;
1171 1191
1172 1192 /*
1173 1193 * Recheck the faulted flag now that we have confirmed that
1174 1194 * the vdev is accessible. If we're faulted, bail.
1175 1195 */
1176 1196 if (vd->vdev_faulted) {
1177 1197 ASSERT(vd->vdev_children == 0);
1178 1198 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1179 1199 vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1180 1200 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1181 1201 vd->vdev_label_aux);
1182 1202 return (SET_ERROR(ENXIO));
1183 1203 }
1184 1204
1185 1205 if (vd->vdev_degraded) {
1186 1206 ASSERT(vd->vdev_children == 0);
1187 1207 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1188 1208 VDEV_AUX_ERR_EXCEEDED);
1189 1209 } else {
1190 1210 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
1191 1211 }
1192 1212
1193 1213 /*
1194 1214 * For hole or missing vdevs we just return success.
1195 1215 */
1196 1216 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
1197 1217 return (0);
1198 1218
1199 1219 for (int c = 0; c < vd->vdev_children; c++) {
1200 1220 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
1201 1221 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1202 1222 VDEV_AUX_NONE);
1203 1223 break;
1204 1224 }
1205 1225 }
1206 1226
1207 1227 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
1208 1228 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
1209 1229
1210 1230 if (vd->vdev_children == 0) {
1211 1231 if (osize < SPA_MINDEVSIZE) {
1212 1232 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1213 1233 VDEV_AUX_TOO_SMALL);
1214 1234 return (SET_ERROR(EOVERFLOW));
1215 1235 }
1216 1236 psize = osize;
1217 1237 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
1218 1238 max_asize = max_osize - (VDEV_LABEL_START_SIZE +
1219 1239 VDEV_LABEL_END_SIZE);
1220 1240 } else {
1221 1241 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
1222 1242 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
1223 1243 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1224 1244 VDEV_AUX_TOO_SMALL);
1225 1245 return (SET_ERROR(EOVERFLOW));
1226 1246 }
1227 1247 psize = 0;
1228 1248 asize = osize;
1229 1249 max_asize = max_osize;
1230 1250 }
1231 1251
1232 1252 vd->vdev_psize = psize;
1233 1253
1234 1254 /*
1235 1255 * Make sure the allocatable size hasn't shrunk.
1236 1256 */
1237 1257 if (asize < vd->vdev_min_asize) {
1238 1258 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1239 1259 VDEV_AUX_BAD_LABEL);
1240 1260 return (SET_ERROR(EINVAL));
1241 1261 }
1242 1262
1243 1263 if (vd->vdev_asize == 0) {
1244 1264 /*
1245 1265 * This is the first-ever open, so use the computed values.
1246 1266 * For testing purposes, a higher ashift can be requested.
1247 1267 */
1248 1268 vd->vdev_asize = asize;
1249 1269 vd->vdev_max_asize = max_asize;
1250 1270 vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
1251 1271 } else {
1252 1272 /*
1253 1273 * Detect if the alignment requirement has increased.
1254 1274 * We don't want to make the pool unavailable, just
1255 1275 * issue a warning instead.
1256 1276 */
1257 1277 if (ashift > vd->vdev_top->vdev_ashift &&
1258 1278 vd->vdev_ops->vdev_op_leaf) {
1259 1279 cmn_err(CE_WARN,
1260 1280 "Disk, '%s', has a block alignment that is "
1261 1281 "larger than the pool's alignment\n",
1262 1282 vd->vdev_path);
1263 1283 }
1264 1284 vd->vdev_max_asize = max_asize;
1265 1285 }
1266 1286
1267 1287 /*
1268 1288 * If all children are healthy and the asize has increased,
1269 1289 * then we've experienced dynamic LUN growth. If automatic
1270 1290 * expansion is enabled then use the additional space.
1271 1291 */
1272 1292 if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
1273 1293 (vd->vdev_expanding || spa->spa_autoexpand))
1274 1294 vd->vdev_asize = asize;
1275 1295
1276 1296 vdev_set_min_asize(vd);
1277 1297
1278 1298 /*
1279 1299 * Ensure we can issue some IO before declaring the
1280 1300 * vdev open for business.
1281 1301 */
1282 1302 if (vd->vdev_ops->vdev_op_leaf &&
1283 1303 (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
1284 1304 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1285 1305 VDEV_AUX_ERR_EXCEEDED);
1286 1306 return (error);
1287 1307 }
1288 1308
1289 1309 /*
1290 1310 * If a leaf vdev has a DTL, and seems healthy, then kick off a
1291 1311 * resilver. But don't do this if we are doing a reopen for a scrub,
1292 1312 * since this would just restart the scrub we are already doing.
1293 1313 */
1294 1314 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
1295 1315 vdev_resilver_needed(vd, NULL, NULL))
1296 1316 spa_async_request(spa, SPA_ASYNC_RESILVER);
1297 1317
1298 1318 return (0);
1299 1319 }
1300 1320
1301 1321 /*
1302 1322 * Called once the vdevs are all opened, this routine validates the label
1303 1323 * contents. This needs to be done before vdev_load() so that we don't
1304 1324 * inadvertently do repair I/Os to the wrong device.
1305 1325 *
1306 1326 * If 'strict' is false ignore the spa guid check. This is necessary because
1307 1327 * if the machine crashed during a re-guid the new guid might have been written
1308 1328 * to all of the vdev labels, but not the cached config. The strict check
1309 1329 * will be performed when the pool is opened again using the mos config.
1310 1330 *
1311 1331 * This function will only return failure if one of the vdevs indicates that it
1312 1332 * has since been destroyed or exported. This is only possible if
1313 1333 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
1314 1334 * will be updated but the function will return 0.
1315 1335 */
1316 1336 int
1317 1337 vdev_validate(vdev_t *vd, boolean_t strict)
1318 1338 {
1319 1339 spa_t *spa = vd->vdev_spa;
1320 1340 nvlist_t *label;
1321 1341 uint64_t guid = 0, top_guid;
1322 1342 uint64_t state;
1323 1343
1324 1344 for (int c = 0; c < vd->vdev_children; c++)
1325 1345 if (vdev_validate(vd->vdev_child[c], strict) != 0)
1326 1346 return (SET_ERROR(EBADF));
1327 1347
1328 1348 /*
1329 1349 * If the device has already failed, or was marked offline, don't do
1330 1350 * any further validation. Otherwise, label I/O will fail and we will
1331 1351 * overwrite the previous state.
1332 1352 */
1333 1353 if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
1334 1354 uint64_t aux_guid = 0;
1335 1355 nvlist_t *nvl;
1336 1356 uint64_t txg = spa_last_synced_txg(spa) != 0 ?
1337 1357 spa_last_synced_txg(spa) : -1ULL;
1338 1358
1339 1359 if ((label = vdev_label_read_config(vd, txg)) == NULL) {
1340 1360 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1341 1361 VDEV_AUX_BAD_LABEL);
1342 1362 return (0);
1343 1363 }
1344 1364
1345 1365 /*
1346 1366 * Determine if this vdev has been split off into another
1347 1367 * pool. If so, then refuse to open it.
1348 1368 */
1349 1369 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
1350 1370 &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
1351 1371 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1352 1372 VDEV_AUX_SPLIT_POOL);
1353 1373 nvlist_free(label);
1354 1374 return (0);
1355 1375 }
1356 1376
1357 1377 if (strict && (nvlist_lookup_uint64(label,
1358 1378 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
1359 1379 guid != spa_guid(spa))) {
1360 1380 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1361 1381 VDEV_AUX_CORRUPT_DATA);
1362 1382 nvlist_free(label);
1363 1383 return (0);
1364 1384 }
1365 1385
1366 1386 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
1367 1387 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
1368 1388 &aux_guid) != 0)
1369 1389 aux_guid = 0;
1370 1390
1371 1391 /*
1372 1392 * If this vdev just became a top-level vdev because its
1373 1393 * sibling was detached, it will have adopted the parent's
1374 1394 * vdev guid -- but the label may or may not be on disk yet.
1375 1395 * Fortunately, either version of the label will have the
1376 1396 * same top guid, so if we're a top-level vdev, we can
1377 1397 * safely compare to that instead.
1378 1398 *
1379 1399 * If we split this vdev off instead, then we also check the
1380 1400 * original pool's guid. We don't want to consider the vdev
1381 1401 * corrupt if it is partway through a split operation.
1382 1402 */
1383 1403 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
1384 1404 &guid) != 0 ||
1385 1405 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
1386 1406 &top_guid) != 0 ||
1387 1407 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
1388 1408 (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
1389 1409 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1390 1410 VDEV_AUX_CORRUPT_DATA);
1391 1411 nvlist_free(label);
1392 1412 return (0);
1393 1413 }
1394 1414
1395 1415 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1396 1416 &state) != 0) {
1397 1417 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1398 1418 VDEV_AUX_CORRUPT_DATA);
1399 1419 nvlist_free(label);
1400 1420 return (0);
1401 1421 }
1402 1422
1403 1423 nvlist_free(label);
1404 1424
1405 1425 /*
1406 1426 * If this is a verbatim import, no need to check the
1407 1427 * state of the pool.
1408 1428 */
1409 1429 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
1410 1430 spa_load_state(spa) == SPA_LOAD_OPEN &&
1411 1431 state != POOL_STATE_ACTIVE)
1412 1432 return (SET_ERROR(EBADF));
1413 1433
1414 1434 /*
1415 1435 * If we were able to open and validate a vdev that was
1416 1436 * previously marked permanently unavailable, clear that state
1417 1437 * now.
1418 1438 */
1419 1439 if (vd->vdev_not_present)
1420 1440 vd->vdev_not_present = 0;
1421 1441 }
1422 1442
1423 1443 return (0);
1424 1444 }
1425 1445
1426 1446 /*
1427 1447 * Close a virtual device.
1428 1448 */
1429 1449 void
1430 1450 vdev_close(vdev_t *vd)
1431 1451 {
1432 1452 spa_t *spa = vd->vdev_spa;
1433 1453 vdev_t *pvd = vd->vdev_parent;
1434 1454
1435 1455 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1436 1456
1437 1457 /*
1438 1458 * If our parent is reopening, then we are as well, unless we are
1439 1459 * going offline.
1440 1460 */
1441 1461 if (pvd != NULL && pvd->vdev_reopening)
1442 1462 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
1443 1463
1444 1464 vd->vdev_ops->vdev_op_close(vd);
1445 1465
1446 1466 vdev_cache_purge(vd);
1447 1467
1448 1468 /*
1449 1469 * We record the previous state before we close it, so that if we are
1450 1470 * doing a reopen(), we don't generate FMA ereports if we notice that
1451 1471 * it's still faulted.
1452 1472 */
1453 1473 vd->vdev_prevstate = vd->vdev_state;
1454 1474
1455 1475 if (vd->vdev_offline)
1456 1476 vd->vdev_state = VDEV_STATE_OFFLINE;
1457 1477 else
1458 1478 vd->vdev_state = VDEV_STATE_CLOSED;
1459 1479 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1460 1480 }
1461 1481
1462 1482 void
1463 1483 vdev_hold(vdev_t *vd)
1464 1484 {
1465 1485 spa_t *spa = vd->vdev_spa;
1466 1486
1467 1487 ASSERT(spa_is_root(spa));
1468 1488 if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1469 1489 return;
1470 1490
1471 1491 for (int c = 0; c < vd->vdev_children; c++)
1472 1492 vdev_hold(vd->vdev_child[c]);
1473 1493
1474 1494 if (vd->vdev_ops->vdev_op_leaf)
1475 1495 vd->vdev_ops->vdev_op_hold(vd);
1476 1496 }
1477 1497
1478 1498 void
1479 1499 vdev_rele(vdev_t *vd)
1480 1500 {
1481 1501 spa_t *spa = vd->vdev_spa;
1482 1502
1483 1503 ASSERT(spa_is_root(spa));
1484 1504 for (int c = 0; c < vd->vdev_children; c++)
1485 1505 vdev_rele(vd->vdev_child[c]);
1486 1506
1487 1507 if (vd->vdev_ops->vdev_op_leaf)
1488 1508 vd->vdev_ops->vdev_op_rele(vd);
1489 1509 }
1490 1510
1491 1511 /*
1492 1512 * Reopen all interior vdevs and any unopened leaves. We don't actually
1493 1513 * reopen leaf vdevs which had previously been opened as they might deadlock
1494 1514 * on the spa_config_lock. Instead we only obtain the leaf's physical size.
1495 1515 * If the leaf has never been opened then open it, as usual.
1496 1516 */
1497 1517 void
1498 1518 vdev_reopen(vdev_t *vd)
1499 1519 {
1500 1520 spa_t *spa = vd->vdev_spa;
1501 1521
1502 1522 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1503 1523
1504 1524 /* set the reopening flag unless we're taking the vdev offline */
1505 1525 vd->vdev_reopening = !vd->vdev_offline;
1506 1526 vdev_close(vd);
1507 1527 (void) vdev_open(vd);
1508 1528
1509 1529 /*
1510 1530 * Call vdev_validate() here to make sure we have the same device.
1511 1531 * Otherwise, a device with an invalid label could be successfully
1512 1532 * opened in response to vdev_reopen().
1513 1533 */
1514 1534 if (vd->vdev_aux) {
1515 1535 (void) vdev_validate_aux(vd);
1516 1536 if (vdev_readable(vd) && vdev_writeable(vd) &&
1517 1537 vd->vdev_aux == &spa->spa_l2cache &&
1518 1538 !l2arc_vdev_present(vd))
1519 1539 l2arc_add_vdev(spa, vd);
1520 1540 } else {
1521 1541 (void) vdev_validate(vd, B_TRUE);
1522 1542 }
1523 1543
1524 1544 /*
1525 1545 * Reassess parent vdev's health.
1526 1546 */
1527 1547 vdev_propagate_state(vd);
1528 1548 }
1529 1549
1530 1550 int
1531 1551 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
1532 1552 {
1533 1553 int error;
1534 1554
1535 1555 /*
1536 1556 * Normally, partial opens (e.g. of a mirror) are allowed.
1537 1557 * For a create, however, we want to fail the request if
1538 1558 * there are any components we can't open.
1539 1559 */
1540 1560 error = vdev_open(vd);
1541 1561
1542 1562 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
1543 1563 vdev_close(vd);
1544 1564 return (error ? error : ENXIO);
1545 1565 }
1546 1566
1547 1567 /*
1548 1568 * Recursively load DTLs and initialize all labels.
1549 1569 */
1550 1570 if ((error = vdev_dtl_load(vd)) != 0 ||
1551 1571 (error = vdev_label_init(vd, txg, isreplacing ?
1552 1572 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
1553 1573 vdev_close(vd);
1554 1574 return (error);
1555 1575 }
1556 1576
1557 1577 return (0);
1558 1578 }
1559 1579
1560 1580 void
1561 1581 vdev_metaslab_set_size(vdev_t *vd)
1562 1582 {
1563 1583 /*
1564 1584 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
1565 1585 */
1566 1586 vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
1567 1587 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1568 1588 }
1569 1589
1570 1590 void
1571 1591 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1572 1592 {
1573 1593 ASSERT(vd == vd->vdev_top);
1574 1594 ASSERT(!vd->vdev_ishole);
1575 1595 ASSERT(ISP2(flags));
1576 1596 ASSERT(spa_writeable(vd->vdev_spa));
1577 1597
1578 1598 if (flags & VDD_METASLAB)
1579 1599 (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
1580 1600
1581 1601 if (flags & VDD_DTL)
1582 1602 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
1583 1603
1584 1604 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
1585 1605 }
1586 1606
1587 1607 void
1588 1608 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
1589 1609 {
1590 1610 for (int c = 0; c < vd->vdev_children; c++)
1591 1611 vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
1592 1612
1593 1613 if (vd->vdev_ops->vdev_op_leaf)
1594 1614 vdev_dirty(vd->vdev_top, flags, vd, txg);
1595 1615 }
1596 1616
1597 1617 /*
1598 1618 * DTLs.
1599 1619 *
1600 1620 * A vdev's DTL (dirty time log) is the set of transaction groups for which
1601 1621 * the vdev has less than perfect replication. There are four kinds of DTL:
1602 1622 *
1603 1623 * DTL_MISSING: txgs for which the vdev has no valid copies of the data
1604 1624 *
1605 1625 * DTL_PARTIAL: txgs for which data is available, but not fully replicated
1606 1626 *
1607 1627 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
1608 1628 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
1609 1629 * txgs that was scrubbed.
1610 1630 *
1611 1631 * DTL_OUTAGE: txgs which cannot currently be read, whether due to
1612 1632 * persistent errors or just some device being offline.
1613 1633 * Unlike the other three, the DTL_OUTAGE map is not generally
1614 1634 * maintained; it's only computed when needed, typically to
1615 1635 * determine whether a device can be detached.
1616 1636 *
1617 1637 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
1618 1638 * either has the data or it doesn't.
1619 1639 *
1620 1640 * For interior vdevs such as mirror and RAID-Z the picture is more complex.
1621 1641 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
1622 1642 * if any child is less than fully replicated, then so is its parent.
1623 1643 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
1624 1644 * comprising only those txgs which appear in 'maxfaults' or more children;
1625 1645 * those are the txgs we don't have enough replication to read. For example,
1626 1646 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
1627 1647 * thus, its DTL_MISSING consists of the set of txgs that appear in more than
1628 1648 * two child DTL_MISSING maps.
1629 1649 *
1630 1650 * It should be clear from the above that to compute the DTLs and outage maps
1631 1651 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
1632 1652 * Therefore, that is all we keep on disk. When loading the pool, or after
1633 1653 * a configuration change, we generate all other DTLs from first principles.
1634 1654 */
1635 1655 void
1636 1656 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1637 1657 {
1638 1658 range_tree_t *rt = vd->vdev_dtl[t];
1639 1659
1640 1660 ASSERT(t < DTL_TYPES);
1641 1661 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1642 1662 ASSERT(spa_writeable(vd->vdev_spa));
1643 1663
1644 1664 mutex_enter(rt->rt_lock);
1645 1665 if (!range_tree_contains(rt, txg, size))
1646 1666 range_tree_add(rt, txg, size);
1647 1667 mutex_exit(rt->rt_lock);
1648 1668 }
1649 1669
1650 1670 boolean_t
1651 1671 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1652 1672 {
1653 1673 range_tree_t *rt = vd->vdev_dtl[t];
1654 1674 boolean_t dirty = B_FALSE;
1655 1675
1656 1676 ASSERT(t < DTL_TYPES);
1657 1677 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1658 1678
1659 1679 mutex_enter(rt->rt_lock);
1660 1680 if (range_tree_space(rt) != 0)
1661 1681 dirty = range_tree_contains(rt, txg, size);
1662 1682 mutex_exit(rt->rt_lock);
1663 1683
1664 1684 return (dirty);
1665 1685 }
1666 1686
1667 1687 boolean_t
1668 1688 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1669 1689 {
1670 1690 range_tree_t *rt = vd->vdev_dtl[t];
1671 1691 boolean_t empty;
1672 1692
1673 1693 mutex_enter(rt->rt_lock);
1674 1694 empty = (range_tree_space(rt) == 0);
1675 1695 mutex_exit(rt->rt_lock);
1676 1696
1677 1697 return (empty);
1678 1698 }
1679 1699
1680 1700 /*
1681 1701 * Returns the lowest txg in the DTL range.
1682 1702 */
1683 1703 static uint64_t
1684 1704 vdev_dtl_min(vdev_t *vd)
1685 1705 {
1686 1706 range_seg_t *rs;
1687 1707
1688 1708 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1689 1709 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
1690 1710 ASSERT0(vd->vdev_children);
1691 1711
1692 1712 rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
1693 1713 return (rs->rs_start - 1);
1694 1714 }
1695 1715
1696 1716 /*
1697 1717 * Returns the highest txg in the DTL.
1698 1718 */
1699 1719 static uint64_t
1700 1720 vdev_dtl_max(vdev_t *vd)
1701 1721 {
1702 1722 range_seg_t *rs;
1703 1723
1704 1724 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1705 1725 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
1706 1726 ASSERT0(vd->vdev_children);
1707 1727
1708 1728 rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
1709 1729 return (rs->rs_end);
1710 1730 }
1711 1731
1712 1732 /*
1713 1733 * Determine if a resilvering vdev should remove any DTL entries from
1714 1734 * its range. If the vdev was resilvering for the entire duration of the
1715 1735 * scan then it should excise that range from its DTLs. Otherwise, this
1716 1736 * vdev is considered partially resilvered and should leave its DTL
1717 1737 * entries intact. The comment in vdev_dtl_reassess() describes how we
1718 1738 * excise the DTLs.
1719 1739 */
1720 1740 static boolean_t
1721 1741 vdev_dtl_should_excise(vdev_t *vd)
1722 1742 {
1723 1743 spa_t *spa = vd->vdev_spa;
1724 1744 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1725 1745
1726 1746 ASSERT0(scn->scn_phys.scn_errors);
1727 1747 ASSERT0(vd->vdev_children);
1728 1748
1729 1749 if (vd->vdev_resilver_txg == 0 ||
1730 1750 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
1731 1751 return (B_TRUE);
1732 1752
1733 1753 /*
1734 1754 * When a resilver is initiated the scan will assign the scn_max_txg
1735 1755 * value to the highest txg value that exists in all DTLs. If this
1736 1756 * device's max DTL is not part of this scan (i.e. it is not in
1737 1757 * the range (scn_min_txg, scn_max_txg] then it is not eligible
1738 1758 * for excision.
1739 1759 */
1740 1760 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
1741 1761 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
1742 1762 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
1743 1763 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
1744 1764 return (B_TRUE);
1745 1765 }
1746 1766 return (B_FALSE);
1747 1767 }
1748 1768
1749 1769 /*
1750 1770 * Reassess DTLs after a config change or scrub completion.
1751 1771 */
1752 1772 void
1753 1773 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
1754 1774 {
1755 1775 spa_t *spa = vd->vdev_spa;
1756 1776 avl_tree_t reftree;
1757 1777 int minref;
1758 1778
1759 1779 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1760 1780
1761 1781 for (int c = 0; c < vd->vdev_children; c++)
1762 1782 vdev_dtl_reassess(vd->vdev_child[c], txg,
1763 1783 scrub_txg, scrub_done);
1764 1784
1765 1785 if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
1766 1786 return;
1767 1787
1768 1788 if (vd->vdev_ops->vdev_op_leaf) {
1769 1789 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1770 1790
1771 1791 mutex_enter(&vd->vdev_dtl_lock);
1772 1792
1773 1793 /*
1774 1794 * If we've completed a scan cleanly then determine
1775 1795 * if this vdev should remove any DTLs. We only want to
1776 1796 * excise regions on vdevs that were available during
1777 1797 * the entire duration of this scan.
1778 1798 */
1779 1799 if (scrub_txg != 0 &&
1780 1800 (spa->spa_scrub_started ||
1781 1801 (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
1782 1802 vdev_dtl_should_excise(vd)) {
1783 1803 /*
1784 1804 * We completed a scrub up to scrub_txg. If we
1785 1805 * did it without rebooting, then the scrub dtl
1786 1806 * will be valid, so excise the old region and
1787 1807 * fold in the scrub dtl. Otherwise, leave the
1788 1808 * dtl as-is if there was an error.
1789 1809 *
1790 1810 * There's little trick here: to excise the beginning
1791 1811 * of the DTL_MISSING map, we put it into a reference
1792 1812 * tree and then add a segment with refcnt -1 that
1793 1813 * covers the range [0, scrub_txg). This means
1794 1814 * that each txg in that range has refcnt -1 or 0.
1795 1815 * We then add DTL_SCRUB with a refcnt of 2, so that
1796 1816 * entries in the range [0, scrub_txg) will have a
1797 1817 * positive refcnt -- either 1 or 2. We then convert
1798 1818 * the reference tree into the new DTL_MISSING map.
1799 1819 */
1800 1820 space_reftree_create(&reftree);
1801 1821 space_reftree_add_map(&reftree,
1802 1822 vd->vdev_dtl[DTL_MISSING], 1);
1803 1823 space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
1804 1824 space_reftree_add_map(&reftree,
1805 1825 vd->vdev_dtl[DTL_SCRUB], 2);
1806 1826 space_reftree_generate_map(&reftree,
1807 1827 vd->vdev_dtl[DTL_MISSING], 1);
1808 1828 space_reftree_destroy(&reftree);
1809 1829 }
1810 1830 range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
1811 1831 range_tree_walk(vd->vdev_dtl[DTL_MISSING],
1812 1832 range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
1813 1833 if (scrub_done)
1814 1834 range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
1815 1835 range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
1816 1836 if (!vdev_readable(vd))
1817 1837 range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
1818 1838 else
1819 1839 range_tree_walk(vd->vdev_dtl[DTL_MISSING],
1820 1840 range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
1821 1841
1822 1842 /*
1823 1843 * If the vdev was resilvering and no longer has any
1824 1844 * DTLs then reset its resilvering flag.
1825 1845 */
1826 1846 if (vd->vdev_resilver_txg != 0 &&
1827 1847 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
1828 1848 range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0)
1829 1849 vd->vdev_resilver_txg = 0;
1830 1850
1831 1851 mutex_exit(&vd->vdev_dtl_lock);
1832 1852
1833 1853 if (txg != 0)
1834 1854 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
1835 1855 return;
1836 1856 }
1837 1857
1838 1858 mutex_enter(&vd->vdev_dtl_lock);
1839 1859 for (int t = 0; t < DTL_TYPES; t++) {
1840 1860 /* account for child's outage in parent's missing map */
1841 1861 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
1842 1862 if (t == DTL_SCRUB)
1843 1863 continue; /* leaf vdevs only */
1844 1864 if (t == DTL_PARTIAL)
1845 1865 minref = 1; /* i.e. non-zero */
1846 1866 else if (vd->vdev_nparity != 0)
1847 1867 minref = vd->vdev_nparity + 1; /* RAID-Z */
1848 1868 else
1849 1869 minref = vd->vdev_children; /* any kind of mirror */
1850 1870 space_reftree_create(&reftree);
1851 1871 for (int c = 0; c < vd->vdev_children; c++) {
1852 1872 vdev_t *cvd = vd->vdev_child[c];
1853 1873 mutex_enter(&cvd->vdev_dtl_lock);
1854 1874 space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
1855 1875 mutex_exit(&cvd->vdev_dtl_lock);
1856 1876 }
1857 1877 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
1858 1878 space_reftree_destroy(&reftree);
1859 1879 }
1860 1880 mutex_exit(&vd->vdev_dtl_lock);
1861 1881 }
1862 1882
1863 1883 int
1864 1884 vdev_dtl_load(vdev_t *vd)
1865 1885 {
1866 1886 spa_t *spa = vd->vdev_spa;
1867 1887 objset_t *mos = spa->spa_meta_objset;
1868 1888 int error = 0;
1869 1889
1870 1890 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
1871 1891 ASSERT(!vd->vdev_ishole);
1872 1892
1873 1893 error = space_map_open(&vd->vdev_dtl_sm, mos,
1874 1894 vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
1875 1895 if (error)
1876 1896 return (error);
1877 1897 ASSERT(vd->vdev_dtl_sm != NULL);
1878 1898
1879 1899 mutex_enter(&vd->vdev_dtl_lock);
1880 1900
1881 1901 /*
1882 1902 * Now that we've opened the space_map we need to update
1883 1903 * the in-core DTL.
1884 1904 */
1885 1905 space_map_update(vd->vdev_dtl_sm);
1886 1906
1887 1907 error = space_map_load(vd->vdev_dtl_sm,
1888 1908 vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
1889 1909 mutex_exit(&vd->vdev_dtl_lock);
1890 1910
1891 1911 return (error);
1892 1912 }
1893 1913
1894 1914 for (int c = 0; c < vd->vdev_children; c++) {
1895 1915 error = vdev_dtl_load(vd->vdev_child[c]);
1896 1916 if (error != 0)
1897 1917 break;
1898 1918 }
1899 1919
1900 1920 return (error);
1901 1921 }
1902 1922
1903 1923 void
1904 1924 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1905 1925 {
1906 1926 spa_t *spa = vd->vdev_spa;
1907 1927 range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
1908 1928 objset_t *mos = spa->spa_meta_objset;
1909 1929 range_tree_t *rtsync;
1910 1930 kmutex_t rtlock;
1911 1931 dmu_tx_t *tx;
1912 1932 uint64_t object = space_map_object(vd->vdev_dtl_sm);
1913 1933
1914 1934 ASSERT(!vd->vdev_ishole);
1915 1935 ASSERT(vd->vdev_ops->vdev_op_leaf);
1916 1936
1917 1937 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1918 1938
1919 1939 if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
1920 1940 mutex_enter(&vd->vdev_dtl_lock);
1921 1941 space_map_free(vd->vdev_dtl_sm, tx);
1922 1942 space_map_close(vd->vdev_dtl_sm);
1923 1943 vd->vdev_dtl_sm = NULL;
1924 1944 mutex_exit(&vd->vdev_dtl_lock);
1925 1945 dmu_tx_commit(tx);
1926 1946 return;
1927 1947 }
1928 1948
1929 1949 if (vd->vdev_dtl_sm == NULL) {
1930 1950 uint64_t new_object;
1931 1951
1932 1952 new_object = space_map_alloc(mos, tx);
1933 1953 VERIFY3U(new_object, !=, 0);
1934 1954
1935 1955 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
1936 1956 0, -1ULL, 0, &vd->vdev_dtl_lock));
1937 1957 ASSERT(vd->vdev_dtl_sm != NULL);
1938 1958 }
1939 1959
1940 1960 mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
1941 1961
1942 1962 rtsync = range_tree_create(NULL, NULL, &rtlock);
1943 1963
1944 1964 mutex_enter(&rtlock);
1945 1965
1946 1966 mutex_enter(&vd->vdev_dtl_lock);
1947 1967 range_tree_walk(rt, range_tree_add, rtsync);
1948 1968 mutex_exit(&vd->vdev_dtl_lock);
1949 1969
1950 1970 space_map_truncate(vd->vdev_dtl_sm, tx);
1951 1971 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
1952 1972 range_tree_vacate(rtsync, NULL, NULL);
1953 1973
1954 1974 range_tree_destroy(rtsync);
1955 1975
1956 1976 mutex_exit(&rtlock);
1957 1977 mutex_destroy(&rtlock);
1958 1978
1959 1979 /*
1960 1980 * If the object for the space map has changed then dirty
1961 1981 * the top level so that we update the config.
1962 1982 */
1963 1983 if (object != space_map_object(vd->vdev_dtl_sm)) {
1964 1984 zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
1965 1985 "new object %llu", txg, spa_name(spa), object,
1966 1986 space_map_object(vd->vdev_dtl_sm));
1967 1987 vdev_config_dirty(vd->vdev_top);
1968 1988 }
1969 1989
1970 1990 dmu_tx_commit(tx);
1971 1991
1972 1992 mutex_enter(&vd->vdev_dtl_lock);
1973 1993 space_map_update(vd->vdev_dtl_sm);
1974 1994 mutex_exit(&vd->vdev_dtl_lock);
1975 1995 }
1976 1996
1977 1997 /*
1978 1998 * Determine whether the specified vdev can be offlined/detached/removed
1979 1999 * without losing data.
1980 2000 */
1981 2001 boolean_t
1982 2002 vdev_dtl_required(vdev_t *vd)
1983 2003 {
1984 2004 spa_t *spa = vd->vdev_spa;
1985 2005 vdev_t *tvd = vd->vdev_top;
1986 2006 uint8_t cant_read = vd->vdev_cant_read;
1987 2007 boolean_t required;
1988 2008
1989 2009 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1990 2010
1991 2011 if (vd == spa->spa_root_vdev || vd == tvd)
1992 2012 return (B_TRUE);
1993 2013
1994 2014 /*
1995 2015 * Temporarily mark the device as unreadable, and then determine
1996 2016 * whether this results in any DTL outages in the top-level vdev.
1997 2017 * If not, we can safely offline/detach/remove the device.
1998 2018 */
1999 2019 vd->vdev_cant_read = B_TRUE;
2000 2020 vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
2001 2021 required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
2002 2022 vd->vdev_cant_read = cant_read;
2003 2023 vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
2004 2024
2005 2025 if (!required && zio_injection_enabled)
2006 2026 required = !!zio_handle_device_injection(vd, NULL, ECHILD);
2007 2027
2008 2028 return (required);
2009 2029 }
2010 2030
2011 2031 /*
2012 2032 * Determine if resilver is needed, and if so the txg range.
2013 2033 */
2014 2034 boolean_t
2015 2035 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
2016 2036 {
2017 2037 boolean_t needed = B_FALSE;
2018 2038 uint64_t thismin = UINT64_MAX;
2019 2039 uint64_t thismax = 0;
2020 2040
2021 2041 if (vd->vdev_children == 0) {
2022 2042 mutex_enter(&vd->vdev_dtl_lock);
2023 2043 if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
2024 2044 vdev_writeable(vd)) {
2025 2045
2026 2046 thismin = vdev_dtl_min(vd);
2027 2047 thismax = vdev_dtl_max(vd);
2028 2048 needed = B_TRUE;
2029 2049 }
2030 2050 mutex_exit(&vd->vdev_dtl_lock);
2031 2051 } else {
2032 2052 for (int c = 0; c < vd->vdev_children; c++) {
2033 2053 vdev_t *cvd = vd->vdev_child[c];
2034 2054 uint64_t cmin, cmax;
2035 2055
2036 2056 if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
2037 2057 thismin = MIN(thismin, cmin);
2038 2058 thismax = MAX(thismax, cmax);
2039 2059 needed = B_TRUE;
2040 2060 }
2041 2061 }
2042 2062 }
2043 2063
2044 2064 if (needed && minp) {
2045 2065 *minp = thismin;
2046 2066 *maxp = thismax;
2047 2067 }
2048 2068 return (needed);
2049 2069 }
2050 2070
2051 2071 void
2052 2072 vdev_load(vdev_t *vd)
2053 2073 {
2054 2074 /*
2055 2075 * Recursively load all children.
2056 2076 */
2057 2077 for (int c = 0; c < vd->vdev_children; c++)
2058 2078 vdev_load(vd->vdev_child[c]);
2059 2079
2060 2080 /*
2061 2081 * If this is a top-level vdev, initialize its metaslabs.
2062 2082 */
2063 2083 if (vd == vd->vdev_top && !vd->vdev_ishole &&
2064 2084 (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
2065 2085 vdev_metaslab_init(vd, 0) != 0))
2066 2086 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2067 2087 VDEV_AUX_CORRUPT_DATA);
2068 2088
2069 2089 /*
2070 2090 * If this is a leaf vdev, load its DTL.
2071 2091 */
2072 2092 if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
2073 2093 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2074 2094 VDEV_AUX_CORRUPT_DATA);
2075 2095 }
2076 2096
2077 2097 /*
2078 2098 * The special vdev case is used for hot spares and l2cache devices. Its
2079 2099 * sole purpose it to set the vdev state for the associated vdev. To do this,
2080 2100 * we make sure that we can open the underlying device, then try to read the
2081 2101 * label, and make sure that the label is sane and that it hasn't been
2082 2102 * repurposed to another pool.
2083 2103 */
2084 2104 int
2085 2105 vdev_validate_aux(vdev_t *vd)
2086 2106 {
2087 2107 nvlist_t *label;
2088 2108 uint64_t guid, version;
2089 2109 uint64_t state;
2090 2110
2091 2111 if (!vdev_readable(vd))
2092 2112 return (0);
2093 2113
2094 2114 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
2095 2115 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2096 2116 VDEV_AUX_CORRUPT_DATA);
2097 2117 return (-1);
2098 2118 }
2099 2119
2100 2120 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
2101 2121 !SPA_VERSION_IS_SUPPORTED(version) ||
2102 2122 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
2103 2123 guid != vd->vdev_guid ||
2104 2124 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
2105 2125 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2106 2126 VDEV_AUX_CORRUPT_DATA);
2107 2127 nvlist_free(label);
2108 2128 return (-1);
2109 2129 }
2110 2130
2111 2131 /*
2112 2132 * We don't actually check the pool state here. If it's in fact in
2113 2133 * use by another pool, we update this fact on the fly when requested.
2114 2134 */
2115 2135 nvlist_free(label);
2116 2136 return (0);
2117 2137 }
2118 2138
2119 2139 void
2120 2140 vdev_remove(vdev_t *vd, uint64_t txg)
2121 2141 {
2122 2142 spa_t *spa = vd->vdev_spa;
2123 2143 objset_t *mos = spa->spa_meta_objset;
2124 2144 dmu_tx_t *tx;
2125 2145
2126 2146 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2127 2147
2128 2148 if (vd->vdev_ms != NULL) {
2129 2149 metaslab_group_t *mg = vd->vdev_mg;
2130 2150
2131 2151 metaslab_group_histogram_verify(mg);
2132 2152 metaslab_class_histogram_verify(mg->mg_class);
2133 2153
2134 2154 for (int m = 0; m < vd->vdev_ms_count; m++) {
2135 2155 metaslab_t *msp = vd->vdev_ms[m];
2136 2156
2137 2157 if (msp == NULL || msp->ms_sm == NULL)
2138 2158 continue;
2139 2159
2140 2160 mutex_enter(&msp->ms_lock);
2141 2161 /*
2142 2162 * If the metaslab was not loaded when the vdev
2143 2163 * was removed then the histogram accounting may
2144 2164 * not be accurate. Update the histogram information
2145 2165 * here so that we ensure that the metaslab group
2146 2166 * and metaslab class are up-to-date.
2147 2167 */
2148 2168 metaslab_group_histogram_remove(mg, msp);
2149 2169
2150 2170 VERIFY0(space_map_allocated(msp->ms_sm));
2151 2171 space_map_free(msp->ms_sm, tx);
2152 2172 space_map_close(msp->ms_sm);
2153 2173 msp->ms_sm = NULL;
2154 2174 mutex_exit(&msp->ms_lock);
2155 2175 }
2156 2176
2157 2177 metaslab_group_histogram_verify(mg);
2158 2178 metaslab_class_histogram_verify(mg->mg_class);
2159 2179 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
2160 2180 ASSERT0(mg->mg_histogram[i]);
2161 2181
2162 2182 }
2163 2183
2164 2184 if (vd->vdev_ms_array) {
2165 2185 (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
2166 2186 vd->vdev_ms_array = 0;
2167 2187 }
2168 2188 dmu_tx_commit(tx);
2169 2189 }
2170 2190
2171 2191 void
2172 2192 vdev_sync_done(vdev_t *vd, uint64_t txg)
2173 2193 {
2174 2194 metaslab_t *msp;
2175 2195 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
2176 2196
2177 2197 ASSERT(!vd->vdev_ishole);
2178 2198
2179 2199 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
2180 2200 metaslab_sync_done(msp, txg);
2181 2201
2182 2202 if (reassess)
2183 2203 metaslab_sync_reassess(vd->vdev_mg);
2184 2204 }
2185 2205
2186 2206 void
2187 2207 vdev_sync(vdev_t *vd, uint64_t txg)
2188 2208 {
2189 2209 spa_t *spa = vd->vdev_spa;
2190 2210 vdev_t *lvd;
2191 2211 metaslab_t *msp;
2192 2212 dmu_tx_t *tx;
2193 2213
2194 2214 ASSERT(!vd->vdev_ishole);
2195 2215
2196 2216 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
2197 2217 ASSERT(vd == vd->vdev_top);
2198 2218 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2199 2219 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
2200 2220 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
2201 2221 ASSERT(vd->vdev_ms_array != 0);
2202 2222 vdev_config_dirty(vd);
2203 2223 dmu_tx_commit(tx);
2204 2224 }
2205 2225
2206 2226 /*
2207 2227 * Remove the metadata associated with this vdev once it's empty.
2208 2228 */
2209 2229 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
2210 2230 vdev_remove(vd, txg);
2211 2231
2212 2232 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
2213 2233 metaslab_sync(msp, txg);
2214 2234 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
2215 2235 }
2216 2236
2217 2237 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
2218 2238 vdev_dtl_sync(lvd, txg);
2219 2239
2220 2240 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
2221 2241 }
2222 2242
2223 2243 uint64_t
2224 2244 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
2225 2245 {
2226 2246 return (vd->vdev_ops->vdev_op_asize(vd, psize));
2227 2247 }
2228 2248
2229 2249 /*
2230 2250 * Mark the given vdev faulted. A faulted vdev behaves as if the device could
2231 2251 * not be opened, and no I/O is attempted.
2232 2252 */
2233 2253 int
2234 2254 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2235 2255 {
2236 2256 vdev_t *vd, *tvd;
2237 2257
2238 2258 spa_vdev_state_enter(spa, SCL_NONE);
2239 2259
2240 2260 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2241 2261 return (spa_vdev_state_exit(spa, NULL, ENODEV));
2242 2262
2243 2263 if (!vd->vdev_ops->vdev_op_leaf)
2244 2264 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2245 2265
2246 2266 tvd = vd->vdev_top;
2247 2267
2248 2268 /*
2249 2269 * We don't directly use the aux state here, but if we do a
2250 2270 * vdev_reopen(), we need this value to be present to remember why we
2251 2271 * were faulted.
2252 2272 */
2253 2273 vd->vdev_label_aux = aux;
2254 2274
2255 2275 /*
2256 2276 * Faulted state takes precedence over degraded.
2257 2277 */
2258 2278 vd->vdev_delayed_close = B_FALSE;
2259 2279 vd->vdev_faulted = 1ULL;
2260 2280 vd->vdev_degraded = 0ULL;
2261 2281 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
2262 2282
2263 2283 /*
2264 2284 * If this device has the only valid copy of the data, then
2265 2285 * back off and simply mark the vdev as degraded instead.
2266 2286 */
2267 2287 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
2268 2288 vd->vdev_degraded = 1ULL;
2269 2289 vd->vdev_faulted = 0ULL;
2270 2290
2271 2291 /*
2272 2292 * If we reopen the device and it's not dead, only then do we
2273 2293 * mark it degraded.
2274 2294 */
2275 2295 vdev_reopen(tvd);
2276 2296
2277 2297 if (vdev_readable(vd))
2278 2298 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
2279 2299 }
2280 2300
2281 2301 return (spa_vdev_state_exit(spa, vd, 0));
2282 2302 }
2283 2303
2284 2304 /*
2285 2305 * Mark the given vdev degraded. A degraded vdev is purely an indication to the
2286 2306 * user that something is wrong. The vdev continues to operate as normal as far
2287 2307 * as I/O is concerned.
2288 2308 */
2289 2309 int
2290 2310 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2291 2311 {
2292 2312 vdev_t *vd;
2293 2313
2294 2314 spa_vdev_state_enter(spa, SCL_NONE);
2295 2315
2296 2316 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2297 2317 return (spa_vdev_state_exit(spa, NULL, ENODEV));
2298 2318
2299 2319 if (!vd->vdev_ops->vdev_op_leaf)
2300 2320 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2301 2321
2302 2322 /*
2303 2323 * If the vdev is already faulted, then don't do anything.
2304 2324 */
2305 2325 if (vd->vdev_faulted || vd->vdev_degraded)
2306 2326 return (spa_vdev_state_exit(spa, NULL, 0));
2307 2327
2308 2328 vd->vdev_degraded = 1ULL;
2309 2329 if (!vdev_is_dead(vd))
2310 2330 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
2311 2331 aux);
2312 2332
2313 2333 return (spa_vdev_state_exit(spa, vd, 0));
2314 2334 }
2315 2335
2316 2336 /*
2317 2337 * Online the given vdev.
2318 2338 *
2319 2339 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached
2320 2340 * spare device should be detached when the device finishes resilvering.
2321 2341 * Second, the online should be treated like a 'test' online case, so no FMA
2322 2342 * events are generated if the device fails to open.
2323 2343 */
2324 2344 int
2325 2345 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
2326 2346 {
2327 2347 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
2328 2348
2329 2349 spa_vdev_state_enter(spa, SCL_NONE);
2330 2350
2331 2351 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2332 2352 return (spa_vdev_state_exit(spa, NULL, ENODEV));
2333 2353
2334 2354 if (!vd->vdev_ops->vdev_op_leaf)
2335 2355 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2336 2356
2337 2357 tvd = vd->vdev_top;
2338 2358 vd->vdev_offline = B_FALSE;
2339 2359 vd->vdev_tmpoffline = B_FALSE;
2340 2360 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
2341 2361 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
2342 2362
2343 2363 /* XXX - L2ARC 1.0 does not support expansion */
2344 2364 if (!vd->vdev_aux) {
2345 2365 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2346 2366 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
2347 2367 }
2348 2368
2349 2369 vdev_reopen(tvd);
2350 2370 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
2351 2371
2352 2372 if (!vd->vdev_aux) {
2353 2373 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2354 2374 pvd->vdev_expanding = B_FALSE;
2355 2375 }
2356 2376
2357 2377 if (newstate)
2358 2378 *newstate = vd->vdev_state;
2359 2379 if ((flags & ZFS_ONLINE_UNSPARE) &&
2360 2380 !vdev_is_dead(vd) && vd->vdev_parent &&
2361 2381 vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2362 2382 vd->vdev_parent->vdev_child[0] == vd)
2363 2383 vd->vdev_unspare = B_TRUE;
2364 2384
2365 2385 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
2366 2386
2367 2387 /* XXX - L2ARC 1.0 does not support expansion */
2368 2388 if (vd->vdev_aux)
2369 2389 return (spa_vdev_state_exit(spa, vd, ENOTSUP));
2370 2390 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2371 2391 }
2372 2392 return (spa_vdev_state_exit(spa, vd, 0));
2373 2393 }
2374 2394
2375 2395 static int
2376 2396 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
2377 2397 {
2378 2398 vdev_t *vd, *tvd;
2379 2399 int error = 0;
2380 2400 uint64_t generation;
2381 2401 metaslab_group_t *mg;
2382 2402
2383 2403 top:
2384 2404 spa_vdev_state_enter(spa, SCL_ALLOC);
2385 2405
2386 2406 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2387 2407 return (spa_vdev_state_exit(spa, NULL, ENODEV));
2388 2408
2389 2409 if (!vd->vdev_ops->vdev_op_leaf)
2390 2410 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2391 2411
2392 2412 tvd = vd->vdev_top;
2393 2413 mg = tvd->vdev_mg;
2394 2414 generation = spa->spa_config_generation + 1;
2395 2415
2396 2416 /*
2397 2417 * If the device isn't already offline, try to offline it.
2398 2418 */
2399 2419 if (!vd->vdev_offline) {
2400 2420 /*
2401 2421 * If this device has the only valid copy of some data,
2402 2422 * don't allow it to be offlined. Log devices are always
2403 2423 * expendable.
2404 2424 */
2405 2425 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2406 2426 vdev_dtl_required(vd))
2407 2427 return (spa_vdev_state_exit(spa, NULL, EBUSY));
2408 2428
2409 2429 /*
2410 2430 * If the top-level is a slog and it has had allocations
2411 2431 * then proceed. We check that the vdev's metaslab group
2412 2432 * is not NULL since it's possible that we may have just
2413 2433 * added this vdev but not yet initialized its metaslabs.
2414 2434 */
2415 2435 if (tvd->vdev_islog && mg != NULL) {
2416 2436 /*
2417 2437 * Prevent any future allocations.
2418 2438 */
2419 2439 metaslab_group_passivate(mg);
2420 2440 (void) spa_vdev_state_exit(spa, vd, 0);
2421 2441
2422 2442 error = spa_offline_log(spa);
2423 2443
2424 2444 spa_vdev_state_enter(spa, SCL_ALLOC);
2425 2445
2426 2446 /*
2427 2447 * Check to see if the config has changed.
2428 2448 */
2429 2449 if (error || generation != spa->spa_config_generation) {
2430 2450 metaslab_group_activate(mg);
2431 2451 if (error)
2432 2452 return (spa_vdev_state_exit(spa,
2433 2453 vd, error));
2434 2454 (void) spa_vdev_state_exit(spa, vd, 0);
2435 2455 goto top;
2436 2456 }
2437 2457 ASSERT0(tvd->vdev_stat.vs_alloc);
2438 2458 }
2439 2459
2440 2460 /*
2441 2461 * Offline this device and reopen its top-level vdev.
2442 2462 * If the top-level vdev is a log device then just offline
2443 2463 * it. Otherwise, if this action results in the top-level
2444 2464 * vdev becoming unusable, undo it and fail the request.
2445 2465 */
2446 2466 vd->vdev_offline = B_TRUE;
2447 2467 vdev_reopen(tvd);
2448 2468
2449 2469 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2450 2470 vdev_is_dead(tvd)) {
2451 2471 vd->vdev_offline = B_FALSE;
2452 2472 vdev_reopen(tvd);
2453 2473 return (spa_vdev_state_exit(spa, NULL, EBUSY));
2454 2474 }
2455 2475
2456 2476 /*
2457 2477 * Add the device back into the metaslab rotor so that
2458 2478 * once we online the device it's open for business.
2459 2479 */
2460 2480 if (tvd->vdev_islog && mg != NULL)
2461 2481 metaslab_group_activate(mg);
2462 2482 }
2463 2483
2464 2484 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
2465 2485
2466 2486 return (spa_vdev_state_exit(spa, vd, 0));
2467 2487 }
2468 2488
2469 2489 int
2470 2490 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
2471 2491 {
2472 2492 int error;
2473 2493
2474 2494 mutex_enter(&spa->spa_vdev_top_lock);
2475 2495 error = vdev_offline_locked(spa, guid, flags);
2476 2496 mutex_exit(&spa->spa_vdev_top_lock);
2477 2497
2478 2498 return (error);
2479 2499 }
2480 2500
2481 2501 /*
2482 2502 * Clear the error counts associated with this vdev. Unlike vdev_online() and
2483 2503 * vdev_offline(), we assume the spa config is locked. We also clear all
2484 2504 * children. If 'vd' is NULL, then the user wants to clear all vdevs.
2485 2505 */
2486 2506 void
2487 2507 vdev_clear(spa_t *spa, vdev_t *vd)
2488 2508 {
2489 2509 vdev_t *rvd = spa->spa_root_vdev;
2490 2510
2491 2511 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2492 2512
2493 2513 if (vd == NULL)
2494 2514 vd = rvd;
2495 2515
2496 2516 vd->vdev_stat.vs_read_errors = 0;
2497 2517 vd->vdev_stat.vs_write_errors = 0;
2498 2518 vd->vdev_stat.vs_checksum_errors = 0;
2499 2519
2500 2520 for (int c = 0; c < vd->vdev_children; c++)
2501 2521 vdev_clear(spa, vd->vdev_child[c]);
2502 2522
2503 2523 /*
2504 2524 * If we're in the FAULTED state or have experienced failed I/O, then
2505 2525 * clear the persistent state and attempt to reopen the device. We
2506 2526 * also mark the vdev config dirty, so that the new faulted state is
2507 2527 * written out to disk.
2508 2528 */
2509 2529 if (vd->vdev_faulted || vd->vdev_degraded ||
2510 2530 !vdev_readable(vd) || !vdev_writeable(vd)) {
2511 2531
2512 2532 /*
2513 2533 * When reopening in reponse to a clear event, it may be due to
2514 2534 * a fmadm repair request. In this case, if the device is
2515 2535 * still broken, we want to still post the ereport again.
2516 2536 */
2517 2537 vd->vdev_forcefault = B_TRUE;
2518 2538
2519 2539 vd->vdev_faulted = vd->vdev_degraded = 0ULL;
2520 2540 vd->vdev_cant_read = B_FALSE;
2521 2541 vd->vdev_cant_write = B_FALSE;
2522 2542
2523 2543 vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
2524 2544
2525 2545 vd->vdev_forcefault = B_FALSE;
2526 2546
2527 2547 if (vd != rvd && vdev_writeable(vd->vdev_top))
2528 2548 vdev_state_dirty(vd->vdev_top);
2529 2549
2530 2550 if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
2531 2551 spa_async_request(spa, SPA_ASYNC_RESILVER);
2532 2552
2533 2553 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
2534 2554 }
2535 2555
2536 2556 /*
2537 2557 * When clearing a FMA-diagnosed fault, we always want to
2538 2558 * unspare the device, as we assume that the original spare was
2539 2559 * done in response to the FMA fault.
2540 2560 */
2541 2561 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
2542 2562 vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2543 2563 vd->vdev_parent->vdev_child[0] == vd)
2544 2564 vd->vdev_unspare = B_TRUE;
2545 2565 }
2546 2566
2547 2567 boolean_t
2548 2568 vdev_is_dead(vdev_t *vd)
2549 2569 {
2550 2570 /*
2551 2571 * Holes and missing devices are always considered "dead".
2552 2572 * This simplifies the code since we don't have to check for
2553 2573 * these types of devices in the various code paths.
2554 2574 * Instead we rely on the fact that we skip over dead devices
2555 2575 * before issuing I/O to them.
2556 2576 */
2557 2577 return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
2558 2578 vd->vdev_ops == &vdev_missing_ops);
2559 2579 }
2560 2580
2561 2581 boolean_t
2562 2582 vdev_readable(vdev_t *vd)
2563 2583 {
2564 2584 return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
2565 2585 }
2566 2586
2567 2587 boolean_t
2568 2588 vdev_writeable(vdev_t *vd)
2569 2589 {
2570 2590 return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
2571 2591 }
2572 2592
2573 2593 boolean_t
2574 2594 vdev_allocatable(vdev_t *vd)
2575 2595 {
2576 2596 uint64_t state = vd->vdev_state;
2577 2597
2578 2598 /*
2579 2599 * We currently allow allocations from vdevs which may be in the
2580 2600 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
2581 2601 * fails to reopen then we'll catch it later when we're holding
2582 2602 * the proper locks. Note that we have to get the vdev state
2583 2603 * in a local variable because although it changes atomically,
2584 2604 * we're asking two separate questions about it.
2585 2605 */
2586 2606 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
2587 2607 !vd->vdev_cant_write && !vd->vdev_ishole);
2588 2608 }
2589 2609
2590 2610 boolean_t
2591 2611 vdev_accessible(vdev_t *vd, zio_t *zio)
2592 2612 {
2593 2613 ASSERT(zio->io_vd == vd);
2594 2614
2595 2615 if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
2596 2616 return (B_FALSE);
2597 2617
2598 2618 if (zio->io_type == ZIO_TYPE_READ)
2599 2619 return (!vd->vdev_cant_read);
2600 2620
2601 2621 if (zio->io_type == ZIO_TYPE_WRITE)
2602 2622 return (!vd->vdev_cant_write);
2603 2623
2604 2624 return (B_TRUE);
2605 2625 }
2606 2626
2607 2627 /*
2608 2628 * Get statistics for the given vdev.
2609 2629 */
2610 2630 void
2611 2631 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
2612 2632 {
2613 2633 spa_t *spa = vd->vdev_spa;
2614 2634 vdev_t *rvd = spa->spa_root_vdev;
2615 2635
2616 2636 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2617 2637
2618 2638 mutex_enter(&vd->vdev_stat_lock);
2619 2639 bcopy(&vd->vdev_stat, vs, sizeof (*vs));
2620 2640 vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
2621 2641 vs->vs_state = vd->vdev_state;
2622 2642 vs->vs_rsize = vdev_get_min_asize(vd);
2623 2643 if (vd->vdev_ops->vdev_op_leaf)
2624 2644 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
2625 2645 vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
2626 2646 if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
2627 2647 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
2628 2648 }
2629 2649
2630 2650 /*
2631 2651 * If we're getting stats on the root vdev, aggregate the I/O counts
2632 2652 * over all top-level vdevs (i.e. the direct children of the root).
2633 2653 */
2634 2654 if (vd == rvd) {
2635 2655 for (int c = 0; c < rvd->vdev_children; c++) {
2636 2656 vdev_t *cvd = rvd->vdev_child[c];
2637 2657 vdev_stat_t *cvs = &cvd->vdev_stat;
2638 2658
2639 2659 for (int t = 0; t < ZIO_TYPES; t++) {
2640 2660 vs->vs_ops[t] += cvs->vs_ops[t];
2641 2661 vs->vs_bytes[t] += cvs->vs_bytes[t];
2642 2662 }
2643 2663 cvs->vs_scan_removing = cvd->vdev_removing;
2644 2664 }
2645 2665 }
2646 2666 mutex_exit(&vd->vdev_stat_lock);
2647 2667 }
2648 2668
2649 2669 void
2650 2670 vdev_clear_stats(vdev_t *vd)
2651 2671 {
2652 2672 mutex_enter(&vd->vdev_stat_lock);
2653 2673 vd->vdev_stat.vs_space = 0;
2654 2674 vd->vdev_stat.vs_dspace = 0;
2655 2675 vd->vdev_stat.vs_alloc = 0;
2656 2676 mutex_exit(&vd->vdev_stat_lock);
2657 2677 }
2658 2678
2659 2679 void
2660 2680 vdev_scan_stat_init(vdev_t *vd)
2661 2681 {
2662 2682 vdev_stat_t *vs = &vd->vdev_stat;
2663 2683
2664 2684 for (int c = 0; c < vd->vdev_children; c++)
2665 2685 vdev_scan_stat_init(vd->vdev_child[c]);
2666 2686
2667 2687 mutex_enter(&vd->vdev_stat_lock);
2668 2688 vs->vs_scan_processed = 0;
2669 2689 mutex_exit(&vd->vdev_stat_lock);
2670 2690 }
2671 2691
2672 2692 void
2673 2693 vdev_stat_update(zio_t *zio, uint64_t psize)
2674 2694 {
2675 2695 spa_t *spa = zio->io_spa;
2676 2696 vdev_t *rvd = spa->spa_root_vdev;
2677 2697 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
2678 2698 vdev_t *pvd;
2679 2699 uint64_t txg = zio->io_txg;
2680 2700 vdev_stat_t *vs = &vd->vdev_stat;
2681 2701 zio_type_t type = zio->io_type;
2682 2702 int flags = zio->io_flags;
2683 2703
2684 2704 /*
2685 2705 * If this i/o is a gang leader, it didn't do any actual work.
2686 2706 */
2687 2707 if (zio->io_gang_tree)
2688 2708 return;
2689 2709
2690 2710 if (zio->io_error == 0) {
2691 2711 /*
2692 2712 * If this is a root i/o, don't count it -- we've already
2693 2713 * counted the top-level vdevs, and vdev_get_stats() will
2694 2714 * aggregate them when asked. This reduces contention on
2695 2715 * the root vdev_stat_lock and implicitly handles blocks
2696 2716 * that compress away to holes, for which there is no i/o.
2697 2717 * (Holes never create vdev children, so all the counters
2698 2718 * remain zero, which is what we want.)
2699 2719 *
2700 2720 * Note: this only applies to successful i/o (io_error == 0)
2701 2721 * because unlike i/o counts, errors are not additive.
2702 2722 * When reading a ditto block, for example, failure of
2703 2723 * one top-level vdev does not imply a root-level error.
2704 2724 */
2705 2725 if (vd == rvd)
2706 2726 return;
2707 2727
2708 2728 ASSERT(vd == zio->io_vd);
2709 2729
2710 2730 if (flags & ZIO_FLAG_IO_BYPASS)
2711 2731 return;
2712 2732
2713 2733 mutex_enter(&vd->vdev_stat_lock);
2714 2734
2715 2735 if (flags & ZIO_FLAG_IO_REPAIR) {
2716 2736 if (flags & ZIO_FLAG_SCAN_THREAD) {
2717 2737 dsl_scan_phys_t *scn_phys =
2718 2738 &spa->spa_dsl_pool->dp_scan->scn_phys;
2719 2739 uint64_t *processed = &scn_phys->scn_processed;
2720 2740
2721 2741 /* XXX cleanup? */
2722 2742 if (vd->vdev_ops->vdev_op_leaf)
2723 2743 atomic_add_64(processed, psize);
2724 2744 vs->vs_scan_processed += psize;
2725 2745 }
2726 2746
2727 2747 if (flags & ZIO_FLAG_SELF_HEAL)
2728 2748 vs->vs_self_healed += psize;
2729 2749 }
2730 2750
2731 2751 vs->vs_ops[type]++;
2732 2752 vs->vs_bytes[type] += psize;
2733 2753
2734 2754 mutex_exit(&vd->vdev_stat_lock);
2735 2755 return;
2736 2756 }
2737 2757
2738 2758 if (flags & ZIO_FLAG_SPECULATIVE)
2739 2759 return;
2740 2760
2741 2761 /*
2742 2762 * If this is an I/O error that is going to be retried, then ignore the
2743 2763 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
2744 2764 * hard errors, when in reality they can happen for any number of
2745 2765 * innocuous reasons (bus resets, MPxIO link failure, etc).
2746 2766 */
2747 2767 if (zio->io_error == EIO &&
2748 2768 !(zio->io_flags & ZIO_FLAG_IO_RETRY))
2749 2769 return;
2750 2770
2751 2771 /*
2752 2772 * Intent logs writes won't propagate their error to the root
2753 2773 * I/O so don't mark these types of failures as pool-level
2754 2774 * errors.
2755 2775 */
2756 2776 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
2757 2777 return;
2758 2778
2759 2779 mutex_enter(&vd->vdev_stat_lock);
2760 2780 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
2761 2781 if (zio->io_error == ECKSUM)
2762 2782 vs->vs_checksum_errors++;
2763 2783 else
2764 2784 vs->vs_read_errors++;
2765 2785 }
2766 2786 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
2767 2787 vs->vs_write_errors++;
2768 2788 mutex_exit(&vd->vdev_stat_lock);
2769 2789
2770 2790 if (type == ZIO_TYPE_WRITE && txg != 0 &&
2771 2791 (!(flags & ZIO_FLAG_IO_REPAIR) ||
2772 2792 (flags & ZIO_FLAG_SCAN_THREAD) ||
2773 2793 spa->spa_claiming)) {
2774 2794 /*
2775 2795 * This is either a normal write (not a repair), or it's
2776 2796 * a repair induced by the scrub thread, or it's a repair
2777 2797 * made by zil_claim() during spa_load() in the first txg.
2778 2798 * In the normal case, we commit the DTL change in the same
2779 2799 * txg as the block was born. In the scrub-induced repair
2780 2800 * case, we know that scrubs run in first-pass syncing context,
2781 2801 * so we commit the DTL change in spa_syncing_txg(spa).
2782 2802 * In the zil_claim() case, we commit in spa_first_txg(spa).
2783 2803 *
2784 2804 * We currently do not make DTL entries for failed spontaneous
2785 2805 * self-healing writes triggered by normal (non-scrubbing)
2786 2806 * reads, because we have no transactional context in which to
2787 2807 * do so -- and it's not clear that it'd be desirable anyway.
2788 2808 */
2789 2809 if (vd->vdev_ops->vdev_op_leaf) {
2790 2810 uint64_t commit_txg = txg;
2791 2811 if (flags & ZIO_FLAG_SCAN_THREAD) {
2792 2812 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
2793 2813 ASSERT(spa_sync_pass(spa) == 1);
2794 2814 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
2795 2815 commit_txg = spa_syncing_txg(spa);
2796 2816 } else if (spa->spa_claiming) {
2797 2817 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
2798 2818 commit_txg = spa_first_txg(spa);
2799 2819 }
2800 2820 ASSERT(commit_txg >= spa_syncing_txg(spa));
2801 2821 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
2802 2822 return;
2803 2823 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2804 2824 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
2805 2825 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
2806 2826 }
2807 2827 if (vd != rvd)
2808 2828 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
2809 2829 }
2810 2830 }
2811 2831
2812 2832 /*
2813 2833 * Update the in-core space usage stats for this vdev, its metaslab class,
2814 2834 * and the root vdev.
2815 2835 */
2816 2836 void
2817 2837 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
2818 2838 int64_t space_delta)
2819 2839 {
2820 2840 int64_t dspace_delta = space_delta;
2821 2841 spa_t *spa = vd->vdev_spa;
2822 2842 vdev_t *rvd = spa->spa_root_vdev;
2823 2843 metaslab_group_t *mg = vd->vdev_mg;
2824 2844 metaslab_class_t *mc = mg ? mg->mg_class : NULL;
2825 2845
2826 2846 ASSERT(vd == vd->vdev_top);
2827 2847
2828 2848 /*
2829 2849 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
2830 2850 * factor. We must calculate this here and not at the root vdev
2831 2851 * because the root vdev's psize-to-asize is simply the max of its
2832 2852 * childrens', thus not accurate enough for us.
2833 2853 */
2834 2854 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
2835 2855 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
2836 2856 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
2837 2857 vd->vdev_deflate_ratio;
2838 2858
2839 2859 mutex_enter(&vd->vdev_stat_lock);
2840 2860 vd->vdev_stat.vs_alloc += alloc_delta;
2841 2861 vd->vdev_stat.vs_space += space_delta;
2842 2862 vd->vdev_stat.vs_dspace += dspace_delta;
2843 2863 mutex_exit(&vd->vdev_stat_lock);
2844 2864
2845 2865 if (mc == spa_normal_class(spa)) {
2846 2866 mutex_enter(&rvd->vdev_stat_lock);
2847 2867 rvd->vdev_stat.vs_alloc += alloc_delta;
2848 2868 rvd->vdev_stat.vs_space += space_delta;
2849 2869 rvd->vdev_stat.vs_dspace += dspace_delta;
2850 2870 mutex_exit(&rvd->vdev_stat_lock);
2851 2871 }
2852 2872
2853 2873 if (mc != NULL) {
2854 2874 ASSERT(rvd == vd->vdev_parent);
2855 2875 ASSERT(vd->vdev_ms_count != 0);
2856 2876
2857 2877 metaslab_class_space_update(mc,
2858 2878 alloc_delta, defer_delta, space_delta, dspace_delta);
2859 2879 }
2860 2880 }
2861 2881
2862 2882 /*
2863 2883 * Mark a top-level vdev's config as dirty, placing it on the dirty list
2864 2884 * so that it will be written out next time the vdev configuration is synced.
2865 2885 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
2866 2886 */
2867 2887 void
2868 2888 vdev_config_dirty(vdev_t *vd)
2869 2889 {
2870 2890 spa_t *spa = vd->vdev_spa;
2871 2891 vdev_t *rvd = spa->spa_root_vdev;
2872 2892 int c;
2873 2893
2874 2894 ASSERT(spa_writeable(spa));
2875 2895
2876 2896 /*
2877 2897 * If this is an aux vdev (as with l2cache and spare devices), then we
2878 2898 * update the vdev config manually and set the sync flag.
2879 2899 */
2880 2900 if (vd->vdev_aux != NULL) {
2881 2901 spa_aux_vdev_t *sav = vd->vdev_aux;
2882 2902 nvlist_t **aux;
2883 2903 uint_t naux;
2884 2904
2885 2905 for (c = 0; c < sav->sav_count; c++) {
2886 2906 if (sav->sav_vdevs[c] == vd)
2887 2907 break;
2888 2908 }
2889 2909
2890 2910 if (c == sav->sav_count) {
2891 2911 /*
2892 2912 * We're being removed. There's nothing more to do.
2893 2913 */
2894 2914 ASSERT(sav->sav_sync == B_TRUE);
2895 2915 return;
2896 2916 }
2897 2917
2898 2918 sav->sav_sync = B_TRUE;
2899 2919
2900 2920 if (nvlist_lookup_nvlist_array(sav->sav_config,
2901 2921 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
2902 2922 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
2903 2923 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
2904 2924 }
2905 2925
2906 2926 ASSERT(c < naux);
2907 2927
2908 2928 /*
2909 2929 * Setting the nvlist in the middle if the array is a little
2910 2930 * sketchy, but it will work.
2911 2931 */
2912 2932 nvlist_free(aux[c]);
2913 2933 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
2914 2934
2915 2935 return;
2916 2936 }
2917 2937
2918 2938 /*
2919 2939 * The dirty list is protected by the SCL_CONFIG lock. The caller
2920 2940 * must either hold SCL_CONFIG as writer, or must be the sync thread
2921 2941 * (which holds SCL_CONFIG as reader). There's only one sync thread,
2922 2942 * so this is sufficient to ensure mutual exclusion.
2923 2943 */
2924 2944 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
2925 2945 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2926 2946 spa_config_held(spa, SCL_CONFIG, RW_READER)));
2927 2947
2928 2948 if (vd == rvd) {
2929 2949 for (c = 0; c < rvd->vdev_children; c++)
2930 2950 vdev_config_dirty(rvd->vdev_child[c]);
2931 2951 } else {
2932 2952 ASSERT(vd == vd->vdev_top);
2933 2953
2934 2954 if (!list_link_active(&vd->vdev_config_dirty_node) &&
2935 2955 !vd->vdev_ishole)
2936 2956 list_insert_head(&spa->spa_config_dirty_list, vd);
2937 2957 }
2938 2958 }
2939 2959
2940 2960 void
2941 2961 vdev_config_clean(vdev_t *vd)
2942 2962 {
2943 2963 spa_t *spa = vd->vdev_spa;
2944 2964
2945 2965 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
2946 2966 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2947 2967 spa_config_held(spa, SCL_CONFIG, RW_READER)));
2948 2968
2949 2969 ASSERT(list_link_active(&vd->vdev_config_dirty_node));
2950 2970 list_remove(&spa->spa_config_dirty_list, vd);
2951 2971 }
2952 2972
2953 2973 /*
2954 2974 * Mark a top-level vdev's state as dirty, so that the next pass of
2955 2975 * spa_sync() can convert this into vdev_config_dirty(). We distinguish
2956 2976 * the state changes from larger config changes because they require
2957 2977 * much less locking, and are often needed for administrative actions.
2958 2978 */
2959 2979 void
2960 2980 vdev_state_dirty(vdev_t *vd)
2961 2981 {
2962 2982 spa_t *spa = vd->vdev_spa;
2963 2983
2964 2984 ASSERT(spa_writeable(spa));
2965 2985 ASSERT(vd == vd->vdev_top);
2966 2986
2967 2987 /*
2968 2988 * The state list is protected by the SCL_STATE lock. The caller
2969 2989 * must either hold SCL_STATE as writer, or must be the sync thread
2970 2990 * (which holds SCL_STATE as reader). There's only one sync thread,
2971 2991 * so this is sufficient to ensure mutual exclusion.
2972 2992 */
2973 2993 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
2974 2994 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2975 2995 spa_config_held(spa, SCL_STATE, RW_READER)));
2976 2996
2977 2997 if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
2978 2998 list_insert_head(&spa->spa_state_dirty_list, vd);
2979 2999 }
2980 3000
2981 3001 void
2982 3002 vdev_state_clean(vdev_t *vd)
2983 3003 {
2984 3004 spa_t *spa = vd->vdev_spa;
2985 3005
2986 3006 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
2987 3007 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2988 3008 spa_config_held(spa, SCL_STATE, RW_READER)));
2989 3009
2990 3010 ASSERT(list_link_active(&vd->vdev_state_dirty_node));
2991 3011 list_remove(&spa->spa_state_dirty_list, vd);
2992 3012 }
2993 3013
2994 3014 /*
2995 3015 * Propagate vdev state up from children to parent.
2996 3016 */
2997 3017 void
2998 3018 vdev_propagate_state(vdev_t *vd)
2999 3019 {
3000 3020 spa_t *spa = vd->vdev_spa;
3001 3021 vdev_t *rvd = spa->spa_root_vdev;
3002 3022 int degraded = 0, faulted = 0;
3003 3023 int corrupted = 0;
3004 3024 vdev_t *child;
3005 3025
3006 3026 if (vd->vdev_children > 0) {
3007 3027 for (int c = 0; c < vd->vdev_children; c++) {
3008 3028 child = vd->vdev_child[c];
3009 3029
3010 3030 /*
3011 3031 * Don't factor holes into the decision.
3012 3032 */
3013 3033 if (child->vdev_ishole)
3014 3034 continue;
3015 3035
3016 3036 if (!vdev_readable(child) ||
3017 3037 (!vdev_writeable(child) && spa_writeable(spa))) {
3018 3038 /*
3019 3039 * Root special: if there is a top-level log
3020 3040 * device, treat the root vdev as if it were
3021 3041 * degraded.
3022 3042 */
3023 3043 if (child->vdev_islog && vd == rvd)
3024 3044 degraded++;
3025 3045 else
3026 3046 faulted++;
3027 3047 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
3028 3048 degraded++;
3029 3049 }
3030 3050
3031 3051 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
3032 3052 corrupted++;
3033 3053 }
3034 3054
3035 3055 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
3036 3056
3037 3057 /*
3038 3058 * Root special: if there is a top-level vdev that cannot be
3039 3059 * opened due to corrupted metadata, then propagate the root
3040 3060 * vdev's aux state as 'corrupt' rather than 'insufficient
3041 3061 * replicas'.
3042 3062 */
3043 3063 if (corrupted && vd == rvd &&
3044 3064 rvd->vdev_state == VDEV_STATE_CANT_OPEN)
3045 3065 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
3046 3066 VDEV_AUX_CORRUPT_DATA);
3047 3067 }
3048 3068
3049 3069 if (vd->vdev_parent)
3050 3070 vdev_propagate_state(vd->vdev_parent);
3051 3071 }
3052 3072
3053 3073 /*
3054 3074 * Set a vdev's state. If this is during an open, we don't update the parent
3055 3075 * state, because we're in the process of opening children depth-first.
3056 3076 * Otherwise, we propagate the change to the parent.
3057 3077 *
3058 3078 * If this routine places a device in a faulted state, an appropriate ereport is
3059 3079 * generated.
3060 3080 */
3061 3081 void
3062 3082 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
3063 3083 {
3064 3084 uint64_t save_state;
3065 3085 spa_t *spa = vd->vdev_spa;
3066 3086
3067 3087 if (state == vd->vdev_state) {
3068 3088 vd->vdev_stat.vs_aux = aux;
3069 3089 return;
3070 3090 }
3071 3091
3072 3092 save_state = vd->vdev_state;
3073 3093
3074 3094 vd->vdev_state = state;
3075 3095 vd->vdev_stat.vs_aux = aux;
3076 3096
3077 3097 /*
3078 3098 * If we are setting the vdev state to anything but an open state, then
3079 3099 * always close the underlying device unless the device has requested
3080 3100 * a delayed close (i.e. we're about to remove or fault the device).
3081 3101 * Otherwise, we keep accessible but invalid devices open forever.
3082 3102 * We don't call vdev_close() itself, because that implies some extra
3083 3103 * checks (offline, etc) that we don't want here. This is limited to
3084 3104 * leaf devices, because otherwise closing the device will affect other
3085 3105 * children.
3086 3106 */
3087 3107 if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
3088 3108 vd->vdev_ops->vdev_op_leaf)
3089 3109 vd->vdev_ops->vdev_op_close(vd);
3090 3110
3091 3111 /*
3092 3112 * If we have brought this vdev back into service, we need
3093 3113 * to notify fmd so that it can gracefully repair any outstanding
3094 3114 * cases due to a missing device. We do this in all cases, even those
3095 3115 * that probably don't correlate to a repaired fault. This is sure to
3096 3116 * catch all cases, and we let the zfs-retire agent sort it out. If
3097 3117 * this is a transient state it's OK, as the retire agent will
3098 3118 * double-check the state of the vdev before repairing it.
3099 3119 */
3100 3120 if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
3101 3121 vd->vdev_prevstate != state)
3102 3122 zfs_post_state_change(spa, vd);
3103 3123
3104 3124 if (vd->vdev_removed &&
3105 3125 state == VDEV_STATE_CANT_OPEN &&
3106 3126 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
3107 3127 /*
3108 3128 * If the previous state is set to VDEV_STATE_REMOVED, then this
3109 3129 * device was previously marked removed and someone attempted to
3110 3130 * reopen it. If this failed due to a nonexistent device, then
3111 3131 * keep the device in the REMOVED state. We also let this be if
3112 3132 * it is one of our special test online cases, which is only
3113 3133 * attempting to online the device and shouldn't generate an FMA
3114 3134 * fault.
3115 3135 */
3116 3136 vd->vdev_state = VDEV_STATE_REMOVED;
3117 3137 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
3118 3138 } else if (state == VDEV_STATE_REMOVED) {
3119 3139 vd->vdev_removed = B_TRUE;
3120 3140 } else if (state == VDEV_STATE_CANT_OPEN) {
3121 3141 /*
3122 3142 * If we fail to open a vdev during an import or recovery, we
3123 3143 * mark it as "not available", which signifies that it was
3124 3144 * never there to begin with. Failure to open such a device
3125 3145 * is not considered an error.
3126 3146 */
3127 3147 if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
3128 3148 spa_load_state(spa) == SPA_LOAD_RECOVER) &&
3129 3149 vd->vdev_ops->vdev_op_leaf)
3130 3150 vd->vdev_not_present = 1;
3131 3151
3132 3152 /*
3133 3153 * Post the appropriate ereport. If the 'prevstate' field is
3134 3154 * set to something other than VDEV_STATE_UNKNOWN, it indicates
3135 3155 * that this is part of a vdev_reopen(). In this case, we don't
3136 3156 * want to post the ereport if the device was already in the
3137 3157 * CANT_OPEN state beforehand.
3138 3158 *
3139 3159 * If the 'checkremove' flag is set, then this is an attempt to
3140 3160 * online the device in response to an insertion event. If we
3141 3161 * hit this case, then we have detected an insertion event for a
3142 3162 * faulted or offline device that wasn't in the removed state.
3143 3163 * In this scenario, we don't post an ereport because we are
3144 3164 * about to replace the device, or attempt an online with
3145 3165 * vdev_forcefault, which will generate the fault for us.
3146 3166 */
3147 3167 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
3148 3168 !vd->vdev_not_present && !vd->vdev_checkremove &&
3149 3169 vd != spa->spa_root_vdev) {
3150 3170 const char *class;
3151 3171
3152 3172 switch (aux) {
3153 3173 case VDEV_AUX_OPEN_FAILED:
3154 3174 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
3155 3175 break;
3156 3176 case VDEV_AUX_CORRUPT_DATA:
3157 3177 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
3158 3178 break;
3159 3179 case VDEV_AUX_NO_REPLICAS:
3160 3180 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
3161 3181 break;
3162 3182 case VDEV_AUX_BAD_GUID_SUM:
3163 3183 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
3164 3184 break;
3165 3185 case VDEV_AUX_TOO_SMALL:
3166 3186 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
3167 3187 break;
3168 3188 case VDEV_AUX_BAD_LABEL:
3169 3189 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
3170 3190 break;
3171 3191 default:
3172 3192 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
3173 3193 }
3174 3194
3175 3195 zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
3176 3196 }
3177 3197
3178 3198 /* Erase any notion of persistent removed state */
3179 3199 vd->vdev_removed = B_FALSE;
3180 3200 } else {
3181 3201 vd->vdev_removed = B_FALSE;
3182 3202 }
3183 3203
3184 3204 if (!isopen && vd->vdev_parent)
3185 3205 vdev_propagate_state(vd->vdev_parent);
3186 3206 }
3187 3207
3188 3208 /*
3189 3209 * Check the vdev configuration to ensure that it's capable of supporting
3190 3210 * a root pool. Currently, we do not support RAID-Z or partial configuration.
3191 3211 * In addition, only a single top-level vdev is allowed and none of the leaves
3192 3212 * can be wholedisks.
3193 3213 */
3194 3214 boolean_t
3195 3215 vdev_is_bootable(vdev_t *vd)
3196 3216 {
3197 3217 if (!vd->vdev_ops->vdev_op_leaf) {
3198 3218 char *vdev_type = vd->vdev_ops->vdev_op_type;
3199 3219
3200 3220 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
3201 3221 vd->vdev_children > 1) {
3202 3222 return (B_FALSE);
3203 3223 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
3204 3224 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
3205 3225 return (B_FALSE);
3206 3226 }
3207 3227 }
3208 3228
3209 3229 for (int c = 0; c < vd->vdev_children; c++) {
3210 3230 if (!vdev_is_bootable(vd->vdev_child[c]))
3211 3231 return (B_FALSE);
3212 3232 }
3213 3233 return (B_TRUE);
3214 3234 }
3215 3235
3216 3236 /*
3217 3237 * Load the state from the original vdev tree (ovd) which
3218 3238 * we've retrieved from the MOS config object. If the original
3219 3239 * vdev was offline or faulted then we transfer that state to the
3220 3240 * device in the current vdev tree (nvd).
3221 3241 */
3222 3242 void
3223 3243 vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
3224 3244 {
3225 3245 spa_t *spa = nvd->vdev_spa;
3226 3246
3227 3247 ASSERT(nvd->vdev_top->vdev_islog);
3228 3248 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3229 3249 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
3230 3250
3231 3251 for (int c = 0; c < nvd->vdev_children; c++)
3232 3252 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
3233 3253
3234 3254 if (nvd->vdev_ops->vdev_op_leaf) {
3235 3255 /*
3236 3256 * Restore the persistent vdev state
3237 3257 */
3238 3258 nvd->vdev_offline = ovd->vdev_offline;
3239 3259 nvd->vdev_faulted = ovd->vdev_faulted;
3240 3260 nvd->vdev_degraded = ovd->vdev_degraded;
3241 3261 nvd->vdev_removed = ovd->vdev_removed;
3242 3262 }
3243 3263 }
3244 3264
3245 3265 /*
3246 3266 * Determine if a log device has valid content. If the vdev was
3247 3267 * removed or faulted in the MOS config then we know that
3248 3268 * the content on the log device has already been written to the pool.
3249 3269 */
3250 3270 boolean_t
3251 3271 vdev_log_state_valid(vdev_t *vd)
3252 3272 {
3253 3273 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
3254 3274 !vd->vdev_removed)
3255 3275 return (B_TRUE);
3256 3276
3257 3277 for (int c = 0; c < vd->vdev_children; c++)
3258 3278 if (vdev_log_state_valid(vd->vdev_child[c]))
3259 3279 return (B_TRUE);
3260 3280
3261 3281 return (B_FALSE);
3262 3282 }
3263 3283
3264 3284 /*
3265 3285 * Expand a vdev if possible.
3266 3286 */
3267 3287 void
3268 3288 vdev_expand(vdev_t *vd, uint64_t txg)
3269 3289 {
3270 3290 ASSERT(vd->vdev_top == vd);
3271 3291 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3272 3292
3273 3293 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
3274 3294 VERIFY(vdev_metaslab_init(vd, txg) == 0);
3275 3295 vdev_config_dirty(vd);
3276 3296 }
3277 3297 }
3278 3298
3279 3299 /*
3280 3300 * Split a vdev.
3281 3301 */
3282 3302 void
3283 3303 vdev_split(vdev_t *vd)
3284 3304 {
3285 3305 vdev_t *cvd, *pvd = vd->vdev_parent;
3286 3306
3287 3307 vdev_remove_child(pvd, vd);
3288 3308 vdev_compact_children(pvd);
3289 3309
3290 3310 cvd = pvd->vdev_child[0];
3291 3311 if (pvd->vdev_children == 1) {
3292 3312 vdev_remove_parent(cvd);
3293 3313 cvd->vdev_splitting = B_TRUE;
3294 3314 }
3295 3315 vdev_propagate_state(cvd);
3296 3316 }
3297 3317
3298 3318 void
3299 3319 vdev_deadman(vdev_t *vd)
3300 3320 {
3301 3321 for (int c = 0; c < vd->vdev_children; c++) {
3302 3322 vdev_t *cvd = vd->vdev_child[c];
3303 3323
3304 3324 vdev_deadman(cvd);
3305 3325 }
3306 3326
3307 3327 if (vd->vdev_ops->vdev_op_leaf) {
3308 3328 vdev_queue_t *vq = &vd->vdev_queue;
3309 3329
3310 3330 mutex_enter(&vq->vq_lock);
3311 3331 if (avl_numnodes(&vq->vq_active_tree) > 0) {
3312 3332 spa_t *spa = vd->vdev_spa;
3313 3333 zio_t *fio;
3314 3334 uint64_t delta;
3315 3335
3316 3336 /*
3317 3337 * Look at the head of all the pending queues,
3318 3338 * if any I/O has been outstanding for longer than
3319 3339 * the spa_deadman_synctime we panic the system.
3320 3340 */
3321 3341 fio = avl_first(&vq->vq_active_tree);
3322 3342 delta = gethrtime() - fio->io_timestamp;
3323 3343 if (delta > spa_deadman_synctime(spa)) {
3324 3344 zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
3325 3345 "delta %lluns, last io %lluns",
3326 3346 fio->io_timestamp, delta,
3327 3347 vq->vq_io_complete_ts);
3328 3348 fm_panic("I/O to pool '%s' appears to be "
3329 3349 "hung.", spa_name(spa));
3330 3350 }
3331 3351 }
3332 3352 mutex_exit(&vq->vq_lock);
3333 3353 }
3334 3354 }
↓ open down ↓ |
3144 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX