1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2013 Saso Kiselkov. All rights reserved.
27 */
28
29 /*
30 * SPA: Storage Pool Allocator
31 *
32 * This file contains all the routines used when modifying on-disk SPA state.
33 * This includes opening, importing, destroying, exporting a pool, and syncing a
34 * pool.
35 */
36
37 #include <sys/zfs_context.h>
38 #include <sys/fm/fs/zfs.h>
39 #include <sys/spa_impl.h>
40 #include <sys/zio.h>
41 #include <sys/zio_checksum.h>
42 #include <sys/dmu.h>
43 #include <sys/dmu_tx.h>
44 #include <sys/zap.h>
45 #include <sys/zil.h>
46 #include <sys/ddt.h>
47 #include <sys/vdev_impl.h>
48 #include <sys/metaslab.h>
49 #include <sys/metaslab_impl.h>
50 #include <sys/uberblock_impl.h>
51 #include <sys/txg.h>
52 #include <sys/avl.h>
53 #include <sys/dmu_traverse.h>
54 #include <sys/dmu_objset.h>
55 #include <sys/unique.h>
56 #include <sys/dsl_pool.h>
57 #include <sys/dsl_dataset.h>
58 #include <sys/dsl_dir.h>
59 #include <sys/dsl_prop.h>
60 #include <sys/dsl_synctask.h>
61 #include <sys/fs/zfs.h>
62 #include <sys/arc.h>
63 #include <sys/callb.h>
64 #include <sys/systeminfo.h>
65 #include <sys/spa_boot.h>
66 #include <sys/zfs_ioctl.h>
67 #include <sys/dsl_scan.h>
68 #include <sys/zfeature.h>
69 #include <sys/dsl_destroy.h>
70
71 #ifdef _KERNEL
72 #include <sys/bootprops.h>
73 #include <sys/callb.h>
74 #include <sys/cpupart.h>
75 #include <sys/pool.h>
76 #include <sys/sysdc.h>
77 #include <sys/zone.h>
78 #endif /* _KERNEL */
79
80 #include "zfs_prop.h"
81 #include "zfs_comutil.h"
82
83 /*
84 * The interval, in seconds, at which failed configuration cache file writes
85 * should be retried.
86 */
87 static int zfs_ccw_retry_interval = 300;
88
89 typedef enum zti_modes {
90 ZTI_MODE_FIXED, /* value is # of threads (min 1) */
91 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
92 ZTI_MODE_NULL, /* don't create a taskq */
93 ZTI_NMODES
94 } zti_modes_t;
95
96 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
97 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
98 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
99
100 #define ZTI_N(n) ZTI_P(n, 1)
101 #define ZTI_ONE ZTI_N(1)
102
103 typedef struct zio_taskq_info {
104 zti_modes_t zti_mode;
105 uint_t zti_value;
106 uint_t zti_count;
107 } zio_taskq_info_t;
108
109 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
110 "issue", "issue_high", "intr", "intr_high"
111 };
112
113 /*
114 * This table defines the taskq settings for each ZFS I/O type. When
115 * initializing a pool, we use this table to create an appropriately sized
116 * taskq. Some operations are low volume and therefore have a small, static
117 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
118 * macros. Other operations process a large amount of data; the ZTI_BATCH
119 * macro causes us to create a taskq oriented for throughput. Some operations
120 * are so high frequency and short-lived that the taskq itself can become a a
121 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
122 * additional degree of parallelism specified by the number of threads per-
123 * taskq and the number of taskqs; when dispatching an event in this case, the
124 * particular taskq is chosen at random.
125 *
126 * The different taskq priorities are to handle the different contexts (issue
127 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
128 * need to be handled with minimum delay.
129 */
130 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
131 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
132 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
133 { ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */
134 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */
135 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
136 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
137 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
138 };
139
140 static void spa_sync_version(void *arg, dmu_tx_t *tx);
141 static void spa_sync_props(void *arg, dmu_tx_t *tx);
142 static boolean_t spa_has_active_shared_spare(spa_t *spa);
143 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
144 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
145 char **ereport);
146 static void spa_vdev_resilver_done(spa_t *spa);
147
148 uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
149 id_t zio_taskq_psrset_bind = PS_NONE;
150 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
151 uint_t zio_taskq_basedc = 80; /* base duty cycle */
152
153 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
154 extern int zfs_sync_pass_deferred_free;
155
156 /*
157 * This (illegal) pool name is used when temporarily importing a spa_t in order
158 * to get the vdev stats associated with the imported devices.
159 */
160 #define TRYIMPORT_NAME "$import"
161
162 /*
163 * ==========================================================================
164 * SPA properties routines
165 * ==========================================================================
166 */
167
168 /*
169 * Add a (source=src, propname=propval) list to an nvlist.
170 */
171 static void
172 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
173 uint64_t intval, zprop_source_t src)
174 {
175 const char *propname = zpool_prop_to_name(prop);
176 nvlist_t *propval;
177
178 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
179 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
180
181 if (strval != NULL)
182 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
183 else
184 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
185
186 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
187 nvlist_free(propval);
188 }
189
190 /*
191 * Get property values from the spa configuration.
192 */
193 static void
194 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
195 {
196 vdev_t *rvd = spa->spa_root_vdev;
197 dsl_pool_t *pool = spa->spa_dsl_pool;
198 uint64_t size;
199 uint64_t alloc;
200 uint64_t space;
201 uint64_t cap, version;
202 zprop_source_t src = ZPROP_SRC_NONE;
203 spa_config_dirent_t *dp;
204
205 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
206
207 if (rvd != NULL) {
208 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
209 size = metaslab_class_get_space(spa_normal_class(spa));
210 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
211 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
212 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
213 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
214 size - alloc, src);
215
216 space = 0;
217 for (int c = 0; c < rvd->vdev_children; c++) {
218 vdev_t *tvd = rvd->vdev_child[c];
219 space += tvd->vdev_max_asize - tvd->vdev_asize;
220 }
221 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
222 src);
223
224 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
225 (spa_mode(spa) == FREAD), src);
226
227 cap = (size == 0) ? 0 : (alloc * 100 / size);
228 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
229
230 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
231 ddt_get_pool_dedup_ratio(spa), src);
232
233 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
234 rvd->vdev_state, src);
235
236 version = spa_version(spa);
237 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
238 src = ZPROP_SRC_DEFAULT;
239 else
240 src = ZPROP_SRC_LOCAL;
241 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
242 }
243
244 if (pool != NULL) {
245 dsl_dir_t *freedir = pool->dp_free_dir;
246
247 /*
248 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
249 * when opening pools before this version freedir will be NULL.
250 */
251 if (freedir != NULL) {
252 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
253 freedir->dd_phys->dd_used_bytes, src);
254 } else {
255 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
256 NULL, 0, src);
257 }
258 }
259
260 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
261
262 if (spa->spa_comment != NULL) {
263 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
264 0, ZPROP_SRC_LOCAL);
265 }
266
267 if (spa->spa_root != NULL)
268 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
269 0, ZPROP_SRC_LOCAL);
270
271 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
272 if (dp->scd_path == NULL) {
273 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
274 "none", 0, ZPROP_SRC_LOCAL);
275 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
276 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
277 dp->scd_path, 0, ZPROP_SRC_LOCAL);
278 }
279 }
280 }
281
282 /*
283 * Get zpool property values.
284 */
285 int
286 spa_prop_get(spa_t *spa, nvlist_t **nvp)
287 {
288 objset_t *mos = spa->spa_meta_objset;
289 zap_cursor_t zc;
290 zap_attribute_t za;
291 int err;
292
293 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
294
295 mutex_enter(&spa->spa_props_lock);
296
297 /*
298 * Get properties from the spa config.
299 */
300 spa_prop_get_config(spa, nvp);
301
302 /* If no pool property object, no more prop to get. */
303 if (mos == NULL || spa->spa_pool_props_object == 0) {
304 mutex_exit(&spa->spa_props_lock);
305 return (0);
306 }
307
308 /*
309 * Get properties from the MOS pool property object.
310 */
311 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
312 (err = zap_cursor_retrieve(&zc, &za)) == 0;
313 zap_cursor_advance(&zc)) {
314 uint64_t intval = 0;
315 char *strval = NULL;
316 zprop_source_t src = ZPROP_SRC_DEFAULT;
317 zpool_prop_t prop;
318
319 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
320 continue;
321
322 switch (za.za_integer_length) {
323 case 8:
324 /* integer property */
325 if (za.za_first_integer !=
326 zpool_prop_default_numeric(prop))
327 src = ZPROP_SRC_LOCAL;
328
329 if (prop == ZPOOL_PROP_BOOTFS) {
330 dsl_pool_t *dp;
331 dsl_dataset_t *ds = NULL;
332
333 dp = spa_get_dsl(spa);
334 dsl_pool_config_enter(dp, FTAG);
335 if (err = dsl_dataset_hold_obj(dp,
336 za.za_first_integer, FTAG, &ds)) {
337 dsl_pool_config_exit(dp, FTAG);
338 break;
339 }
340
341 strval = kmem_alloc(
342 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
343 KM_SLEEP);
344 dsl_dataset_name(ds, strval);
345 dsl_dataset_rele(ds, FTAG);
346 dsl_pool_config_exit(dp, FTAG);
347 } else {
348 strval = NULL;
349 intval = za.za_first_integer;
350 }
351
352 spa_prop_add_list(*nvp, prop, strval, intval, src);
353
354 if (strval != NULL)
355 kmem_free(strval,
356 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
357
358 break;
359
360 case 1:
361 /* string property */
362 strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
363 err = zap_lookup(mos, spa->spa_pool_props_object,
364 za.za_name, 1, za.za_num_integers, strval);
365 if (err) {
366 kmem_free(strval, za.za_num_integers);
367 break;
368 }
369 spa_prop_add_list(*nvp, prop, strval, 0, src);
370 kmem_free(strval, za.za_num_integers);
371 break;
372
373 default:
374 break;
375 }
376 }
377 zap_cursor_fini(&zc);
378 mutex_exit(&spa->spa_props_lock);
379 out:
380 if (err && err != ENOENT) {
381 nvlist_free(*nvp);
382 *nvp = NULL;
383 return (err);
384 }
385
386 return (0);
387 }
388
389 /*
390 * Validate the given pool properties nvlist and modify the list
391 * for the property values to be set.
392 */
393 static int
394 spa_prop_validate(spa_t *spa, nvlist_t *props)
395 {
396 nvpair_t *elem;
397 int error = 0, reset_bootfs = 0;
398 uint64_t objnum = 0;
399 boolean_t has_feature = B_FALSE;
400
401 elem = NULL;
402 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
403 uint64_t intval;
404 char *strval, *slash, *check, *fname;
405 const char *propname = nvpair_name(elem);
406 zpool_prop_t prop = zpool_name_to_prop(propname);
407
408 switch (prop) {
409 case ZPROP_INVAL:
410 if (!zpool_prop_feature(propname)) {
411 error = SET_ERROR(EINVAL);
412 break;
413 }
414
415 /*
416 * Sanitize the input.
417 */
418 if (nvpair_type(elem) != DATA_TYPE_UINT64) {
419 error = SET_ERROR(EINVAL);
420 break;
421 }
422
423 if (nvpair_value_uint64(elem, &intval) != 0) {
424 error = SET_ERROR(EINVAL);
425 break;
426 }
427
428 if (intval != 0) {
429 error = SET_ERROR(EINVAL);
430 break;
431 }
432
433 fname = strchr(propname, '@') + 1;
434 if (zfeature_lookup_name(fname, NULL) != 0) {
435 error = SET_ERROR(EINVAL);
436 break;
437 }
438
439 has_feature = B_TRUE;
440 break;
441
442 case ZPOOL_PROP_VERSION:
443 error = nvpair_value_uint64(elem, &intval);
444 if (!error &&
445 (intval < spa_version(spa) ||
446 intval > SPA_VERSION_BEFORE_FEATURES ||
447 has_feature))
448 error = SET_ERROR(EINVAL);
449 break;
450
451 case ZPOOL_PROP_DELEGATION:
452 case ZPOOL_PROP_AUTOREPLACE:
453 case ZPOOL_PROP_LISTSNAPS:
454 case ZPOOL_PROP_AUTOEXPAND:
455 error = nvpair_value_uint64(elem, &intval);
456 if (!error && intval > 1)
457 error = SET_ERROR(EINVAL);
458 break;
459
460 case ZPOOL_PROP_BOOTFS:
461 /*
462 * If the pool version is less than SPA_VERSION_BOOTFS,
463 * or the pool is still being created (version == 0),
464 * the bootfs property cannot be set.
465 */
466 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
467 error = SET_ERROR(ENOTSUP);
468 break;
469 }
470
471 /*
472 * Make sure the vdev config is bootable
473 */
474 if (!vdev_is_bootable(spa->spa_root_vdev)) {
475 error = SET_ERROR(ENOTSUP);
476 break;
477 }
478
479 reset_bootfs = 1;
480
481 error = nvpair_value_string(elem, &strval);
482
483 if (!error) {
484 objset_t *os;
485 uint64_t compress;
486
487 if (strval == NULL || strval[0] == '\0') {
488 objnum = zpool_prop_default_numeric(
489 ZPOOL_PROP_BOOTFS);
490 break;
491 }
492
493 if (error = dmu_objset_hold(strval, FTAG, &os))
494 break;
495
496 /* Must be ZPL and not gzip compressed. */
497
498 if (dmu_objset_type(os) != DMU_OST_ZFS) {
499 error = SET_ERROR(ENOTSUP);
500 } else if ((error =
501 dsl_prop_get_int_ds(dmu_objset_ds(os),
502 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
503 &compress)) == 0 &&
504 !BOOTFS_COMPRESS_VALID(compress)) {
505 error = SET_ERROR(ENOTSUP);
506 } else {
507 objnum = dmu_objset_id(os);
508 }
509 dmu_objset_rele(os, FTAG);
510 }
511 break;
512
513 case ZPOOL_PROP_FAILUREMODE:
514 error = nvpair_value_uint64(elem, &intval);
515 if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
516 intval > ZIO_FAILURE_MODE_PANIC))
517 error = SET_ERROR(EINVAL);
518
519 /*
520 * This is a special case which only occurs when
521 * the pool has completely failed. This allows
522 * the user to change the in-core failmode property
523 * without syncing it out to disk (I/Os might
524 * currently be blocked). We do this by returning
525 * EIO to the caller (spa_prop_set) to trick it
526 * into thinking we encountered a property validation
527 * error.
528 */
529 if (!error && spa_suspended(spa)) {
530 spa->spa_failmode = intval;
531 error = SET_ERROR(EIO);
532 }
533 break;
534
535 case ZPOOL_PROP_CACHEFILE:
536 if ((error = nvpair_value_string(elem, &strval)) != 0)
537 break;
538
539 if (strval[0] == '\0')
540 break;
541
542 if (strcmp(strval, "none") == 0)
543 break;
544
545 if (strval[0] != '/') {
546 error = SET_ERROR(EINVAL);
547 break;
548 }
549
550 slash = strrchr(strval, '/');
551 ASSERT(slash != NULL);
552
553 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
554 strcmp(slash, "/..") == 0)
555 error = SET_ERROR(EINVAL);
556 break;
557
558 case ZPOOL_PROP_COMMENT:
559 if ((error = nvpair_value_string(elem, &strval)) != 0)
560 break;
561 for (check = strval; *check != '\0'; check++) {
562 /*
563 * The kernel doesn't have an easy isprint()
564 * check. For this kernel check, we merely
565 * check ASCII apart from DEL. Fix this if
566 * there is an easy-to-use kernel isprint().
567 */
568 if (*check >= 0x7f) {
569 error = SET_ERROR(EINVAL);
570 break;
571 }
572 check++;
573 }
574 if (strlen(strval) > ZPROP_MAX_COMMENT)
575 error = E2BIG;
576 break;
577
578 case ZPOOL_PROP_DEDUPDITTO:
579 if (spa_version(spa) < SPA_VERSION_DEDUP)
580 error = SET_ERROR(ENOTSUP);
581 else
582 error = nvpair_value_uint64(elem, &intval);
583 if (error == 0 &&
584 intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
585 error = SET_ERROR(EINVAL);
586 break;
587 }
588
589 if (error)
590 break;
591 }
592
593 if (!error && reset_bootfs) {
594 error = nvlist_remove(props,
595 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
596
597 if (!error) {
598 error = nvlist_add_uint64(props,
599 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
600 }
601 }
602
603 return (error);
604 }
605
606 void
607 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
608 {
609 char *cachefile;
610 spa_config_dirent_t *dp;
611
612 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
613 &cachefile) != 0)
614 return;
615
616 dp = kmem_alloc(sizeof (spa_config_dirent_t),
617 KM_SLEEP);
618
619 if (cachefile[0] == '\0')
620 dp->scd_path = spa_strdup(spa_config_path);
621 else if (strcmp(cachefile, "none") == 0)
622 dp->scd_path = NULL;
623 else
624 dp->scd_path = spa_strdup(cachefile);
625
626 list_insert_head(&spa->spa_config_list, dp);
627 if (need_sync)
628 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
629 }
630
631 int
632 spa_prop_set(spa_t *spa, nvlist_t *nvp)
633 {
634 int error;
635 nvpair_t *elem = NULL;
636 boolean_t need_sync = B_FALSE;
637
638 if ((error = spa_prop_validate(spa, nvp)) != 0)
639 return (error);
640
641 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
642 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
643
644 if (prop == ZPOOL_PROP_CACHEFILE ||
645 prop == ZPOOL_PROP_ALTROOT ||
646 prop == ZPOOL_PROP_READONLY)
647 continue;
648
649 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
650 uint64_t ver;
651
652 if (prop == ZPOOL_PROP_VERSION) {
653 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
654 } else {
655 ASSERT(zpool_prop_feature(nvpair_name(elem)));
656 ver = SPA_VERSION_FEATURES;
657 need_sync = B_TRUE;
658 }
659
660 /* Save time if the version is already set. */
661 if (ver == spa_version(spa))
662 continue;
663
664 /*
665 * In addition to the pool directory object, we might
666 * create the pool properties object, the features for
667 * read object, the features for write object, or the
668 * feature descriptions object.
669 */
670 error = dsl_sync_task(spa->spa_name, NULL,
671 spa_sync_version, &ver, 6);
672 if (error)
673 return (error);
674 continue;
675 }
676
677 need_sync = B_TRUE;
678 break;
679 }
680
681 if (need_sync) {
682 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
683 nvp, 6));
684 }
685
686 return (0);
687 }
688
689 /*
690 * If the bootfs property value is dsobj, clear it.
691 */
692 void
693 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
694 {
695 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
696 VERIFY(zap_remove(spa->spa_meta_objset,
697 spa->spa_pool_props_object,
698 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
699 spa->spa_bootfs = 0;
700 }
701 }
702
703 /*ARGSUSED*/
704 static int
705 spa_change_guid_check(void *arg, dmu_tx_t *tx)
706 {
707 uint64_t *newguid = arg;
708 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
709 vdev_t *rvd = spa->spa_root_vdev;
710 uint64_t vdev_state;
711
712 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
713 vdev_state = rvd->vdev_state;
714 spa_config_exit(spa, SCL_STATE, FTAG);
715
716 if (vdev_state != VDEV_STATE_HEALTHY)
717 return (SET_ERROR(ENXIO));
718
719 ASSERT3U(spa_guid(spa), !=, *newguid);
720
721 return (0);
722 }
723
724 static void
725 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
726 {
727 uint64_t *newguid = arg;
728 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
729 uint64_t oldguid;
730 vdev_t *rvd = spa->spa_root_vdev;
731
732 oldguid = spa_guid(spa);
733
734 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
735 rvd->vdev_guid = *newguid;
736 rvd->vdev_guid_sum += (*newguid - oldguid);
737 vdev_config_dirty(rvd);
738 spa_config_exit(spa, SCL_STATE, FTAG);
739
740 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
741 oldguid, *newguid);
742 }
743
744 /*
745 * Change the GUID for the pool. This is done so that we can later
746 * re-import a pool built from a clone of our own vdevs. We will modify
747 * the root vdev's guid, our own pool guid, and then mark all of our
748 * vdevs dirty. Note that we must make sure that all our vdevs are
749 * online when we do this, or else any vdevs that weren't present
750 * would be orphaned from our pool. We are also going to issue a
751 * sysevent to update any watchers.
752 */
753 int
754 spa_change_guid(spa_t *spa)
755 {
756 int error;
757 uint64_t guid;
758
759 mutex_enter(&spa->spa_vdev_top_lock);
760 mutex_enter(&spa_namespace_lock);
761 guid = spa_generate_guid(NULL);
762
763 error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
764 spa_change_guid_sync, &guid, 5);
765
766 if (error == 0) {
767 spa_config_sync(spa, B_FALSE, B_TRUE);
768 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
769 }
770
771 mutex_exit(&spa_namespace_lock);
772 mutex_exit(&spa->spa_vdev_top_lock);
773
774 return (error);
775 }
776
777 /*
778 * ==========================================================================
779 * SPA state manipulation (open/create/destroy/import/export)
780 * ==========================================================================
781 */
782
783 static int
784 spa_error_entry_compare(const void *a, const void *b)
785 {
786 spa_error_entry_t *sa = (spa_error_entry_t *)a;
787 spa_error_entry_t *sb = (spa_error_entry_t *)b;
788 int ret;
789
790 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
791 sizeof (zbookmark_t));
792
793 if (ret < 0)
794 return (-1);
795 else if (ret > 0)
796 return (1);
797 else
798 return (0);
799 }
800
801 /*
802 * Utility function which retrieves copies of the current logs and
803 * re-initializes them in the process.
804 */
805 void
806 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
807 {
808 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
809
810 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
811 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
812
813 avl_create(&spa->spa_errlist_scrub,
814 spa_error_entry_compare, sizeof (spa_error_entry_t),
815 offsetof(spa_error_entry_t, se_avl));
816 avl_create(&spa->spa_errlist_last,
817 spa_error_entry_compare, sizeof (spa_error_entry_t),
818 offsetof(spa_error_entry_t, se_avl));
819 }
820
821 static void
822 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
823 {
824 const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
825 enum zti_modes mode = ztip->zti_mode;
826 uint_t value = ztip->zti_value;
827 uint_t count = ztip->zti_count;
828 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
829 char name[32];
830 uint_t flags = 0;
831 boolean_t batch = B_FALSE;
832
833 if (mode == ZTI_MODE_NULL) {
834 tqs->stqs_count = 0;
835 tqs->stqs_taskq = NULL;
836 return;
837 }
838
839 ASSERT3U(count, >, 0);
840
841 tqs->stqs_count = count;
842 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
843
844 switch (mode) {
845 case ZTI_MODE_FIXED:
846 ASSERT3U(value, >=, 1);
847 value = MAX(value, 1);
848 break;
849
850 case ZTI_MODE_BATCH:
851 batch = B_TRUE;
852 flags |= TASKQ_THREADS_CPU_PCT;
853 value = zio_taskq_batch_pct;
854 break;
855
856 default:
857 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
858 "spa_activate()",
859 zio_type_name[t], zio_taskq_types[q], mode, value);
860 break;
861 }
862
863 for (uint_t i = 0; i < count; i++) {
864 taskq_t *tq;
865
866 if (count > 1) {
867 (void) snprintf(name, sizeof (name), "%s_%s_%u",
868 zio_type_name[t], zio_taskq_types[q], i);
869 } else {
870 (void) snprintf(name, sizeof (name), "%s_%s",
871 zio_type_name[t], zio_taskq_types[q]);
872 }
873
874 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
875 if (batch)
876 flags |= TASKQ_DC_BATCH;
877
878 tq = taskq_create_sysdc(name, value, 50, INT_MAX,
879 spa->spa_proc, zio_taskq_basedc, flags);
880 } else {
881 pri_t pri = maxclsyspri;
882 /*
883 * The write issue taskq can be extremely CPU
884 * intensive. Run it at slightly lower priority
885 * than the other taskqs.
886 */
887 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
888 pri--;
889
890 tq = taskq_create_proc(name, value, pri, 50,
891 INT_MAX, spa->spa_proc, flags);
892 }
893
894 tqs->stqs_taskq[i] = tq;
895 }
896 }
897
898 static void
899 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
900 {
901 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
902
903 if (tqs->stqs_taskq == NULL) {
904 ASSERT0(tqs->stqs_count);
905 return;
906 }
907
908 for (uint_t i = 0; i < tqs->stqs_count; i++) {
909 ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
910 taskq_destroy(tqs->stqs_taskq[i]);
911 }
912
913 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
914 tqs->stqs_taskq = NULL;
915 }
916
917 /*
918 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
919 * Note that a type may have multiple discrete taskqs to avoid lock contention
920 * on the taskq itself. In that case we choose which taskq at random by using
921 * the low bits of gethrtime().
922 */
923 void
924 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
925 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
926 {
927 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
928 taskq_t *tq;
929
930 ASSERT3P(tqs->stqs_taskq, !=, NULL);
931 ASSERT3U(tqs->stqs_count, !=, 0);
932
933 if (tqs->stqs_count == 1) {
934 tq = tqs->stqs_taskq[0];
935 } else {
936 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
937 }
938
939 taskq_dispatch_ent(tq, func, arg, flags, ent);
940 }
941
942 static void
943 spa_create_zio_taskqs(spa_t *spa)
944 {
945 for (int t = 0; t < ZIO_TYPES; t++) {
946 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
947 spa_taskqs_init(spa, t, q);
948 }
949 }
950 }
951
952 #ifdef _KERNEL
953 static void
954 spa_thread(void *arg)
955 {
956 callb_cpr_t cprinfo;
957
958 spa_t *spa = arg;
959 user_t *pu = PTOU(curproc);
960
961 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
962 spa->spa_name);
963
964 ASSERT(curproc != &p0);
965 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
966 "zpool-%s", spa->spa_name);
967 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
968
969 /* bind this thread to the requested psrset */
970 if (zio_taskq_psrset_bind != PS_NONE) {
971 pool_lock();
972 mutex_enter(&cpu_lock);
973 mutex_enter(&pidlock);
974 mutex_enter(&curproc->p_lock);
975
976 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
977 0, NULL, NULL) == 0) {
978 curthread->t_bind_pset = zio_taskq_psrset_bind;
979 } else {
980 cmn_err(CE_WARN,
981 "Couldn't bind process for zfs pool \"%s\" to "
982 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
983 }
984
985 mutex_exit(&curproc->p_lock);
986 mutex_exit(&pidlock);
987 mutex_exit(&cpu_lock);
988 pool_unlock();
989 }
990
991 if (zio_taskq_sysdc) {
992 sysdc_thread_enter(curthread, 100, 0);
993 }
994
995 spa->spa_proc = curproc;
996 spa->spa_did = curthread->t_did;
997
998 spa_create_zio_taskqs(spa);
999
1000 mutex_enter(&spa->spa_proc_lock);
1001 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1002
1003 spa->spa_proc_state = SPA_PROC_ACTIVE;
1004 cv_broadcast(&spa->spa_proc_cv);
1005
1006 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1007 while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1008 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1009 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1010
1011 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1012 spa->spa_proc_state = SPA_PROC_GONE;
1013 spa->spa_proc = &p0;
1014 cv_broadcast(&spa->spa_proc_cv);
1015 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
1016
1017 mutex_enter(&curproc->p_lock);
1018 lwp_exit();
1019 }
1020 #endif
1021
1022 /*
1023 * Activate an uninitialized pool.
1024 */
1025 static void
1026 spa_activate(spa_t *spa, int mode)
1027 {
1028 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1029
1030 spa->spa_state = POOL_STATE_ACTIVE;
1031 spa->spa_mode = mode;
1032
1033 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1034 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1035
1036 /* Try to create a covering process */
1037 mutex_enter(&spa->spa_proc_lock);
1038 ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1039 ASSERT(spa->spa_proc == &p0);
1040 spa->spa_did = 0;
1041
1042 /* Only create a process if we're going to be around a while. */
1043 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1044 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1045 NULL, 0) == 0) {
1046 spa->spa_proc_state = SPA_PROC_CREATED;
1047 while (spa->spa_proc_state == SPA_PROC_CREATED) {
1048 cv_wait(&spa->spa_proc_cv,
1049 &spa->spa_proc_lock);
1050 }
1051 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1052 ASSERT(spa->spa_proc != &p0);
1053 ASSERT(spa->spa_did != 0);
1054 } else {
1055 #ifdef _KERNEL
1056 cmn_err(CE_WARN,
1057 "Couldn't create process for zfs pool \"%s\"\n",
1058 spa->spa_name);
1059 #endif
1060 }
1061 }
1062 mutex_exit(&spa->spa_proc_lock);
1063
1064 /* If we didn't create a process, we need to create our taskqs. */
1065 if (spa->spa_proc == &p0) {
1066 spa_create_zio_taskqs(spa);
1067 }
1068
1069 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1070 offsetof(vdev_t, vdev_config_dirty_node));
1071 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1072 offsetof(vdev_t, vdev_state_dirty_node));
1073
1074 txg_list_create(&spa->spa_vdev_txg_list,
1075 offsetof(struct vdev, vdev_txg_node));
1076
1077 avl_create(&spa->spa_errlist_scrub,
1078 spa_error_entry_compare, sizeof (spa_error_entry_t),
1079 offsetof(spa_error_entry_t, se_avl));
1080 avl_create(&spa->spa_errlist_last,
1081 spa_error_entry_compare, sizeof (spa_error_entry_t),
1082 offsetof(spa_error_entry_t, se_avl));
1083 }
1084
1085 /*
1086 * Opposite of spa_activate().
1087 */
1088 static void
1089 spa_deactivate(spa_t *spa)
1090 {
1091 ASSERT(spa->spa_sync_on == B_FALSE);
1092 ASSERT(spa->spa_dsl_pool == NULL);
1093 ASSERT(spa->spa_root_vdev == NULL);
1094 ASSERT(spa->spa_async_zio_root == NULL);
1095 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1096
1097 txg_list_destroy(&spa->spa_vdev_txg_list);
1098
1099 list_destroy(&spa->spa_config_dirty_list);
1100 list_destroy(&spa->spa_state_dirty_list);
1101
1102 for (int t = 0; t < ZIO_TYPES; t++) {
1103 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1104 spa_taskqs_fini(spa, t, q);
1105 }
1106 }
1107
1108 metaslab_class_destroy(spa->spa_normal_class);
1109 spa->spa_normal_class = NULL;
1110
1111 metaslab_class_destroy(spa->spa_log_class);
1112 spa->spa_log_class = NULL;
1113
1114 /*
1115 * If this was part of an import or the open otherwise failed, we may
1116 * still have errors left in the queues. Empty them just in case.
1117 */
1118 spa_errlog_drain(spa);
1119
1120 avl_destroy(&spa->spa_errlist_scrub);
1121 avl_destroy(&spa->spa_errlist_last);
1122
1123 spa->spa_state = POOL_STATE_UNINITIALIZED;
1124
1125 mutex_enter(&spa->spa_proc_lock);
1126 if (spa->spa_proc_state != SPA_PROC_NONE) {
1127 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1128 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1129 cv_broadcast(&spa->spa_proc_cv);
1130 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1131 ASSERT(spa->spa_proc != &p0);
1132 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1133 }
1134 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1135 spa->spa_proc_state = SPA_PROC_NONE;
1136 }
1137 ASSERT(spa->spa_proc == &p0);
1138 mutex_exit(&spa->spa_proc_lock);
1139
1140 /*
1141 * We want to make sure spa_thread() has actually exited the ZFS
1142 * module, so that the module can't be unloaded out from underneath
1143 * it.
1144 */
1145 if (spa->spa_did != 0) {
1146 thread_join(spa->spa_did);
1147 spa->spa_did = 0;
1148 }
1149 }
1150
1151 /*
1152 * Verify a pool configuration, and construct the vdev tree appropriately. This
1153 * will create all the necessary vdevs in the appropriate layout, with each vdev
1154 * in the CLOSED state. This will prep the pool before open/creation/import.
1155 * All vdev validation is done by the vdev_alloc() routine.
1156 */
1157 static int
1158 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1159 uint_t id, int atype)
1160 {
1161 nvlist_t **child;
1162 uint_t children;
1163 int error;
1164
1165 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1166 return (error);
1167
1168 if ((*vdp)->vdev_ops->vdev_op_leaf)
1169 return (0);
1170
1171 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1172 &child, &children);
1173
1174 if (error == ENOENT)
1175 return (0);
1176
1177 if (error) {
1178 vdev_free(*vdp);
1179 *vdp = NULL;
1180 return (SET_ERROR(EINVAL));
1181 }
1182
1183 for (int c = 0; c < children; c++) {
1184 vdev_t *vd;
1185 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1186 atype)) != 0) {
1187 vdev_free(*vdp);
1188 *vdp = NULL;
1189 return (error);
1190 }
1191 }
1192
1193 ASSERT(*vdp != NULL);
1194
1195 return (0);
1196 }
1197
1198 /*
1199 * Opposite of spa_load().
1200 */
1201 static void
1202 spa_unload(spa_t *spa)
1203 {
1204 int i;
1205
1206 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1207
1208 /*
1209 * Stop async tasks.
1210 */
1211 spa_async_suspend(spa);
1212
1213 /*
1214 * Stop syncing.
1215 */
1216 if (spa->spa_sync_on) {
1217 txg_sync_stop(spa->spa_dsl_pool);
1218 spa->spa_sync_on = B_FALSE;
1219 }
1220
1221 /*
1222 * Wait for any outstanding async I/O to complete.
1223 */
1224 if (spa->spa_async_zio_root != NULL) {
1225 (void) zio_wait(spa->spa_async_zio_root);
1226 spa->spa_async_zio_root = NULL;
1227 }
1228
1229 bpobj_close(&spa->spa_deferred_bpobj);
1230
1231 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1232
1233 /*
1234 * Close all vdevs.
1235 */
1236 if (spa->spa_root_vdev)
1237 vdev_free(spa->spa_root_vdev);
1238 ASSERT(spa->spa_root_vdev == NULL);
1239
1240 /*
1241 * Close the dsl pool.
1242 */
1243 if (spa->spa_dsl_pool) {
1244 dsl_pool_close(spa->spa_dsl_pool);
1245 spa->spa_dsl_pool = NULL;
1246 spa->spa_meta_objset = NULL;
1247 }
1248
1249 ddt_unload(spa);
1250
1251
1252 /*
1253 * Drop and purge level 2 cache
1254 */
1255 spa_l2cache_drop(spa);
1256
1257 for (i = 0; i < spa->spa_spares.sav_count; i++)
1258 vdev_free(spa->spa_spares.sav_vdevs[i]);
1259 if (spa->spa_spares.sav_vdevs) {
1260 kmem_free(spa->spa_spares.sav_vdevs,
1261 spa->spa_spares.sav_count * sizeof (void *));
1262 spa->spa_spares.sav_vdevs = NULL;
1263 }
1264 if (spa->spa_spares.sav_config) {
1265 nvlist_free(spa->spa_spares.sav_config);
1266 spa->spa_spares.sav_config = NULL;
1267 }
1268 spa->spa_spares.sav_count = 0;
1269
1270 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1271 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1272 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1273 }
1274 if (spa->spa_l2cache.sav_vdevs) {
1275 kmem_free(spa->spa_l2cache.sav_vdevs,
1276 spa->spa_l2cache.sav_count * sizeof (void *));
1277 spa->spa_l2cache.sav_vdevs = NULL;
1278 }
1279 if (spa->spa_l2cache.sav_config) {
1280 nvlist_free(spa->spa_l2cache.sav_config);
1281 spa->spa_l2cache.sav_config = NULL;
1282 }
1283 spa->spa_l2cache.sav_count = 0;
1284
1285 spa->spa_async_suspended = 0;
1286
1287 if (spa->spa_comment != NULL) {
1288 spa_strfree(spa->spa_comment);
1289 spa->spa_comment = NULL;
1290 }
1291
1292 spa_config_exit(spa, SCL_ALL, FTAG);
1293 }
1294
1295 /*
1296 * Load (or re-load) the current list of vdevs describing the active spares for
1297 * this pool. When this is called, we have some form of basic information in
1298 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
1299 * then re-generate a more complete list including status information.
1300 */
1301 static void
1302 spa_load_spares(spa_t *spa)
1303 {
1304 nvlist_t **spares;
1305 uint_t nspares;
1306 int i;
1307 vdev_t *vd, *tvd;
1308
1309 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1310
1311 /*
1312 * First, close and free any existing spare vdevs.
1313 */
1314 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1315 vd = spa->spa_spares.sav_vdevs[i];
1316
1317 /* Undo the call to spa_activate() below */
1318 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1319 B_FALSE)) != NULL && tvd->vdev_isspare)
1320 spa_spare_remove(tvd);
1321 vdev_close(vd);
1322 vdev_free(vd);
1323 }
1324
1325 if (spa->spa_spares.sav_vdevs)
1326 kmem_free(spa->spa_spares.sav_vdevs,
1327 spa->spa_spares.sav_count * sizeof (void *));
1328
1329 if (spa->spa_spares.sav_config == NULL)
1330 nspares = 0;
1331 else
1332 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1333 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1334
1335 spa->spa_spares.sav_count = (int)nspares;
1336 spa->spa_spares.sav_vdevs = NULL;
1337
1338 if (nspares == 0)
1339 return;
1340
1341 /*
1342 * Construct the array of vdevs, opening them to get status in the
1343 * process. For each spare, there is potentially two different vdev_t
1344 * structures associated with it: one in the list of spares (used only
1345 * for basic validation purposes) and one in the active vdev
1346 * configuration (if it's spared in). During this phase we open and
1347 * validate each vdev on the spare list. If the vdev also exists in the
1348 * active configuration, then we also mark this vdev as an active spare.
1349 */
1350 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1351 KM_SLEEP);
1352 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1353 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1354 VDEV_ALLOC_SPARE) == 0);
1355 ASSERT(vd != NULL);
1356
1357 spa->spa_spares.sav_vdevs[i] = vd;
1358
1359 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1360 B_FALSE)) != NULL) {
1361 if (!tvd->vdev_isspare)
1362 spa_spare_add(tvd);
1363
1364 /*
1365 * We only mark the spare active if we were successfully
1366 * able to load the vdev. Otherwise, importing a pool
1367 * with a bad active spare would result in strange
1368 * behavior, because multiple pool would think the spare
1369 * is actively in use.
1370 *
1371 * There is a vulnerability here to an equally bizarre
1372 * circumstance, where a dead active spare is later
1373 * brought back to life (onlined or otherwise). Given
1374 * the rarity of this scenario, and the extra complexity
1375 * it adds, we ignore the possibility.
1376 */
1377 if (!vdev_is_dead(tvd))
1378 spa_spare_activate(tvd);
1379 }
1380
1381 vd->vdev_top = vd;
1382 vd->vdev_aux = &spa->spa_spares;
1383
1384 if (vdev_open(vd) != 0)
1385 continue;
1386
1387 if (vdev_validate_aux(vd) == 0)
1388 spa_spare_add(vd);
1389 }
1390
1391 /*
1392 * Recompute the stashed list of spares, with status information
1393 * this time.
1394 */
1395 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1396 DATA_TYPE_NVLIST_ARRAY) == 0);
1397
1398 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1399 KM_SLEEP);
1400 for (i = 0; i < spa->spa_spares.sav_count; i++)
1401 spares[i] = vdev_config_generate(spa,
1402 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1403 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1404 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1405 for (i = 0; i < spa->spa_spares.sav_count; i++)
1406 nvlist_free(spares[i]);
1407 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1408 }
1409
1410 /*
1411 * Load (or re-load) the current list of vdevs describing the active l2cache for
1412 * this pool. When this is called, we have some form of basic information in
1413 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
1414 * then re-generate a more complete list including status information.
1415 * Devices which are already active have their details maintained, and are
1416 * not re-opened.
1417 */
1418 static void
1419 spa_load_l2cache(spa_t *spa)
1420 {
1421 nvlist_t **l2cache;
1422 uint_t nl2cache;
1423 int i, j, oldnvdevs;
1424 uint64_t guid;
1425 vdev_t *vd, **oldvdevs, **newvdevs;
1426 spa_aux_vdev_t *sav = &spa->spa_l2cache;
1427
1428 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1429
1430 if (sav->sav_config != NULL) {
1431 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1432 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1433 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1434 } else {
1435 nl2cache = 0;
1436 newvdevs = NULL;
1437 }
1438
1439 oldvdevs = sav->sav_vdevs;
1440 oldnvdevs = sav->sav_count;
1441 sav->sav_vdevs = NULL;
1442 sav->sav_count = 0;
1443
1444 /*
1445 * Process new nvlist of vdevs.
1446 */
1447 for (i = 0; i < nl2cache; i++) {
1448 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1449 &guid) == 0);
1450
1451 newvdevs[i] = NULL;
1452 for (j = 0; j < oldnvdevs; j++) {
1453 vd = oldvdevs[j];
1454 if (vd != NULL && guid == vd->vdev_guid) {
1455 /*
1456 * Retain previous vdev for add/remove ops.
1457 */
1458 newvdevs[i] = vd;
1459 oldvdevs[j] = NULL;
1460 break;
1461 }
1462 }
1463
1464 if (newvdevs[i] == NULL) {
1465 /*
1466 * Create new vdev
1467 */
1468 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1469 VDEV_ALLOC_L2CACHE) == 0);
1470 ASSERT(vd != NULL);
1471 newvdevs[i] = vd;
1472
1473 /*
1474 * Commit this vdev as an l2cache device,
1475 * even if it fails to open.
1476 */
1477 spa_l2cache_add(vd);
1478
1479 vd->vdev_top = vd;
1480 vd->vdev_aux = sav;
1481
1482 spa_l2cache_activate(vd);
1483
1484 if (vdev_open(vd) != 0)
1485 continue;
1486
1487 (void) vdev_validate_aux(vd);
1488
1489 if (!vdev_is_dead(vd))
1490 l2arc_add_vdev(spa, vd);
1491 }
1492 }
1493
1494 /*
1495 * Purge vdevs that were dropped
1496 */
1497 for (i = 0; i < oldnvdevs; i++) {
1498 uint64_t pool;
1499
1500 vd = oldvdevs[i];
1501 if (vd != NULL) {
1502 ASSERT(vd->vdev_isl2cache);
1503
1504 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1505 pool != 0ULL && l2arc_vdev_present(vd))
1506 l2arc_remove_vdev(vd);
1507 vdev_clear_stats(vd);
1508 vdev_free(vd);
1509 }
1510 }
1511
1512 if (oldvdevs)
1513 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1514
1515 if (sav->sav_config == NULL)
1516 goto out;
1517
1518 sav->sav_vdevs = newvdevs;
1519 sav->sav_count = (int)nl2cache;
1520
1521 /*
1522 * Recompute the stashed list of l2cache devices, with status
1523 * information this time.
1524 */
1525 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1526 DATA_TYPE_NVLIST_ARRAY) == 0);
1527
1528 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1529 for (i = 0; i < sav->sav_count; i++)
1530 l2cache[i] = vdev_config_generate(spa,
1531 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1532 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1533 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1534 out:
1535 for (i = 0; i < sav->sav_count; i++)
1536 nvlist_free(l2cache[i]);
1537 if (sav->sav_count)
1538 kmem_free(l2cache, sav->sav_count * sizeof (void *));
1539 }
1540
1541 static int
1542 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1543 {
1544 dmu_buf_t *db;
1545 char *packed = NULL;
1546 size_t nvsize = 0;
1547 int error;
1548 *value = NULL;
1549
1550 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1551 nvsize = *(uint64_t *)db->db_data;
1552 dmu_buf_rele(db, FTAG);
1553
1554 packed = kmem_alloc(nvsize, KM_SLEEP);
1555 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1556 DMU_READ_PREFETCH);
1557 if (error == 0)
1558 error = nvlist_unpack(packed, nvsize, value, 0);
1559 kmem_free(packed, nvsize);
1560
1561 return (error);
1562 }
1563
1564 /*
1565 * Checks to see if the given vdev could not be opened, in which case we post a
1566 * sysevent to notify the autoreplace code that the device has been removed.
1567 */
1568 static void
1569 spa_check_removed(vdev_t *vd)
1570 {
1571 for (int c = 0; c < vd->vdev_children; c++)
1572 spa_check_removed(vd->vdev_child[c]);
1573
1574 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1575 !vd->vdev_ishole) {
1576 zfs_post_autoreplace(vd->vdev_spa, vd);
1577 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1578 }
1579 }
1580
1581 /*
1582 * Validate the current config against the MOS config
1583 */
1584 static boolean_t
1585 spa_config_valid(spa_t *spa, nvlist_t *config)
1586 {
1587 vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1588 nvlist_t *nv;
1589
1590 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1591
1592 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1593 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1594
1595 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1596
1597 /*
1598 * If we're doing a normal import, then build up any additional
1599 * diagnostic information about missing devices in this config.
1600 * We'll pass this up to the user for further processing.
1601 */
1602 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1603 nvlist_t **child, *nv;
1604 uint64_t idx = 0;
1605
1606 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1607 KM_SLEEP);
1608 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1609
1610 for (int c = 0; c < rvd->vdev_children; c++) {
1611 vdev_t *tvd = rvd->vdev_child[c];
1612 vdev_t *mtvd = mrvd->vdev_child[c];
1613
1614 if (tvd->vdev_ops == &vdev_missing_ops &&
1615 mtvd->vdev_ops != &vdev_missing_ops &&
1616 mtvd->vdev_islog)
1617 child[idx++] = vdev_config_generate(spa, mtvd,
1618 B_FALSE, 0);
1619 }
1620
1621 if (idx) {
1622 VERIFY(nvlist_add_nvlist_array(nv,
1623 ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1624 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1625 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1626
1627 for (int i = 0; i < idx; i++)
1628 nvlist_free(child[i]);
1629 }
1630 nvlist_free(nv);
1631 kmem_free(child, rvd->vdev_children * sizeof (char **));
1632 }
1633
1634 /*
1635 * Compare the root vdev tree with the information we have
1636 * from the MOS config (mrvd). Check each top-level vdev
1637 * with the corresponding MOS config top-level (mtvd).
1638 */
1639 for (int c = 0; c < rvd->vdev_children; c++) {
1640 vdev_t *tvd = rvd->vdev_child[c];
1641 vdev_t *mtvd = mrvd->vdev_child[c];
1642
1643 /*
1644 * Resolve any "missing" vdevs in the current configuration.
1645 * If we find that the MOS config has more accurate information
1646 * about the top-level vdev then use that vdev instead.
1647 */
1648 if (tvd->vdev_ops == &vdev_missing_ops &&
1649 mtvd->vdev_ops != &vdev_missing_ops) {
1650
1651 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1652 continue;
1653
1654 /*
1655 * Device specific actions.
1656 */
1657 if (mtvd->vdev_islog) {
1658 spa_set_log_state(spa, SPA_LOG_CLEAR);
1659 } else {
1660 /*
1661 * XXX - once we have 'readonly' pool
1662 * support we should be able to handle
1663 * missing data devices by transitioning
1664 * the pool to readonly.
1665 */
1666 continue;
1667 }
1668
1669 /*
1670 * Swap the missing vdev with the data we were
1671 * able to obtain from the MOS config.
1672 */
1673 vdev_remove_child(rvd, tvd);
1674 vdev_remove_child(mrvd, mtvd);
1675
1676 vdev_add_child(rvd, mtvd);
1677 vdev_add_child(mrvd, tvd);
1678
1679 spa_config_exit(spa, SCL_ALL, FTAG);
1680 vdev_load(mtvd);
1681 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1682
1683 vdev_reopen(rvd);
1684 } else if (mtvd->vdev_islog) {
1685 /*
1686 * Load the slog device's state from the MOS config
1687 * since it's possible that the label does not
1688 * contain the most up-to-date information.
1689 */
1690 vdev_load_log_state(tvd, mtvd);
1691 vdev_reopen(tvd);
1692 }
1693 }
1694 vdev_free(mrvd);
1695 spa_config_exit(spa, SCL_ALL, FTAG);
1696
1697 /*
1698 * Ensure we were able to validate the config.
1699 */
1700 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1701 }
1702
1703 /*
1704 * Check for missing log devices
1705 */
1706 static boolean_t
1707 spa_check_logs(spa_t *spa)
1708 {
1709 boolean_t rv = B_FALSE;
1710
1711 switch (spa->spa_log_state) {
1712 case SPA_LOG_MISSING:
1713 /* need to recheck in case slog has been restored */
1714 case SPA_LOG_UNKNOWN:
1715 rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
1716 NULL, DS_FIND_CHILDREN) != 0);
1717 if (rv)
1718 spa_set_log_state(spa, SPA_LOG_MISSING);
1719 break;
1720 }
1721 return (rv);
1722 }
1723
1724 static boolean_t
1725 spa_passivate_log(spa_t *spa)
1726 {
1727 vdev_t *rvd = spa->spa_root_vdev;
1728 boolean_t slog_found = B_FALSE;
1729
1730 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1731
1732 if (!spa_has_slogs(spa))
1733 return (B_FALSE);
1734
1735 for (int c = 0; c < rvd->vdev_children; c++) {
1736 vdev_t *tvd = rvd->vdev_child[c];
1737 metaslab_group_t *mg = tvd->vdev_mg;
1738
1739 if (tvd->vdev_islog) {
1740 metaslab_group_passivate(mg);
1741 slog_found = B_TRUE;
1742 }
1743 }
1744
1745 return (slog_found);
1746 }
1747
1748 static void
1749 spa_activate_log(spa_t *spa)
1750 {
1751 vdev_t *rvd = spa->spa_root_vdev;
1752
1753 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1754
1755 for (int c = 0; c < rvd->vdev_children; c++) {
1756 vdev_t *tvd = rvd->vdev_child[c];
1757 metaslab_group_t *mg = tvd->vdev_mg;
1758
1759 if (tvd->vdev_islog)
1760 metaslab_group_activate(mg);
1761 }
1762 }
1763
1764 int
1765 spa_offline_log(spa_t *spa)
1766 {
1767 int error;
1768
1769 error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1770 NULL, DS_FIND_CHILDREN);
1771 if (error == 0) {
1772 /*
1773 * We successfully offlined the log device, sync out the
1774 * current txg so that the "stubby" block can be removed
1775 * by zil_sync().
1776 */
1777 txg_wait_synced(spa->spa_dsl_pool, 0);
1778 }
1779 return (error);
1780 }
1781
1782 static void
1783 spa_aux_check_removed(spa_aux_vdev_t *sav)
1784 {
1785 for (int i = 0; i < sav->sav_count; i++)
1786 spa_check_removed(sav->sav_vdevs[i]);
1787 }
1788
1789 void
1790 spa_claim_notify(zio_t *zio)
1791 {
1792 spa_t *spa = zio->io_spa;
1793
1794 if (zio->io_error)
1795 return;
1796
1797 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
1798 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1799 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1800 mutex_exit(&spa->spa_props_lock);
1801 }
1802
1803 typedef struct spa_load_error {
1804 uint64_t sle_meta_count;
1805 uint64_t sle_data_count;
1806 } spa_load_error_t;
1807
1808 static void
1809 spa_load_verify_done(zio_t *zio)
1810 {
1811 blkptr_t *bp = zio->io_bp;
1812 spa_load_error_t *sle = zio->io_private;
1813 dmu_object_type_t type = BP_GET_TYPE(bp);
1814 int error = zio->io_error;
1815
1816 if (error) {
1817 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1818 type != DMU_OT_INTENT_LOG)
1819 atomic_add_64(&sle->sle_meta_count, 1);
1820 else
1821 atomic_add_64(&sle->sle_data_count, 1);
1822 }
1823 zio_data_buf_free(zio->io_data, zio->io_size);
1824 }
1825
1826 /*ARGSUSED*/
1827 static int
1828 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1829 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1830 {
1831 if (bp != NULL) {
1832 zio_t *rio = arg;
1833 size_t size = BP_GET_PSIZE(bp);
1834 void *data = zio_data_buf_alloc(size);
1835
1836 zio_nowait(zio_read(rio, spa, bp, data, size,
1837 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1838 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1839 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1840 }
1841 return (0);
1842 }
1843
1844 static int
1845 spa_load_verify(spa_t *spa)
1846 {
1847 zio_t *rio;
1848 spa_load_error_t sle = { 0 };
1849 zpool_rewind_policy_t policy;
1850 boolean_t verify_ok = B_FALSE;
1851 int error;
1852
1853 zpool_get_rewind_policy(spa->spa_config, &policy);
1854
1855 if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1856 return (0);
1857
1858 rio = zio_root(spa, NULL, &sle,
1859 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1860
1861 error = traverse_pool(spa, spa->spa_verify_min_txg,
1862 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1863
1864 (void) zio_wait(rio);
1865
1866 spa->spa_load_meta_errors = sle.sle_meta_count;
1867 spa->spa_load_data_errors = sle.sle_data_count;
1868
1869 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1870 sle.sle_data_count <= policy.zrp_maxdata) {
1871 int64_t loss = 0;
1872
1873 verify_ok = B_TRUE;
1874 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1875 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1876
1877 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1878 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1879 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1880 VERIFY(nvlist_add_int64(spa->spa_load_info,
1881 ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1882 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1883 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1884 } else {
1885 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1886 }
1887
1888 if (error) {
1889 if (error != ENXIO && error != EIO)
1890 error = SET_ERROR(EIO);
1891 return (error);
1892 }
1893
1894 return (verify_ok ? 0 : EIO);
1895 }
1896
1897 /*
1898 * Find a value in the pool props object.
1899 */
1900 static void
1901 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1902 {
1903 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1904 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1905 }
1906
1907 /*
1908 * Find a value in the pool directory object.
1909 */
1910 static int
1911 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1912 {
1913 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1914 name, sizeof (uint64_t), 1, val));
1915 }
1916
1917 static int
1918 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1919 {
1920 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1921 return (err);
1922 }
1923
1924 /*
1925 * Fix up config after a partly-completed split. This is done with the
1926 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
1927 * pool have that entry in their config, but only the splitting one contains
1928 * a list of all the guids of the vdevs that are being split off.
1929 *
1930 * This function determines what to do with that list: either rejoin
1931 * all the disks to the pool, or complete the splitting process. To attempt
1932 * the rejoin, each disk that is offlined is marked online again, and
1933 * we do a reopen() call. If the vdev label for every disk that was
1934 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1935 * then we call vdev_split() on each disk, and complete the split.
1936 *
1937 * Otherwise we leave the config alone, with all the vdevs in place in
1938 * the original pool.
1939 */
1940 static void
1941 spa_try_repair(spa_t *spa, nvlist_t *config)
1942 {
1943 uint_t extracted;
1944 uint64_t *glist;
1945 uint_t i, gcount;
1946 nvlist_t *nvl;
1947 vdev_t **vd;
1948 boolean_t attempt_reopen;
1949
1950 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1951 return;
1952
1953 /* check that the config is complete */
1954 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1955 &glist, &gcount) != 0)
1956 return;
1957
1958 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
1959
1960 /* attempt to online all the vdevs & validate */
1961 attempt_reopen = B_TRUE;
1962 for (i = 0; i < gcount; i++) {
1963 if (glist[i] == 0) /* vdev is hole */
1964 continue;
1965
1966 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
1967 if (vd[i] == NULL) {
1968 /*
1969 * Don't bother attempting to reopen the disks;
1970 * just do the split.
1971 */
1972 attempt_reopen = B_FALSE;
1973 } else {
1974 /* attempt to re-online it */
1975 vd[i]->vdev_offline = B_FALSE;
1976 }
1977 }
1978
1979 if (attempt_reopen) {
1980 vdev_reopen(spa->spa_root_vdev);
1981
1982 /* check each device to see what state it's in */
1983 for (extracted = 0, i = 0; i < gcount; i++) {
1984 if (vd[i] != NULL &&
1985 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
1986 break;
1987 ++extracted;
1988 }
1989 }
1990
1991 /*
1992 * If every disk has been moved to the new pool, or if we never
1993 * even attempted to look at them, then we split them off for
1994 * good.
1995 */
1996 if (!attempt_reopen || gcount == extracted) {
1997 for (i = 0; i < gcount; i++)
1998 if (vd[i] != NULL)
1999 vdev_split(vd[i]);
2000 vdev_reopen(spa->spa_root_vdev);
2001 }
2002
2003 kmem_free(vd, gcount * sizeof (vdev_t *));
2004 }
2005
2006 static int
2007 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
2008 boolean_t mosconfig)
2009 {
2010 nvlist_t *config = spa->spa_config;
2011 char *ereport = FM_EREPORT_ZFS_POOL;
2012 char *comment;
2013 int error;
2014 uint64_t pool_guid;
2015 nvlist_t *nvl;
2016
2017 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
2018 return (SET_ERROR(EINVAL));
2019
2020 ASSERT(spa->spa_comment == NULL);
2021 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2022 spa->spa_comment = spa_strdup(comment);
2023
2024 /*
2025 * Versioning wasn't explicitly added to the label until later, so if
2026 * it's not present treat it as the initial version.
2027 */
2028 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2029 &spa->spa_ubsync.ub_version) != 0)
2030 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2031
2032 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2033 &spa->spa_config_txg);
2034
2035 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
2036 spa_guid_exists(pool_guid, 0)) {
2037 error = SET_ERROR(EEXIST);
2038 } else {
2039 spa->spa_config_guid = pool_guid;
2040
2041 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
2042 &nvl) == 0) {
2043 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
2044 KM_SLEEP) == 0);
2045 }
2046
2047 nvlist_free(spa->spa_load_info);
2048 spa->spa_load_info = fnvlist_alloc();
2049
2050 gethrestime(&spa->spa_loaded_ts);
2051 error = spa_load_impl(spa, pool_guid, config, state, type,
2052 mosconfig, &ereport);
2053 }
2054
2055 spa->spa_minref = refcount_count(&spa->spa_refcount);
2056 if (error) {
2057 if (error != EEXIST) {
2058 spa->spa_loaded_ts.tv_sec = 0;
2059 spa->spa_loaded_ts.tv_nsec = 0;
2060 }
2061 if (error != EBADF) {
2062 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2063 }
2064 }
2065 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2066 spa->spa_ena = 0;
2067
2068 return (error);
2069 }
2070
2071 /*
2072 * Load an existing storage pool, using the pool's builtin spa_config as a
2073 * source of configuration information.
2074 */
2075 static int
2076 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
2077 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
2078 char **ereport)
2079 {
2080 int error = 0;
2081 nvlist_t *nvroot = NULL;
2082 nvlist_t *label;
2083 vdev_t *rvd;
2084 uberblock_t *ub = &spa->spa_uberblock;
2085 uint64_t children, config_cache_txg = spa->spa_config_txg;
2086 int orig_mode = spa->spa_mode;
2087 int parse;
2088 uint64_t obj;
2089 boolean_t missing_feat_write = B_FALSE;
2090
2091 /*
2092 * If this is an untrusted config, access the pool in read-only mode.
2093 * This prevents things like resilvering recently removed devices.
2094 */
2095 if (!mosconfig)
2096 spa->spa_mode = FREAD;
2097
2098 ASSERT(MUTEX_HELD(&spa_namespace_lock));
2099
2100 spa->spa_load_state = state;
2101
2102 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
2103 return (SET_ERROR(EINVAL));
2104
2105 parse = (type == SPA_IMPORT_EXISTING ?
2106 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2107
2108 /*
2109 * Create "The Godfather" zio to hold all async IOs
2110 */
2111 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2112 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2113
2114 /*
2115 * Parse the configuration into a vdev tree. We explicitly set the
2116 * value that will be returned by spa_version() since parsing the
2117 * configuration requires knowing the version number.
2118 */
2119 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2120 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
2121 spa_config_exit(spa, SCL_ALL, FTAG);
2122
2123 if (error != 0)
2124 return (error);
2125
2126 ASSERT(spa->spa_root_vdev == rvd);
2127
2128 if (type != SPA_IMPORT_ASSEMBLE) {
2129 ASSERT(spa_guid(spa) == pool_guid);
2130 }
2131
2132 /*
2133 * Try to open all vdevs, loading each label in the process.
2134 */
2135 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2136 error = vdev_open(rvd);
2137 spa_config_exit(spa, SCL_ALL, FTAG);
2138 if (error != 0)
2139 return (error);
2140
2141 /*
2142 * We need to validate the vdev labels against the configuration that
2143 * we have in hand, which is dependent on the setting of mosconfig. If
2144 * mosconfig is true then we're validating the vdev labels based on
2145 * that config. Otherwise, we're validating against the cached config
2146 * (zpool.cache) that was read when we loaded the zfs module, and then
2147 * later we will recursively call spa_load() and validate against
2148 * the vdev config.
2149 *
2150 * If we're assembling a new pool that's been split off from an
2151 * existing pool, the labels haven't yet been updated so we skip
2152 * validation for now.
2153 */
2154 if (type != SPA_IMPORT_ASSEMBLE) {
2155 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2156 error = vdev_validate(rvd, mosconfig);
2157 spa_config_exit(spa, SCL_ALL, FTAG);
2158
2159 if (error != 0)
2160 return (error);
2161
2162 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2163 return (SET_ERROR(ENXIO));
2164 }
2165
2166 /*
2167 * Find the best uberblock.
2168 */
2169 vdev_uberblock_load(rvd, ub, &label);
2170
2171 /*
2172 * If we weren't able to find a single valid uberblock, return failure.
2173 */
2174 if (ub->ub_txg == 0) {
2175 nvlist_free(label);
2176 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2177 }
2178
2179 /*
2180 * If the pool has an unsupported version we can't open it.
2181 */
2182 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2183 nvlist_free(label);
2184 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2185 }
2186
2187 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2188 nvlist_t *features;
2189
2190 /*
2191 * If we weren't able to find what's necessary for reading the
2192 * MOS in the label, return failure.
2193 */
2194 if (label == NULL || nvlist_lookup_nvlist(label,
2195 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
2196 nvlist_free(label);
2197 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2198 ENXIO));
2199 }
2200
2201 /*
2202 * Update our in-core representation with the definitive values
2203 * from the label.
2204 */
2205 nvlist_free(spa->spa_label_features);
2206 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2207 }
2208
2209 nvlist_free(label);
2210
2211 /*
2212 * Look through entries in the label nvlist's features_for_read. If
2213 * there is a feature listed there which we don't understand then we
2214 * cannot open a pool.
2215 */
2216 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2217 nvlist_t *unsup_feat;
2218
2219 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2220 0);
2221
2222 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2223 NULL); nvp != NULL;
2224 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2225 if (!zfeature_is_supported(nvpair_name(nvp))) {
2226 VERIFY(nvlist_add_string(unsup_feat,
2227 nvpair_name(nvp), "") == 0);
2228 }
2229 }
2230
2231 if (!nvlist_empty(unsup_feat)) {
2232 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2233 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2234 nvlist_free(unsup_feat);
2235 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2236 ENOTSUP));
2237 }
2238
2239 nvlist_free(unsup_feat);
2240 }
2241
2242 /*
2243 * If the vdev guid sum doesn't match the uberblock, we have an
2244 * incomplete configuration. We first check to see if the pool
2245 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
2246 * If it is, defer the vdev_guid_sum check till later so we
2247 * can handle missing vdevs.
2248 */
2249 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
2250 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
2251 rvd->vdev_guid_sum != ub->ub_guid_sum)
2252 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2253
2254 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2255 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2256 spa_try_repair(spa, config);
2257 spa_config_exit(spa, SCL_ALL, FTAG);
2258 nvlist_free(spa->spa_config_splitting);
2259 spa->spa_config_splitting = NULL;
2260 }
2261
2262 /*
2263 * Initialize internal SPA structures.
2264 */
2265 spa->spa_state = POOL_STATE_ACTIVE;
2266 spa->spa_ubsync = spa->spa_uberblock;
2267 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2268 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2269 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2270 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2271 spa->spa_claim_max_txg = spa->spa_first_txg;
2272 spa->spa_prev_software_version = ub->ub_software_version;
2273
2274 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2275 if (error)
2276 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2277 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2278
2279 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2280 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2281
2282 if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2283 boolean_t missing_feat_read = B_FALSE;
2284 nvlist_t *unsup_feat, *enabled_feat;
2285
2286 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2287 &spa->spa_feat_for_read_obj) != 0) {
2288 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2289 }
2290
2291 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2292 &spa->spa_feat_for_write_obj) != 0) {
2293 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2294 }
2295
2296 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2297 &spa->spa_feat_desc_obj) != 0) {
2298 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2299 }
2300
2301 enabled_feat = fnvlist_alloc();
2302 unsup_feat = fnvlist_alloc();
2303
2304 if (!feature_is_supported(spa->spa_meta_objset,
2305 spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
2306 unsup_feat, enabled_feat))
2307 missing_feat_read = B_TRUE;
2308
2309 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
2310 if (!feature_is_supported(spa->spa_meta_objset,
2311 spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
2312 unsup_feat, enabled_feat)) {
2313 missing_feat_write = B_TRUE;
2314 }
2315 }
2316
2317 fnvlist_add_nvlist(spa->spa_load_info,
2318 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2319
2320 if (!nvlist_empty(unsup_feat)) {
2321 fnvlist_add_nvlist(spa->spa_load_info,
2322 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
2323 }
2324
2325 fnvlist_free(enabled_feat);
2326 fnvlist_free(unsup_feat);
2327
2328 if (!missing_feat_read) {
2329 fnvlist_add_boolean(spa->spa_load_info,
2330 ZPOOL_CONFIG_CAN_RDONLY);
2331 }
2332
2333 /*
2334 * If the state is SPA_LOAD_TRYIMPORT, our objective is
2335 * twofold: to determine whether the pool is available for
2336 * import in read-write mode and (if it is not) whether the
2337 * pool is available for import in read-only mode. If the pool
2338 * is available for import in read-write mode, it is displayed
2339 * as available in userland; if it is not available for import
2340 * in read-only mode, it is displayed as unavailable in
2341 * userland. If the pool is available for import in read-only
2342 * mode but not read-write mode, it is displayed as unavailable
2343 * in userland with a special note that the pool is actually
2344 * available for open in read-only mode.
2345 *
2346 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2347 * missing a feature for write, we must first determine whether
2348 * the pool can be opened read-only before returning to
2349 * userland in order to know whether to display the
2350 * abovementioned note.
2351 */
2352 if (missing_feat_read || (missing_feat_write &&
2353 spa_writeable(spa))) {
2354 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2355 ENOTSUP));
2356 }
2357 }
2358
2359 spa->spa_is_initializing = B_TRUE;
2360 error = dsl_pool_open(spa->spa_dsl_pool);
2361 spa->spa_is_initializing = B_FALSE;
2362 if (error != 0)
2363 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2364
2365 if (!mosconfig) {
2366 uint64_t hostid;
2367 nvlist_t *policy = NULL, *nvconfig;
2368
2369 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2370 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2371
2372 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2373 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2374 char *hostname;
2375 unsigned long myhostid = 0;
2376
2377 VERIFY(nvlist_lookup_string(nvconfig,
2378 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2379
2380 #ifdef _KERNEL
2381 myhostid = zone_get_hostid(NULL);
2382 #else /* _KERNEL */
2383 /*
2384 * We're emulating the system's hostid in userland, so
2385 * we can't use zone_get_hostid().
2386 */
2387 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2388 #endif /* _KERNEL */
2389 if (hostid != 0 && myhostid != 0 &&
2390 hostid != myhostid) {
2391 nvlist_free(nvconfig);
2392 cmn_err(CE_WARN, "pool '%s' could not be "
2393 "loaded as it was last accessed by "
2394 "another system (host: %s hostid: 0x%lx). "
2395 "See: http://illumos.org/msg/ZFS-8000-EY",
2396 spa_name(spa), hostname,
2397 (unsigned long)hostid);
2398 return (SET_ERROR(EBADF));
2399 }
2400 }
2401 if (nvlist_lookup_nvlist(spa->spa_config,
2402 ZPOOL_REWIND_POLICY, &policy) == 0)
2403 VERIFY(nvlist_add_nvlist(nvconfig,
2404 ZPOOL_REWIND_POLICY, policy) == 0);
2405
2406 spa_config_set(spa, nvconfig);
2407 spa_unload(spa);
2408 spa_deactivate(spa);
2409 spa_activate(spa, orig_mode);
2410
2411 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2412 }
2413
2414 /* Grab the secret checksum salt from the MOS. */
2415 if (spa_dir_prop(spa, DMU_POOL_CHECKSUM_SALT,
2416 &spa->spa_cksum_salt_obj) == 0) {
2417 if (zap_lookup(spa->spa_meta_objset, spa->spa_cksum_salt_obj,
2418 DMU_POOL_CHECKSUM_SALT, 1,
2419 sizeof (spa->spa_cksum_salt.zcs_bytes),
2420 spa->spa_cksum_salt.zcs_bytes) != 0) {
2421 /*
2422 * MOS format is broken, the salt object is there but
2423 * is missing the actual salt value.
2424 */
2425 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2426 }
2427 } else {
2428 /* Generate a new salt for subsequent use */
2429 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
2430 sizeof (spa->spa_cksum_salt.zcs_bytes));
2431 }
2432
2433 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2434 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2435 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2436 if (error != 0)
2437 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2438
2439 /*
2440 * Load the bit that tells us to use the new accounting function
2441 * (raid-z deflation). If we have an older pool, this will not
2442 * be present.
2443 */
2444 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2445 if (error != 0 && error != ENOENT)
2446 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2447
2448 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2449 &spa->spa_creation_version);
2450 if (error != 0 && error != ENOENT)
2451 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2452
2453 /*
2454 * Load the persistent error log. If we have an older pool, this will
2455 * not be present.
2456 */
2457 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2458 if (error != 0 && error != ENOENT)
2459 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2460
2461 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2462 &spa->spa_errlog_scrub);
2463 if (error != 0 && error != ENOENT)
2464 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2465
2466 /*
2467 * Load the history object. If we have an older pool, this
2468 * will not be present.
2469 */
2470 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2471 if (error != 0 && error != ENOENT)
2472 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2473
2474 /*
2475 * If we're assembling the pool from the split-off vdevs of
2476 * an existing pool, we don't want to attach the spares & cache
2477 * devices.
2478 */
2479
2480 /*
2481 * Load any hot spares for this pool.
2482 */
2483 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2484 if (error != 0 && error != ENOENT)
2485 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2486 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2487 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2488 if (load_nvlist(spa, spa->spa_spares.sav_object,
2489 &spa->spa_spares.sav_config) != 0)
2490 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2491
2492 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2493 spa_load_spares(spa);
2494 spa_config_exit(spa, SCL_ALL, FTAG);
2495 } else if (error == 0) {
2496 spa->spa_spares.sav_sync = B_TRUE;
2497 }
2498
2499 /*
2500 * Load any level 2 ARC devices for this pool.
2501 */
2502 error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2503 &spa->spa_l2cache.sav_object);
2504 if (error != 0 && error != ENOENT)
2505 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2506 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2507 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2508 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2509 &spa->spa_l2cache.sav_config) != 0)
2510 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2511
2512 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2513 spa_load_l2cache(spa);
2514 spa_config_exit(spa, SCL_ALL, FTAG);
2515 } else if (error == 0) {
2516 spa->spa_l2cache.sav_sync = B_TRUE;
2517 }
2518
2519 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2520
2521 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2522 if (error && error != ENOENT)
2523 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2524
2525 if (error == 0) {
2526 uint64_t autoreplace;
2527
2528 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2529 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2530 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2531 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2532 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2533 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2534 &spa->spa_dedup_ditto);
2535
2536 spa->spa_autoreplace = (autoreplace != 0);
2537 }
2538
2539 /*
2540 * If the 'autoreplace' property is set, then post a resource notifying
2541 * the ZFS DE that it should not issue any faults for unopenable
2542 * devices. We also iterate over the vdevs, and post a sysevent for any
2543 * unopenable vdevs so that the normal autoreplace handler can take
2544 * over.
2545 */
2546 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2547 spa_check_removed(spa->spa_root_vdev);
2548 /*
2549 * For the import case, this is done in spa_import(), because
2550 * at this point we're using the spare definitions from
2551 * the MOS config, not necessarily from the userland config.
2552 */
2553 if (state != SPA_LOAD_IMPORT) {
2554 spa_aux_check_removed(&spa->spa_spares);
2555 spa_aux_check_removed(&spa->spa_l2cache);
2556 }
2557 }
2558
2559 /*
2560 * Load the vdev state for all toplevel vdevs.
2561 */
2562 vdev_load(rvd);
2563
2564 /*
2565 * Propagate the leaf DTLs we just loaded all the way up the tree.
2566 */
2567 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2568 vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2569 spa_config_exit(spa, SCL_ALL, FTAG);
2570
2571 /*
2572 * Load the DDTs (dedup tables).
2573 */
2574 error = ddt_load(spa);
2575 if (error != 0)
2576 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2577
2578 spa_update_dspace(spa);
2579
2580 /*
2581 * Validate the config, using the MOS config to fill in any
2582 * information which might be missing. If we fail to validate
2583 * the config then declare the pool unfit for use. If we're
2584 * assembling a pool from a split, the log is not transferred
2585 * over.
2586 */
2587 if (type != SPA_IMPORT_ASSEMBLE) {
2588 nvlist_t *nvconfig;
2589
2590 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2591 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2592
2593 if (!spa_config_valid(spa, nvconfig)) {
2594 nvlist_free(nvconfig);
2595 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2596 ENXIO));
2597 }
2598 nvlist_free(nvconfig);
2599
2600 /*
2601 * Now that we've validated the config, check the state of the
2602 * root vdev. If it can't be opened, it indicates one or
2603 * more toplevel vdevs are faulted.
2604 */
2605 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2606 return (SET_ERROR(ENXIO));
2607
2608 if (spa_check_logs(spa)) {
2609 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2610 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2611 }
2612 }
2613
2614 if (missing_feat_write) {
2615 ASSERT(state == SPA_LOAD_TRYIMPORT);
2616
2617 /*
2618 * At this point, we know that we can open the pool in
2619 * read-only mode but not read-write mode. We now have enough
2620 * information and can return to userland.
2621 */
2622 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
2623 }
2624
2625 /*
2626 * We've successfully opened the pool, verify that we're ready
2627 * to start pushing transactions.
2628 */
2629 if (state != SPA_LOAD_TRYIMPORT) {
2630 if (error = spa_load_verify(spa))
2631 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2632 error));
2633 }
2634
2635 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2636 spa->spa_load_max_txg == UINT64_MAX)) {
2637 dmu_tx_t *tx;
2638 int need_update = B_FALSE;
2639
2640 ASSERT(state != SPA_LOAD_TRYIMPORT);
2641
2642 /*
2643 * Claim log blocks that haven't been committed yet.
2644 * This must all happen in a single txg.
2645 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2646 * invoked from zil_claim_log_block()'s i/o done callback.
2647 * Price of rollback is that we abandon the log.
2648 */
2649 spa->spa_claiming = B_TRUE;
2650
2651 tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2652 spa_first_txg(spa));
2653 (void) dmu_objset_find(spa_name(spa),
2654 zil_claim, tx, DS_FIND_CHILDREN);
2655 dmu_tx_commit(tx);
2656
2657 spa->spa_claiming = B_FALSE;
2658
2659 spa_set_log_state(spa, SPA_LOG_GOOD);
2660 spa->spa_sync_on = B_TRUE;
2661 txg_sync_start(spa->spa_dsl_pool);
2662
2663 /*
2664 * Wait for all claims to sync. We sync up to the highest
2665 * claimed log block birth time so that claimed log blocks
2666 * don't appear to be from the future. spa_claim_max_txg
2667 * will have been set for us by either zil_check_log_chain()
2668 * (invoked from spa_check_logs()) or zil_claim() above.
2669 */
2670 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2671
2672 /*
2673 * If the config cache is stale, or we have uninitialized
2674 * metaslabs (see spa_vdev_add()), then update the config.
2675 *
2676 * If this is a verbatim import, trust the current
2677 * in-core spa_config and update the disk labels.
2678 */
2679 if (config_cache_txg != spa->spa_config_txg ||
2680 state == SPA_LOAD_IMPORT ||
2681 state == SPA_LOAD_RECOVER ||
2682 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2683 need_update = B_TRUE;
2684
2685 for (int c = 0; c < rvd->vdev_children; c++)
2686 if (rvd->vdev_child[c]->vdev_ms_array == 0)
2687 need_update = B_TRUE;
2688
2689 /*
2690 * Update the config cache asychronously in case we're the
2691 * root pool, in which case the config cache isn't writable yet.
2692 */
2693 if (need_update)
2694 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2695
2696 /*
2697 * Check all DTLs to see if anything needs resilvering.
2698 */
2699 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2700 vdev_resilver_needed(rvd, NULL, NULL))
2701 spa_async_request(spa, SPA_ASYNC_RESILVER);
2702
2703 /*
2704 * Log the fact that we booted up (so that we can detect if
2705 * we rebooted in the middle of an operation).
2706 */
2707 spa_history_log_version(spa, "open");
2708
2709 /*
2710 * Delete any inconsistent datasets.
2711 */
2712 (void) dmu_objset_find(spa_name(spa),
2713 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2714
2715 /*
2716 * Clean up any stale temporary dataset userrefs.
2717 */
2718 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2719 }
2720
2721 return (0);
2722 }
2723
2724 static int
2725 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2726 {
2727 int mode = spa->spa_mode;
2728
2729 spa_unload(spa);
2730 spa_deactivate(spa);
2731
2732 spa->spa_load_max_txg--;
2733
2734 spa_activate(spa, mode);
2735 spa_async_suspend(spa);
2736
2737 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2738 }
2739
2740 /*
2741 * If spa_load() fails this function will try loading prior txg's. If
2742 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
2743 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
2744 * function will not rewind the pool and will return the same error as
2745 * spa_load().
2746 */
2747 static int
2748 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2749 uint64_t max_request, int rewind_flags)
2750 {
2751 nvlist_t *loadinfo = NULL;
2752 nvlist_t *config = NULL;
2753 int load_error, rewind_error;
2754 uint64_t safe_rewind_txg;
2755 uint64_t min_txg;
2756
2757 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2758 spa->spa_load_max_txg = spa->spa_load_txg;
2759 spa_set_log_state(spa, SPA_LOG_CLEAR);
2760 } else {
2761 spa->spa_load_max_txg = max_request;
2762 }
2763
2764 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2765 mosconfig);
2766 if (load_error == 0)
2767 return (0);
2768
2769 if (spa->spa_root_vdev != NULL)
2770 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2771
2772 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2773 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2774
2775 if (rewind_flags & ZPOOL_NEVER_REWIND) {
2776 nvlist_free(config);
2777 return (load_error);
2778 }
2779
2780 if (state == SPA_LOAD_RECOVER) {
2781 /* Price of rolling back is discarding txgs, including log */
2782 spa_set_log_state(spa, SPA_LOG_CLEAR);
2783 } else {
2784 /*
2785 * If we aren't rolling back save the load info from our first
2786 * import attempt so that we can restore it after attempting
2787 * to rewind.
2788 */
2789 loadinfo = spa->spa_load_info;
2790 spa->spa_load_info = fnvlist_alloc();
2791 }
2792
2793 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2794 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2795 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2796 TXG_INITIAL : safe_rewind_txg;
2797
2798 /*
2799 * Continue as long as we're finding errors, we're still within
2800 * the acceptable rewind range, and we're still finding uberblocks
2801 */
2802 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2803 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2804 if (spa->spa_load_max_txg < safe_rewind_txg)
2805 spa->spa_extreme_rewind = B_TRUE;
2806 rewind_error = spa_load_retry(spa, state, mosconfig);
2807 }
2808
2809 spa->spa_extreme_rewind = B_FALSE;
2810 spa->spa_load_max_txg = UINT64_MAX;
2811
2812 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2813 spa_config_set(spa, config);
2814
2815 if (state == SPA_LOAD_RECOVER) {
2816 ASSERT3P(loadinfo, ==, NULL);
2817 return (rewind_error);
2818 } else {
2819 /* Store the rewind info as part of the initial load info */
2820 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
2821 spa->spa_load_info);
2822
2823 /* Restore the initial load info */
2824 fnvlist_free(spa->spa_load_info);
2825 spa->spa_load_info = loadinfo;
2826
2827 return (load_error);
2828 }
2829 }
2830
2831 /*
2832 * Pool Open/Import
2833 *
2834 * The import case is identical to an open except that the configuration is sent
2835 * down from userland, instead of grabbed from the configuration cache. For the
2836 * case of an open, the pool configuration will exist in the
2837 * POOL_STATE_UNINITIALIZED state.
2838 *
2839 * The stats information (gen/count/ustats) is used to gather vdev statistics at
2840 * the same time open the pool, without having to keep around the spa_t in some
2841 * ambiguous state.
2842 */
2843 static int
2844 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2845 nvlist_t **config)
2846 {
2847 spa_t *spa;
2848 spa_load_state_t state = SPA_LOAD_OPEN;
2849 int error;
2850 int locked = B_FALSE;
2851
2852 *spapp = NULL;
2853
2854 /*
2855 * As disgusting as this is, we need to support recursive calls to this
2856 * function because dsl_dir_open() is called during spa_load(), and ends
2857 * up calling spa_open() again. The real fix is to figure out how to
2858 * avoid dsl_dir_open() calling this in the first place.
2859 */
2860 if (mutex_owner(&spa_namespace_lock) != curthread) {
2861 mutex_enter(&spa_namespace_lock);
2862 locked = B_TRUE;
2863 }
2864
2865 if ((spa = spa_lookup(pool)) == NULL) {
2866 if (locked)
2867 mutex_exit(&spa_namespace_lock);
2868 return (SET_ERROR(ENOENT));
2869 }
2870
2871 if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
2872 zpool_rewind_policy_t policy;
2873
2874 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
2875 &policy);
2876 if (policy.zrp_request & ZPOOL_DO_REWIND)
2877 state = SPA_LOAD_RECOVER;
2878
2879 spa_activate(spa, spa_mode_global);
2880
2881 if (state != SPA_LOAD_RECOVER)
2882 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2883
2884 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2885 policy.zrp_request);
2886
2887 if (error == EBADF) {
2888 /*
2889 * If vdev_validate() returns failure (indicated by
2890 * EBADF), it indicates that one of the vdevs indicates
2891 * that the pool has been exported or destroyed. If
2892 * this is the case, the config cache is out of sync and
2893 * we should remove the pool from the namespace.
2894 */
2895 spa_unload(spa);
2896 spa_deactivate(spa);
2897 spa_config_sync(spa, B_TRUE, B_TRUE);
2898 spa_remove(spa);
2899 if (locked)
2900 mutex_exit(&spa_namespace_lock);
2901 return (SET_ERROR(ENOENT));
2902 }
2903
2904 if (error) {
2905 /*
2906 * We can't open the pool, but we still have useful
2907 * information: the state of each vdev after the
2908 * attempted vdev_open(). Return this to the user.
2909 */
2910 if (config != NULL && spa->spa_config) {
2911 VERIFY(nvlist_dup(spa->spa_config, config,
2912 KM_SLEEP) == 0);
2913 VERIFY(nvlist_add_nvlist(*config,
2914 ZPOOL_CONFIG_LOAD_INFO,
2915 spa->spa_load_info) == 0);
2916 }
2917 spa_unload(spa);
2918 spa_deactivate(spa);
2919 spa->spa_last_open_failed = error;
2920 if (locked)
2921 mutex_exit(&spa_namespace_lock);
2922 *spapp = NULL;
2923 return (error);
2924 }
2925 }
2926
2927 spa_open_ref(spa, tag);
2928
2929 if (config != NULL)
2930 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2931
2932 /*
2933 * If we've recovered the pool, pass back any information we
2934 * gathered while doing the load.
2935 */
2936 if (state == SPA_LOAD_RECOVER) {
2937 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
2938 spa->spa_load_info) == 0);
2939 }
2940
2941 if (locked) {
2942 spa->spa_last_open_failed = 0;
2943 spa->spa_last_ubsync_txg = 0;
2944 spa->spa_load_txg = 0;
2945 mutex_exit(&spa_namespace_lock);
2946 }
2947
2948 *spapp = spa;
2949
2950 return (0);
2951 }
2952
2953 int
2954 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2955 nvlist_t **config)
2956 {
2957 return (spa_open_common(name, spapp, tag, policy, config));
2958 }
2959
2960 int
2961 spa_open(const char *name, spa_t **spapp, void *tag)
2962 {
2963 return (spa_open_common(name, spapp, tag, NULL, NULL));
2964 }
2965
2966 /*
2967 * Lookup the given spa_t, incrementing the inject count in the process,
2968 * preventing it from being exported or destroyed.
2969 */
2970 spa_t *
2971 spa_inject_addref(char *name)
2972 {
2973 spa_t *spa;
2974
2975 mutex_enter(&spa_namespace_lock);
2976 if ((spa = spa_lookup(name)) == NULL) {
2977 mutex_exit(&spa_namespace_lock);
2978 return (NULL);
2979 }
2980 spa->spa_inject_ref++;
2981 mutex_exit(&spa_namespace_lock);
2982
2983 return (spa);
2984 }
2985
2986 void
2987 spa_inject_delref(spa_t *spa)
2988 {
2989 mutex_enter(&spa_namespace_lock);
2990 spa->spa_inject_ref--;
2991 mutex_exit(&spa_namespace_lock);
2992 }
2993
2994 /*
2995 * Add spares device information to the nvlist.
2996 */
2997 static void
2998 spa_add_spares(spa_t *spa, nvlist_t *config)
2999 {
3000 nvlist_t **spares;
3001 uint_t i, nspares;
3002 nvlist_t *nvroot;
3003 uint64_t guid;
3004 vdev_stat_t *vs;
3005 uint_t vsc;
3006 uint64_t pool;
3007
3008 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3009
3010 if (spa->spa_spares.sav_count == 0)
3011 return;
3012
3013 VERIFY(nvlist_lookup_nvlist(config,
3014 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3015 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3016 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
3017 if (nspares != 0) {
3018 VERIFY(nvlist_add_nvlist_array(nvroot,
3019 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3020 VERIFY(nvlist_lookup_nvlist_array(nvroot,
3021 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
3022
3023 /*
3024 * Go through and find any spares which have since been
3025 * repurposed as an active spare. If this is the case, update
3026 * their status appropriately.
3027 */
3028 for (i = 0; i < nspares; i++) {
3029 VERIFY(nvlist_lookup_uint64(spares[i],
3030 ZPOOL_CONFIG_GUID, &guid) == 0);
3031 if (spa_spare_exists(guid, &pool, NULL) &&
3032 pool != 0ULL) {
3033 VERIFY(nvlist_lookup_uint64_array(
3034 spares[i], ZPOOL_CONFIG_VDEV_STATS,
3035 (uint64_t **)&vs, &vsc) == 0);
3036 vs->vs_state = VDEV_STATE_CANT_OPEN;
3037 vs->vs_aux = VDEV_AUX_SPARED;
3038 }
3039 }
3040 }
3041 }
3042
3043 /*
3044 * Add l2cache device information to the nvlist, including vdev stats.
3045 */
3046 static void
3047 spa_add_l2cache(spa_t *spa, nvlist_t *config)
3048 {
3049 nvlist_t **l2cache;
3050 uint_t i, j, nl2cache;
3051 nvlist_t *nvroot;
3052 uint64_t guid;
3053 vdev_t *vd;
3054 vdev_stat_t *vs;
3055 uint_t vsc;
3056
3057 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3058
3059 if (spa->spa_l2cache.sav_count == 0)
3060 return;
3061
3062 VERIFY(nvlist_lookup_nvlist(config,
3063 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3064 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3065 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3066 if (nl2cache != 0) {
3067 VERIFY(nvlist_add_nvlist_array(nvroot,
3068 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3069 VERIFY(nvlist_lookup_nvlist_array(nvroot,
3070 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3071
3072 /*
3073 * Update level 2 cache device stats.
3074 */
3075
3076 for (i = 0; i < nl2cache; i++) {
3077 VERIFY(nvlist_lookup_uint64(l2cache[i],
3078 ZPOOL_CONFIG_GUID, &guid) == 0);
3079
3080 vd = NULL;
3081 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
3082 if (guid ==
3083 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
3084 vd = spa->spa_l2cache.sav_vdevs[j];
3085 break;
3086 }
3087 }
3088 ASSERT(vd != NULL);
3089
3090 VERIFY(nvlist_lookup_uint64_array(l2cache[i],
3091 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
3092 == 0);
3093 vdev_get_stats(vd, vs);
3094 }
3095 }
3096 }
3097
3098 static void
3099 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
3100 {
3101 nvlist_t *features;
3102 zap_cursor_t zc;
3103 zap_attribute_t za;
3104
3105 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3106 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3107
3108 if (spa->spa_feat_for_read_obj != 0) {
3109 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3110 spa->spa_feat_for_read_obj);
3111 zap_cursor_retrieve(&zc, &za) == 0;
3112 zap_cursor_advance(&zc)) {
3113 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3114 za.za_num_integers == 1);
3115 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3116 za.za_first_integer));
3117 }
3118 zap_cursor_fini(&zc);
3119 }
3120
3121 if (spa->spa_feat_for_write_obj != 0) {
3122 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3123 spa->spa_feat_for_write_obj);
3124 zap_cursor_retrieve(&zc, &za) == 0;
3125 zap_cursor_advance(&zc)) {
3126 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3127 za.za_num_integers == 1);
3128 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3129 za.za_first_integer));
3130 }
3131 zap_cursor_fini(&zc);
3132 }
3133
3134 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
3135 features) == 0);
3136 nvlist_free(features);
3137 }
3138
3139 int
3140 spa_get_stats(const char *name, nvlist_t **config,
3141 char *altroot, size_t buflen)
3142 {
3143 int error;
3144 spa_t *spa;
3145
3146 *config = NULL;
3147 error = spa_open_common(name, &spa, FTAG, NULL, config);
3148
3149 if (spa != NULL) {
3150 /*
3151 * This still leaves a window of inconsistency where the spares
3152 * or l2cache devices could change and the config would be
3153 * self-inconsistent.
3154 */
3155 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3156
3157 if (*config != NULL) {
3158 uint64_t loadtimes[2];
3159
3160 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
3161 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
3162 VERIFY(nvlist_add_uint64_array(*config,
3163 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
3164
3165 VERIFY(nvlist_add_uint64(*config,
3166 ZPOOL_CONFIG_ERRCOUNT,
3167 spa_get_errlog_size(spa)) == 0);
3168
3169 if (spa_suspended(spa))
3170 VERIFY(nvlist_add_uint64(*config,
3171 ZPOOL_CONFIG_SUSPENDED,
3172 spa->spa_failmode) == 0);
3173
3174 spa_add_spares(spa, *config);
3175 spa_add_l2cache(spa, *config);
3176 spa_add_feature_stats(spa, *config);
3177 }
3178 }
3179
3180 /*
3181 * We want to get the alternate root even for faulted pools, so we cheat
3182 * and call spa_lookup() directly.
3183 */
3184 if (altroot) {
3185 if (spa == NULL) {
3186 mutex_enter(&spa_namespace_lock);
3187 spa = spa_lookup(name);
3188 if (spa)
3189 spa_altroot(spa, altroot, buflen);
3190 else
3191 altroot[0] = '\0';
3192 spa = NULL;
3193 mutex_exit(&spa_namespace_lock);
3194 } else {
3195 spa_altroot(spa, altroot, buflen);
3196 }
3197 }
3198
3199 if (spa != NULL) {
3200 spa_config_exit(spa, SCL_CONFIG, FTAG);
3201 spa_close(spa, FTAG);
3202 }
3203
3204 return (error);
3205 }
3206
3207 /*
3208 * Validate that the auxiliary device array is well formed. We must have an
3209 * array of nvlists, each which describes a valid leaf vdev. If this is an
3210 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
3211 * specified, as long as they are well-formed.
3212 */
3213 static int
3214 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
3215 spa_aux_vdev_t *sav, const char *config, uint64_t version,
3216 vdev_labeltype_t label)
3217 {
3218 nvlist_t **dev;
3219 uint_t i, ndev;
3220 vdev_t *vd;
3221 int error;
3222
3223 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3224
3225 /*
3226 * It's acceptable to have no devs specified.
3227 */
3228 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
3229 return (0);
3230
3231 if (ndev == 0)
3232 return (SET_ERROR(EINVAL));
3233
3234 /*
3235 * Make sure the pool is formatted with a version that supports this
3236 * device type.
3237 */
3238 if (spa_version(spa) < version)
3239 return (SET_ERROR(ENOTSUP));
3240
3241 /*
3242 * Set the pending device list so we correctly handle device in-use
3243 * checking.
3244 */
3245 sav->sav_pending = dev;
3246 sav->sav_npending = ndev;
3247
3248 for (i = 0; i < ndev; i++) {
3249 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
3250 mode)) != 0)
3251 goto out;
3252
3253 if (!vd->vdev_ops->vdev_op_leaf) {
3254 vdev_free(vd);
3255 error = SET_ERROR(EINVAL);
3256 goto out;
3257 }
3258
3259 /*
3260 * The L2ARC currently only supports disk devices in
3261 * kernel context. For user-level testing, we allow it.
3262 */
3263 #ifdef _KERNEL
3264 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
3265 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
3266 error = SET_ERROR(ENOTBLK);
3267 vdev_free(vd);
3268 goto out;
3269 }
3270 #endif
3271 vd->vdev_top = vd;
3272
3273 if ((error = vdev_open(vd)) == 0 &&
3274 (error = vdev_label_init(vd, crtxg, label)) == 0) {
3275 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
3276 vd->vdev_guid) == 0);
3277 }
3278
3279 vdev_free(vd);
3280
3281 if (error &&
3282 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
3283 goto out;
3284 else
3285 error = 0;
3286 }
3287
3288 out:
3289 sav->sav_pending = NULL;
3290 sav->sav_npending = 0;
3291 return (error);
3292 }
3293
3294 static int
3295 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
3296 {
3297 int error;
3298
3299 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3300
3301 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3302 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
3303 VDEV_LABEL_SPARE)) != 0) {
3304 return (error);
3305 }
3306
3307 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3308 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
3309 VDEV_LABEL_L2CACHE));
3310 }
3311
3312 static void
3313 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
3314 const char *config)
3315 {
3316 int i;
3317
3318 if (sav->sav_config != NULL) {
3319 nvlist_t **olddevs;
3320 uint_t oldndevs;
3321 nvlist_t **newdevs;
3322
3323 /*
3324 * Generate new dev list by concatentating with the
3325 * current dev list.
3326 */
3327 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
3328 &olddevs, &oldndevs) == 0);
3329
3330 newdevs = kmem_alloc(sizeof (void *) *
3331 (ndevs + oldndevs), KM_SLEEP);
3332 for (i = 0; i < oldndevs; i++)
3333 VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
3334 KM_SLEEP) == 0);
3335 for (i = 0; i < ndevs; i++)
3336 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
3337 KM_SLEEP) == 0);
3338
3339 VERIFY(nvlist_remove(sav->sav_config, config,
3340 DATA_TYPE_NVLIST_ARRAY) == 0);
3341
3342 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
3343 config, newdevs, ndevs + oldndevs) == 0);
3344 for (i = 0; i < oldndevs + ndevs; i++)
3345 nvlist_free(newdevs[i]);
3346 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
3347 } else {
3348 /*
3349 * Generate a new dev list.
3350 */
3351 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
3352 KM_SLEEP) == 0);
3353 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
3354 devs, ndevs) == 0);
3355 }
3356 }
3357
3358 /*
3359 * Stop and drop level 2 ARC devices
3360 */
3361 void
3362 spa_l2cache_drop(spa_t *spa)
3363 {
3364 vdev_t *vd;
3365 int i;
3366 spa_aux_vdev_t *sav = &spa->spa_l2cache;
3367
3368 for (i = 0; i < sav->sav_count; i++) {
3369 uint64_t pool;
3370
3371 vd = sav->sav_vdevs[i];
3372 ASSERT(vd != NULL);
3373
3374 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
3375 pool != 0ULL && l2arc_vdev_present(vd))
3376 l2arc_remove_vdev(vd);
3377 }
3378 }
3379
3380 /*
3381 * Pool Creation
3382 */
3383 int
3384 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
3385 nvlist_t *zplprops)
3386 {
3387 spa_t *spa;
3388 char *altroot = NULL;
3389 vdev_t *rvd;
3390 dsl_pool_t *dp;
3391 dmu_tx_t *tx;
3392 int error = 0;
3393 uint64_t txg = TXG_INITIAL;
3394 nvlist_t **spares, **l2cache;
3395 uint_t nspares, nl2cache;
3396 uint64_t version, obj;
3397 boolean_t has_features;
3398
3399 /*
3400 * If this pool already exists, return failure.
3401 */
3402 mutex_enter(&spa_namespace_lock);
3403 if (spa_lookup(pool) != NULL) {
3404 mutex_exit(&spa_namespace_lock);
3405 return (SET_ERROR(EEXIST));
3406 }
3407
3408 /*
3409 * Allocate a new spa_t structure.
3410 */
3411 (void) nvlist_lookup_string(props,
3412 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3413 spa = spa_add(pool, NULL, altroot);
3414 spa_activate(spa, spa_mode_global);
3415
3416 if (props && (error = spa_prop_validate(spa, props))) {
3417 spa_deactivate(spa);
3418 spa_remove(spa);
3419 mutex_exit(&spa_namespace_lock);
3420 return (error);
3421 }
3422
3423 has_features = B_FALSE;
3424 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
3425 elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
3426 if (zpool_prop_feature(nvpair_name(elem)))
3427 has_features = B_TRUE;
3428 }
3429
3430 if (has_features || nvlist_lookup_uint64(props,
3431 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
3432 version = SPA_VERSION;
3433 }
3434 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
3435
3436 spa->spa_first_txg = txg;
3437 spa->spa_uberblock.ub_txg = txg - 1;
3438 spa->spa_uberblock.ub_version = version;
3439 spa->spa_ubsync = spa->spa_uberblock;
3440
3441 /*
3442 * Create "The Godfather" zio to hold all async IOs
3443 */
3444 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
3445 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
3446
3447 /*
3448 * Create the root vdev.
3449 */
3450 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3451
3452 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
3453
3454 ASSERT(error != 0 || rvd != NULL);
3455 ASSERT(error != 0 || spa->spa_root_vdev == rvd);
3456
3457 if (error == 0 && !zfs_allocatable_devs(nvroot))
3458 error = SET_ERROR(EINVAL);
3459
3460 if (error == 0 &&
3461 (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
3462 (error = spa_validate_aux(spa, nvroot, txg,
3463 VDEV_ALLOC_ADD)) == 0) {
3464 for (int c = 0; c < rvd->vdev_children; c++) {
3465 vdev_metaslab_set_size(rvd->vdev_child[c]);
3466 vdev_expand(rvd->vdev_child[c], txg);
3467 }
3468 }
3469
3470 spa_config_exit(spa, SCL_ALL, FTAG);
3471
3472 if (error != 0) {
3473 spa_unload(spa);
3474 spa_deactivate(spa);
3475 spa_remove(spa);
3476 mutex_exit(&spa_namespace_lock);
3477 return (error);
3478 }
3479
3480 /*
3481 * Get the list of spares, if specified.
3482 */
3483 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3484 &spares, &nspares) == 0) {
3485 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
3486 KM_SLEEP) == 0);
3487 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3488 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3489 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3490 spa_load_spares(spa);
3491 spa_config_exit(spa, SCL_ALL, FTAG);
3492 spa->spa_spares.sav_sync = B_TRUE;
3493 }
3494
3495 /*
3496 * Get the list of level 2 cache devices, if specified.
3497 */
3498 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3499 &l2cache, &nl2cache) == 0) {
3500 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3501 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3502 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3503 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3504 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3505 spa_load_l2cache(spa);
3506 spa_config_exit(spa, SCL_ALL, FTAG);
3507 spa->spa_l2cache.sav_sync = B_TRUE;
3508 }
3509
3510 spa->spa_is_initializing = B_TRUE;
3511 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3512 spa->spa_meta_objset = dp->dp_meta_objset;
3513 spa->spa_is_initializing = B_FALSE;
3514
3515 /*
3516 * Create DDTs (dedup tables).
3517 */
3518 ddt_create(spa);
3519
3520 spa_update_dspace(spa);
3521
3522 tx = dmu_tx_create_assigned(dp, txg);
3523
3524 /*
3525 * Create the pool config object.
3526 */
3527 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3528 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3529 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3530
3531 if (zap_add(spa->spa_meta_objset,
3532 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3533 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3534 cmn_err(CE_PANIC, "failed to add pool config");
3535 }
3536
3537 if (spa_version(spa) >= SPA_VERSION_FEATURES)
3538 spa_feature_create_zap_objects(spa, tx);
3539
3540 if (zap_add(spa->spa_meta_objset,
3541 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3542 sizeof (uint64_t), 1, &version, tx) != 0) {
3543 cmn_err(CE_PANIC, "failed to add pool version");
3544 }
3545
3546 /* Newly created pools with the right version are always deflated. */
3547 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3548 spa->spa_deflate = TRUE;
3549 if (zap_add(spa->spa_meta_objset,
3550 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3551 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3552 cmn_err(CE_PANIC, "failed to add deflate");
3553 }
3554 }
3555
3556 /*
3557 * Create the deferred-free bpobj. Turn off compression
3558 * because sync-to-convergence takes longer if the blocksize
3559 * keeps changing.
3560 */
3561 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3562 dmu_object_set_compress(spa->spa_meta_objset, obj,
3563 ZIO_COMPRESS_OFF, tx);
3564 if (zap_add(spa->spa_meta_objset,
3565 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3566 sizeof (uint64_t), 1, &obj, tx) != 0) {
3567 cmn_err(CE_PANIC, "failed to add bpobj");
3568 }
3569 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3570 spa->spa_meta_objset, obj));
3571
3572 /*
3573 * Create the pool's history object.
3574 */
3575 if (version >= SPA_VERSION_ZPOOL_HISTORY)
3576 spa_history_create_obj(spa, tx);
3577
3578 /*
3579 * Generate some random noise for salted checksums to operate on. As
3580 * soon as a salted checksum is used for the first time we will
3581 * generate the persistent MOS object to hold the salt (see
3582 * spa_activate_salted_cksum).
3583 */
3584 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
3585 sizeof (spa->spa_cksum_salt.zcs_bytes));
3586
3587 /*
3588 * Set pool properties.
3589 */
3590 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3591 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3592 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3593 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3594
3595 if (props != NULL) {
3596 spa_configfile_set(spa, props, B_FALSE);
3597 spa_sync_props(props, tx);
3598 }
3599
3600 dmu_tx_commit(tx);
3601
3602 spa->spa_sync_on = B_TRUE;
3603 txg_sync_start(spa->spa_dsl_pool);
3604
3605 /*
3606 * We explicitly wait for the first transaction to complete so that our
3607 * bean counters are appropriately updated.
3608 */
3609 txg_wait_synced(spa->spa_dsl_pool, txg);
3610
3611 spa_config_sync(spa, B_FALSE, B_TRUE);
3612
3613 spa_history_log_version(spa, "create");
3614
3615 spa->spa_minref = refcount_count(&spa->spa_refcount);
3616
3617 mutex_exit(&spa_namespace_lock);
3618
3619 return (0);
3620 }
3621
3622 #ifdef _KERNEL
3623 /*
3624 * Get the root pool information from the root disk, then import the root pool
3625 * during the system boot up time.
3626 */
3627 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3628
3629 static nvlist_t *
3630 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3631 {
3632 nvlist_t *config;
3633 nvlist_t *nvtop, *nvroot;
3634 uint64_t pgid;
3635
3636 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3637 return (NULL);
3638
3639 /*
3640 * Add this top-level vdev to the child array.
3641 */
3642 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3643 &nvtop) == 0);
3644 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3645 &pgid) == 0);
3646 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3647
3648 /*
3649 * Put this pool's top-level vdevs into a root vdev.
3650 */
3651 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3652 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3653 VDEV_TYPE_ROOT) == 0);
3654 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3655 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3656 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3657 &nvtop, 1) == 0);
3658
3659 /*
3660 * Replace the existing vdev_tree with the new root vdev in
3661 * this pool's configuration (remove the old, add the new).
3662 */
3663 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3664 nvlist_free(nvroot);
3665 return (config);
3666 }
3667
3668 /*
3669 * Walk the vdev tree and see if we can find a device with "better"
3670 * configuration. A configuration is "better" if the label on that
3671 * device has a more recent txg.
3672 */
3673 static void
3674 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3675 {
3676 for (int c = 0; c < vd->vdev_children; c++)
3677 spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3678
3679 if (vd->vdev_ops->vdev_op_leaf) {
3680 nvlist_t *label;
3681 uint64_t label_txg;
3682
3683 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
3684 &label) != 0)
3685 return;
3686
3687 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
3688 &label_txg) == 0);
3689
3690 /*
3691 * Do we have a better boot device?
3692 */
3693 if (label_txg > *txg) {
3694 *txg = label_txg;
3695 *avd = vd;
3696 }
3697 nvlist_free(label);
3698 }
3699 }
3700
3701 /*
3702 * Import a root pool.
3703 *
3704 * For x86. devpath_list will consist of devid and/or physpath name of
3705 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3706 * The GRUB "findroot" command will return the vdev we should boot.
3707 *
3708 * For Sparc, devpath_list consists the physpath name of the booting device
3709 * no matter the rootpool is a single device pool or a mirrored pool.
3710 * e.g.
3711 * "/pci@1f,0/ide@d/disk@0,0:a"
3712 */
3713 int
3714 spa_import_rootpool(char *devpath, char *devid)
3715 {
3716 spa_t *spa;
3717 vdev_t *rvd, *bvd, *avd = NULL;
3718 nvlist_t *config, *nvtop;
3719 uint64_t guid, txg;
3720 char *pname;
3721 int error;
3722
3723 /*
3724 * Read the label from the boot device and generate a configuration.
3725 */
3726 config = spa_generate_rootconf(devpath, devid, &guid);
3727 #if defined(_OBP) && defined(_KERNEL)
3728 if (config == NULL) {
3729 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3730 /* iscsi boot */
3731 get_iscsi_bootpath_phy(devpath);
3732 config = spa_generate_rootconf(devpath, devid, &guid);
3733 }
3734 }
3735 #endif
3736 if (config == NULL) {
3737 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
3738 devpath);
3739 return (SET_ERROR(EIO));
3740 }
3741
3742 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3743 &pname) == 0);
3744 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3745
3746 mutex_enter(&spa_namespace_lock);
3747 if ((spa = spa_lookup(pname)) != NULL) {
3748 /*
3749 * Remove the existing root pool from the namespace so that we
3750 * can replace it with the correct config we just read in.
3751 */
3752 spa_remove(spa);
3753 }
3754
3755 spa = spa_add(pname, config, NULL);
3756 spa->spa_is_root = B_TRUE;
3757 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3758
3759 /*
3760 * Build up a vdev tree based on the boot device's label config.
3761 */
3762 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3763 &nvtop) == 0);
3764 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3765 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3766 VDEV_ALLOC_ROOTPOOL);
3767 spa_config_exit(spa, SCL_ALL, FTAG);
3768 if (error) {
3769 mutex_exit(&spa_namespace_lock);
3770 nvlist_free(config);
3771 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3772 pname);
3773 return (error);
3774 }
3775
3776 /*
3777 * Get the boot vdev.
3778 */
3779 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3780 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
3781 (u_longlong_t)guid);
3782 error = SET_ERROR(ENOENT);
3783 goto out;
3784 }
3785
3786 /*
3787 * Determine if there is a better boot device.
3788 */
3789 avd = bvd;
3790 spa_alt_rootvdev(rvd, &avd, &txg);
3791 if (avd != bvd) {
3792 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
3793 "try booting from '%s'", avd->vdev_path);
3794 error = SET_ERROR(EINVAL);
3795 goto out;
3796 }
3797
3798 /*
3799 * If the boot device is part of a spare vdev then ensure that
3800 * we're booting off the active spare.
3801 */
3802 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3803 !bvd->vdev_isspare) {
3804 cmn_err(CE_NOTE, "The boot device is currently spared. Please "
3805 "try booting from '%s'",
3806 bvd->vdev_parent->
3807 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
3808 error = SET_ERROR(EINVAL);
3809 goto out;
3810 }
3811
3812 error = 0;
3813 out:
3814 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3815 vdev_free(rvd);
3816 spa_config_exit(spa, SCL_ALL, FTAG);
3817 mutex_exit(&spa_namespace_lock);
3818
3819 nvlist_free(config);
3820 return (error);
3821 }
3822
3823 #endif
3824
3825 /*
3826 * Import a non-root pool into the system.
3827 */
3828 int
3829 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3830 {
3831 spa_t *spa;
3832 char *altroot = NULL;
3833 spa_load_state_t state = SPA_LOAD_IMPORT;
3834 zpool_rewind_policy_t policy;
3835 uint64_t mode = spa_mode_global;
3836 uint64_t readonly = B_FALSE;
3837 int error;
3838 nvlist_t *nvroot;
3839 nvlist_t **spares, **l2cache;
3840 uint_t nspares, nl2cache;
3841
3842 /*
3843 * If a pool with this name exists, return failure.
3844 */
3845 mutex_enter(&spa_namespace_lock);
3846 if (spa_lookup(pool) != NULL) {
3847 mutex_exit(&spa_namespace_lock);
3848 return (SET_ERROR(EEXIST));
3849 }
3850
3851 /*
3852 * Create and initialize the spa structure.
3853 */
3854 (void) nvlist_lookup_string(props,
3855 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3856 (void) nvlist_lookup_uint64(props,
3857 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
3858 if (readonly)
3859 mode = FREAD;
3860 spa = spa_add(pool, config, altroot);
3861 spa->spa_import_flags = flags;
3862
3863 /*
3864 * Verbatim import - Take a pool and insert it into the namespace
3865 * as if it had been loaded at boot.
3866 */
3867 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
3868 if (props != NULL)
3869 spa_configfile_set(spa, props, B_FALSE);
3870
3871 spa_config_sync(spa, B_FALSE, B_TRUE);
3872
3873 mutex_exit(&spa_namespace_lock);
3874 return (0);
3875 }
3876
3877 spa_activate(spa, mode);
3878
3879 /*
3880 * Don't start async tasks until we know everything is healthy.
3881 */
3882 spa_async_suspend(spa);
3883
3884 zpool_get_rewind_policy(config, &policy);
3885 if (policy.zrp_request & ZPOOL_DO_REWIND)
3886 state = SPA_LOAD_RECOVER;
3887
3888 /*
3889 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
3890 * because the user-supplied config is actually the one to trust when
3891 * doing an import.
3892 */
3893 if (state != SPA_LOAD_RECOVER)
3894 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
3895
3896 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
3897 policy.zrp_request);
3898
3899 /*
3900 * Propagate anything learned while loading the pool and pass it
3901 * back to caller (i.e. rewind info, missing devices, etc).
3902 */
3903 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
3904 spa->spa_load_info) == 0);
3905
3906 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3907 /*
3908 * Toss any existing sparelist, as it doesn't have any validity
3909 * anymore, and conflicts with spa_has_spare().
3910 */
3911 if (spa->spa_spares.sav_config) {
3912 nvlist_free(spa->spa_spares.sav_config);
3913 spa->spa_spares.sav_config = NULL;
3914 spa_load_spares(spa);
3915 }
3916 if (spa->spa_l2cache.sav_config) {
3917 nvlist_free(spa->spa_l2cache.sav_config);
3918 spa->spa_l2cache.sav_config = NULL;
3919 spa_load_l2cache(spa);
3920 }
3921
3922 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3923 &nvroot) == 0);
3924 if (error == 0)
3925 error = spa_validate_aux(spa, nvroot, -1ULL,
3926 VDEV_ALLOC_SPARE);
3927 if (error == 0)
3928 error = spa_validate_aux(spa, nvroot, -1ULL,
3929 VDEV_ALLOC_L2CACHE);
3930 spa_config_exit(spa, SCL_ALL, FTAG);
3931
3932 if (props != NULL)
3933 spa_configfile_set(spa, props, B_FALSE);
3934
3935 if (error != 0 || (props && spa_writeable(spa) &&
3936 (error = spa_prop_set(spa, props)))) {
3937 spa_unload(spa);
3938 spa_deactivate(spa);
3939 spa_remove(spa);
3940 mutex_exit(&spa_namespace_lock);
3941 return (error);
3942 }
3943
3944 spa_async_resume(spa);
3945
3946 /*
3947 * Override any spares and level 2 cache devices as specified by
3948 * the user, as these may have correct device names/devids, etc.
3949 */
3950 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3951 &spares, &nspares) == 0) {
3952 if (spa->spa_spares.sav_config)
3953 VERIFY(nvlist_remove(spa->spa_spares.sav_config,
3954 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
3955 else
3956 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
3957 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3958 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3959 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3960 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3961 spa_load_spares(spa);
3962 spa_config_exit(spa, SCL_ALL, FTAG);
3963 spa->spa_spares.sav_sync = B_TRUE;
3964 }
3965 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3966 &l2cache, &nl2cache) == 0) {
3967 if (spa->spa_l2cache.sav_config)
3968 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
3969 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
3970 else
3971 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3972 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3973 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3974 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3975 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3976 spa_load_l2cache(spa);
3977 spa_config_exit(spa, SCL_ALL, FTAG);
3978 spa->spa_l2cache.sav_sync = B_TRUE;
3979 }
3980
3981 /*
3982 * Check for any removed devices.
3983 */
3984 if (spa->spa_autoreplace) {
3985 spa_aux_check_removed(&spa->spa_spares);
3986 spa_aux_check_removed(&spa->spa_l2cache);
3987 }
3988
3989 if (spa_writeable(spa)) {
3990 /*
3991 * Update the config cache to include the newly-imported pool.
3992 */
3993 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3994 }
3995
3996 /*
3997 * It's possible that the pool was expanded while it was exported.
3998 * We kick off an async task to handle this for us.
3999 */
4000 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
4001
4002 mutex_exit(&spa_namespace_lock);
4003 spa_history_log_version(spa, "import");
4004
4005 return (0);
4006 }
4007
4008 nvlist_t *
4009 spa_tryimport(nvlist_t *tryconfig)
4010 {
4011 nvlist_t *config = NULL;
4012 char *poolname;
4013 spa_t *spa;
4014 uint64_t state;
4015 int error;
4016
4017 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
4018 return (NULL);
4019
4020 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
4021 return (NULL);
4022
4023 /*
4024 * Create and initialize the spa structure.
4025 */
4026 mutex_enter(&spa_namespace_lock);
4027 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
4028 spa_activate(spa, FREAD);
4029
4030 /*
4031 * Pass off the heavy lifting to spa_load().
4032 * Pass TRUE for mosconfig because the user-supplied config
4033 * is actually the one to trust when doing an import.
4034 */
4035 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
4036
4037 /*
4038 * If 'tryconfig' was at least parsable, return the current config.
4039 */
4040 if (spa->spa_root_vdev != NULL) {
4041 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4042 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
4043 poolname) == 0);
4044 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4045 state) == 0);
4046 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
4047 spa->spa_uberblock.ub_timestamp) == 0);
4048 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4049 spa->spa_load_info) == 0);
4050
4051 /*
4052 * If the bootfs property exists on this pool then we
4053 * copy it out so that external consumers can tell which
4054 * pools are bootable.
4055 */
4056 if ((!error || error == EEXIST) && spa->spa_bootfs) {
4057 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4058
4059 /*
4060 * We have to play games with the name since the
4061 * pool was opened as TRYIMPORT_NAME.
4062 */
4063 if (dsl_dsobj_to_dsname(spa_name(spa),
4064 spa->spa_bootfs, tmpname) == 0) {
4065 char *cp;
4066 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4067
4068 cp = strchr(tmpname, '/');
4069 if (cp == NULL) {
4070 (void) strlcpy(dsname, tmpname,
4071 MAXPATHLEN);
4072 } else {
4073 (void) snprintf(dsname, MAXPATHLEN,
4074 "%s/%s", poolname, ++cp);
4075 }
4076 VERIFY(nvlist_add_string(config,
4077 ZPOOL_CONFIG_BOOTFS, dsname) == 0);
4078 kmem_free(dsname, MAXPATHLEN);
4079 }
4080 kmem_free(tmpname, MAXPATHLEN);
4081 }
4082
4083 /*
4084 * Add the list of hot spares and level 2 cache devices.
4085 */
4086 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4087 spa_add_spares(spa, config);
4088 spa_add_l2cache(spa, config);
4089 spa_config_exit(spa, SCL_CONFIG, FTAG);
4090 }
4091
4092 spa_unload(spa);
4093 spa_deactivate(spa);
4094 spa_remove(spa);
4095 mutex_exit(&spa_namespace_lock);
4096
4097 return (config);
4098 }
4099
4100 /*
4101 * Pool export/destroy
4102 *
4103 * The act of destroying or exporting a pool is very simple. We make sure there
4104 * is no more pending I/O and any references to the pool are gone. Then, we
4105 * update the pool state and sync all the labels to disk, removing the
4106 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
4107 * we don't sync the labels or remove the configuration cache.
4108 */
4109 static int
4110 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
4111 boolean_t force, boolean_t hardforce)
4112 {
4113 spa_t *spa;
4114
4115 if (oldconfig)
4116 *oldconfig = NULL;
4117
4118 if (!(spa_mode_global & FWRITE))
4119 return (SET_ERROR(EROFS));
4120
4121 mutex_enter(&spa_namespace_lock);
4122 if ((spa = spa_lookup(pool)) == NULL) {
4123 mutex_exit(&spa_namespace_lock);
4124 return (SET_ERROR(ENOENT));
4125 }
4126
4127 /*
4128 * Put a hold on the pool, drop the namespace lock, stop async tasks,
4129 * reacquire the namespace lock, and see if we can export.
4130 */
4131 spa_open_ref(spa, FTAG);
4132 mutex_exit(&spa_namespace_lock);
4133 spa_async_suspend(spa);
4134 mutex_enter(&spa_namespace_lock);
4135 spa_close(spa, FTAG);
4136
4137 /*
4138 * The pool will be in core if it's openable,
4139 * in which case we can modify its state.
4140 */
4141 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
4142 /*
4143 * Objsets may be open only because they're dirty, so we
4144 * have to force it to sync before checking spa_refcnt.
4145 */
4146 txg_wait_synced(spa->spa_dsl_pool, 0);
4147
4148 /*
4149 * A pool cannot be exported or destroyed if there are active
4150 * references. If we are resetting a pool, allow references by
4151 * fault injection handlers.
4152 */
4153 if (!spa_refcount_zero(spa) ||
4154 (spa->spa_inject_ref != 0 &&
4155 new_state != POOL_STATE_UNINITIALIZED)) {
4156 spa_async_resume(spa);
4157 mutex_exit(&spa_namespace_lock);
4158 return (SET_ERROR(EBUSY));
4159 }
4160
4161 /*
4162 * A pool cannot be exported if it has an active shared spare.
4163 * This is to prevent other pools stealing the active spare
4164 * from an exported pool. At user's own will, such pool can
4165 * be forcedly exported.
4166 */
4167 if (!force && new_state == POOL_STATE_EXPORTED &&
4168 spa_has_active_shared_spare(spa)) {
4169 spa_async_resume(spa);
4170 mutex_exit(&spa_namespace_lock);
4171 return (SET_ERROR(EXDEV));
4172 }
4173
4174 /*
4175 * We want this to be reflected on every label,
4176 * so mark them all dirty. spa_unload() will do the
4177 * final sync that pushes these changes out.
4178 */
4179 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
4180 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4181 spa->spa_state = new_state;
4182 spa->spa_final_txg = spa_last_synced_txg(spa) +
4183 TXG_DEFER_SIZE + 1;
4184 vdev_config_dirty(spa->spa_root_vdev);
4185 spa_config_exit(spa, SCL_ALL, FTAG);
4186 }
4187 }
4188
4189 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
4190
4191 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4192 spa_unload(spa);
4193 spa_deactivate(spa);
4194 }
4195
4196 if (oldconfig && spa->spa_config)
4197 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
4198
4199 if (new_state != POOL_STATE_UNINITIALIZED) {
4200 if (!hardforce)
4201 spa_config_sync(spa, B_TRUE, B_TRUE);
4202 spa_remove(spa);
4203 }
4204 mutex_exit(&spa_namespace_lock);
4205
4206 return (0);
4207 }
4208
4209 /*
4210 * Destroy a storage pool.
4211 */
4212 int
4213 spa_destroy(char *pool)
4214 {
4215 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
4216 B_FALSE, B_FALSE));
4217 }
4218
4219 /*
4220 * Export a storage pool.
4221 */
4222 int
4223 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
4224 boolean_t hardforce)
4225 {
4226 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
4227 force, hardforce));
4228 }
4229
4230 /*
4231 * Similar to spa_export(), this unloads the spa_t without actually removing it
4232 * from the namespace in any way.
4233 */
4234 int
4235 spa_reset(char *pool)
4236 {
4237 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
4238 B_FALSE, B_FALSE));
4239 }
4240
4241 /*
4242 * ==========================================================================
4243 * Device manipulation
4244 * ==========================================================================
4245 */
4246
4247 /*
4248 * Add a device to a storage pool.
4249 */
4250 int
4251 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
4252 {
4253 uint64_t txg, id;
4254 int error;
4255 vdev_t *rvd = spa->spa_root_vdev;
4256 vdev_t *vd, *tvd;
4257 nvlist_t **spares, **l2cache;
4258 uint_t nspares, nl2cache;
4259
4260 ASSERT(spa_writeable(spa));
4261
4262 txg = spa_vdev_enter(spa);
4263
4264 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
4265 VDEV_ALLOC_ADD)) != 0)
4266 return (spa_vdev_exit(spa, NULL, txg, error));
4267
4268 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
4269
4270 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
4271 &nspares) != 0)
4272 nspares = 0;
4273
4274 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
4275 &nl2cache) != 0)
4276 nl2cache = 0;
4277
4278 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
4279 return (spa_vdev_exit(spa, vd, txg, EINVAL));
4280
4281 if (vd->vdev_children != 0 &&
4282 (error = vdev_create(vd, txg, B_FALSE)) != 0)
4283 return (spa_vdev_exit(spa, vd, txg, error));
4284
4285 /*
4286 * We must validate the spares and l2cache devices after checking the
4287 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
4288 */
4289 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
4290 return (spa_vdev_exit(spa, vd, txg, error));
4291
4292 /*
4293 * Transfer each new top-level vdev from vd to rvd.
4294 */
4295 for (int c = 0; c < vd->vdev_children; c++) {
4296
4297 /*
4298 * Set the vdev id to the first hole, if one exists.
4299 */
4300 for (id = 0; id < rvd->vdev_children; id++) {
4301 if (rvd->vdev_child[id]->vdev_ishole) {
4302 vdev_free(rvd->vdev_child[id]);
4303 break;
4304 }
4305 }
4306 tvd = vd->vdev_child[c];
4307 vdev_remove_child(vd, tvd);
4308 tvd->vdev_id = id;
4309 vdev_add_child(rvd, tvd);
4310 vdev_config_dirty(tvd);
4311 }
4312
4313 if (nspares != 0) {
4314 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
4315 ZPOOL_CONFIG_SPARES);
4316 spa_load_spares(spa);
4317 spa->spa_spares.sav_sync = B_TRUE;
4318 }
4319
4320 if (nl2cache != 0) {
4321 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
4322 ZPOOL_CONFIG_L2CACHE);
4323 spa_load_l2cache(spa);
4324 spa->spa_l2cache.sav_sync = B_TRUE;
4325 }
4326
4327 /*
4328 * We have to be careful when adding new vdevs to an existing pool.
4329 * If other threads start allocating from these vdevs before we
4330 * sync the config cache, and we lose power, then upon reboot we may
4331 * fail to open the pool because there are DVAs that the config cache
4332 * can't translate. Therefore, we first add the vdevs without
4333 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
4334 * and then let spa_config_update() initialize the new metaslabs.
4335 *
4336 * spa_load() checks for added-but-not-initialized vdevs, so that
4337 * if we lose power at any point in this sequence, the remaining
4338 * steps will be completed the next time we load the pool.
4339 */
4340 (void) spa_vdev_exit(spa, vd, txg, 0);
4341
4342 mutex_enter(&spa_namespace_lock);
4343 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4344 mutex_exit(&spa_namespace_lock);
4345
4346 return (0);
4347 }
4348
4349 /*
4350 * Attach a device to a mirror. The arguments are the path to any device
4351 * in the mirror, and the nvroot for the new device. If the path specifies
4352 * a device that is not mirrored, we automatically insert the mirror vdev.
4353 *
4354 * If 'replacing' is specified, the new device is intended to replace the
4355 * existing device; in this case the two devices are made into their own
4356 * mirror using the 'replacing' vdev, which is functionally identical to
4357 * the mirror vdev (it actually reuses all the same ops) but has a few
4358 * extra rules: you can't attach to it after it's been created, and upon
4359 * completion of resilvering, the first disk (the one being replaced)
4360 * is automatically detached.
4361 */
4362 int
4363 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
4364 {
4365 uint64_t txg, dtl_max_txg;
4366 vdev_t *rvd = spa->spa_root_vdev;
4367 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
4368 vdev_ops_t *pvops;
4369 char *oldvdpath, *newvdpath;
4370 int newvd_isspare;
4371 int error;
4372
4373 ASSERT(spa_writeable(spa));
4374
4375 txg = spa_vdev_enter(spa);
4376
4377 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
4378
4379 if (oldvd == NULL)
4380 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4381
4382 if (!oldvd->vdev_ops->vdev_op_leaf)
4383 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4384
4385 pvd = oldvd->vdev_parent;
4386
4387 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
4388 VDEV_ALLOC_ATTACH)) != 0)
4389 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4390
4391 if (newrootvd->vdev_children != 1)
4392 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4393
4394 newvd = newrootvd->vdev_child[0];
4395
4396 if (!newvd->vdev_ops->vdev_op_leaf)
4397 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4398
4399 if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
4400 return (spa_vdev_exit(spa, newrootvd, txg, error));
4401
4402 /*
4403 * Spares can't replace logs
4404 */
4405 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
4406 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4407
4408 if (!replacing) {
4409 /*
4410 * For attach, the only allowable parent is a mirror or the root
4411 * vdev.
4412 */
4413 if (pvd->vdev_ops != &vdev_mirror_ops &&
4414 pvd->vdev_ops != &vdev_root_ops)
4415 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4416
4417 pvops = &vdev_mirror_ops;
4418 } else {
4419 /*
4420 * Active hot spares can only be replaced by inactive hot
4421 * spares.
4422 */
4423 if (pvd->vdev_ops == &vdev_spare_ops &&
4424 oldvd->vdev_isspare &&
4425 !spa_has_spare(spa, newvd->vdev_guid))
4426 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4427
4428 /*
4429 * If the source is a hot spare, and the parent isn't already a
4430 * spare, then we want to create a new hot spare. Otherwise, we
4431 * want to create a replacing vdev. The user is not allowed to
4432 * attach to a spared vdev child unless the 'isspare' state is
4433 * the same (spare replaces spare, non-spare replaces
4434 * non-spare).
4435 */
4436 if (pvd->vdev_ops == &vdev_replacing_ops &&
4437 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
4438 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4439 } else if (pvd->vdev_ops == &vdev_spare_ops &&
4440 newvd->vdev_isspare != oldvd->vdev_isspare) {
4441 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4442 }
4443
4444 if (newvd->vdev_isspare)
4445 pvops = &vdev_spare_ops;
4446 else
4447 pvops = &vdev_replacing_ops;
4448 }
4449
4450 /*
4451 * Make sure the new device is big enough.
4452 */
4453 if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
4454 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
4455
4456 /*
4457 * The new device cannot have a higher alignment requirement
4458 * than the top-level vdev.
4459 */
4460 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
4461 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
4462
4463 /*
4464 * If this is an in-place replacement, update oldvd's path and devid
4465 * to make it distinguishable from newvd, and unopenable from now on.
4466 */
4467 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
4468 spa_strfree(oldvd->vdev_path);
4469 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
4470 KM_SLEEP);
4471 (void) sprintf(oldvd->vdev_path, "%s/%s",
4472 newvd->vdev_path, "old");
4473 if (oldvd->vdev_devid != NULL) {
4474 spa_strfree(oldvd->vdev_devid);
4475 oldvd->vdev_devid = NULL;
4476 }
4477 }
4478
4479 /* mark the device being resilvered */
4480 newvd->vdev_resilver_txg = txg;
4481
4482 /*
4483 * If the parent is not a mirror, or if we're replacing, insert the new
4484 * mirror/replacing/spare vdev above oldvd.
4485 */
4486 if (pvd->vdev_ops != pvops)
4487 pvd = vdev_add_parent(oldvd, pvops);
4488
4489 ASSERT(pvd->vdev_top->vdev_parent == rvd);
4490 ASSERT(pvd->vdev_ops == pvops);
4491 ASSERT(oldvd->vdev_parent == pvd);
4492
4493 /*
4494 * Extract the new device from its root and add it to pvd.
4495 */
4496 vdev_remove_child(newrootvd, newvd);
4497 newvd->vdev_id = pvd->vdev_children;
4498 newvd->vdev_crtxg = oldvd->vdev_crtxg;
4499 vdev_add_child(pvd, newvd);
4500
4501 tvd = newvd->vdev_top;
4502 ASSERT(pvd->vdev_top == tvd);
4503 ASSERT(tvd->vdev_parent == rvd);
4504
4505 vdev_config_dirty(tvd);
4506
4507 /*
4508 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4509 * for any dmu_sync-ed blocks. It will propagate upward when
4510 * spa_vdev_exit() calls vdev_dtl_reassess().
4511 */
4512 dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4513
4514 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4515 dtl_max_txg - TXG_INITIAL);
4516
4517 if (newvd->vdev_isspare) {
4518 spa_spare_activate(newvd);
4519 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
4520 }
4521
4522 oldvdpath = spa_strdup(oldvd->vdev_path);
4523 newvdpath = spa_strdup(newvd->vdev_path);
4524 newvd_isspare = newvd->vdev_isspare;
4525
4526 /*
4527 * Mark newvd's DTL dirty in this txg.
4528 */
4529 vdev_dirty(tvd, VDD_DTL, newvd, txg);
4530
4531 /*
4532 * Schedule the resilver to restart in the future. We do this to
4533 * ensure that dmu_sync-ed blocks have been stitched into the
4534 * respective datasets.
4535 */
4536 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4537
4538 /*
4539 * Commit the config
4540 */
4541 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4542
4543 spa_history_log_internal(spa, "vdev attach", NULL,
4544 "%s vdev=%s %s vdev=%s",
4545 replacing && newvd_isspare ? "spare in" :
4546 replacing ? "replace" : "attach", newvdpath,
4547 replacing ? "for" : "to", oldvdpath);
4548
4549 spa_strfree(oldvdpath);
4550 spa_strfree(newvdpath);
4551
4552 if (spa->spa_bootfs)
4553 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4554
4555 return (0);
4556 }
4557
4558 /*
4559 * Detach a device from a mirror or replacing vdev.
4560 *
4561 * If 'replace_done' is specified, only detach if the parent
4562 * is a replacing vdev.
4563 */
4564 int
4565 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4566 {
4567 uint64_t txg;
4568 int error;
4569 vdev_t *rvd = spa->spa_root_vdev;
4570 vdev_t *vd, *pvd, *cvd, *tvd;
4571 boolean_t unspare = B_FALSE;
4572 uint64_t unspare_guid = 0;
4573 char *vdpath;
4574
4575 ASSERT(spa_writeable(spa));
4576
4577 txg = spa_vdev_enter(spa);
4578
4579 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4580
4581 if (vd == NULL)
4582 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4583
4584 if (!vd->vdev_ops->vdev_op_leaf)
4585 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4586
4587 pvd = vd->vdev_parent;
4588
4589 /*
4590 * If the parent/child relationship is not as expected, don't do it.
4591 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4592 * vdev that's replacing B with C. The user's intent in replacing
4593 * is to go from M(A,B) to M(A,C). If the user decides to cancel
4594 * the replace by detaching C, the expected behavior is to end up
4595 * M(A,B). But suppose that right after deciding to detach C,
4596 * the replacement of B completes. We would have M(A,C), and then
4597 * ask to detach C, which would leave us with just A -- not what
4598 * the user wanted. To prevent this, we make sure that the
4599 * parent/child relationship hasn't changed -- in this example,
4600 * that C's parent is still the replacing vdev R.
4601 */
4602 if (pvd->vdev_guid != pguid && pguid != 0)
4603 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4604
4605 /*
4606 * Only 'replacing' or 'spare' vdevs can be replaced.
4607 */
4608 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4609 pvd->vdev_ops != &vdev_spare_ops)
4610 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4611
4612 ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4613 spa_version(spa) >= SPA_VERSION_SPARES);
4614
4615 /*
4616 * Only mirror, replacing, and spare vdevs support detach.
4617 */
4618 if (pvd->vdev_ops != &vdev_replacing_ops &&
4619 pvd->vdev_ops != &vdev_mirror_ops &&
4620 pvd->vdev_ops != &vdev_spare_ops)
4621 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4622
4623 /*
4624 * If this device has the only valid copy of some data,
4625 * we cannot safely detach it.
4626 */
4627 if (vdev_dtl_required(vd))
4628 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4629
4630 ASSERT(pvd->vdev_children >= 2);
4631
4632 /*
4633 * If we are detaching the second disk from a replacing vdev, then
4634 * check to see if we changed the original vdev's path to have "/old"
4635 * at the end in spa_vdev_attach(). If so, undo that change now.
4636 */
4637 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4638 vd->vdev_path != NULL) {
4639 size_t len = strlen(vd->vdev_path);
4640
4641 for (int c = 0; c < pvd->vdev_children; c++) {
4642 cvd = pvd->vdev_child[c];
4643
4644 if (cvd == vd || cvd->vdev_path == NULL)
4645 continue;
4646
4647 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4648 strcmp(cvd->vdev_path + len, "/old") == 0) {
4649 spa_strfree(cvd->vdev_path);
4650 cvd->vdev_path = spa_strdup(vd->vdev_path);
4651 break;
4652 }
4653 }
4654 }
4655
4656 /*
4657 * If we are detaching the original disk from a spare, then it implies
4658 * that the spare should become a real disk, and be removed from the
4659 * active spare list for the pool.
4660 */
4661 if (pvd->vdev_ops == &vdev_spare_ops &&
4662 vd->vdev_id == 0 &&
4663 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
4664 unspare = B_TRUE;
4665
4666 /*
4667 * Erase the disk labels so the disk can be used for other things.
4668 * This must be done after all other error cases are handled,
4669 * but before we disembowel vd (so we can still do I/O to it).
4670 * But if we can't do it, don't treat the error as fatal --
4671 * it may be that the unwritability of the disk is the reason
4672 * it's being detached!
4673 */
4674 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4675
4676 /*
4677 * Remove vd from its parent and compact the parent's children.
4678 */
4679 vdev_remove_child(pvd, vd);
4680 vdev_compact_children(pvd);
4681
4682 /*
4683 * Remember one of the remaining children so we can get tvd below.
4684 */
4685 cvd = pvd->vdev_child[pvd->vdev_children - 1];
4686
4687 /*
4688 * If we need to remove the remaining child from the list of hot spares,
4689 * do it now, marking the vdev as no longer a spare in the process.
4690 * We must do this before vdev_remove_parent(), because that can
4691 * change the GUID if it creates a new toplevel GUID. For a similar
4692 * reason, we must remove the spare now, in the same txg as the detach;
4693 * otherwise someone could attach a new sibling, change the GUID, and
4694 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
4695 */
4696 if (unspare) {
4697 ASSERT(cvd->vdev_isspare);
4698 spa_spare_remove(cvd);
4699 unspare_guid = cvd->vdev_guid;
4700 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4701 cvd->vdev_unspare = B_TRUE;
4702 }
4703
4704 /*
4705 * If the parent mirror/replacing vdev only has one child,
4706 * the parent is no longer needed. Remove it from the tree.
4707 */
4708 if (pvd->vdev_children == 1) {
4709 if (pvd->vdev_ops == &vdev_spare_ops)
4710 cvd->vdev_unspare = B_FALSE;
4711 vdev_remove_parent(cvd);
4712 }
4713
4714
4715 /*
4716 * We don't set tvd until now because the parent we just removed
4717 * may have been the previous top-level vdev.
4718 */
4719 tvd = cvd->vdev_top;
4720 ASSERT(tvd->vdev_parent == rvd);
4721
4722 /*
4723 * Reevaluate the parent vdev state.
4724 */
4725 vdev_propagate_state(cvd);
4726
4727 /*
4728 * If the 'autoexpand' property is set on the pool then automatically
4729 * try to expand the size of the pool. For example if the device we
4730 * just detached was smaller than the others, it may be possible to
4731 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4732 * first so that we can obtain the updated sizes of the leaf vdevs.
4733 */
4734 if (spa->spa_autoexpand) {
4735 vdev_reopen(tvd);
4736 vdev_expand(tvd, txg);
4737 }
4738
4739 vdev_config_dirty(tvd);
4740
4741 /*
4742 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
4743 * vd->vdev_detached is set and free vd's DTL object in syncing context.
4744 * But first make sure we're not on any *other* txg's DTL list, to
4745 * prevent vd from being accessed after it's freed.
4746 */
4747 vdpath = spa_strdup(vd->vdev_path);
4748 for (int t = 0; t < TXG_SIZE; t++)
4749 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4750 vd->vdev_detached = B_TRUE;
4751 vdev_dirty(tvd, VDD_DTL, vd, txg);
4752
4753 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
4754
4755 /* hang on to the spa before we release the lock */
4756 spa_open_ref(spa, FTAG);
4757
4758 error = spa_vdev_exit(spa, vd, txg, 0);
4759
4760 spa_history_log_internal(spa, "detach", NULL,
4761 "vdev=%s", vdpath);
4762 spa_strfree(vdpath);
4763
4764 /*
4765 * If this was the removal of the original device in a hot spare vdev,
4766 * then we want to go through and remove the device from the hot spare
4767 * list of every other pool.
4768 */
4769 if (unspare) {
4770 spa_t *altspa = NULL;
4771
4772 mutex_enter(&spa_namespace_lock);
4773 while ((altspa = spa_next(altspa)) != NULL) {
4774 if (altspa->spa_state != POOL_STATE_ACTIVE ||
4775 altspa == spa)
4776 continue;
4777
4778 spa_open_ref(altspa, FTAG);
4779 mutex_exit(&spa_namespace_lock);
4780 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
4781 mutex_enter(&spa_namespace_lock);
4782 spa_close(altspa, FTAG);
4783 }
4784 mutex_exit(&spa_namespace_lock);
4785
4786 /* search the rest of the vdevs for spares to remove */
4787 spa_vdev_resilver_done(spa);
4788 }
4789
4790 /* all done with the spa; OK to release */
4791 mutex_enter(&spa_namespace_lock);
4792 spa_close(spa, FTAG);
4793 mutex_exit(&spa_namespace_lock);
4794
4795 return (error);
4796 }
4797
4798 /*
4799 * Split a set of devices from their mirrors, and create a new pool from them.
4800 */
4801 int
4802 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4803 nvlist_t *props, boolean_t exp)
4804 {
4805 int error = 0;
4806 uint64_t txg, *glist;
4807 spa_t *newspa;
4808 uint_t c, children, lastlog;
4809 nvlist_t **child, *nvl, *tmp;
4810 dmu_tx_t *tx;
4811 char *altroot = NULL;
4812 vdev_t *rvd, **vml = NULL; /* vdev modify list */
4813 boolean_t activate_slog;
4814
4815 ASSERT(spa_writeable(spa));
4816
4817 txg = spa_vdev_enter(spa);
4818
4819 /* clear the log and flush everything up to now */
4820 activate_slog = spa_passivate_log(spa);
4821 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4822 error = spa_offline_log(spa);
4823 txg = spa_vdev_config_enter(spa);
4824
4825 if (activate_slog)
4826 spa_activate_log(spa);
4827
4828 if (error != 0)
4829 return (spa_vdev_exit(spa, NULL, txg, error));
4830
4831 /* check new spa name before going any further */
4832 if (spa_lookup(newname) != NULL)
4833 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
4834
4835 /*
4836 * scan through all the children to ensure they're all mirrors
4837 */
4838 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
4839 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
4840 &children) != 0)
4841 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4842
4843 /* first, check to ensure we've got the right child count */
4844 rvd = spa->spa_root_vdev;
4845 lastlog = 0;
4846 for (c = 0; c < rvd->vdev_children; c++) {
4847 vdev_t *vd = rvd->vdev_child[c];
4848
4849 /* don't count the holes & logs as children */
4850 if (vd->vdev_islog || vd->vdev_ishole) {
4851 if (lastlog == 0)
4852 lastlog = c;
4853 continue;
4854 }
4855
4856 lastlog = 0;
4857 }
4858 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
4859 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4860
4861 /* next, ensure no spare or cache devices are part of the split */
4862 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
4863 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
4864 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4865
4866 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
4867 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
4868
4869 /* then, loop over each vdev and validate it */
4870 for (c = 0; c < children; c++) {
4871 uint64_t is_hole = 0;
4872
4873 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
4874 &is_hole);
4875
4876 if (is_hole != 0) {
4877 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
4878 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
4879 continue;
4880 } else {
4881 error = SET_ERROR(EINVAL);
4882 break;
4883 }
4884 }
4885
4886 /* which disk is going to be split? */
4887 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
4888 &glist[c]) != 0) {
4889 error = SET_ERROR(EINVAL);
4890 break;
4891 }
4892
4893 /* look it up in the spa */
4894 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
4895 if (vml[c] == NULL) {
4896 error = SET_ERROR(ENODEV);
4897 break;
4898 }
4899
4900 /* make sure there's nothing stopping the split */
4901 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
4902 vml[c]->vdev_islog ||
4903 vml[c]->vdev_ishole ||
4904 vml[c]->vdev_isspare ||
4905 vml[c]->vdev_isl2cache ||
4906 !vdev_writeable(vml[c]) ||
4907 vml[c]->vdev_children != 0 ||
4908 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
4909 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
4910 error = SET_ERROR(EINVAL);
4911 break;
4912 }
4913
4914 if (vdev_dtl_required(vml[c])) {
4915 error = SET_ERROR(EBUSY);
4916 break;
4917 }
4918
4919 /* we need certain info from the top level */
4920 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
4921 vml[c]->vdev_top->vdev_ms_array) == 0);
4922 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
4923 vml[c]->vdev_top->vdev_ms_shift) == 0);
4924 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
4925 vml[c]->vdev_top->vdev_asize) == 0);
4926 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
4927 vml[c]->vdev_top->vdev_ashift) == 0);
4928 }
4929
4930 if (error != 0) {
4931 kmem_free(vml, children * sizeof (vdev_t *));
4932 kmem_free(glist, children * sizeof (uint64_t));
4933 return (spa_vdev_exit(spa, NULL, txg, error));
4934 }
4935
4936 /* stop writers from using the disks */
4937 for (c = 0; c < children; c++) {
4938 if (vml[c] != NULL)
4939 vml[c]->vdev_offline = B_TRUE;
4940 }
4941 vdev_reopen(spa->spa_root_vdev);
4942
4943 /*
4944 * Temporarily record the splitting vdevs in the spa config. This
4945 * will disappear once the config is regenerated.
4946 */
4947 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4948 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
4949 glist, children) == 0);
4950 kmem_free(glist, children * sizeof (uint64_t));
4951
4952 mutex_enter(&spa->spa_props_lock);
4953 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
4954 nvl) == 0);
4955 mutex_exit(&spa->spa_props_lock);
4956 spa->spa_config_splitting = nvl;
4957 vdev_config_dirty(spa->spa_root_vdev);
4958
4959 /* configure and create the new pool */
4960 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
4961 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4962 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
4963 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
4964 spa_version(spa)) == 0);
4965 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
4966 spa->spa_config_txg) == 0);
4967 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
4968 spa_generate_guid(NULL)) == 0);
4969 (void) nvlist_lookup_string(props,
4970 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4971
4972 /* add the new pool to the namespace */
4973 newspa = spa_add(newname, config, altroot);
4974 newspa->spa_config_txg = spa->spa_config_txg;
4975 spa_set_log_state(newspa, SPA_LOG_CLEAR);
4976
4977 /* release the spa config lock, retaining the namespace lock */
4978 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4979
4980 if (zio_injection_enabled)
4981 zio_handle_panic_injection(spa, FTAG, 1);
4982
4983 spa_activate(newspa, spa_mode_global);
4984 spa_async_suspend(newspa);
4985
4986 /* create the new pool from the disks of the original pool */
4987 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
4988 if (error)
4989 goto out;
4990
4991 /* if that worked, generate a real config for the new pool */
4992 if (newspa->spa_root_vdev != NULL) {
4993 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
4994 NV_UNIQUE_NAME, KM_SLEEP) == 0);
4995 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
4996 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
4997 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
4998 B_TRUE));
4999 }
5000
5001 /* set the props */
5002 if (props != NULL) {
5003 spa_configfile_set(newspa, props, B_FALSE);
5004 error = spa_prop_set(newspa, props);
5005 if (error)
5006 goto out;
5007 }
5008
5009 /* flush everything */
5010 txg = spa_vdev_config_enter(newspa);
5011 vdev_config_dirty(newspa->spa_root_vdev);
5012 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
5013
5014 if (zio_injection_enabled)
5015 zio_handle_panic_injection(spa, FTAG, 2);
5016
5017 spa_async_resume(newspa);
5018
5019 /* finally, update the original pool's config */
5020 txg = spa_vdev_config_enter(spa);
5021 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5022 error = dmu_tx_assign(tx, TXG_WAIT);
5023 if (error != 0)
5024 dmu_tx_abort(tx);
5025 for (c = 0; c < children; c++) {
5026 if (vml[c] != NULL) {
5027 vdev_split(vml[c]);
5028 if (error == 0)
5029 spa_history_log_internal(spa, "detach", tx,
5030 "vdev=%s", vml[c]->vdev_path);
5031 vdev_free(vml[c]);
5032 }
5033 }
5034 vdev_config_dirty(spa->spa_root_vdev);
5035 spa->spa_config_splitting = NULL;
5036 nvlist_free(nvl);
5037 if (error == 0)
5038 dmu_tx_commit(tx);
5039 (void) spa_vdev_exit(spa, NULL, txg, 0);
5040
5041 if (zio_injection_enabled)
5042 zio_handle_panic_injection(spa, FTAG, 3);
5043
5044 /* split is complete; log a history record */
5045 spa_history_log_internal(newspa, "split", NULL,
5046 "from pool %s", spa_name(spa));
5047
5048 kmem_free(vml, children * sizeof (vdev_t *));
5049
5050 /* if we're not going to mount the filesystems in userland, export */
5051 if (exp)
5052 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
5053 B_FALSE, B_FALSE);
5054
5055 return (error);
5056
5057 out:
5058 spa_unload(newspa);
5059 spa_deactivate(newspa);
5060 spa_remove(newspa);
5061
5062 txg = spa_vdev_config_enter(spa);
5063
5064 /* re-online all offlined disks */
5065 for (c = 0; c < children; c++) {
5066 if (vml[c] != NULL)
5067 vml[c]->vdev_offline = B_FALSE;
5068 }
5069 vdev_reopen(spa->spa_root_vdev);
5070
5071 nvlist_free(spa->spa_config_splitting);
5072 spa->spa_config_splitting = NULL;
5073 (void) spa_vdev_exit(spa, NULL, txg, error);
5074
5075 kmem_free(vml, children * sizeof (vdev_t *));
5076 return (error);
5077 }
5078
5079 static nvlist_t *
5080 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
5081 {
5082 for (int i = 0; i < count; i++) {
5083 uint64_t guid;
5084
5085 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
5086 &guid) == 0);
5087
5088 if (guid == target_guid)
5089 return (nvpp[i]);
5090 }
5091
5092 return (NULL);
5093 }
5094
5095 static void
5096 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
5097 nvlist_t *dev_to_remove)
5098 {
5099 nvlist_t **newdev = NULL;
5100
5101 if (count > 1)
5102 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
5103
5104 for (int i = 0, j = 0; i < count; i++) {
5105 if (dev[i] == dev_to_remove)
5106 continue;
5107 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
5108 }
5109
5110 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
5111 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
5112
5113 for (int i = 0; i < count - 1; i++)
5114 nvlist_free(newdev[i]);
5115
5116 if (count > 1)
5117 kmem_free(newdev, (count - 1) * sizeof (void *));
5118 }
5119
5120 /*
5121 * Evacuate the device.
5122 */
5123 static int
5124 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
5125 {
5126 uint64_t txg;
5127 int error = 0;
5128
5129 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5130 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5131 ASSERT(vd == vd->vdev_top);
5132
5133 /*
5134 * Evacuate the device. We don't hold the config lock as writer
5135 * since we need to do I/O but we do keep the
5136 * spa_namespace_lock held. Once this completes the device
5137 * should no longer have any blocks allocated on it.
5138 */
5139 if (vd->vdev_islog) {
5140 if (vd->vdev_stat.vs_alloc != 0)
5141 error = spa_offline_log(spa);
5142 } else {
5143 error = SET_ERROR(ENOTSUP);
5144 }
5145
5146 if (error)
5147 return (error);
5148
5149 /*
5150 * The evacuation succeeded. Remove any remaining MOS metadata
5151 * associated with this vdev, and wait for these changes to sync.
5152 */
5153 ASSERT0(vd->vdev_stat.vs_alloc);
5154 txg = spa_vdev_config_enter(spa);
5155 vd->vdev_removing = B_TRUE;
5156 vdev_dirty_leaves(vd, VDD_DTL, txg);
5157 vdev_config_dirty(vd);
5158 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5159
5160 return (0);
5161 }
5162
5163 /*
5164 * Complete the removal by cleaning up the namespace.
5165 */
5166 static void
5167 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
5168 {
5169 vdev_t *rvd = spa->spa_root_vdev;
5170 uint64_t id = vd->vdev_id;
5171 boolean_t last_vdev = (id == (rvd->vdev_children - 1));
5172
5173 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5174 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5175 ASSERT(vd == vd->vdev_top);
5176
5177 /*
5178 * Only remove any devices which are empty.
5179 */
5180 if (vd->vdev_stat.vs_alloc != 0)
5181 return;
5182
5183 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5184
5185 if (list_link_active(&vd->vdev_state_dirty_node))
5186 vdev_state_clean(vd);
5187 if (list_link_active(&vd->vdev_config_dirty_node))
5188 vdev_config_clean(vd);
5189
5190 vdev_free(vd);
5191
5192 if (last_vdev) {
5193 vdev_compact_children(rvd);
5194 } else {
5195 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
5196 vdev_add_child(rvd, vd);
5197 }
5198 vdev_config_dirty(rvd);
5199
5200 /*
5201 * Reassess the health of our root vdev.
5202 */
5203 vdev_reopen(rvd);
5204 }
5205
5206 /*
5207 * Remove a device from the pool -
5208 *
5209 * Removing a device from the vdev namespace requires several steps
5210 * and can take a significant amount of time. As a result we use
5211 * the spa_vdev_config_[enter/exit] functions which allow us to
5212 * grab and release the spa_config_lock while still holding the namespace
5213 * lock. During each step the configuration is synced out.
5214 *
5215 * Currently, this supports removing only hot spares, slogs, and level 2 ARC
5216 * devices.
5217 */
5218 int
5219 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
5220 {
5221 vdev_t *vd;
5222 metaslab_group_t *mg;
5223 nvlist_t **spares, **l2cache, *nv;
5224 uint64_t txg = 0;
5225 uint_t nspares, nl2cache;
5226 int error = 0;
5227 boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
5228
5229 ASSERT(spa_writeable(spa));
5230
5231 if (!locked)
5232 txg = spa_vdev_enter(spa);
5233
5234 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5235
5236 if (spa->spa_spares.sav_vdevs != NULL &&
5237 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5238 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
5239 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
5240 /*
5241 * Only remove the hot spare if it's not currently in use
5242 * in this pool.
5243 */
5244 if (vd == NULL || unspare) {
5245 spa_vdev_remove_aux(spa->spa_spares.sav_config,
5246 ZPOOL_CONFIG_SPARES, spares, nspares, nv);
5247 spa_load_spares(spa);
5248 spa->spa_spares.sav_sync = B_TRUE;
5249 } else {
5250 error = SET_ERROR(EBUSY);
5251 }
5252 } else if (spa->spa_l2cache.sav_vdevs != NULL &&
5253 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
5254 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
5255 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
5256 /*
5257 * Cache devices can always be removed.
5258 */
5259 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
5260 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
5261 spa_load_l2cache(spa);
5262 spa->spa_l2cache.sav_sync = B_TRUE;
5263 } else if (vd != NULL && vd->vdev_islog) {
5264 ASSERT(!locked);
5265 ASSERT(vd == vd->vdev_top);
5266
5267 /*
5268 * XXX - Once we have bp-rewrite this should
5269 * become the common case.
5270 */
5271
5272 mg = vd->vdev_mg;
5273
5274 /*
5275 * Stop allocating from this vdev.
5276 */
5277 metaslab_group_passivate(mg);
5278
5279 /*
5280 * Wait for the youngest allocations and frees to sync,
5281 * and then wait for the deferral of those frees to finish.
5282 */
5283 spa_vdev_config_exit(spa, NULL,
5284 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
5285
5286 /*
5287 * Attempt to evacuate the vdev.
5288 */
5289 error = spa_vdev_remove_evacuate(spa, vd);
5290
5291 txg = spa_vdev_config_enter(spa);
5292
5293 /*
5294 * If we couldn't evacuate the vdev, unwind.
5295 */
5296 if (error) {
5297 metaslab_group_activate(mg);
5298 return (spa_vdev_exit(spa, NULL, txg, error));
5299 }
5300
5301 /*
5302 * Clean up the vdev namespace.
5303 */
5304 spa_vdev_remove_from_namespace(spa, vd);
5305
5306 } else if (vd != NULL) {
5307 /*
5308 * Normal vdevs cannot be removed (yet).
5309 */
5310 error = SET_ERROR(ENOTSUP);
5311 } else {
5312 /*
5313 * There is no vdev of any kind with the specified guid.
5314 */
5315 error = SET_ERROR(ENOENT);
5316 }
5317
5318 if (!locked)
5319 return (spa_vdev_exit(spa, NULL, txg, error));
5320
5321 return (error);
5322 }
5323
5324 /*
5325 * Find any device that's done replacing, or a vdev marked 'unspare' that's
5326 * currently spared, so we can detach it.
5327 */
5328 static vdev_t *
5329 spa_vdev_resilver_done_hunt(vdev_t *vd)
5330 {
5331 vdev_t *newvd, *oldvd;
5332
5333 for (int c = 0; c < vd->vdev_children; c++) {
5334 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
5335 if (oldvd != NULL)
5336 return (oldvd);
5337 }
5338
5339 /*
5340 * Check for a completed replacement. We always consider the first
5341 * vdev in the list to be the oldest vdev, and the last one to be
5342 * the newest (see spa_vdev_attach() for how that works). In
5343 * the case where the newest vdev is faulted, we will not automatically
5344 * remove it after a resilver completes. This is OK as it will require
5345 * user intervention to determine which disk the admin wishes to keep.
5346 */
5347 if (vd->vdev_ops == &vdev_replacing_ops) {
5348 ASSERT(vd->vdev_children > 1);
5349
5350 newvd = vd->vdev_child[vd->vdev_children - 1];
5351 oldvd = vd->vdev_child[0];
5352
5353 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
5354 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5355 !vdev_dtl_required(oldvd))
5356 return (oldvd);
5357 }
5358
5359 /*
5360 * Check for a completed resilver with the 'unspare' flag set.
5361 */
5362 if (vd->vdev_ops == &vdev_spare_ops) {
5363 vdev_t *first = vd->vdev_child[0];
5364 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
5365
5366 if (last->vdev_unspare) {
5367 oldvd = first;
5368 newvd = last;
5369 } else if (first->vdev_unspare) {
5370 oldvd = last;
5371 newvd = first;
5372 } else {
5373 oldvd = NULL;
5374 }
5375
5376 if (oldvd != NULL &&
5377 vdev_dtl_empty(newvd, DTL_MISSING) &&
5378 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5379 !vdev_dtl_required(oldvd))
5380 return (oldvd);
5381
5382 /*
5383 * If there are more than two spares attached to a disk,
5384 * and those spares are not required, then we want to
5385 * attempt to free them up now so that they can be used
5386 * by other pools. Once we're back down to a single
5387 * disk+spare, we stop removing them.
5388 */
5389 if (vd->vdev_children > 2) {
5390 newvd = vd->vdev_child[1];
5391
5392 if (newvd->vdev_isspare && last->vdev_isspare &&
5393 vdev_dtl_empty(last, DTL_MISSING) &&
5394 vdev_dtl_empty(last, DTL_OUTAGE) &&
5395 !vdev_dtl_required(newvd))
5396 return (newvd);
5397 }
5398 }
5399
5400 return (NULL);
5401 }
5402
5403 static void
5404 spa_vdev_resilver_done(spa_t *spa)
5405 {
5406 vdev_t *vd, *pvd, *ppvd;
5407 uint64_t guid, sguid, pguid, ppguid;
5408
5409 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5410
5411 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
5412 pvd = vd->vdev_parent;
5413 ppvd = pvd->vdev_parent;
5414 guid = vd->vdev_guid;
5415 pguid = pvd->vdev_guid;
5416 ppguid = ppvd->vdev_guid;
5417 sguid = 0;
5418 /*
5419 * If we have just finished replacing a hot spared device, then
5420 * we need to detach the parent's first child (the original hot
5421 * spare) as well.
5422 */
5423 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
5424 ppvd->vdev_children == 2) {
5425 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
5426 sguid = ppvd->vdev_child[1]->vdev_guid;
5427 }
5428 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
5429
5430 spa_config_exit(spa, SCL_ALL, FTAG);
5431 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
5432 return;
5433 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
5434 return;
5435 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5436 }
5437
5438 spa_config_exit(spa, SCL_ALL, FTAG);
5439 }
5440
5441 /*
5442 * Update the stored path or FRU for this vdev.
5443 */
5444 int
5445 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
5446 boolean_t ispath)
5447 {
5448 vdev_t *vd;
5449 boolean_t sync = B_FALSE;
5450
5451 ASSERT(spa_writeable(spa));
5452
5453 spa_vdev_state_enter(spa, SCL_ALL);
5454
5455 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
5456 return (spa_vdev_state_exit(spa, NULL, ENOENT));
5457
5458 if (!vd->vdev_ops->vdev_op_leaf)
5459 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
5460
5461 if (ispath) {
5462 if (strcmp(value, vd->vdev_path) != 0) {
5463 spa_strfree(vd->vdev_path);
5464 vd->vdev_path = spa_strdup(value);
5465 sync = B_TRUE;
5466 }
5467 } else {
5468 if (vd->vdev_fru == NULL) {
5469 vd->vdev_fru = spa_strdup(value);
5470 sync = B_TRUE;
5471 } else if (strcmp(value, vd->vdev_fru) != 0) {
5472 spa_strfree(vd->vdev_fru);
5473 vd->vdev_fru = spa_strdup(value);
5474 sync = B_TRUE;
5475 }
5476 }
5477
5478 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
5479 }
5480
5481 int
5482 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
5483 {
5484 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
5485 }
5486
5487 int
5488 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
5489 {
5490 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
5491 }
5492
5493 /*
5494 * ==========================================================================
5495 * SPA Scanning
5496 * ==========================================================================
5497 */
5498
5499 int
5500 spa_scan_stop(spa_t *spa)
5501 {
5502 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5503 if (dsl_scan_resilvering(spa->spa_dsl_pool))
5504 return (SET_ERROR(EBUSY));
5505 return (dsl_scan_cancel(spa->spa_dsl_pool));
5506 }
5507
5508 int
5509 spa_scan(spa_t *spa, pool_scan_func_t func)
5510 {
5511 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5512
5513 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5514 return (SET_ERROR(ENOTSUP));
5515
5516 /*
5517 * If a resilver was requested, but there is no DTL on a
5518 * writeable leaf device, we have nothing to do.
5519 */
5520 if (func == POOL_SCAN_RESILVER &&
5521 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5522 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5523 return (0);
5524 }
5525
5526 return (dsl_scan(spa->spa_dsl_pool, func));
5527 }
5528
5529 /*
5530 * ==========================================================================
5531 * SPA async task processing
5532 * ==========================================================================
5533 */
5534
5535 static void
5536 spa_async_remove(spa_t *spa, vdev_t *vd)
5537 {
5538 if (vd->vdev_remove_wanted) {
5539 vd->vdev_remove_wanted = B_FALSE;
5540 vd->vdev_delayed_close = B_FALSE;
5541 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5542
5543 /*
5544 * We want to clear the stats, but we don't want to do a full
5545 * vdev_clear() as that will cause us to throw away
5546 * degraded/faulted state as well as attempt to reopen the
5547 * device, all of which is a waste.
5548 */
5549 vd->vdev_stat.vs_read_errors = 0;
5550 vd->vdev_stat.vs_write_errors = 0;
5551 vd->vdev_stat.vs_checksum_errors = 0;
5552
5553 vdev_state_dirty(vd->vdev_top);
5554 }
5555
5556 for (int c = 0; c < vd->vdev_children; c++)
5557 spa_async_remove(spa, vd->vdev_child[c]);
5558 }
5559
5560 static void
5561 spa_async_probe(spa_t *spa, vdev_t *vd)
5562 {
5563 if (vd->vdev_probe_wanted) {
5564 vd->vdev_probe_wanted = B_FALSE;
5565 vdev_reopen(vd); /* vdev_open() does the actual probe */
5566 }
5567
5568 for (int c = 0; c < vd->vdev_children; c++)
5569 spa_async_probe(spa, vd->vdev_child[c]);
5570 }
5571
5572 static void
5573 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5574 {
5575 sysevent_id_t eid;
5576 nvlist_t *attr;
5577 char *physpath;
5578
5579 if (!spa->spa_autoexpand)
5580 return;
5581
5582 for (int c = 0; c < vd->vdev_children; c++) {
5583 vdev_t *cvd = vd->vdev_child[c];
5584 spa_async_autoexpand(spa, cvd);
5585 }
5586
5587 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5588 return;
5589
5590 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
5591 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
5592
5593 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5594 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
5595
5596 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
5597 ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
5598
5599 nvlist_free(attr);
5600 kmem_free(physpath, MAXPATHLEN);
5601 }
5602
5603 static void
5604 spa_async_thread(spa_t *spa)
5605 {
5606 int tasks;
5607
5608 ASSERT(spa->spa_sync_on);
5609
5610 mutex_enter(&spa->spa_async_lock);
5611 tasks = spa->spa_async_tasks;
5612 spa->spa_async_tasks = 0;
5613 mutex_exit(&spa->spa_async_lock);
5614
5615 /*
5616 * See if the config needs to be updated.
5617 */
5618 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5619 uint64_t old_space, new_space;
5620
5621 mutex_enter(&spa_namespace_lock);
5622 old_space = metaslab_class_get_space(spa_normal_class(spa));
5623 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5624 new_space = metaslab_class_get_space(spa_normal_class(spa));
5625 mutex_exit(&spa_namespace_lock);
5626
5627 /*
5628 * If the pool grew as a result of the config update,
5629 * then log an internal history event.
5630 */
5631 if (new_space != old_space) {
5632 spa_history_log_internal(spa, "vdev online", NULL,
5633 "pool '%s' size: %llu(+%llu)",
5634 spa_name(spa), new_space, new_space - old_space);
5635 }
5636 }
5637
5638 /*
5639 * See if any devices need to be marked REMOVED.
5640 */
5641 if (tasks & SPA_ASYNC_REMOVE) {
5642 spa_vdev_state_enter(spa, SCL_NONE);
5643 spa_async_remove(spa, spa->spa_root_vdev);
5644 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
5645 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5646 for (int i = 0; i < spa->spa_spares.sav_count; i++)
5647 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5648 (void) spa_vdev_state_exit(spa, NULL, 0);
5649 }
5650
5651 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5652 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5653 spa_async_autoexpand(spa, spa->spa_root_vdev);
5654 spa_config_exit(spa, SCL_CONFIG, FTAG);
5655 }
5656
5657 /*
5658 * See if any devices need to be probed.
5659 */
5660 if (tasks & SPA_ASYNC_PROBE) {
5661 spa_vdev_state_enter(spa, SCL_NONE);
5662 spa_async_probe(spa, spa->spa_root_vdev);
5663 (void) spa_vdev_state_exit(spa, NULL, 0);
5664 }
5665
5666 /*
5667 * If any devices are done replacing, detach them.
5668 */
5669 if (tasks & SPA_ASYNC_RESILVER_DONE)
5670 spa_vdev_resilver_done(spa);
5671
5672 /*
5673 * Kick off a resilver.
5674 */
5675 if (tasks & SPA_ASYNC_RESILVER)
5676 dsl_resilver_restart(spa->spa_dsl_pool, 0);
5677
5678 /*
5679 * Let the world know that we're done.
5680 */
5681 mutex_enter(&spa->spa_async_lock);
5682 spa->spa_async_thread = NULL;
5683 cv_broadcast(&spa->spa_async_cv);
5684 mutex_exit(&spa->spa_async_lock);
5685 thread_exit();
5686 }
5687
5688 void
5689 spa_async_suspend(spa_t *spa)
5690 {
5691 mutex_enter(&spa->spa_async_lock);
5692 spa->spa_async_suspended++;
5693 while (spa->spa_async_thread != NULL)
5694 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5695 mutex_exit(&spa->spa_async_lock);
5696 }
5697
5698 void
5699 spa_async_resume(spa_t *spa)
5700 {
5701 mutex_enter(&spa->spa_async_lock);
5702 ASSERT(spa->spa_async_suspended != 0);
5703 spa->spa_async_suspended--;
5704 mutex_exit(&spa->spa_async_lock);
5705 }
5706
5707 static boolean_t
5708 spa_async_tasks_pending(spa_t *spa)
5709 {
5710 uint_t non_config_tasks;
5711 uint_t config_task;
5712 boolean_t config_task_suspended;
5713
5714 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
5715 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
5716 if (spa->spa_ccw_fail_time == 0) {
5717 config_task_suspended = B_FALSE;
5718 } else {
5719 config_task_suspended =
5720 (gethrtime() - spa->spa_ccw_fail_time) <
5721 (zfs_ccw_retry_interval * NANOSEC);
5722 }
5723
5724 return (non_config_tasks || (config_task && !config_task_suspended));
5725 }
5726
5727 static void
5728 spa_async_dispatch(spa_t *spa)
5729 {
5730 mutex_enter(&spa->spa_async_lock);
5731 if (spa_async_tasks_pending(spa) &&
5732 !spa->spa_async_suspended &&
5733 spa->spa_async_thread == NULL &&
5734 rootdir != NULL)
5735 spa->spa_async_thread = thread_create(NULL, 0,
5736 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5737 mutex_exit(&spa->spa_async_lock);
5738 }
5739
5740 void
5741 spa_async_request(spa_t *spa, int task)
5742 {
5743 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5744 mutex_enter(&spa->spa_async_lock);
5745 spa->spa_async_tasks |= task;
5746 mutex_exit(&spa->spa_async_lock);
5747 }
5748
5749 /*
5750 * ==========================================================================
5751 * SPA syncing routines
5752 * ==========================================================================
5753 */
5754
5755 static int
5756 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5757 {
5758 bpobj_t *bpo = arg;
5759 bpobj_enqueue(bpo, bp, tx);
5760 return (0);
5761 }
5762
5763 static int
5764 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5765 {
5766 zio_t *zio = arg;
5767
5768 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5769 zio->io_flags));
5770 return (0);
5771 }
5772
5773 /*
5774 * Note: this simple function is not inlined to make it easier to dtrace the
5775 * amount of time spent syncing frees.
5776 */
5777 static void
5778 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
5779 {
5780 zio_t *zio = zio_root(spa, NULL, NULL, 0);
5781 bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
5782 VERIFY(zio_wait(zio) == 0);
5783 }
5784
5785 /*
5786 * Note: this simple function is not inlined to make it easier to dtrace the
5787 * amount of time spent syncing deferred frees.
5788 */
5789 static void
5790 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
5791 {
5792 zio_t *zio = zio_root(spa, NULL, NULL, 0);
5793 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
5794 spa_free_sync_cb, zio, tx), ==, 0);
5795 VERIFY0(zio_wait(zio));
5796 }
5797
5798
5799 static void
5800 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5801 {
5802 char *packed = NULL;
5803 size_t bufsize;
5804 size_t nvsize = 0;
5805 dmu_buf_t *db;
5806
5807 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5808
5809 /*
5810 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5811 * information. This avoids the dbuf_will_dirty() path and
5812 * saves us a pre-read to get data we don't actually care about.
5813 */
5814 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
5815 packed = kmem_alloc(bufsize, KM_SLEEP);
5816
5817 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5818 KM_SLEEP) == 0);
5819 bzero(packed + nvsize, bufsize - nvsize);
5820
5821 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5822
5823 kmem_free(packed, bufsize);
5824
5825 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5826 dmu_buf_will_dirty(db, tx);
5827 *(uint64_t *)db->db_data = nvsize;
5828 dmu_buf_rele(db, FTAG);
5829 }
5830
5831 static void
5832 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5833 const char *config, const char *entry)
5834 {
5835 nvlist_t *nvroot;
5836 nvlist_t **list;
5837 int i;
5838
5839 if (!sav->sav_sync)
5840 return;
5841
5842 /*
5843 * Update the MOS nvlist describing the list of available devices.
5844 * spa_validate_aux() will have already made sure this nvlist is
5845 * valid and the vdevs are labeled appropriately.
5846 */
5847 if (sav->sav_object == 0) {
5848 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5849 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5850 sizeof (uint64_t), tx);
5851 VERIFY(zap_update(spa->spa_meta_objset,
5852 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5853 &sav->sav_object, tx) == 0);
5854 }
5855
5856 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5857 if (sav->sav_count == 0) {
5858 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
5859 } else {
5860 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
5861 for (i = 0; i < sav->sav_count; i++)
5862 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
5863 B_FALSE, VDEV_CONFIG_L2CACHE);
5864 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5865 sav->sav_count) == 0);
5866 for (i = 0; i < sav->sav_count; i++)
5867 nvlist_free(list[i]);
5868 kmem_free(list, sav->sav_count * sizeof (void *));
5869 }
5870
5871 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
5872 nvlist_free(nvroot);
5873
5874 sav->sav_sync = B_FALSE;
5875 }
5876
5877 static void
5878 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
5879 {
5880 nvlist_t *config;
5881
5882 if (list_is_empty(&spa->spa_config_dirty_list))
5883 return;
5884
5885 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5886
5887 config = spa_config_generate(spa, spa->spa_root_vdev,
5888 dmu_tx_get_txg(tx), B_FALSE);
5889
5890 /*
5891 * If we're upgrading the spa version then make sure that
5892 * the config object gets updated with the correct version.
5893 */
5894 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
5895 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5896 spa->spa_uberblock.ub_version);
5897
5898 spa_config_exit(spa, SCL_STATE, FTAG);
5899
5900 if (spa->spa_config_syncing)
5901 nvlist_free(spa->spa_config_syncing);
5902 spa->spa_config_syncing = config;
5903
5904 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5905 }
5906
5907 static void
5908 spa_sync_version(void *arg, dmu_tx_t *tx)
5909 {
5910 uint64_t *versionp = arg;
5911 uint64_t version = *versionp;
5912 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5913
5914 /*
5915 * Setting the version is special cased when first creating the pool.
5916 */
5917 ASSERT(tx->tx_txg != TXG_INITIAL);
5918
5919 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
5920 ASSERT(version >= spa_version(spa));
5921
5922 spa->spa_uberblock.ub_version = version;
5923 vdev_config_dirty(spa->spa_root_vdev);
5924 spa_history_log_internal(spa, "set", tx, "version=%lld", version);
5925 }
5926
5927 /*
5928 * Set zpool properties.
5929 */
5930 static void
5931 spa_sync_props(void *arg, dmu_tx_t *tx)
5932 {
5933 nvlist_t *nvp = arg;
5934 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5935 objset_t *mos = spa->spa_meta_objset;
5936 nvpair_t *elem = NULL;
5937
5938 mutex_enter(&spa->spa_props_lock);
5939
5940 while ((elem = nvlist_next_nvpair(nvp, elem))) {
5941 uint64_t intval;
5942 char *strval, *fname;
5943 zpool_prop_t prop;
5944 const char *propname;
5945 zprop_type_t proptype;
5946 zfeature_info_t *feature;
5947
5948 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
5949 case ZPROP_INVAL:
5950 /*
5951 * We checked this earlier in spa_prop_validate().
5952 */
5953 ASSERT(zpool_prop_feature(nvpair_name(elem)));
5954
5955 fname = strchr(nvpair_name(elem), '@') + 1;
5956 VERIFY0(zfeature_lookup_name(fname, &feature));
5957
5958 spa_feature_enable(spa, feature, tx);
5959 spa_history_log_internal(spa, "set", tx,
5960 "%s=enabled", nvpair_name(elem));
5961 break;
5962
5963 case ZPOOL_PROP_VERSION:
5964 intval = fnvpair_value_uint64(elem);
5965 /*
5966 * The version is synced seperatly before other
5967 * properties and should be correct by now.
5968 */
5969 ASSERT3U(spa_version(spa), >=, intval);
5970 break;
5971
5972 case ZPOOL_PROP_ALTROOT:
5973 /*
5974 * 'altroot' is a non-persistent property. It should
5975 * have been set temporarily at creation or import time.
5976 */
5977 ASSERT(spa->spa_root != NULL);
5978 break;
5979
5980 case ZPOOL_PROP_READONLY:
5981 case ZPOOL_PROP_CACHEFILE:
5982 /*
5983 * 'readonly' and 'cachefile' are also non-persisitent
5984 * properties.
5985 */
5986 break;
5987 case ZPOOL_PROP_COMMENT:
5988 strval = fnvpair_value_string(elem);
5989 if (spa->spa_comment != NULL)
5990 spa_strfree(spa->spa_comment);
5991 spa->spa_comment = spa_strdup(strval);
5992 /*
5993 * We need to dirty the configuration on all the vdevs
5994 * so that their labels get updated. It's unnecessary
5995 * to do this for pool creation since the vdev's
5996 * configuratoin has already been dirtied.
5997 */
5998 if (tx->tx_txg != TXG_INITIAL)
5999 vdev_config_dirty(spa->spa_root_vdev);
6000 spa_history_log_internal(spa, "set", tx,
6001 "%s=%s", nvpair_name(elem), strval);
6002 break;
6003 default:
6004 /*
6005 * Set pool property values in the poolprops mos object.
6006 */
6007 if (spa->spa_pool_props_object == 0) {
6008 spa->spa_pool_props_object =
6009 zap_create_link(mos, DMU_OT_POOL_PROPS,
6010 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
6011 tx);
6012 }
6013
6014 /* normalize the property name */
6015 propname = zpool_prop_to_name(prop);
6016 proptype = zpool_prop_get_type(prop);
6017
6018 if (nvpair_type(elem) == DATA_TYPE_STRING) {
6019 ASSERT(proptype == PROP_TYPE_STRING);
6020 strval = fnvpair_value_string(elem);
6021 VERIFY0(zap_update(mos,
6022 spa->spa_pool_props_object, propname,
6023 1, strlen(strval) + 1, strval, tx));
6024 spa_history_log_internal(spa, "set", tx,
6025 "%s=%s", nvpair_name(elem), strval);
6026 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
6027 intval = fnvpair_value_uint64(elem);
6028
6029 if (proptype == PROP_TYPE_INDEX) {
6030 const char *unused;
6031 VERIFY0(zpool_prop_index_to_string(
6032 prop, intval, &unused));
6033 }
6034 VERIFY0(zap_update(mos,
6035 spa->spa_pool_props_object, propname,
6036 8, 1, &intval, tx));
6037 spa_history_log_internal(spa, "set", tx,
6038 "%s=%lld", nvpair_name(elem), intval);
6039 } else {
6040 ASSERT(0); /* not allowed */
6041 }
6042
6043 switch (prop) {
6044 case ZPOOL_PROP_DELEGATION:
6045 spa->spa_delegation = intval;
6046 break;
6047 case ZPOOL_PROP_BOOTFS:
6048 spa->spa_bootfs = intval;
6049 break;
6050 case ZPOOL_PROP_FAILUREMODE:
6051 spa->spa_failmode = intval;
6052 break;
6053 case ZPOOL_PROP_AUTOEXPAND:
6054 spa->spa_autoexpand = intval;
6055 if (tx->tx_txg != TXG_INITIAL)
6056 spa_async_request(spa,
6057 SPA_ASYNC_AUTOEXPAND);
6058 break;
6059 case ZPOOL_PROP_DEDUPDITTO:
6060 spa->spa_dedup_ditto = intval;
6061 break;
6062 default:
6063 break;
6064 }
6065 }
6066
6067 }
6068
6069 mutex_exit(&spa->spa_props_lock);
6070 }
6071
6072 /*
6073 * Perform one-time upgrade on-disk changes. spa_version() does not
6074 * reflect the new version this txg, so there must be no changes this
6075 * txg to anything that the upgrade code depends on after it executes.
6076 * Therefore this must be called after dsl_pool_sync() does the sync
6077 * tasks.
6078 */
6079 static void
6080 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
6081 {
6082 dsl_pool_t *dp = spa->spa_dsl_pool;
6083
6084 ASSERT(spa->spa_sync_pass == 1);
6085
6086 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
6087
6088 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
6089 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
6090 dsl_pool_create_origin(dp, tx);
6091
6092 /* Keeping the origin open increases spa_minref */
6093 spa->spa_minref += 3;
6094 }
6095
6096 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
6097 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
6098 dsl_pool_upgrade_clones(dp, tx);
6099 }
6100
6101 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
6102 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
6103 dsl_pool_upgrade_dir_clones(dp, tx);
6104
6105 /* Keeping the freedir open increases spa_minref */
6106 spa->spa_minref += 3;
6107 }
6108
6109 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6110 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6111 spa_feature_create_zap_objects(spa, tx);
6112 }
6113 rrw_exit(&dp->dp_config_rwlock, FTAG);
6114 }
6115
6116 /*
6117 * Sync the specified transaction group. New blocks may be dirtied as
6118 * part of the process, so we iterate until it converges.
6119 */
6120 void
6121 spa_sync(spa_t *spa, uint64_t txg)
6122 {
6123 dsl_pool_t *dp = spa->spa_dsl_pool;
6124 objset_t *mos = spa->spa_meta_objset;
6125 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6126 vdev_t *rvd = spa->spa_root_vdev;
6127 vdev_t *vd;
6128 dmu_tx_t *tx;
6129 int error;
6130
6131 VERIFY(spa_writeable(spa));
6132
6133 /*
6134 * Lock out configuration changes.
6135 */
6136 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6137
6138 spa->spa_syncing_txg = txg;
6139 spa->spa_sync_pass = 0;
6140
6141 /*
6142 * If there are any pending vdev state changes, convert them
6143 * into config changes that go out with this transaction group.
6144 */
6145 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6146 while (list_head(&spa->spa_state_dirty_list) != NULL) {
6147 /*
6148 * We need the write lock here because, for aux vdevs,
6149 * calling vdev_config_dirty() modifies sav_config.
6150 * This is ugly and will become unnecessary when we
6151 * eliminate the aux vdev wart by integrating all vdevs
6152 * into the root vdev tree.
6153 */
6154 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6155 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
6156 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
6157 vdev_state_clean(vd);
6158 vdev_config_dirty(vd);
6159 }
6160 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6161 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6162 }
6163 spa_config_exit(spa, SCL_STATE, FTAG);
6164
6165 tx = dmu_tx_create_assigned(dp, txg);
6166
6167 spa->spa_sync_starttime = gethrtime();
6168 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
6169 spa->spa_sync_starttime + spa->spa_deadman_synctime));
6170
6171 /*
6172 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
6173 * set spa_deflate if we have no raid-z vdevs.
6174 */
6175 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
6176 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
6177 int i;
6178
6179 for (i = 0; i < rvd->vdev_children; i++) {
6180 vd = rvd->vdev_child[i];
6181 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
6182 break;
6183 }
6184 if (i == rvd->vdev_children) {
6185 spa->spa_deflate = TRUE;
6186 VERIFY(0 == zap_add(spa->spa_meta_objset,
6187 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6188 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6189 }
6190 }
6191
6192 /*
6193 * If anything has changed in this txg, or if someone is waiting
6194 * for this txg to sync (eg, spa_vdev_remove()), push the
6195 * deferred frees from the previous txg. If not, leave them
6196 * alone so that we don't generate work on an otherwise idle
6197 * system.
6198 */
6199 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
6200 !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
6201 !txg_list_empty(&dp->dp_sync_tasks, txg) ||
6202 ((dsl_scan_active(dp->dp_scan) ||
6203 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
6204 spa_sync_deferred_frees(spa, tx);
6205 }
6206
6207 /*
6208 * Iterate to convergence.
6209 */
6210 do {
6211 int pass = ++spa->spa_sync_pass;
6212
6213 spa_sync_config_object(spa, tx);
6214 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6215 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6216 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6217 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6218 spa_errlog_sync(spa, txg);
6219 dsl_pool_sync(dp, txg);
6220
6221 if (pass < zfs_sync_pass_deferred_free) {
6222 spa_sync_frees(spa, free_bpl, tx);
6223 } else {
6224 bplist_iterate(free_bpl, bpobj_enqueue_cb,
6225 &spa->spa_deferred_bpobj, tx);
6226 }
6227
6228 ddt_sync(spa, txg);
6229 dsl_scan_sync(dp, tx);
6230
6231 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
6232 vdev_sync(vd, txg);
6233
6234 if (pass == 1)
6235 spa_sync_upgrades(spa, tx);
6236
6237 } while (dmu_objset_is_dirty(mos, txg));
6238
6239 /*
6240 * Rewrite the vdev configuration (which includes the uberblock)
6241 * to commit the transaction group.
6242 *
6243 * If there are no dirty vdevs, we sync the uberblock to a few
6244 * random top-level vdevs that are known to be visible in the
6245 * config cache (see spa_vdev_add() for a complete description).
6246 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
6247 */
6248 for (;;) {
6249 /*
6250 * We hold SCL_STATE to prevent vdev open/close/etc.
6251 * while we're attempting to write the vdev labels.
6252 */
6253 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6254
6255 if (list_is_empty(&spa->spa_config_dirty_list)) {
6256 vdev_t *svd[SPA_DVAS_PER_BP];
6257 int svdcount = 0;
6258 int children = rvd->vdev_children;
6259 int c0 = spa_get_random(children);
6260
6261 for (int c = 0; c < children; c++) {
6262 vd = rvd->vdev_child[(c0 + c) % children];
6263 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
6264 continue;
6265 svd[svdcount++] = vd;
6266 if (svdcount == SPA_DVAS_PER_BP)
6267 break;
6268 }
6269 error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
6270 if (error != 0)
6271 error = vdev_config_sync(svd, svdcount, txg,
6272 B_TRUE);
6273 } else {
6274 error = vdev_config_sync(rvd->vdev_child,
6275 rvd->vdev_children, txg, B_FALSE);
6276 if (error != 0)
6277 error = vdev_config_sync(rvd->vdev_child,
6278 rvd->vdev_children, txg, B_TRUE);
6279 }
6280
6281 if (error == 0)
6282 spa->spa_last_synced_guid = rvd->vdev_guid;
6283
6284 spa_config_exit(spa, SCL_STATE, FTAG);
6285
6286 if (error == 0)
6287 break;
6288 zio_suspend(spa, NULL);
6289 zio_resume_wait(spa);
6290 }
6291 dmu_tx_commit(tx);
6292
6293 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
6294
6295 /*
6296 * Clear the dirty config list.
6297 */
6298 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
6299 vdev_config_clean(vd);
6300
6301 /*
6302 * Now that the new config has synced transactionally,
6303 * let it become visible to the config cache.
6304 */
6305 if (spa->spa_config_syncing != NULL) {
6306 spa_config_set(spa, spa->spa_config_syncing);
6307 spa->spa_config_txg = txg;
6308 spa->spa_config_syncing = NULL;
6309 }
6310
6311 spa->spa_ubsync = spa->spa_uberblock;
6312
6313 dsl_pool_sync_done(dp, txg);
6314
6315 /*
6316 * Update usable space statistics.
6317 */
6318 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
6319 vdev_sync_done(vd, txg);
6320
6321 spa_update_dspace(spa);
6322
6323 /*
6324 * It had better be the case that we didn't dirty anything
6325 * since vdev_config_sync().
6326 */
6327 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
6328 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
6329 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
6330
6331 spa->spa_sync_pass = 0;
6332
6333 spa_config_exit(spa, SCL_CONFIG, FTAG);
6334
6335 spa_handle_ignored_writes(spa);
6336
6337 /*
6338 * If any async tasks have been requested, kick them off.
6339 */
6340 spa_async_dispatch(spa);
6341 }
6342
6343 /*
6344 * Sync all pools. We don't want to hold the namespace lock across these
6345 * operations, so we take a reference on the spa_t and drop the lock during the
6346 * sync.
6347 */
6348 void
6349 spa_sync_allpools(void)
6350 {
6351 spa_t *spa = NULL;
6352 mutex_enter(&spa_namespace_lock);
6353 while ((spa = spa_next(spa)) != NULL) {
6354 if (spa_state(spa) != POOL_STATE_ACTIVE ||
6355 !spa_writeable(spa) || spa_suspended(spa))
6356 continue;
6357 spa_open_ref(spa, FTAG);
6358 mutex_exit(&spa_namespace_lock);
6359 txg_wait_synced(spa_get_dsl(spa), 0);
6360 mutex_enter(&spa_namespace_lock);
6361 spa_close(spa, FTAG);
6362 }
6363 mutex_exit(&spa_namespace_lock);
6364 }
6365
6366 /*
6367 * ==========================================================================
6368 * Miscellaneous routines
6369 * ==========================================================================
6370 */
6371
6372 /*
6373 * Remove all pools in the system.
6374 */
6375 void
6376 spa_evict_all(void)
6377 {
6378 spa_t *spa;
6379
6380 /*
6381 * Remove all cached state. All pools should be closed now,
6382 * so every spa in the AVL tree should be unreferenced.
6383 */
6384 mutex_enter(&spa_namespace_lock);
6385 while ((spa = spa_next(NULL)) != NULL) {
6386 /*
6387 * Stop async tasks. The async thread may need to detach
6388 * a device that's been replaced, which requires grabbing
6389 * spa_namespace_lock, so we must drop it here.
6390 */
6391 spa_open_ref(spa, FTAG);
6392 mutex_exit(&spa_namespace_lock);
6393 spa_async_suspend(spa);
6394 mutex_enter(&spa_namespace_lock);
6395 spa_close(spa, FTAG);
6396
6397 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6398 spa_unload(spa);
6399 spa_deactivate(spa);
6400 }
6401 spa_remove(spa);
6402 }
6403 mutex_exit(&spa_namespace_lock);
6404 }
6405
6406 vdev_t *
6407 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
6408 {
6409 vdev_t *vd;
6410 int i;
6411
6412 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
6413 return (vd);
6414
6415 if (aux) {
6416 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
6417 vd = spa->spa_l2cache.sav_vdevs[i];
6418 if (vd->vdev_guid == guid)
6419 return (vd);
6420 }
6421
6422 for (i = 0; i < spa->spa_spares.sav_count; i++) {
6423 vd = spa->spa_spares.sav_vdevs[i];
6424 if (vd->vdev_guid == guid)
6425 return (vd);
6426 }
6427 }
6428
6429 return (NULL);
6430 }
6431
6432 void
6433 spa_upgrade(spa_t *spa, uint64_t version)
6434 {
6435 ASSERT(spa_writeable(spa));
6436
6437 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6438
6439 /*
6440 * This should only be called for a non-faulted pool, and since a
6441 * future version would result in an unopenable pool, this shouldn't be
6442 * possible.
6443 */
6444 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
6445 ASSERT(version >= spa->spa_uberblock.ub_version);
6446
6447 spa->spa_uberblock.ub_version = version;
6448 vdev_config_dirty(spa->spa_root_vdev);
6449
6450 spa_config_exit(spa, SCL_ALL, FTAG);
6451
6452 txg_wait_synced(spa_get_dsl(spa), 0);
6453 }
6454
6455 boolean_t
6456 spa_has_spare(spa_t *spa, uint64_t guid)
6457 {
6458 int i;
6459 uint64_t spareguid;
6460 spa_aux_vdev_t *sav = &spa->spa_spares;
6461
6462 for (i = 0; i < sav->sav_count; i++)
6463 if (sav->sav_vdevs[i]->vdev_guid == guid)
6464 return (B_TRUE);
6465
6466 for (i = 0; i < sav->sav_npending; i++) {
6467 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
6468 &spareguid) == 0 && spareguid == guid)
6469 return (B_TRUE);
6470 }
6471
6472 return (B_FALSE);
6473 }
6474
6475 /*
6476 * Check if a pool has an active shared spare device.
6477 * Note: reference count of an active spare is 2, as a spare and as a replace
6478 */
6479 static boolean_t
6480 spa_has_active_shared_spare(spa_t *spa)
6481 {
6482 int i, refcnt;
6483 uint64_t pool;
6484 spa_aux_vdev_t *sav = &spa->spa_spares;
6485
6486 for (i = 0; i < sav->sav_count; i++) {
6487 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
6488 &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
6489 refcnt > 2)
6490 return (B_TRUE);
6491 }
6492
6493 return (B_FALSE);
6494 }
6495
6496 /*
6497 * Post a sysevent corresponding to the given event. The 'name' must be one of
6498 * the event definitions in sys/sysevent/eventdefs.h. The payload will be
6499 * filled in from the spa and (optionally) the vdev. This doesn't do anything
6500 * in the userland libzpool, as we don't want consumers to misinterpret ztest
6501 * or zdb as real changes.
6502 */
6503 void
6504 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
6505 {
6506 #ifdef _KERNEL
6507 sysevent_t *ev;
6508 sysevent_attr_list_t *attr = NULL;
6509 sysevent_value_t value;
6510 sysevent_id_t eid;
6511
6512 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
6513 SE_SLEEP);
6514
6515 value.value_type = SE_DATA_TYPE_STRING;
6516 value.value.sv_string = spa_name(spa);
6517 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
6518 goto done;
6519
6520 value.value_type = SE_DATA_TYPE_UINT64;
6521 value.value.sv_uint64 = spa_guid(spa);
6522 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
6523 goto done;
6524
6525 if (vd) {
6526 value.value_type = SE_DATA_TYPE_UINT64;
6527 value.value.sv_uint64 = vd->vdev_guid;
6528 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
6529 SE_SLEEP) != 0)
6530 goto done;
6531
6532 if (vd->vdev_path) {
6533 value.value_type = SE_DATA_TYPE_STRING;
6534 value.value.sv_string = vd->vdev_path;
6535 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
6536 &value, SE_SLEEP) != 0)
6537 goto done;
6538 }
6539 }
6540
6541 if (sysevent_attach_attributes(ev, attr) != 0)
6542 goto done;
6543 attr = NULL;
6544
6545 (void) log_sysevent(ev, SE_SLEEP, &eid);
6546
6547 done:
6548 if (attr)
6549 sysevent_free_attr(attr);
6550 sysevent_free(ev);
6551 #endif
6552 }