Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/spa.c
+++ new/usr/src/uts/common/fs/zfs/spa.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 25 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
26 26 */
27 27
28 28 /*
29 29 * SPA: Storage Pool Allocator
30 30 *
31 31 * This file contains all the routines used when modifying on-disk SPA state.
32 32 * This includes opening, importing, destroying, exporting a pool, and syncing a
33 33 * pool.
34 34 */
35 35
36 36 #include <sys/zfs_context.h>
37 37 #include <sys/fm/fs/zfs.h>
38 38 #include <sys/spa_impl.h>
39 39 #include <sys/zio.h>
40 40 #include <sys/zio_checksum.h>
41 41 #include <sys/dmu.h>
42 42 #include <sys/dmu_tx.h>
43 43 #include <sys/zap.h>
44 44 #include <sys/zil.h>
45 45 #include <sys/ddt.h>
46 46 #include <sys/vdev_impl.h>
47 47 #include <sys/metaslab.h>
48 48 #include <sys/metaslab_impl.h>
49 49 #include <sys/uberblock_impl.h>
50 50 #include <sys/txg.h>
51 51 #include <sys/avl.h>
52 52 #include <sys/dmu_traverse.h>
53 53 #include <sys/dmu_objset.h>
54 54 #include <sys/unique.h>
55 55 #include <sys/dsl_pool.h>
56 56 #include <sys/dsl_dataset.h>
57 57 #include <sys/dsl_dir.h>
58 58 #include <sys/dsl_prop.h>
59 59 #include <sys/dsl_synctask.h>
60 60 #include <sys/fs/zfs.h>
61 61 #include <sys/arc.h>
62 62 #include <sys/callb.h>
63 63 #include <sys/systeminfo.h>
64 64 #include <sys/spa_boot.h>
65 65 #include <sys/zfs_ioctl.h>
66 66 #include <sys/dsl_scan.h>
67 67 #include <sys/zfeature.h>
68 68 #include <sys/dsl_destroy.h>
69 69
70 70 #ifdef _KERNEL
71 71 #include <sys/bootprops.h>
72 72 #include <sys/callb.h>
73 73 #include <sys/cpupart.h>
74 74 #include <sys/pool.h>
75 75 #include <sys/sysdc.h>
76 76 #include <sys/zone.h>
77 77 #endif /* _KERNEL */
78 78
79 79 #include "zfs_prop.h"
↓ open down ↓ |
79 lines elided |
↑ open up ↑ |
80 80 #include "zfs_comutil.h"
81 81
82 82 /*
83 83 * The interval, in seconds, at which failed configuration cache file writes
84 84 * should be retried.
85 85 */
86 86 static int zfs_ccw_retry_interval = 300;
87 87
88 88 typedef enum zti_modes {
89 89 ZTI_MODE_FIXED, /* value is # of threads (min 1) */
90 - ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */
91 90 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
92 91 ZTI_MODE_NULL, /* don't create a taskq */
93 92 ZTI_NMODES
94 93 } zti_modes_t;
95 94
96 95 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
97 -#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
98 96 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
99 97 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
100 98
101 99 #define ZTI_N(n) ZTI_P(n, 1)
102 100 #define ZTI_ONE ZTI_N(1)
103 101
104 102 typedef struct zio_taskq_info {
105 103 zti_modes_t zti_mode;
106 104 uint_t zti_value;
107 105 uint_t zti_count;
108 106 } zio_taskq_info_t;
109 107
110 108 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
111 109 "issue", "issue_high", "intr", "intr_high"
112 110 };
113 111
114 112 /*
115 113 * This table defines the taskq settings for each ZFS I/O type. When
116 114 * initializing a pool, we use this table to create an appropriately sized
117 115 * taskq. Some operations are low volume and therefore have a small, static
118 116 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
119 117 * macros. Other operations process a large amount of data; the ZTI_BATCH
120 118 * macro causes us to create a taskq oriented for throughput. Some operations
121 119 * are so high frequency and short-lived that the taskq itself can become a a
122 120 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
123 121 * additional degree of parallelism specified by the number of threads per-
124 122 * taskq and the number of taskqs; when dispatching an event in this case, the
125 123 * particular taskq is chosen at random.
126 124 *
127 125 * The different taskq priorities are to handle the different contexts (issue
128 126 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
129 127 * need to be handled with minimum delay.
130 128 */
131 129 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
132 130 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
133 131 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
134 132 { ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */
135 133 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */
136 134 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
137 135 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
138 136 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
↓ open down ↓ |
31 lines elided |
↑ open up ↑ |
139 137 };
140 138
141 139 static void spa_sync_version(void *arg, dmu_tx_t *tx);
142 140 static void spa_sync_props(void *arg, dmu_tx_t *tx);
143 141 static boolean_t spa_has_active_shared_spare(spa_t *spa);
144 142 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
145 143 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
146 144 char **ereport);
147 145 static void spa_vdev_resilver_done(spa_t *spa);
148 146
149 -uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
147 +uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
150 148 id_t zio_taskq_psrset_bind = PS_NONE;
151 149 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
152 150 uint_t zio_taskq_basedc = 80; /* base duty cycle */
153 151
154 152 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
155 153 extern int zfs_sync_pass_deferred_free;
156 154
157 155 /*
158 156 * This (illegal) pool name is used when temporarily importing a spa_t in order
159 157 * to get the vdev stats associated with the imported devices.
160 158 */
161 159 #define TRYIMPORT_NAME "$import"
162 160
163 161 /*
164 162 * ==========================================================================
165 163 * SPA properties routines
166 164 * ==========================================================================
167 165 */
168 166
169 167 /*
170 168 * Add a (source=src, propname=propval) list to an nvlist.
171 169 */
172 170 static void
173 171 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
174 172 uint64_t intval, zprop_source_t src)
175 173 {
176 174 const char *propname = zpool_prop_to_name(prop);
177 175 nvlist_t *propval;
178 176
179 177 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
180 178 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
181 179
182 180 if (strval != NULL)
183 181 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
184 182 else
185 183 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
186 184
187 185 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
188 186 nvlist_free(propval);
189 187 }
190 188
191 189 /*
192 190 * Get property values from the spa configuration.
193 191 */
194 192 static void
195 193 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
196 194 {
197 195 vdev_t *rvd = spa->spa_root_vdev;
198 196 dsl_pool_t *pool = spa->spa_dsl_pool;
199 197 uint64_t size;
200 198 uint64_t alloc;
201 199 uint64_t space;
202 200 uint64_t cap, version;
203 201 zprop_source_t src = ZPROP_SRC_NONE;
204 202 spa_config_dirent_t *dp;
205 203
206 204 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
207 205
208 206 if (rvd != NULL) {
209 207 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
210 208 size = metaslab_class_get_space(spa_normal_class(spa));
211 209 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
212 210 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
213 211 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
214 212 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
215 213 size - alloc, src);
216 214
217 215 space = 0;
218 216 for (int c = 0; c < rvd->vdev_children; c++) {
219 217 vdev_t *tvd = rvd->vdev_child[c];
220 218 space += tvd->vdev_max_asize - tvd->vdev_asize;
221 219 }
222 220 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
223 221 src);
224 222
225 223 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
226 224 (spa_mode(spa) == FREAD), src);
227 225
228 226 cap = (size == 0) ? 0 : (alloc * 100 / size);
229 227 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
230 228
231 229 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
232 230 ddt_get_pool_dedup_ratio(spa), src);
233 231
234 232 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
235 233 rvd->vdev_state, src);
236 234
237 235 version = spa_version(spa);
238 236 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
239 237 src = ZPROP_SRC_DEFAULT;
240 238 else
241 239 src = ZPROP_SRC_LOCAL;
242 240 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
243 241 }
244 242
245 243 if (pool != NULL) {
246 244 dsl_dir_t *freedir = pool->dp_free_dir;
247 245
248 246 /*
249 247 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
250 248 * when opening pools before this version freedir will be NULL.
251 249 */
252 250 if (freedir != NULL) {
253 251 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
254 252 freedir->dd_phys->dd_used_bytes, src);
255 253 } else {
256 254 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
257 255 NULL, 0, src);
258 256 }
259 257 }
260 258
261 259 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
262 260
263 261 if (spa->spa_comment != NULL) {
264 262 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
265 263 0, ZPROP_SRC_LOCAL);
266 264 }
267 265
268 266 if (spa->spa_root != NULL)
269 267 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
270 268 0, ZPROP_SRC_LOCAL);
271 269
272 270 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
273 271 if (dp->scd_path == NULL) {
274 272 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
275 273 "none", 0, ZPROP_SRC_LOCAL);
276 274 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
277 275 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
278 276 dp->scd_path, 0, ZPROP_SRC_LOCAL);
279 277 }
280 278 }
281 279 }
282 280
283 281 /*
284 282 * Get zpool property values.
285 283 */
286 284 int
287 285 spa_prop_get(spa_t *spa, nvlist_t **nvp)
288 286 {
289 287 objset_t *mos = spa->spa_meta_objset;
290 288 zap_cursor_t zc;
291 289 zap_attribute_t za;
292 290 int err;
293 291
294 292 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
295 293
296 294 mutex_enter(&spa->spa_props_lock);
297 295
298 296 /*
299 297 * Get properties from the spa config.
300 298 */
301 299 spa_prop_get_config(spa, nvp);
302 300
303 301 /* If no pool property object, no more prop to get. */
304 302 if (mos == NULL || spa->spa_pool_props_object == 0) {
305 303 mutex_exit(&spa->spa_props_lock);
306 304 return (0);
307 305 }
308 306
309 307 /*
310 308 * Get properties from the MOS pool property object.
311 309 */
312 310 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
313 311 (err = zap_cursor_retrieve(&zc, &za)) == 0;
314 312 zap_cursor_advance(&zc)) {
315 313 uint64_t intval = 0;
316 314 char *strval = NULL;
317 315 zprop_source_t src = ZPROP_SRC_DEFAULT;
318 316 zpool_prop_t prop;
319 317
320 318 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
321 319 continue;
322 320
323 321 switch (za.za_integer_length) {
324 322 case 8:
325 323 /* integer property */
326 324 if (za.za_first_integer !=
327 325 zpool_prop_default_numeric(prop))
328 326 src = ZPROP_SRC_LOCAL;
329 327
330 328 if (prop == ZPOOL_PROP_BOOTFS) {
331 329 dsl_pool_t *dp;
332 330 dsl_dataset_t *ds = NULL;
333 331
334 332 dp = spa_get_dsl(spa);
335 333 dsl_pool_config_enter(dp, FTAG);
336 334 if (err = dsl_dataset_hold_obj(dp,
337 335 za.za_first_integer, FTAG, &ds)) {
338 336 dsl_pool_config_exit(dp, FTAG);
339 337 break;
340 338 }
341 339
342 340 strval = kmem_alloc(
343 341 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
344 342 KM_SLEEP);
345 343 dsl_dataset_name(ds, strval);
346 344 dsl_dataset_rele(ds, FTAG);
347 345 dsl_pool_config_exit(dp, FTAG);
348 346 } else {
349 347 strval = NULL;
350 348 intval = za.za_first_integer;
351 349 }
352 350
353 351 spa_prop_add_list(*nvp, prop, strval, intval, src);
354 352
355 353 if (strval != NULL)
356 354 kmem_free(strval,
357 355 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
358 356
359 357 break;
360 358
361 359 case 1:
362 360 /* string property */
363 361 strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
364 362 err = zap_lookup(mos, spa->spa_pool_props_object,
365 363 za.za_name, 1, za.za_num_integers, strval);
366 364 if (err) {
367 365 kmem_free(strval, za.za_num_integers);
368 366 break;
369 367 }
370 368 spa_prop_add_list(*nvp, prop, strval, 0, src);
371 369 kmem_free(strval, za.za_num_integers);
372 370 break;
373 371
374 372 default:
375 373 break;
376 374 }
377 375 }
378 376 zap_cursor_fini(&zc);
379 377 mutex_exit(&spa->spa_props_lock);
380 378 out:
381 379 if (err && err != ENOENT) {
382 380 nvlist_free(*nvp);
383 381 *nvp = NULL;
384 382 return (err);
385 383 }
386 384
387 385 return (0);
388 386 }
389 387
390 388 /*
391 389 * Validate the given pool properties nvlist and modify the list
392 390 * for the property values to be set.
393 391 */
394 392 static int
395 393 spa_prop_validate(spa_t *spa, nvlist_t *props)
396 394 {
397 395 nvpair_t *elem;
398 396 int error = 0, reset_bootfs = 0;
399 397 uint64_t objnum = 0;
400 398 boolean_t has_feature = B_FALSE;
401 399
402 400 elem = NULL;
403 401 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
404 402 uint64_t intval;
405 403 char *strval, *slash, *check, *fname;
406 404 const char *propname = nvpair_name(elem);
407 405 zpool_prop_t prop = zpool_name_to_prop(propname);
408 406
409 407 switch (prop) {
410 408 case ZPROP_INVAL:
411 409 if (!zpool_prop_feature(propname)) {
412 410 error = SET_ERROR(EINVAL);
413 411 break;
414 412 }
415 413
416 414 /*
417 415 * Sanitize the input.
418 416 */
419 417 if (nvpair_type(elem) != DATA_TYPE_UINT64) {
420 418 error = SET_ERROR(EINVAL);
421 419 break;
422 420 }
423 421
424 422 if (nvpair_value_uint64(elem, &intval) != 0) {
425 423 error = SET_ERROR(EINVAL);
426 424 break;
427 425 }
428 426
429 427 if (intval != 0) {
430 428 error = SET_ERROR(EINVAL);
431 429 break;
432 430 }
433 431
434 432 fname = strchr(propname, '@') + 1;
435 433 if (zfeature_lookup_name(fname, NULL) != 0) {
436 434 error = SET_ERROR(EINVAL);
437 435 break;
438 436 }
439 437
440 438 has_feature = B_TRUE;
441 439 break;
442 440
443 441 case ZPOOL_PROP_VERSION:
444 442 error = nvpair_value_uint64(elem, &intval);
445 443 if (!error &&
446 444 (intval < spa_version(spa) ||
447 445 intval > SPA_VERSION_BEFORE_FEATURES ||
448 446 has_feature))
449 447 error = SET_ERROR(EINVAL);
450 448 break;
451 449
452 450 case ZPOOL_PROP_DELEGATION:
453 451 case ZPOOL_PROP_AUTOREPLACE:
454 452 case ZPOOL_PROP_LISTSNAPS:
455 453 case ZPOOL_PROP_AUTOEXPAND:
456 454 error = nvpair_value_uint64(elem, &intval);
457 455 if (!error && intval > 1)
458 456 error = SET_ERROR(EINVAL);
459 457 break;
460 458
461 459 case ZPOOL_PROP_BOOTFS:
462 460 /*
463 461 * If the pool version is less than SPA_VERSION_BOOTFS,
464 462 * or the pool is still being created (version == 0),
465 463 * the bootfs property cannot be set.
466 464 */
467 465 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
468 466 error = SET_ERROR(ENOTSUP);
469 467 break;
470 468 }
471 469
472 470 /*
473 471 * Make sure the vdev config is bootable
474 472 */
475 473 if (!vdev_is_bootable(spa->spa_root_vdev)) {
476 474 error = SET_ERROR(ENOTSUP);
477 475 break;
478 476 }
479 477
480 478 reset_bootfs = 1;
481 479
482 480 error = nvpair_value_string(elem, &strval);
483 481
484 482 if (!error) {
485 483 objset_t *os;
486 484 uint64_t compress;
487 485
488 486 if (strval == NULL || strval[0] == '\0') {
489 487 objnum = zpool_prop_default_numeric(
490 488 ZPOOL_PROP_BOOTFS);
491 489 break;
492 490 }
493 491
494 492 if (error = dmu_objset_hold(strval, FTAG, &os))
495 493 break;
496 494
497 495 /* Must be ZPL and not gzip compressed. */
498 496
499 497 if (dmu_objset_type(os) != DMU_OST_ZFS) {
500 498 error = SET_ERROR(ENOTSUP);
501 499 } else if ((error =
502 500 dsl_prop_get_int_ds(dmu_objset_ds(os),
503 501 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
504 502 &compress)) == 0 &&
505 503 !BOOTFS_COMPRESS_VALID(compress)) {
506 504 error = SET_ERROR(ENOTSUP);
507 505 } else {
508 506 objnum = dmu_objset_id(os);
509 507 }
510 508 dmu_objset_rele(os, FTAG);
511 509 }
512 510 break;
513 511
514 512 case ZPOOL_PROP_FAILUREMODE:
515 513 error = nvpair_value_uint64(elem, &intval);
516 514 if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
517 515 intval > ZIO_FAILURE_MODE_PANIC))
518 516 error = SET_ERROR(EINVAL);
519 517
520 518 /*
521 519 * This is a special case which only occurs when
522 520 * the pool has completely failed. This allows
523 521 * the user to change the in-core failmode property
524 522 * without syncing it out to disk (I/Os might
525 523 * currently be blocked). We do this by returning
526 524 * EIO to the caller (spa_prop_set) to trick it
527 525 * into thinking we encountered a property validation
528 526 * error.
529 527 */
530 528 if (!error && spa_suspended(spa)) {
531 529 spa->spa_failmode = intval;
532 530 error = SET_ERROR(EIO);
533 531 }
534 532 break;
535 533
536 534 case ZPOOL_PROP_CACHEFILE:
537 535 if ((error = nvpair_value_string(elem, &strval)) != 0)
538 536 break;
539 537
540 538 if (strval[0] == '\0')
541 539 break;
542 540
543 541 if (strcmp(strval, "none") == 0)
544 542 break;
545 543
546 544 if (strval[0] != '/') {
547 545 error = SET_ERROR(EINVAL);
548 546 break;
549 547 }
550 548
551 549 slash = strrchr(strval, '/');
552 550 ASSERT(slash != NULL);
553 551
554 552 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
555 553 strcmp(slash, "/..") == 0)
556 554 error = SET_ERROR(EINVAL);
557 555 break;
558 556
559 557 case ZPOOL_PROP_COMMENT:
560 558 if ((error = nvpair_value_string(elem, &strval)) != 0)
561 559 break;
562 560 for (check = strval; *check != '\0'; check++) {
563 561 /*
564 562 * The kernel doesn't have an easy isprint()
565 563 * check. For this kernel check, we merely
566 564 * check ASCII apart from DEL. Fix this if
567 565 * there is an easy-to-use kernel isprint().
568 566 */
569 567 if (*check >= 0x7f) {
570 568 error = SET_ERROR(EINVAL);
571 569 break;
572 570 }
573 571 check++;
574 572 }
575 573 if (strlen(strval) > ZPROP_MAX_COMMENT)
576 574 error = E2BIG;
577 575 break;
578 576
579 577 case ZPOOL_PROP_DEDUPDITTO:
580 578 if (spa_version(spa) < SPA_VERSION_DEDUP)
581 579 error = SET_ERROR(ENOTSUP);
582 580 else
583 581 error = nvpair_value_uint64(elem, &intval);
584 582 if (error == 0 &&
585 583 intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
586 584 error = SET_ERROR(EINVAL);
587 585 break;
588 586 }
589 587
590 588 if (error)
591 589 break;
592 590 }
593 591
594 592 if (!error && reset_bootfs) {
595 593 error = nvlist_remove(props,
596 594 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
597 595
598 596 if (!error) {
599 597 error = nvlist_add_uint64(props,
600 598 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
601 599 }
602 600 }
603 601
604 602 return (error);
605 603 }
606 604
607 605 void
608 606 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
609 607 {
610 608 char *cachefile;
611 609 spa_config_dirent_t *dp;
612 610
613 611 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
614 612 &cachefile) != 0)
615 613 return;
616 614
617 615 dp = kmem_alloc(sizeof (spa_config_dirent_t),
618 616 KM_SLEEP);
619 617
620 618 if (cachefile[0] == '\0')
621 619 dp->scd_path = spa_strdup(spa_config_path);
622 620 else if (strcmp(cachefile, "none") == 0)
623 621 dp->scd_path = NULL;
624 622 else
625 623 dp->scd_path = spa_strdup(cachefile);
626 624
627 625 list_insert_head(&spa->spa_config_list, dp);
628 626 if (need_sync)
629 627 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
630 628 }
631 629
632 630 int
633 631 spa_prop_set(spa_t *spa, nvlist_t *nvp)
634 632 {
635 633 int error;
636 634 nvpair_t *elem = NULL;
637 635 boolean_t need_sync = B_FALSE;
638 636
639 637 if ((error = spa_prop_validate(spa, nvp)) != 0)
640 638 return (error);
641 639
642 640 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
643 641 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
644 642
645 643 if (prop == ZPOOL_PROP_CACHEFILE ||
646 644 prop == ZPOOL_PROP_ALTROOT ||
647 645 prop == ZPOOL_PROP_READONLY)
648 646 continue;
649 647
650 648 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
651 649 uint64_t ver;
652 650
653 651 if (prop == ZPOOL_PROP_VERSION) {
654 652 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
655 653 } else {
656 654 ASSERT(zpool_prop_feature(nvpair_name(elem)));
657 655 ver = SPA_VERSION_FEATURES;
658 656 need_sync = B_TRUE;
659 657 }
660 658
661 659 /* Save time if the version is already set. */
662 660 if (ver == spa_version(spa))
663 661 continue;
664 662
665 663 /*
666 664 * In addition to the pool directory object, we might
667 665 * create the pool properties object, the features for
668 666 * read object, the features for write object, or the
669 667 * feature descriptions object.
670 668 */
671 669 error = dsl_sync_task(spa->spa_name, NULL,
672 670 spa_sync_version, &ver, 6);
673 671 if (error)
674 672 return (error);
675 673 continue;
676 674 }
677 675
678 676 need_sync = B_TRUE;
679 677 break;
680 678 }
681 679
682 680 if (need_sync) {
683 681 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
684 682 nvp, 6));
685 683 }
686 684
687 685 return (0);
688 686 }
689 687
690 688 /*
691 689 * If the bootfs property value is dsobj, clear it.
692 690 */
693 691 void
694 692 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
695 693 {
696 694 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
697 695 VERIFY(zap_remove(spa->spa_meta_objset,
698 696 spa->spa_pool_props_object,
699 697 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
700 698 spa->spa_bootfs = 0;
701 699 }
702 700 }
703 701
704 702 /*ARGSUSED*/
705 703 static int
706 704 spa_change_guid_check(void *arg, dmu_tx_t *tx)
707 705 {
708 706 uint64_t *newguid = arg;
709 707 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
710 708 vdev_t *rvd = spa->spa_root_vdev;
711 709 uint64_t vdev_state;
712 710
713 711 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
714 712 vdev_state = rvd->vdev_state;
715 713 spa_config_exit(spa, SCL_STATE, FTAG);
716 714
717 715 if (vdev_state != VDEV_STATE_HEALTHY)
718 716 return (SET_ERROR(ENXIO));
719 717
720 718 ASSERT3U(spa_guid(spa), !=, *newguid);
721 719
722 720 return (0);
723 721 }
724 722
725 723 static void
726 724 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
727 725 {
728 726 uint64_t *newguid = arg;
729 727 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
730 728 uint64_t oldguid;
731 729 vdev_t *rvd = spa->spa_root_vdev;
732 730
733 731 oldguid = spa_guid(spa);
734 732
735 733 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
736 734 rvd->vdev_guid = *newguid;
737 735 rvd->vdev_guid_sum += (*newguid - oldguid);
738 736 vdev_config_dirty(rvd);
739 737 spa_config_exit(spa, SCL_STATE, FTAG);
740 738
741 739 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
742 740 oldguid, *newguid);
743 741 }
744 742
745 743 /*
746 744 * Change the GUID for the pool. This is done so that we can later
747 745 * re-import a pool built from a clone of our own vdevs. We will modify
748 746 * the root vdev's guid, our own pool guid, and then mark all of our
749 747 * vdevs dirty. Note that we must make sure that all our vdevs are
750 748 * online when we do this, or else any vdevs that weren't present
751 749 * would be orphaned from our pool. We are also going to issue a
752 750 * sysevent to update any watchers.
753 751 */
754 752 int
755 753 spa_change_guid(spa_t *spa)
756 754 {
757 755 int error;
758 756 uint64_t guid;
759 757
760 758 mutex_enter(&spa->spa_vdev_top_lock);
761 759 mutex_enter(&spa_namespace_lock);
762 760 guid = spa_generate_guid(NULL);
763 761
764 762 error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
765 763 spa_change_guid_sync, &guid, 5);
766 764
767 765 if (error == 0) {
768 766 spa_config_sync(spa, B_FALSE, B_TRUE);
769 767 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
770 768 }
771 769
772 770 mutex_exit(&spa_namespace_lock);
773 771 mutex_exit(&spa->spa_vdev_top_lock);
774 772
775 773 return (error);
776 774 }
777 775
778 776 /*
779 777 * ==========================================================================
780 778 * SPA state manipulation (open/create/destroy/import/export)
781 779 * ==========================================================================
782 780 */
783 781
784 782 static int
785 783 spa_error_entry_compare(const void *a, const void *b)
786 784 {
787 785 spa_error_entry_t *sa = (spa_error_entry_t *)a;
788 786 spa_error_entry_t *sb = (spa_error_entry_t *)b;
789 787 int ret;
790 788
791 789 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
792 790 sizeof (zbookmark_t));
793 791
794 792 if (ret < 0)
795 793 return (-1);
796 794 else if (ret > 0)
797 795 return (1);
798 796 else
799 797 return (0);
800 798 }
801 799
802 800 /*
803 801 * Utility function which retrieves copies of the current logs and
804 802 * re-initializes them in the process.
805 803 */
806 804 void
807 805 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
808 806 {
809 807 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
810 808
811 809 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
812 810 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
813 811
814 812 avl_create(&spa->spa_errlist_scrub,
815 813 spa_error_entry_compare, sizeof (spa_error_entry_t),
816 814 offsetof(spa_error_entry_t, se_avl));
817 815 avl_create(&spa->spa_errlist_last,
818 816 spa_error_entry_compare, sizeof (spa_error_entry_t),
819 817 offsetof(spa_error_entry_t, se_avl));
820 818 }
821 819
822 820 static void
823 821 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
824 822 {
825 823 const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
826 824 enum zti_modes mode = ztip->zti_mode;
827 825 uint_t value = ztip->zti_value;
828 826 uint_t count = ztip->zti_count;
829 827 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
830 828 char name[32];
831 829 uint_t flags = 0;
832 830 boolean_t batch = B_FALSE;
833 831
834 832 if (mode == ZTI_MODE_NULL) {
↓ open down ↓ |
675 lines elided |
↑ open up ↑ |
835 833 tqs->stqs_count = 0;
836 834 tqs->stqs_taskq = NULL;
837 835 return;
838 836 }
839 837
840 838 ASSERT3U(count, >, 0);
841 839
842 840 tqs->stqs_count = count;
843 841 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
844 842
845 - for (uint_t i = 0; i < count; i++) {
846 - taskq_t *tq;
843 + switch (mode) {
844 + case ZTI_MODE_FIXED:
845 + ASSERT3U(value, >=, 1);
846 + value = MAX(value, 1);
847 + break;
847 848
848 - switch (mode) {
849 - case ZTI_MODE_FIXED:
850 - ASSERT3U(value, >=, 1);
851 - value = MAX(value, 1);
852 - break;
849 + case ZTI_MODE_BATCH:
850 + batch = B_TRUE;
851 + flags |= TASKQ_THREADS_CPU_PCT;
852 + value = zio_taskq_batch_pct;
853 + break;
853 854
854 - case ZTI_MODE_BATCH:
855 - batch = B_TRUE;
856 - flags |= TASKQ_THREADS_CPU_PCT;
857 - value = zio_taskq_batch_pct;
858 - break;
855 + default:
856 + panic("unrecognized mode for %s_%s taskq (%u:%u) in "
857 + "spa_activate()",
858 + zio_type_name[t], zio_taskq_types[q], mode, value);
859 + break;
860 + }
859 861
860 - case ZTI_MODE_ONLINE_PERCENT:
861 - flags |= TASKQ_THREADS_CPU_PCT;
862 - break;
862 + for (uint_t i = 0; i < count; i++) {
863 + taskq_t *tq;
863 864
864 - default:
865 - panic("unrecognized mode for %s_%s taskq (%u:%u) in "
866 - "spa_activate()",
867 - zio_type_name[t], zio_taskq_types[q], mode, value);
868 - break;
869 - }
870 -
871 865 if (count > 1) {
872 866 (void) snprintf(name, sizeof (name), "%s_%s_%u",
873 867 zio_type_name[t], zio_taskq_types[q], i);
874 868 } else {
875 869 (void) snprintf(name, sizeof (name), "%s_%s",
876 870 zio_type_name[t], zio_taskq_types[q]);
877 871 }
878 872
879 873 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
880 874 if (batch)
881 875 flags |= TASKQ_DC_BATCH;
882 876
883 877 tq = taskq_create_sysdc(name, value, 50, INT_MAX,
884 878 spa->spa_proc, zio_taskq_basedc, flags);
885 879 } else {
886 - tq = taskq_create_proc(name, value, maxclsyspri, 50,
880 + pri_t pri = maxclsyspri;
881 + /*
882 + * The write issue taskq can be extremely CPU
883 + * intensive. Run it at slightly lower priority
884 + * than the other taskqs.
885 + */
886 + if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
887 + pri--;
888 +
889 + tq = taskq_create_proc(name, value, pri, 50,
887 890 INT_MAX, spa->spa_proc, flags);
888 891 }
889 892
890 893 tqs->stqs_taskq[i] = tq;
891 894 }
892 895 }
893 896
894 897 static void
895 898 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
896 899 {
897 900 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
898 901
899 902 if (tqs->stqs_taskq == NULL) {
900 903 ASSERT0(tqs->stqs_count);
901 904 return;
902 905 }
903 906
904 907 for (uint_t i = 0; i < tqs->stqs_count; i++) {
905 908 ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
906 909 taskq_destroy(tqs->stqs_taskq[i]);
907 910 }
908 911
909 912 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
910 913 tqs->stqs_taskq = NULL;
911 914 }
912 915
913 916 /*
914 917 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
915 918 * Note that a type may have multiple discrete taskqs to avoid lock contention
916 919 * on the taskq itself. In that case we choose which taskq at random by using
917 920 * the low bits of gethrtime().
918 921 */
919 922 void
920 923 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
921 924 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
922 925 {
923 926 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
924 927 taskq_t *tq;
925 928
926 929 ASSERT3P(tqs->stqs_taskq, !=, NULL);
927 930 ASSERT3U(tqs->stqs_count, !=, 0);
928 931
929 932 if (tqs->stqs_count == 1) {
930 933 tq = tqs->stqs_taskq[0];
931 934 } else {
932 935 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
933 936 }
934 937
935 938 taskq_dispatch_ent(tq, func, arg, flags, ent);
936 939 }
937 940
938 941 static void
939 942 spa_create_zio_taskqs(spa_t *spa)
940 943 {
941 944 for (int t = 0; t < ZIO_TYPES; t++) {
942 945 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
943 946 spa_taskqs_init(spa, t, q);
944 947 }
945 948 }
946 949 }
947 950
948 951 #ifdef _KERNEL
949 952 static void
950 953 spa_thread(void *arg)
951 954 {
952 955 callb_cpr_t cprinfo;
953 956
954 957 spa_t *spa = arg;
955 958 user_t *pu = PTOU(curproc);
956 959
957 960 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
958 961 spa->spa_name);
959 962
960 963 ASSERT(curproc != &p0);
961 964 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
962 965 "zpool-%s", spa->spa_name);
963 966 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
964 967
965 968 /* bind this thread to the requested psrset */
966 969 if (zio_taskq_psrset_bind != PS_NONE) {
967 970 pool_lock();
968 971 mutex_enter(&cpu_lock);
969 972 mutex_enter(&pidlock);
970 973 mutex_enter(&curproc->p_lock);
971 974
972 975 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
973 976 0, NULL, NULL) == 0) {
974 977 curthread->t_bind_pset = zio_taskq_psrset_bind;
975 978 } else {
976 979 cmn_err(CE_WARN,
977 980 "Couldn't bind process for zfs pool \"%s\" to "
978 981 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
979 982 }
980 983
981 984 mutex_exit(&curproc->p_lock);
982 985 mutex_exit(&pidlock);
983 986 mutex_exit(&cpu_lock);
984 987 pool_unlock();
985 988 }
986 989
987 990 if (zio_taskq_sysdc) {
988 991 sysdc_thread_enter(curthread, 100, 0);
989 992 }
990 993
991 994 spa->spa_proc = curproc;
992 995 spa->spa_did = curthread->t_did;
993 996
994 997 spa_create_zio_taskqs(spa);
995 998
996 999 mutex_enter(&spa->spa_proc_lock);
997 1000 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
998 1001
999 1002 spa->spa_proc_state = SPA_PROC_ACTIVE;
1000 1003 cv_broadcast(&spa->spa_proc_cv);
1001 1004
1002 1005 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1003 1006 while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1004 1007 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1005 1008 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1006 1009
1007 1010 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1008 1011 spa->spa_proc_state = SPA_PROC_GONE;
1009 1012 spa->spa_proc = &p0;
1010 1013 cv_broadcast(&spa->spa_proc_cv);
1011 1014 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
1012 1015
1013 1016 mutex_enter(&curproc->p_lock);
1014 1017 lwp_exit();
1015 1018 }
1016 1019 #endif
1017 1020
1018 1021 /*
1019 1022 * Activate an uninitialized pool.
1020 1023 */
1021 1024 static void
1022 1025 spa_activate(spa_t *spa, int mode)
1023 1026 {
1024 1027 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1025 1028
1026 1029 spa->spa_state = POOL_STATE_ACTIVE;
1027 1030 spa->spa_mode = mode;
1028 1031
1029 1032 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1030 1033 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1031 1034
1032 1035 /* Try to create a covering process */
1033 1036 mutex_enter(&spa->spa_proc_lock);
1034 1037 ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1035 1038 ASSERT(spa->spa_proc == &p0);
1036 1039 spa->spa_did = 0;
1037 1040
1038 1041 /* Only create a process if we're going to be around a while. */
1039 1042 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1040 1043 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1041 1044 NULL, 0) == 0) {
1042 1045 spa->spa_proc_state = SPA_PROC_CREATED;
1043 1046 while (spa->spa_proc_state == SPA_PROC_CREATED) {
1044 1047 cv_wait(&spa->spa_proc_cv,
1045 1048 &spa->spa_proc_lock);
1046 1049 }
1047 1050 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1048 1051 ASSERT(spa->spa_proc != &p0);
1049 1052 ASSERT(spa->spa_did != 0);
1050 1053 } else {
1051 1054 #ifdef _KERNEL
1052 1055 cmn_err(CE_WARN,
1053 1056 "Couldn't create process for zfs pool \"%s\"\n",
1054 1057 spa->spa_name);
1055 1058 #endif
1056 1059 }
1057 1060 }
1058 1061 mutex_exit(&spa->spa_proc_lock);
1059 1062
1060 1063 /* If we didn't create a process, we need to create our taskqs. */
1061 1064 if (spa->spa_proc == &p0) {
1062 1065 spa_create_zio_taskqs(spa);
1063 1066 }
1064 1067
1065 1068 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1066 1069 offsetof(vdev_t, vdev_config_dirty_node));
1067 1070 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1068 1071 offsetof(vdev_t, vdev_state_dirty_node));
1069 1072
1070 1073 txg_list_create(&spa->spa_vdev_txg_list,
1071 1074 offsetof(struct vdev, vdev_txg_node));
1072 1075
1073 1076 avl_create(&spa->spa_errlist_scrub,
1074 1077 spa_error_entry_compare, sizeof (spa_error_entry_t),
1075 1078 offsetof(spa_error_entry_t, se_avl));
1076 1079 avl_create(&spa->spa_errlist_last,
1077 1080 spa_error_entry_compare, sizeof (spa_error_entry_t),
1078 1081 offsetof(spa_error_entry_t, se_avl));
1079 1082 }
1080 1083
1081 1084 /*
1082 1085 * Opposite of spa_activate().
1083 1086 */
1084 1087 static void
1085 1088 spa_deactivate(spa_t *spa)
1086 1089 {
1087 1090 ASSERT(spa->spa_sync_on == B_FALSE);
1088 1091 ASSERT(spa->spa_dsl_pool == NULL);
1089 1092 ASSERT(spa->spa_root_vdev == NULL);
1090 1093 ASSERT(spa->spa_async_zio_root == NULL);
1091 1094 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1092 1095
1093 1096 txg_list_destroy(&spa->spa_vdev_txg_list);
1094 1097
1095 1098 list_destroy(&spa->spa_config_dirty_list);
1096 1099 list_destroy(&spa->spa_state_dirty_list);
1097 1100
1098 1101 for (int t = 0; t < ZIO_TYPES; t++) {
1099 1102 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1100 1103 spa_taskqs_fini(spa, t, q);
1101 1104 }
1102 1105 }
1103 1106
1104 1107 metaslab_class_destroy(spa->spa_normal_class);
1105 1108 spa->spa_normal_class = NULL;
1106 1109
1107 1110 metaslab_class_destroy(spa->spa_log_class);
1108 1111 spa->spa_log_class = NULL;
1109 1112
1110 1113 /*
1111 1114 * If this was part of an import or the open otherwise failed, we may
1112 1115 * still have errors left in the queues. Empty them just in case.
1113 1116 */
1114 1117 spa_errlog_drain(spa);
1115 1118
1116 1119 avl_destroy(&spa->spa_errlist_scrub);
1117 1120 avl_destroy(&spa->spa_errlist_last);
1118 1121
1119 1122 spa->spa_state = POOL_STATE_UNINITIALIZED;
1120 1123
1121 1124 mutex_enter(&spa->spa_proc_lock);
1122 1125 if (spa->spa_proc_state != SPA_PROC_NONE) {
1123 1126 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1124 1127 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1125 1128 cv_broadcast(&spa->spa_proc_cv);
1126 1129 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1127 1130 ASSERT(spa->spa_proc != &p0);
1128 1131 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1129 1132 }
1130 1133 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1131 1134 spa->spa_proc_state = SPA_PROC_NONE;
1132 1135 }
1133 1136 ASSERT(spa->spa_proc == &p0);
1134 1137 mutex_exit(&spa->spa_proc_lock);
1135 1138
1136 1139 /*
1137 1140 * We want to make sure spa_thread() has actually exited the ZFS
1138 1141 * module, so that the module can't be unloaded out from underneath
1139 1142 * it.
1140 1143 */
1141 1144 if (spa->spa_did != 0) {
1142 1145 thread_join(spa->spa_did);
1143 1146 spa->spa_did = 0;
1144 1147 }
1145 1148 }
1146 1149
1147 1150 /*
1148 1151 * Verify a pool configuration, and construct the vdev tree appropriately. This
1149 1152 * will create all the necessary vdevs in the appropriate layout, with each vdev
1150 1153 * in the CLOSED state. This will prep the pool before open/creation/import.
1151 1154 * All vdev validation is done by the vdev_alloc() routine.
1152 1155 */
1153 1156 static int
1154 1157 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1155 1158 uint_t id, int atype)
1156 1159 {
1157 1160 nvlist_t **child;
1158 1161 uint_t children;
1159 1162 int error;
1160 1163
1161 1164 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1162 1165 return (error);
1163 1166
1164 1167 if ((*vdp)->vdev_ops->vdev_op_leaf)
1165 1168 return (0);
1166 1169
1167 1170 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1168 1171 &child, &children);
1169 1172
1170 1173 if (error == ENOENT)
1171 1174 return (0);
1172 1175
1173 1176 if (error) {
1174 1177 vdev_free(*vdp);
1175 1178 *vdp = NULL;
1176 1179 return (SET_ERROR(EINVAL));
1177 1180 }
1178 1181
1179 1182 for (int c = 0; c < children; c++) {
1180 1183 vdev_t *vd;
1181 1184 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1182 1185 atype)) != 0) {
1183 1186 vdev_free(*vdp);
1184 1187 *vdp = NULL;
1185 1188 return (error);
1186 1189 }
1187 1190 }
1188 1191
1189 1192 ASSERT(*vdp != NULL);
1190 1193
1191 1194 return (0);
1192 1195 }
1193 1196
1194 1197 /*
1195 1198 * Opposite of spa_load().
1196 1199 */
1197 1200 static void
1198 1201 spa_unload(spa_t *spa)
1199 1202 {
1200 1203 int i;
1201 1204
1202 1205 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1203 1206
1204 1207 /*
1205 1208 * Stop async tasks.
1206 1209 */
1207 1210 spa_async_suspend(spa);
1208 1211
1209 1212 /*
1210 1213 * Stop syncing.
1211 1214 */
1212 1215 if (spa->spa_sync_on) {
1213 1216 txg_sync_stop(spa->spa_dsl_pool);
1214 1217 spa->spa_sync_on = B_FALSE;
1215 1218 }
1216 1219
1217 1220 /*
1218 1221 * Wait for any outstanding async I/O to complete.
1219 1222 */
1220 1223 if (spa->spa_async_zio_root != NULL) {
1221 1224 (void) zio_wait(spa->spa_async_zio_root);
1222 1225 spa->spa_async_zio_root = NULL;
1223 1226 }
1224 1227
1225 1228 bpobj_close(&spa->spa_deferred_bpobj);
1226 1229
1227 1230 /*
1228 1231 * Close the dsl pool.
1229 1232 */
1230 1233 if (spa->spa_dsl_pool) {
1231 1234 dsl_pool_close(spa->spa_dsl_pool);
1232 1235 spa->spa_dsl_pool = NULL;
1233 1236 spa->spa_meta_objset = NULL;
1234 1237 }
1235 1238
1236 1239 ddt_unload(spa);
1237 1240
1238 1241 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1239 1242
1240 1243 /*
1241 1244 * Drop and purge level 2 cache
1242 1245 */
1243 1246 spa_l2cache_drop(spa);
1244 1247
1245 1248 /*
1246 1249 * Close all vdevs.
1247 1250 */
1248 1251 if (spa->spa_root_vdev)
1249 1252 vdev_free(spa->spa_root_vdev);
1250 1253 ASSERT(spa->spa_root_vdev == NULL);
1251 1254
1252 1255 for (i = 0; i < spa->spa_spares.sav_count; i++)
1253 1256 vdev_free(spa->spa_spares.sav_vdevs[i]);
1254 1257 if (spa->spa_spares.sav_vdevs) {
1255 1258 kmem_free(spa->spa_spares.sav_vdevs,
1256 1259 spa->spa_spares.sav_count * sizeof (void *));
1257 1260 spa->spa_spares.sav_vdevs = NULL;
1258 1261 }
1259 1262 if (spa->spa_spares.sav_config) {
1260 1263 nvlist_free(spa->spa_spares.sav_config);
1261 1264 spa->spa_spares.sav_config = NULL;
1262 1265 }
1263 1266 spa->spa_spares.sav_count = 0;
1264 1267
1265 1268 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1266 1269 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1267 1270 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1268 1271 }
1269 1272 if (spa->spa_l2cache.sav_vdevs) {
1270 1273 kmem_free(spa->spa_l2cache.sav_vdevs,
1271 1274 spa->spa_l2cache.sav_count * sizeof (void *));
1272 1275 spa->spa_l2cache.sav_vdevs = NULL;
1273 1276 }
1274 1277 if (spa->spa_l2cache.sav_config) {
1275 1278 nvlist_free(spa->spa_l2cache.sav_config);
1276 1279 spa->spa_l2cache.sav_config = NULL;
1277 1280 }
1278 1281 spa->spa_l2cache.sav_count = 0;
1279 1282
1280 1283 spa->spa_async_suspended = 0;
1281 1284
1282 1285 if (spa->spa_comment != NULL) {
1283 1286 spa_strfree(spa->spa_comment);
1284 1287 spa->spa_comment = NULL;
1285 1288 }
1286 1289
1287 1290 spa_config_exit(spa, SCL_ALL, FTAG);
1288 1291 }
1289 1292
1290 1293 /*
1291 1294 * Load (or re-load) the current list of vdevs describing the active spares for
1292 1295 * this pool. When this is called, we have some form of basic information in
1293 1296 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
1294 1297 * then re-generate a more complete list including status information.
1295 1298 */
1296 1299 static void
1297 1300 spa_load_spares(spa_t *spa)
1298 1301 {
1299 1302 nvlist_t **spares;
1300 1303 uint_t nspares;
1301 1304 int i;
1302 1305 vdev_t *vd, *tvd;
1303 1306
1304 1307 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1305 1308
1306 1309 /*
1307 1310 * First, close and free any existing spare vdevs.
1308 1311 */
1309 1312 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1310 1313 vd = spa->spa_spares.sav_vdevs[i];
1311 1314
1312 1315 /* Undo the call to spa_activate() below */
1313 1316 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1314 1317 B_FALSE)) != NULL && tvd->vdev_isspare)
1315 1318 spa_spare_remove(tvd);
1316 1319 vdev_close(vd);
1317 1320 vdev_free(vd);
1318 1321 }
1319 1322
1320 1323 if (spa->spa_spares.sav_vdevs)
1321 1324 kmem_free(spa->spa_spares.sav_vdevs,
1322 1325 spa->spa_spares.sav_count * sizeof (void *));
1323 1326
1324 1327 if (spa->spa_spares.sav_config == NULL)
1325 1328 nspares = 0;
1326 1329 else
1327 1330 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1328 1331 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1329 1332
1330 1333 spa->spa_spares.sav_count = (int)nspares;
1331 1334 spa->spa_spares.sav_vdevs = NULL;
1332 1335
1333 1336 if (nspares == 0)
1334 1337 return;
1335 1338
1336 1339 /*
1337 1340 * Construct the array of vdevs, opening them to get status in the
1338 1341 * process. For each spare, there is potentially two different vdev_t
1339 1342 * structures associated with it: one in the list of spares (used only
1340 1343 * for basic validation purposes) and one in the active vdev
1341 1344 * configuration (if it's spared in). During this phase we open and
1342 1345 * validate each vdev on the spare list. If the vdev also exists in the
1343 1346 * active configuration, then we also mark this vdev as an active spare.
1344 1347 */
1345 1348 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1346 1349 KM_SLEEP);
1347 1350 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1348 1351 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1349 1352 VDEV_ALLOC_SPARE) == 0);
1350 1353 ASSERT(vd != NULL);
1351 1354
1352 1355 spa->spa_spares.sav_vdevs[i] = vd;
1353 1356
1354 1357 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1355 1358 B_FALSE)) != NULL) {
1356 1359 if (!tvd->vdev_isspare)
1357 1360 spa_spare_add(tvd);
1358 1361
1359 1362 /*
1360 1363 * We only mark the spare active if we were successfully
1361 1364 * able to load the vdev. Otherwise, importing a pool
1362 1365 * with a bad active spare would result in strange
1363 1366 * behavior, because multiple pool would think the spare
1364 1367 * is actively in use.
1365 1368 *
1366 1369 * There is a vulnerability here to an equally bizarre
1367 1370 * circumstance, where a dead active spare is later
1368 1371 * brought back to life (onlined or otherwise). Given
1369 1372 * the rarity of this scenario, and the extra complexity
1370 1373 * it adds, we ignore the possibility.
1371 1374 */
1372 1375 if (!vdev_is_dead(tvd))
1373 1376 spa_spare_activate(tvd);
1374 1377 }
1375 1378
1376 1379 vd->vdev_top = vd;
1377 1380 vd->vdev_aux = &spa->spa_spares;
1378 1381
1379 1382 if (vdev_open(vd) != 0)
1380 1383 continue;
1381 1384
1382 1385 if (vdev_validate_aux(vd) == 0)
1383 1386 spa_spare_add(vd);
1384 1387 }
1385 1388
1386 1389 /*
1387 1390 * Recompute the stashed list of spares, with status information
1388 1391 * this time.
1389 1392 */
1390 1393 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1391 1394 DATA_TYPE_NVLIST_ARRAY) == 0);
1392 1395
1393 1396 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1394 1397 KM_SLEEP);
1395 1398 for (i = 0; i < spa->spa_spares.sav_count; i++)
1396 1399 spares[i] = vdev_config_generate(spa,
1397 1400 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1398 1401 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1399 1402 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1400 1403 for (i = 0; i < spa->spa_spares.sav_count; i++)
1401 1404 nvlist_free(spares[i]);
1402 1405 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1403 1406 }
1404 1407
1405 1408 /*
1406 1409 * Load (or re-load) the current list of vdevs describing the active l2cache for
1407 1410 * this pool. When this is called, we have some form of basic information in
1408 1411 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
1409 1412 * then re-generate a more complete list including status information.
1410 1413 * Devices which are already active have their details maintained, and are
1411 1414 * not re-opened.
1412 1415 */
1413 1416 static void
1414 1417 spa_load_l2cache(spa_t *spa)
1415 1418 {
1416 1419 nvlist_t **l2cache;
1417 1420 uint_t nl2cache;
1418 1421 int i, j, oldnvdevs;
1419 1422 uint64_t guid;
1420 1423 vdev_t *vd, **oldvdevs, **newvdevs;
1421 1424 spa_aux_vdev_t *sav = &spa->spa_l2cache;
1422 1425
1423 1426 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1424 1427
1425 1428 if (sav->sav_config != NULL) {
1426 1429 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1427 1430 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1428 1431 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1429 1432 } else {
1430 1433 nl2cache = 0;
1431 1434 newvdevs = NULL;
1432 1435 }
1433 1436
1434 1437 oldvdevs = sav->sav_vdevs;
1435 1438 oldnvdevs = sav->sav_count;
1436 1439 sav->sav_vdevs = NULL;
1437 1440 sav->sav_count = 0;
1438 1441
1439 1442 /*
1440 1443 * Process new nvlist of vdevs.
1441 1444 */
1442 1445 for (i = 0; i < nl2cache; i++) {
1443 1446 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1444 1447 &guid) == 0);
1445 1448
1446 1449 newvdevs[i] = NULL;
1447 1450 for (j = 0; j < oldnvdevs; j++) {
1448 1451 vd = oldvdevs[j];
1449 1452 if (vd != NULL && guid == vd->vdev_guid) {
1450 1453 /*
1451 1454 * Retain previous vdev for add/remove ops.
1452 1455 */
1453 1456 newvdevs[i] = vd;
1454 1457 oldvdevs[j] = NULL;
1455 1458 break;
1456 1459 }
1457 1460 }
1458 1461
1459 1462 if (newvdevs[i] == NULL) {
1460 1463 /*
1461 1464 * Create new vdev
1462 1465 */
1463 1466 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1464 1467 VDEV_ALLOC_L2CACHE) == 0);
1465 1468 ASSERT(vd != NULL);
1466 1469 newvdevs[i] = vd;
1467 1470
1468 1471 /*
1469 1472 * Commit this vdev as an l2cache device,
1470 1473 * even if it fails to open.
1471 1474 */
1472 1475 spa_l2cache_add(vd);
1473 1476
1474 1477 vd->vdev_top = vd;
1475 1478 vd->vdev_aux = sav;
1476 1479
1477 1480 spa_l2cache_activate(vd);
1478 1481
1479 1482 if (vdev_open(vd) != 0)
1480 1483 continue;
1481 1484
1482 1485 (void) vdev_validate_aux(vd);
1483 1486
1484 1487 if (!vdev_is_dead(vd))
1485 1488 l2arc_add_vdev(spa, vd);
1486 1489 }
1487 1490 }
1488 1491
1489 1492 /*
1490 1493 * Purge vdevs that were dropped
1491 1494 */
1492 1495 for (i = 0; i < oldnvdevs; i++) {
1493 1496 uint64_t pool;
1494 1497
1495 1498 vd = oldvdevs[i];
1496 1499 if (vd != NULL) {
1497 1500 ASSERT(vd->vdev_isl2cache);
1498 1501
1499 1502 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1500 1503 pool != 0ULL && l2arc_vdev_present(vd))
1501 1504 l2arc_remove_vdev(vd);
1502 1505 vdev_clear_stats(vd);
1503 1506 vdev_free(vd);
1504 1507 }
1505 1508 }
1506 1509
1507 1510 if (oldvdevs)
1508 1511 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1509 1512
1510 1513 if (sav->sav_config == NULL)
1511 1514 goto out;
1512 1515
1513 1516 sav->sav_vdevs = newvdevs;
1514 1517 sav->sav_count = (int)nl2cache;
1515 1518
1516 1519 /*
1517 1520 * Recompute the stashed list of l2cache devices, with status
1518 1521 * information this time.
1519 1522 */
1520 1523 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1521 1524 DATA_TYPE_NVLIST_ARRAY) == 0);
1522 1525
1523 1526 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1524 1527 for (i = 0; i < sav->sav_count; i++)
1525 1528 l2cache[i] = vdev_config_generate(spa,
1526 1529 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1527 1530 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1528 1531 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1529 1532 out:
1530 1533 for (i = 0; i < sav->sav_count; i++)
1531 1534 nvlist_free(l2cache[i]);
1532 1535 if (sav->sav_count)
1533 1536 kmem_free(l2cache, sav->sav_count * sizeof (void *));
1534 1537 }
1535 1538
1536 1539 static int
1537 1540 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1538 1541 {
1539 1542 dmu_buf_t *db;
1540 1543 char *packed = NULL;
1541 1544 size_t nvsize = 0;
1542 1545 int error;
1543 1546 *value = NULL;
1544 1547
1545 1548 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1546 1549 nvsize = *(uint64_t *)db->db_data;
1547 1550 dmu_buf_rele(db, FTAG);
1548 1551
1549 1552 packed = kmem_alloc(nvsize, KM_SLEEP);
1550 1553 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1551 1554 DMU_READ_PREFETCH);
1552 1555 if (error == 0)
1553 1556 error = nvlist_unpack(packed, nvsize, value, 0);
1554 1557 kmem_free(packed, nvsize);
1555 1558
1556 1559 return (error);
1557 1560 }
1558 1561
1559 1562 /*
1560 1563 * Checks to see if the given vdev could not be opened, in which case we post a
1561 1564 * sysevent to notify the autoreplace code that the device has been removed.
1562 1565 */
1563 1566 static void
1564 1567 spa_check_removed(vdev_t *vd)
1565 1568 {
1566 1569 for (int c = 0; c < vd->vdev_children; c++)
1567 1570 spa_check_removed(vd->vdev_child[c]);
1568 1571
1569 1572 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1570 1573 !vd->vdev_ishole) {
1571 1574 zfs_post_autoreplace(vd->vdev_spa, vd);
1572 1575 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1573 1576 }
1574 1577 }
1575 1578
1576 1579 /*
1577 1580 * Validate the current config against the MOS config
1578 1581 */
1579 1582 static boolean_t
1580 1583 spa_config_valid(spa_t *spa, nvlist_t *config)
1581 1584 {
1582 1585 vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1583 1586 nvlist_t *nv;
1584 1587
1585 1588 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1586 1589
1587 1590 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1588 1591 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1589 1592
1590 1593 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1591 1594
1592 1595 /*
1593 1596 * If we're doing a normal import, then build up any additional
1594 1597 * diagnostic information about missing devices in this config.
1595 1598 * We'll pass this up to the user for further processing.
1596 1599 */
1597 1600 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1598 1601 nvlist_t **child, *nv;
1599 1602 uint64_t idx = 0;
1600 1603
1601 1604 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1602 1605 KM_SLEEP);
1603 1606 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1604 1607
1605 1608 for (int c = 0; c < rvd->vdev_children; c++) {
1606 1609 vdev_t *tvd = rvd->vdev_child[c];
1607 1610 vdev_t *mtvd = mrvd->vdev_child[c];
1608 1611
1609 1612 if (tvd->vdev_ops == &vdev_missing_ops &&
1610 1613 mtvd->vdev_ops != &vdev_missing_ops &&
1611 1614 mtvd->vdev_islog)
1612 1615 child[idx++] = vdev_config_generate(spa, mtvd,
1613 1616 B_FALSE, 0);
1614 1617 }
1615 1618
1616 1619 if (idx) {
1617 1620 VERIFY(nvlist_add_nvlist_array(nv,
1618 1621 ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1619 1622 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1620 1623 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1621 1624
1622 1625 for (int i = 0; i < idx; i++)
1623 1626 nvlist_free(child[i]);
1624 1627 }
1625 1628 nvlist_free(nv);
1626 1629 kmem_free(child, rvd->vdev_children * sizeof (char **));
1627 1630 }
1628 1631
1629 1632 /*
1630 1633 * Compare the root vdev tree with the information we have
1631 1634 * from the MOS config (mrvd). Check each top-level vdev
1632 1635 * with the corresponding MOS config top-level (mtvd).
1633 1636 */
1634 1637 for (int c = 0; c < rvd->vdev_children; c++) {
1635 1638 vdev_t *tvd = rvd->vdev_child[c];
1636 1639 vdev_t *mtvd = mrvd->vdev_child[c];
1637 1640
1638 1641 /*
1639 1642 * Resolve any "missing" vdevs in the current configuration.
1640 1643 * If we find that the MOS config has more accurate information
1641 1644 * about the top-level vdev then use that vdev instead.
1642 1645 */
1643 1646 if (tvd->vdev_ops == &vdev_missing_ops &&
1644 1647 mtvd->vdev_ops != &vdev_missing_ops) {
1645 1648
1646 1649 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1647 1650 continue;
1648 1651
1649 1652 /*
1650 1653 * Device specific actions.
1651 1654 */
1652 1655 if (mtvd->vdev_islog) {
1653 1656 spa_set_log_state(spa, SPA_LOG_CLEAR);
1654 1657 } else {
1655 1658 /*
1656 1659 * XXX - once we have 'readonly' pool
1657 1660 * support we should be able to handle
1658 1661 * missing data devices by transitioning
1659 1662 * the pool to readonly.
1660 1663 */
1661 1664 continue;
1662 1665 }
1663 1666
1664 1667 /*
1665 1668 * Swap the missing vdev with the data we were
1666 1669 * able to obtain from the MOS config.
1667 1670 */
1668 1671 vdev_remove_child(rvd, tvd);
1669 1672 vdev_remove_child(mrvd, mtvd);
1670 1673
1671 1674 vdev_add_child(rvd, mtvd);
1672 1675 vdev_add_child(mrvd, tvd);
1673 1676
1674 1677 spa_config_exit(spa, SCL_ALL, FTAG);
1675 1678 vdev_load(mtvd);
1676 1679 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1677 1680
1678 1681 vdev_reopen(rvd);
1679 1682 } else if (mtvd->vdev_islog) {
1680 1683 /*
1681 1684 * Load the slog device's state from the MOS config
1682 1685 * since it's possible that the label does not
1683 1686 * contain the most up-to-date information.
1684 1687 */
1685 1688 vdev_load_log_state(tvd, mtvd);
1686 1689 vdev_reopen(tvd);
1687 1690 }
1688 1691 }
1689 1692 vdev_free(mrvd);
1690 1693 spa_config_exit(spa, SCL_ALL, FTAG);
1691 1694
1692 1695 /*
1693 1696 * Ensure we were able to validate the config.
1694 1697 */
1695 1698 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1696 1699 }
1697 1700
1698 1701 /*
1699 1702 * Check for missing log devices
1700 1703 */
1701 1704 static boolean_t
1702 1705 spa_check_logs(spa_t *spa)
1703 1706 {
1704 1707 boolean_t rv = B_FALSE;
1705 1708
1706 1709 switch (spa->spa_log_state) {
1707 1710 case SPA_LOG_MISSING:
1708 1711 /* need to recheck in case slog has been restored */
1709 1712 case SPA_LOG_UNKNOWN:
1710 1713 rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
1711 1714 NULL, DS_FIND_CHILDREN) != 0);
1712 1715 if (rv)
1713 1716 spa_set_log_state(spa, SPA_LOG_MISSING);
1714 1717 break;
1715 1718 }
1716 1719 return (rv);
1717 1720 }
1718 1721
1719 1722 static boolean_t
1720 1723 spa_passivate_log(spa_t *spa)
1721 1724 {
1722 1725 vdev_t *rvd = spa->spa_root_vdev;
1723 1726 boolean_t slog_found = B_FALSE;
1724 1727
1725 1728 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1726 1729
1727 1730 if (!spa_has_slogs(spa))
1728 1731 return (B_FALSE);
1729 1732
1730 1733 for (int c = 0; c < rvd->vdev_children; c++) {
1731 1734 vdev_t *tvd = rvd->vdev_child[c];
1732 1735 metaslab_group_t *mg = tvd->vdev_mg;
1733 1736
1734 1737 if (tvd->vdev_islog) {
1735 1738 metaslab_group_passivate(mg);
1736 1739 slog_found = B_TRUE;
1737 1740 }
1738 1741 }
1739 1742
1740 1743 return (slog_found);
1741 1744 }
1742 1745
1743 1746 static void
1744 1747 spa_activate_log(spa_t *spa)
1745 1748 {
1746 1749 vdev_t *rvd = spa->spa_root_vdev;
1747 1750
1748 1751 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1749 1752
1750 1753 for (int c = 0; c < rvd->vdev_children; c++) {
1751 1754 vdev_t *tvd = rvd->vdev_child[c];
1752 1755 metaslab_group_t *mg = tvd->vdev_mg;
1753 1756
1754 1757 if (tvd->vdev_islog)
1755 1758 metaslab_group_activate(mg);
1756 1759 }
1757 1760 }
1758 1761
1759 1762 int
1760 1763 spa_offline_log(spa_t *spa)
1761 1764 {
1762 1765 int error;
1763 1766
1764 1767 error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1765 1768 NULL, DS_FIND_CHILDREN);
1766 1769 if (error == 0) {
1767 1770 /*
1768 1771 * We successfully offlined the log device, sync out the
1769 1772 * current txg so that the "stubby" block can be removed
1770 1773 * by zil_sync().
1771 1774 */
1772 1775 txg_wait_synced(spa->spa_dsl_pool, 0);
1773 1776 }
1774 1777 return (error);
1775 1778 }
1776 1779
1777 1780 static void
1778 1781 spa_aux_check_removed(spa_aux_vdev_t *sav)
1779 1782 {
1780 1783 for (int i = 0; i < sav->sav_count; i++)
1781 1784 spa_check_removed(sav->sav_vdevs[i]);
1782 1785 }
1783 1786
1784 1787 void
1785 1788 spa_claim_notify(zio_t *zio)
1786 1789 {
1787 1790 spa_t *spa = zio->io_spa;
1788 1791
1789 1792 if (zio->io_error)
1790 1793 return;
1791 1794
1792 1795 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
1793 1796 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1794 1797 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1795 1798 mutex_exit(&spa->spa_props_lock);
1796 1799 }
1797 1800
1798 1801 typedef struct spa_load_error {
1799 1802 uint64_t sle_meta_count;
1800 1803 uint64_t sle_data_count;
1801 1804 } spa_load_error_t;
1802 1805
1803 1806 static void
1804 1807 spa_load_verify_done(zio_t *zio)
1805 1808 {
1806 1809 blkptr_t *bp = zio->io_bp;
1807 1810 spa_load_error_t *sle = zio->io_private;
1808 1811 dmu_object_type_t type = BP_GET_TYPE(bp);
1809 1812 int error = zio->io_error;
1810 1813
1811 1814 if (error) {
1812 1815 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1813 1816 type != DMU_OT_INTENT_LOG)
1814 1817 atomic_add_64(&sle->sle_meta_count, 1);
1815 1818 else
1816 1819 atomic_add_64(&sle->sle_data_count, 1);
1817 1820 }
1818 1821 zio_data_buf_free(zio->io_data, zio->io_size);
1819 1822 }
1820 1823
1821 1824 /*ARGSUSED*/
1822 1825 static int
1823 1826 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1824 1827 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1825 1828 {
1826 1829 if (bp != NULL) {
1827 1830 zio_t *rio = arg;
1828 1831 size_t size = BP_GET_PSIZE(bp);
1829 1832 void *data = zio_data_buf_alloc(size);
1830 1833
1831 1834 zio_nowait(zio_read(rio, spa, bp, data, size,
1832 1835 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1833 1836 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1834 1837 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1835 1838 }
1836 1839 return (0);
1837 1840 }
1838 1841
1839 1842 static int
1840 1843 spa_load_verify(spa_t *spa)
1841 1844 {
1842 1845 zio_t *rio;
1843 1846 spa_load_error_t sle = { 0 };
1844 1847 zpool_rewind_policy_t policy;
1845 1848 boolean_t verify_ok = B_FALSE;
1846 1849 int error;
1847 1850
1848 1851 zpool_get_rewind_policy(spa->spa_config, &policy);
1849 1852
1850 1853 if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1851 1854 return (0);
1852 1855
1853 1856 rio = zio_root(spa, NULL, &sle,
1854 1857 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1855 1858
1856 1859 error = traverse_pool(spa, spa->spa_verify_min_txg,
1857 1860 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1858 1861
1859 1862 (void) zio_wait(rio);
1860 1863
1861 1864 spa->spa_load_meta_errors = sle.sle_meta_count;
1862 1865 spa->spa_load_data_errors = sle.sle_data_count;
1863 1866
1864 1867 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1865 1868 sle.sle_data_count <= policy.zrp_maxdata) {
1866 1869 int64_t loss = 0;
1867 1870
1868 1871 verify_ok = B_TRUE;
1869 1872 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1870 1873 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1871 1874
1872 1875 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1873 1876 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1874 1877 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1875 1878 VERIFY(nvlist_add_int64(spa->spa_load_info,
1876 1879 ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1877 1880 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1878 1881 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1879 1882 } else {
1880 1883 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1881 1884 }
1882 1885
1883 1886 if (error) {
1884 1887 if (error != ENXIO && error != EIO)
1885 1888 error = SET_ERROR(EIO);
1886 1889 return (error);
1887 1890 }
1888 1891
1889 1892 return (verify_ok ? 0 : EIO);
1890 1893 }
1891 1894
1892 1895 /*
1893 1896 * Find a value in the pool props object.
1894 1897 */
1895 1898 static void
1896 1899 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1897 1900 {
1898 1901 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1899 1902 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1900 1903 }
1901 1904
1902 1905 /*
1903 1906 * Find a value in the pool directory object.
1904 1907 */
1905 1908 static int
1906 1909 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1907 1910 {
1908 1911 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1909 1912 name, sizeof (uint64_t), 1, val));
1910 1913 }
1911 1914
1912 1915 static int
1913 1916 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1914 1917 {
1915 1918 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1916 1919 return (err);
1917 1920 }
1918 1921
1919 1922 /*
1920 1923 * Fix up config after a partly-completed split. This is done with the
1921 1924 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
1922 1925 * pool have that entry in their config, but only the splitting one contains
1923 1926 * a list of all the guids of the vdevs that are being split off.
1924 1927 *
1925 1928 * This function determines what to do with that list: either rejoin
1926 1929 * all the disks to the pool, or complete the splitting process. To attempt
1927 1930 * the rejoin, each disk that is offlined is marked online again, and
1928 1931 * we do a reopen() call. If the vdev label for every disk that was
1929 1932 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1930 1933 * then we call vdev_split() on each disk, and complete the split.
1931 1934 *
1932 1935 * Otherwise we leave the config alone, with all the vdevs in place in
1933 1936 * the original pool.
1934 1937 */
1935 1938 static void
1936 1939 spa_try_repair(spa_t *spa, nvlist_t *config)
1937 1940 {
1938 1941 uint_t extracted;
1939 1942 uint64_t *glist;
1940 1943 uint_t i, gcount;
1941 1944 nvlist_t *nvl;
1942 1945 vdev_t **vd;
1943 1946 boolean_t attempt_reopen;
1944 1947
1945 1948 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1946 1949 return;
1947 1950
1948 1951 /* check that the config is complete */
1949 1952 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1950 1953 &glist, &gcount) != 0)
1951 1954 return;
1952 1955
1953 1956 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
1954 1957
1955 1958 /* attempt to online all the vdevs & validate */
1956 1959 attempt_reopen = B_TRUE;
1957 1960 for (i = 0; i < gcount; i++) {
1958 1961 if (glist[i] == 0) /* vdev is hole */
1959 1962 continue;
1960 1963
1961 1964 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
1962 1965 if (vd[i] == NULL) {
1963 1966 /*
1964 1967 * Don't bother attempting to reopen the disks;
1965 1968 * just do the split.
1966 1969 */
1967 1970 attempt_reopen = B_FALSE;
1968 1971 } else {
1969 1972 /* attempt to re-online it */
1970 1973 vd[i]->vdev_offline = B_FALSE;
1971 1974 }
1972 1975 }
1973 1976
1974 1977 if (attempt_reopen) {
1975 1978 vdev_reopen(spa->spa_root_vdev);
1976 1979
1977 1980 /* check each device to see what state it's in */
1978 1981 for (extracted = 0, i = 0; i < gcount; i++) {
1979 1982 if (vd[i] != NULL &&
1980 1983 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
1981 1984 break;
1982 1985 ++extracted;
1983 1986 }
1984 1987 }
1985 1988
1986 1989 /*
1987 1990 * If every disk has been moved to the new pool, or if we never
1988 1991 * even attempted to look at them, then we split them off for
1989 1992 * good.
1990 1993 */
1991 1994 if (!attempt_reopen || gcount == extracted) {
1992 1995 for (i = 0; i < gcount; i++)
1993 1996 if (vd[i] != NULL)
1994 1997 vdev_split(vd[i]);
1995 1998 vdev_reopen(spa->spa_root_vdev);
1996 1999 }
1997 2000
1998 2001 kmem_free(vd, gcount * sizeof (vdev_t *));
1999 2002 }
2000 2003
2001 2004 static int
2002 2005 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
2003 2006 boolean_t mosconfig)
2004 2007 {
2005 2008 nvlist_t *config = spa->spa_config;
2006 2009 char *ereport = FM_EREPORT_ZFS_POOL;
2007 2010 char *comment;
2008 2011 int error;
2009 2012 uint64_t pool_guid;
2010 2013 nvlist_t *nvl;
2011 2014
2012 2015 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
2013 2016 return (SET_ERROR(EINVAL));
2014 2017
2015 2018 ASSERT(spa->spa_comment == NULL);
2016 2019 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2017 2020 spa->spa_comment = spa_strdup(comment);
2018 2021
2019 2022 /*
2020 2023 * Versioning wasn't explicitly added to the label until later, so if
2021 2024 * it's not present treat it as the initial version.
2022 2025 */
2023 2026 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2024 2027 &spa->spa_ubsync.ub_version) != 0)
2025 2028 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2026 2029
2027 2030 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2028 2031 &spa->spa_config_txg);
2029 2032
2030 2033 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
2031 2034 spa_guid_exists(pool_guid, 0)) {
2032 2035 error = SET_ERROR(EEXIST);
2033 2036 } else {
2034 2037 spa->spa_config_guid = pool_guid;
2035 2038
2036 2039 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
2037 2040 &nvl) == 0) {
2038 2041 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
2039 2042 KM_SLEEP) == 0);
2040 2043 }
2041 2044
2042 2045 nvlist_free(spa->spa_load_info);
2043 2046 spa->spa_load_info = fnvlist_alloc();
2044 2047
2045 2048 gethrestime(&spa->spa_loaded_ts);
2046 2049 error = spa_load_impl(spa, pool_guid, config, state, type,
2047 2050 mosconfig, &ereport);
2048 2051 }
2049 2052
2050 2053 spa->spa_minref = refcount_count(&spa->spa_refcount);
2051 2054 if (error) {
2052 2055 if (error != EEXIST) {
2053 2056 spa->spa_loaded_ts.tv_sec = 0;
2054 2057 spa->spa_loaded_ts.tv_nsec = 0;
2055 2058 }
2056 2059 if (error != EBADF) {
2057 2060 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2058 2061 }
2059 2062 }
2060 2063 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2061 2064 spa->spa_ena = 0;
2062 2065
2063 2066 return (error);
2064 2067 }
2065 2068
2066 2069 /*
2067 2070 * Load an existing storage pool, using the pool's builtin spa_config as a
2068 2071 * source of configuration information.
2069 2072 */
2070 2073 static int
2071 2074 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
2072 2075 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
2073 2076 char **ereport)
2074 2077 {
2075 2078 int error = 0;
2076 2079 nvlist_t *nvroot = NULL;
2077 2080 nvlist_t *label;
2078 2081 vdev_t *rvd;
2079 2082 uberblock_t *ub = &spa->spa_uberblock;
2080 2083 uint64_t children, config_cache_txg = spa->spa_config_txg;
2081 2084 int orig_mode = spa->spa_mode;
2082 2085 int parse;
2083 2086 uint64_t obj;
2084 2087 boolean_t missing_feat_write = B_FALSE;
2085 2088
2086 2089 /*
2087 2090 * If this is an untrusted config, access the pool in read-only mode.
2088 2091 * This prevents things like resilvering recently removed devices.
2089 2092 */
2090 2093 if (!mosconfig)
2091 2094 spa->spa_mode = FREAD;
2092 2095
2093 2096 ASSERT(MUTEX_HELD(&spa_namespace_lock));
2094 2097
2095 2098 spa->spa_load_state = state;
2096 2099
2097 2100 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
2098 2101 return (SET_ERROR(EINVAL));
2099 2102
2100 2103 parse = (type == SPA_IMPORT_EXISTING ?
2101 2104 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2102 2105
2103 2106 /*
2104 2107 * Create "The Godfather" zio to hold all async IOs
2105 2108 */
2106 2109 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2107 2110 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2108 2111
2109 2112 /*
2110 2113 * Parse the configuration into a vdev tree. We explicitly set the
2111 2114 * value that will be returned by spa_version() since parsing the
2112 2115 * configuration requires knowing the version number.
2113 2116 */
2114 2117 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2115 2118 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
2116 2119 spa_config_exit(spa, SCL_ALL, FTAG);
2117 2120
2118 2121 if (error != 0)
2119 2122 return (error);
2120 2123
2121 2124 ASSERT(spa->spa_root_vdev == rvd);
2122 2125
2123 2126 if (type != SPA_IMPORT_ASSEMBLE) {
2124 2127 ASSERT(spa_guid(spa) == pool_guid);
2125 2128 }
2126 2129
2127 2130 /*
2128 2131 * Try to open all vdevs, loading each label in the process.
2129 2132 */
2130 2133 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2131 2134 error = vdev_open(rvd);
2132 2135 spa_config_exit(spa, SCL_ALL, FTAG);
2133 2136 if (error != 0)
2134 2137 return (error);
2135 2138
2136 2139 /*
2137 2140 * We need to validate the vdev labels against the configuration that
2138 2141 * we have in hand, which is dependent on the setting of mosconfig. If
2139 2142 * mosconfig is true then we're validating the vdev labels based on
2140 2143 * that config. Otherwise, we're validating against the cached config
2141 2144 * (zpool.cache) that was read when we loaded the zfs module, and then
2142 2145 * later we will recursively call spa_load() and validate against
2143 2146 * the vdev config.
2144 2147 *
2145 2148 * If we're assembling a new pool that's been split off from an
2146 2149 * existing pool, the labels haven't yet been updated so we skip
2147 2150 * validation for now.
2148 2151 */
2149 2152 if (type != SPA_IMPORT_ASSEMBLE) {
2150 2153 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2151 2154 error = vdev_validate(rvd, mosconfig);
2152 2155 spa_config_exit(spa, SCL_ALL, FTAG);
2153 2156
2154 2157 if (error != 0)
2155 2158 return (error);
2156 2159
2157 2160 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2158 2161 return (SET_ERROR(ENXIO));
2159 2162 }
2160 2163
2161 2164 /*
2162 2165 * Find the best uberblock.
2163 2166 */
2164 2167 vdev_uberblock_load(rvd, ub, &label);
2165 2168
2166 2169 /*
2167 2170 * If we weren't able to find a single valid uberblock, return failure.
2168 2171 */
2169 2172 if (ub->ub_txg == 0) {
2170 2173 nvlist_free(label);
2171 2174 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2172 2175 }
2173 2176
2174 2177 /*
2175 2178 * If the pool has an unsupported version we can't open it.
2176 2179 */
2177 2180 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2178 2181 nvlist_free(label);
2179 2182 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2180 2183 }
2181 2184
2182 2185 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2183 2186 nvlist_t *features;
2184 2187
2185 2188 /*
2186 2189 * If we weren't able to find what's necessary for reading the
2187 2190 * MOS in the label, return failure.
2188 2191 */
2189 2192 if (label == NULL || nvlist_lookup_nvlist(label,
2190 2193 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
2191 2194 nvlist_free(label);
2192 2195 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2193 2196 ENXIO));
2194 2197 }
2195 2198
2196 2199 /*
2197 2200 * Update our in-core representation with the definitive values
2198 2201 * from the label.
2199 2202 */
2200 2203 nvlist_free(spa->spa_label_features);
2201 2204 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2202 2205 }
2203 2206
2204 2207 nvlist_free(label);
2205 2208
2206 2209 /*
2207 2210 * Look through entries in the label nvlist's features_for_read. If
2208 2211 * there is a feature listed there which we don't understand then we
2209 2212 * cannot open a pool.
2210 2213 */
2211 2214 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2212 2215 nvlist_t *unsup_feat;
2213 2216
2214 2217 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2215 2218 0);
2216 2219
2217 2220 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2218 2221 NULL); nvp != NULL;
2219 2222 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2220 2223 if (!zfeature_is_supported(nvpair_name(nvp))) {
2221 2224 VERIFY(nvlist_add_string(unsup_feat,
2222 2225 nvpair_name(nvp), "") == 0);
2223 2226 }
2224 2227 }
2225 2228
2226 2229 if (!nvlist_empty(unsup_feat)) {
2227 2230 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2228 2231 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2229 2232 nvlist_free(unsup_feat);
2230 2233 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2231 2234 ENOTSUP));
2232 2235 }
2233 2236
2234 2237 nvlist_free(unsup_feat);
2235 2238 }
2236 2239
2237 2240 /*
2238 2241 * If the vdev guid sum doesn't match the uberblock, we have an
2239 2242 * incomplete configuration. We first check to see if the pool
2240 2243 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
2241 2244 * If it is, defer the vdev_guid_sum check till later so we
2242 2245 * can handle missing vdevs.
2243 2246 */
2244 2247 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
2245 2248 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
2246 2249 rvd->vdev_guid_sum != ub->ub_guid_sum)
2247 2250 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2248 2251
2249 2252 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2250 2253 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2251 2254 spa_try_repair(spa, config);
2252 2255 spa_config_exit(spa, SCL_ALL, FTAG);
2253 2256 nvlist_free(spa->spa_config_splitting);
2254 2257 spa->spa_config_splitting = NULL;
2255 2258 }
2256 2259
2257 2260 /*
2258 2261 * Initialize internal SPA structures.
2259 2262 */
2260 2263 spa->spa_state = POOL_STATE_ACTIVE;
2261 2264 spa->spa_ubsync = spa->spa_uberblock;
2262 2265 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2263 2266 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2264 2267 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2265 2268 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2266 2269 spa->spa_claim_max_txg = spa->spa_first_txg;
2267 2270 spa->spa_prev_software_version = ub->ub_software_version;
2268 2271
2269 2272 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2270 2273 if (error)
2271 2274 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2272 2275 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2273 2276
2274 2277 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2275 2278 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2276 2279
2277 2280 if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2278 2281 boolean_t missing_feat_read = B_FALSE;
2279 2282 nvlist_t *unsup_feat, *enabled_feat;
2280 2283
2281 2284 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2282 2285 &spa->spa_feat_for_read_obj) != 0) {
2283 2286 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2284 2287 }
2285 2288
2286 2289 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2287 2290 &spa->spa_feat_for_write_obj) != 0) {
2288 2291 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2289 2292 }
2290 2293
2291 2294 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2292 2295 &spa->spa_feat_desc_obj) != 0) {
2293 2296 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2294 2297 }
2295 2298
2296 2299 enabled_feat = fnvlist_alloc();
2297 2300 unsup_feat = fnvlist_alloc();
2298 2301
2299 2302 if (!feature_is_supported(spa->spa_meta_objset,
2300 2303 spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
2301 2304 unsup_feat, enabled_feat))
2302 2305 missing_feat_read = B_TRUE;
2303 2306
2304 2307 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
2305 2308 if (!feature_is_supported(spa->spa_meta_objset,
2306 2309 spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
2307 2310 unsup_feat, enabled_feat)) {
2308 2311 missing_feat_write = B_TRUE;
2309 2312 }
2310 2313 }
2311 2314
2312 2315 fnvlist_add_nvlist(spa->spa_load_info,
2313 2316 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2314 2317
2315 2318 if (!nvlist_empty(unsup_feat)) {
2316 2319 fnvlist_add_nvlist(spa->spa_load_info,
2317 2320 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
2318 2321 }
2319 2322
2320 2323 fnvlist_free(enabled_feat);
2321 2324 fnvlist_free(unsup_feat);
2322 2325
2323 2326 if (!missing_feat_read) {
2324 2327 fnvlist_add_boolean(spa->spa_load_info,
2325 2328 ZPOOL_CONFIG_CAN_RDONLY);
2326 2329 }
2327 2330
2328 2331 /*
2329 2332 * If the state is SPA_LOAD_TRYIMPORT, our objective is
2330 2333 * twofold: to determine whether the pool is available for
2331 2334 * import in read-write mode and (if it is not) whether the
2332 2335 * pool is available for import in read-only mode. If the pool
2333 2336 * is available for import in read-write mode, it is displayed
2334 2337 * as available in userland; if it is not available for import
2335 2338 * in read-only mode, it is displayed as unavailable in
2336 2339 * userland. If the pool is available for import in read-only
2337 2340 * mode but not read-write mode, it is displayed as unavailable
2338 2341 * in userland with a special note that the pool is actually
2339 2342 * available for open in read-only mode.
2340 2343 *
2341 2344 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2342 2345 * missing a feature for write, we must first determine whether
2343 2346 * the pool can be opened read-only before returning to
2344 2347 * userland in order to know whether to display the
2345 2348 * abovementioned note.
2346 2349 */
2347 2350 if (missing_feat_read || (missing_feat_write &&
2348 2351 spa_writeable(spa))) {
2349 2352 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2350 2353 ENOTSUP));
2351 2354 }
2352 2355 }
2353 2356
2354 2357 spa->spa_is_initializing = B_TRUE;
2355 2358 error = dsl_pool_open(spa->spa_dsl_pool);
2356 2359 spa->spa_is_initializing = B_FALSE;
2357 2360 if (error != 0)
2358 2361 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2359 2362
2360 2363 if (!mosconfig) {
2361 2364 uint64_t hostid;
2362 2365 nvlist_t *policy = NULL, *nvconfig;
2363 2366
2364 2367 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2365 2368 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2366 2369
2367 2370 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2368 2371 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2369 2372 char *hostname;
2370 2373 unsigned long myhostid = 0;
2371 2374
2372 2375 VERIFY(nvlist_lookup_string(nvconfig,
2373 2376 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2374 2377
2375 2378 #ifdef _KERNEL
2376 2379 myhostid = zone_get_hostid(NULL);
2377 2380 #else /* _KERNEL */
2378 2381 /*
2379 2382 * We're emulating the system's hostid in userland, so
2380 2383 * we can't use zone_get_hostid().
2381 2384 */
2382 2385 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2383 2386 #endif /* _KERNEL */
2384 2387 if (hostid != 0 && myhostid != 0 &&
2385 2388 hostid != myhostid) {
2386 2389 nvlist_free(nvconfig);
2387 2390 cmn_err(CE_WARN, "pool '%s' could not be "
2388 2391 "loaded as it was last accessed by "
2389 2392 "another system (host: %s hostid: 0x%lx). "
2390 2393 "See: http://illumos.org/msg/ZFS-8000-EY",
2391 2394 spa_name(spa), hostname,
2392 2395 (unsigned long)hostid);
2393 2396 return (SET_ERROR(EBADF));
2394 2397 }
2395 2398 }
2396 2399 if (nvlist_lookup_nvlist(spa->spa_config,
2397 2400 ZPOOL_REWIND_POLICY, &policy) == 0)
2398 2401 VERIFY(nvlist_add_nvlist(nvconfig,
2399 2402 ZPOOL_REWIND_POLICY, policy) == 0);
2400 2403
2401 2404 spa_config_set(spa, nvconfig);
2402 2405 spa_unload(spa);
2403 2406 spa_deactivate(spa);
2404 2407 spa_activate(spa, orig_mode);
2405 2408
2406 2409 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2407 2410 }
2408 2411
2409 2412 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2410 2413 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2411 2414 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2412 2415 if (error != 0)
2413 2416 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2414 2417
2415 2418 /*
2416 2419 * Load the bit that tells us to use the new accounting function
2417 2420 * (raid-z deflation). If we have an older pool, this will not
2418 2421 * be present.
2419 2422 */
2420 2423 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2421 2424 if (error != 0 && error != ENOENT)
2422 2425 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2423 2426
2424 2427 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2425 2428 &spa->spa_creation_version);
2426 2429 if (error != 0 && error != ENOENT)
2427 2430 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2428 2431
2429 2432 /*
2430 2433 * Load the persistent error log. If we have an older pool, this will
2431 2434 * not be present.
2432 2435 */
2433 2436 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2434 2437 if (error != 0 && error != ENOENT)
2435 2438 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2436 2439
2437 2440 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2438 2441 &spa->spa_errlog_scrub);
2439 2442 if (error != 0 && error != ENOENT)
2440 2443 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2441 2444
2442 2445 /*
2443 2446 * Load the history object. If we have an older pool, this
2444 2447 * will not be present.
2445 2448 */
2446 2449 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2447 2450 if (error != 0 && error != ENOENT)
2448 2451 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2449 2452
2450 2453 /*
2451 2454 * If we're assembling the pool from the split-off vdevs of
2452 2455 * an existing pool, we don't want to attach the spares & cache
2453 2456 * devices.
2454 2457 */
2455 2458
2456 2459 /*
2457 2460 * Load any hot spares for this pool.
2458 2461 */
2459 2462 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2460 2463 if (error != 0 && error != ENOENT)
2461 2464 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2462 2465 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2463 2466 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2464 2467 if (load_nvlist(spa, spa->spa_spares.sav_object,
2465 2468 &spa->spa_spares.sav_config) != 0)
2466 2469 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2467 2470
2468 2471 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2469 2472 spa_load_spares(spa);
2470 2473 spa_config_exit(spa, SCL_ALL, FTAG);
2471 2474 } else if (error == 0) {
2472 2475 spa->spa_spares.sav_sync = B_TRUE;
2473 2476 }
2474 2477
2475 2478 /*
2476 2479 * Load any level 2 ARC devices for this pool.
2477 2480 */
2478 2481 error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2479 2482 &spa->spa_l2cache.sav_object);
2480 2483 if (error != 0 && error != ENOENT)
2481 2484 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2482 2485 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2483 2486 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2484 2487 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2485 2488 &spa->spa_l2cache.sav_config) != 0)
2486 2489 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2487 2490
2488 2491 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2489 2492 spa_load_l2cache(spa);
2490 2493 spa_config_exit(spa, SCL_ALL, FTAG);
2491 2494 } else if (error == 0) {
2492 2495 spa->spa_l2cache.sav_sync = B_TRUE;
2493 2496 }
2494 2497
2495 2498 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2496 2499
2497 2500 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2498 2501 if (error && error != ENOENT)
2499 2502 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2500 2503
2501 2504 if (error == 0) {
2502 2505 uint64_t autoreplace;
2503 2506
2504 2507 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2505 2508 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2506 2509 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2507 2510 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2508 2511 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2509 2512 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2510 2513 &spa->spa_dedup_ditto);
2511 2514
2512 2515 spa->spa_autoreplace = (autoreplace != 0);
2513 2516 }
2514 2517
2515 2518 /*
2516 2519 * If the 'autoreplace' property is set, then post a resource notifying
2517 2520 * the ZFS DE that it should not issue any faults for unopenable
2518 2521 * devices. We also iterate over the vdevs, and post a sysevent for any
2519 2522 * unopenable vdevs so that the normal autoreplace handler can take
2520 2523 * over.
2521 2524 */
2522 2525 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2523 2526 spa_check_removed(spa->spa_root_vdev);
2524 2527 /*
2525 2528 * For the import case, this is done in spa_import(), because
2526 2529 * at this point we're using the spare definitions from
2527 2530 * the MOS config, not necessarily from the userland config.
2528 2531 */
2529 2532 if (state != SPA_LOAD_IMPORT) {
2530 2533 spa_aux_check_removed(&spa->spa_spares);
2531 2534 spa_aux_check_removed(&spa->spa_l2cache);
2532 2535 }
2533 2536 }
2534 2537
2535 2538 /*
2536 2539 * Load the vdev state for all toplevel vdevs.
2537 2540 */
2538 2541 vdev_load(rvd);
2539 2542
2540 2543 /*
2541 2544 * Propagate the leaf DTLs we just loaded all the way up the tree.
2542 2545 */
2543 2546 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2544 2547 vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2545 2548 spa_config_exit(spa, SCL_ALL, FTAG);
2546 2549
2547 2550 /*
2548 2551 * Load the DDTs (dedup tables).
2549 2552 */
2550 2553 error = ddt_load(spa);
2551 2554 if (error != 0)
2552 2555 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2553 2556
2554 2557 spa_update_dspace(spa);
2555 2558
2556 2559 /*
2557 2560 * Validate the config, using the MOS config to fill in any
2558 2561 * information which might be missing. If we fail to validate
2559 2562 * the config then declare the pool unfit for use. If we're
2560 2563 * assembling a pool from a split, the log is not transferred
2561 2564 * over.
2562 2565 */
2563 2566 if (type != SPA_IMPORT_ASSEMBLE) {
2564 2567 nvlist_t *nvconfig;
2565 2568
2566 2569 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2567 2570 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2568 2571
2569 2572 if (!spa_config_valid(spa, nvconfig)) {
2570 2573 nvlist_free(nvconfig);
2571 2574 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2572 2575 ENXIO));
2573 2576 }
2574 2577 nvlist_free(nvconfig);
2575 2578
2576 2579 /*
2577 2580 * Now that we've validated the config, check the state of the
2578 2581 * root vdev. If it can't be opened, it indicates one or
2579 2582 * more toplevel vdevs are faulted.
2580 2583 */
2581 2584 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2582 2585 return (SET_ERROR(ENXIO));
2583 2586
2584 2587 if (spa_check_logs(spa)) {
2585 2588 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2586 2589 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2587 2590 }
2588 2591 }
2589 2592
2590 2593 if (missing_feat_write) {
2591 2594 ASSERT(state == SPA_LOAD_TRYIMPORT);
2592 2595
2593 2596 /*
2594 2597 * At this point, we know that we can open the pool in
2595 2598 * read-only mode but not read-write mode. We now have enough
2596 2599 * information and can return to userland.
2597 2600 */
2598 2601 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
2599 2602 }
2600 2603
2601 2604 /*
2602 2605 * We've successfully opened the pool, verify that we're ready
2603 2606 * to start pushing transactions.
2604 2607 */
2605 2608 if (state != SPA_LOAD_TRYIMPORT) {
2606 2609 if (error = spa_load_verify(spa))
2607 2610 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2608 2611 error));
2609 2612 }
2610 2613
2611 2614 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2612 2615 spa->spa_load_max_txg == UINT64_MAX)) {
2613 2616 dmu_tx_t *tx;
2614 2617 int need_update = B_FALSE;
2615 2618
2616 2619 ASSERT(state != SPA_LOAD_TRYIMPORT);
2617 2620
2618 2621 /*
2619 2622 * Claim log blocks that haven't been committed yet.
2620 2623 * This must all happen in a single txg.
2621 2624 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2622 2625 * invoked from zil_claim_log_block()'s i/o done callback.
2623 2626 * Price of rollback is that we abandon the log.
2624 2627 */
2625 2628 spa->spa_claiming = B_TRUE;
2626 2629
2627 2630 tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2628 2631 spa_first_txg(spa));
2629 2632 (void) dmu_objset_find(spa_name(spa),
2630 2633 zil_claim, tx, DS_FIND_CHILDREN);
2631 2634 dmu_tx_commit(tx);
2632 2635
2633 2636 spa->spa_claiming = B_FALSE;
2634 2637
2635 2638 spa_set_log_state(spa, SPA_LOG_GOOD);
2636 2639 spa->spa_sync_on = B_TRUE;
2637 2640 txg_sync_start(spa->spa_dsl_pool);
2638 2641
2639 2642 /*
2640 2643 * Wait for all claims to sync. We sync up to the highest
2641 2644 * claimed log block birth time so that claimed log blocks
2642 2645 * don't appear to be from the future. spa_claim_max_txg
2643 2646 * will have been set for us by either zil_check_log_chain()
2644 2647 * (invoked from spa_check_logs()) or zil_claim() above.
2645 2648 */
2646 2649 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2647 2650
2648 2651 /*
2649 2652 * If the config cache is stale, or we have uninitialized
2650 2653 * metaslabs (see spa_vdev_add()), then update the config.
2651 2654 *
2652 2655 * If this is a verbatim import, trust the current
2653 2656 * in-core spa_config and update the disk labels.
2654 2657 */
2655 2658 if (config_cache_txg != spa->spa_config_txg ||
2656 2659 state == SPA_LOAD_IMPORT ||
2657 2660 state == SPA_LOAD_RECOVER ||
2658 2661 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2659 2662 need_update = B_TRUE;
2660 2663
2661 2664 for (int c = 0; c < rvd->vdev_children; c++)
2662 2665 if (rvd->vdev_child[c]->vdev_ms_array == 0)
2663 2666 need_update = B_TRUE;
2664 2667
2665 2668 /*
2666 2669 * Update the config cache asychronously in case we're the
2667 2670 * root pool, in which case the config cache isn't writable yet.
2668 2671 */
2669 2672 if (need_update)
2670 2673 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2671 2674
2672 2675 /*
2673 2676 * Check all DTLs to see if anything needs resilvering.
2674 2677 */
2675 2678 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2676 2679 vdev_resilver_needed(rvd, NULL, NULL))
2677 2680 spa_async_request(spa, SPA_ASYNC_RESILVER);
2678 2681
2679 2682 /*
2680 2683 * Log the fact that we booted up (so that we can detect if
2681 2684 * we rebooted in the middle of an operation).
2682 2685 */
2683 2686 spa_history_log_version(spa, "open");
2684 2687
2685 2688 /*
2686 2689 * Delete any inconsistent datasets.
2687 2690 */
2688 2691 (void) dmu_objset_find(spa_name(spa),
2689 2692 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2690 2693
2691 2694 /*
2692 2695 * Clean up any stale temporary dataset userrefs.
2693 2696 */
2694 2697 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2695 2698 }
2696 2699
2697 2700 return (0);
2698 2701 }
2699 2702
2700 2703 static int
2701 2704 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2702 2705 {
2703 2706 int mode = spa->spa_mode;
2704 2707
2705 2708 spa_unload(spa);
2706 2709 spa_deactivate(spa);
2707 2710
2708 2711 spa->spa_load_max_txg--;
2709 2712
2710 2713 spa_activate(spa, mode);
2711 2714 spa_async_suspend(spa);
2712 2715
2713 2716 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2714 2717 }
2715 2718
2716 2719 /*
2717 2720 * If spa_load() fails this function will try loading prior txg's. If
2718 2721 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
2719 2722 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
2720 2723 * function will not rewind the pool and will return the same error as
2721 2724 * spa_load().
2722 2725 */
2723 2726 static int
2724 2727 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2725 2728 uint64_t max_request, int rewind_flags)
2726 2729 {
2727 2730 nvlist_t *loadinfo = NULL;
2728 2731 nvlist_t *config = NULL;
2729 2732 int load_error, rewind_error;
2730 2733 uint64_t safe_rewind_txg;
2731 2734 uint64_t min_txg;
2732 2735
2733 2736 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2734 2737 spa->spa_load_max_txg = spa->spa_load_txg;
2735 2738 spa_set_log_state(spa, SPA_LOG_CLEAR);
2736 2739 } else {
2737 2740 spa->spa_load_max_txg = max_request;
2738 2741 }
2739 2742
2740 2743 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2741 2744 mosconfig);
2742 2745 if (load_error == 0)
2743 2746 return (0);
2744 2747
2745 2748 if (spa->spa_root_vdev != NULL)
2746 2749 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2747 2750
2748 2751 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2749 2752 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2750 2753
2751 2754 if (rewind_flags & ZPOOL_NEVER_REWIND) {
2752 2755 nvlist_free(config);
2753 2756 return (load_error);
2754 2757 }
2755 2758
2756 2759 if (state == SPA_LOAD_RECOVER) {
2757 2760 /* Price of rolling back is discarding txgs, including log */
2758 2761 spa_set_log_state(spa, SPA_LOG_CLEAR);
2759 2762 } else {
2760 2763 /*
2761 2764 * If we aren't rolling back save the load info from our first
2762 2765 * import attempt so that we can restore it after attempting
2763 2766 * to rewind.
2764 2767 */
2765 2768 loadinfo = spa->spa_load_info;
2766 2769 spa->spa_load_info = fnvlist_alloc();
2767 2770 }
2768 2771
2769 2772 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2770 2773 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2771 2774 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2772 2775 TXG_INITIAL : safe_rewind_txg;
2773 2776
2774 2777 /*
2775 2778 * Continue as long as we're finding errors, we're still within
2776 2779 * the acceptable rewind range, and we're still finding uberblocks
2777 2780 */
2778 2781 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2779 2782 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2780 2783 if (spa->spa_load_max_txg < safe_rewind_txg)
2781 2784 spa->spa_extreme_rewind = B_TRUE;
2782 2785 rewind_error = spa_load_retry(spa, state, mosconfig);
2783 2786 }
2784 2787
2785 2788 spa->spa_extreme_rewind = B_FALSE;
2786 2789 spa->spa_load_max_txg = UINT64_MAX;
2787 2790
2788 2791 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2789 2792 spa_config_set(spa, config);
2790 2793
2791 2794 if (state == SPA_LOAD_RECOVER) {
2792 2795 ASSERT3P(loadinfo, ==, NULL);
2793 2796 return (rewind_error);
2794 2797 } else {
2795 2798 /* Store the rewind info as part of the initial load info */
2796 2799 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
2797 2800 spa->spa_load_info);
2798 2801
2799 2802 /* Restore the initial load info */
2800 2803 fnvlist_free(spa->spa_load_info);
2801 2804 spa->spa_load_info = loadinfo;
2802 2805
2803 2806 return (load_error);
2804 2807 }
2805 2808 }
2806 2809
2807 2810 /*
2808 2811 * Pool Open/Import
2809 2812 *
2810 2813 * The import case is identical to an open except that the configuration is sent
2811 2814 * down from userland, instead of grabbed from the configuration cache. For the
2812 2815 * case of an open, the pool configuration will exist in the
2813 2816 * POOL_STATE_UNINITIALIZED state.
2814 2817 *
2815 2818 * The stats information (gen/count/ustats) is used to gather vdev statistics at
2816 2819 * the same time open the pool, without having to keep around the spa_t in some
2817 2820 * ambiguous state.
2818 2821 */
2819 2822 static int
2820 2823 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2821 2824 nvlist_t **config)
2822 2825 {
2823 2826 spa_t *spa;
2824 2827 spa_load_state_t state = SPA_LOAD_OPEN;
2825 2828 int error;
2826 2829 int locked = B_FALSE;
2827 2830
2828 2831 *spapp = NULL;
2829 2832
2830 2833 /*
2831 2834 * As disgusting as this is, we need to support recursive calls to this
2832 2835 * function because dsl_dir_open() is called during spa_load(), and ends
2833 2836 * up calling spa_open() again. The real fix is to figure out how to
2834 2837 * avoid dsl_dir_open() calling this in the first place.
2835 2838 */
2836 2839 if (mutex_owner(&spa_namespace_lock) != curthread) {
2837 2840 mutex_enter(&spa_namespace_lock);
2838 2841 locked = B_TRUE;
2839 2842 }
2840 2843
2841 2844 if ((spa = spa_lookup(pool)) == NULL) {
2842 2845 if (locked)
2843 2846 mutex_exit(&spa_namespace_lock);
2844 2847 return (SET_ERROR(ENOENT));
2845 2848 }
2846 2849
2847 2850 if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
2848 2851 zpool_rewind_policy_t policy;
2849 2852
2850 2853 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
2851 2854 &policy);
2852 2855 if (policy.zrp_request & ZPOOL_DO_REWIND)
2853 2856 state = SPA_LOAD_RECOVER;
2854 2857
2855 2858 spa_activate(spa, spa_mode_global);
2856 2859
2857 2860 if (state != SPA_LOAD_RECOVER)
2858 2861 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2859 2862
2860 2863 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2861 2864 policy.zrp_request);
2862 2865
2863 2866 if (error == EBADF) {
2864 2867 /*
2865 2868 * If vdev_validate() returns failure (indicated by
2866 2869 * EBADF), it indicates that one of the vdevs indicates
2867 2870 * that the pool has been exported or destroyed. If
2868 2871 * this is the case, the config cache is out of sync and
2869 2872 * we should remove the pool from the namespace.
2870 2873 */
2871 2874 spa_unload(spa);
2872 2875 spa_deactivate(spa);
2873 2876 spa_config_sync(spa, B_TRUE, B_TRUE);
2874 2877 spa_remove(spa);
2875 2878 if (locked)
2876 2879 mutex_exit(&spa_namespace_lock);
2877 2880 return (SET_ERROR(ENOENT));
2878 2881 }
2879 2882
2880 2883 if (error) {
2881 2884 /*
2882 2885 * We can't open the pool, but we still have useful
2883 2886 * information: the state of each vdev after the
2884 2887 * attempted vdev_open(). Return this to the user.
2885 2888 */
2886 2889 if (config != NULL && spa->spa_config) {
2887 2890 VERIFY(nvlist_dup(spa->spa_config, config,
2888 2891 KM_SLEEP) == 0);
2889 2892 VERIFY(nvlist_add_nvlist(*config,
2890 2893 ZPOOL_CONFIG_LOAD_INFO,
2891 2894 spa->spa_load_info) == 0);
2892 2895 }
2893 2896 spa_unload(spa);
2894 2897 spa_deactivate(spa);
2895 2898 spa->spa_last_open_failed = error;
2896 2899 if (locked)
2897 2900 mutex_exit(&spa_namespace_lock);
2898 2901 *spapp = NULL;
2899 2902 return (error);
2900 2903 }
2901 2904 }
2902 2905
2903 2906 spa_open_ref(spa, tag);
2904 2907
2905 2908 if (config != NULL)
2906 2909 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2907 2910
2908 2911 /*
2909 2912 * If we've recovered the pool, pass back any information we
2910 2913 * gathered while doing the load.
2911 2914 */
2912 2915 if (state == SPA_LOAD_RECOVER) {
2913 2916 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
2914 2917 spa->spa_load_info) == 0);
2915 2918 }
2916 2919
2917 2920 if (locked) {
2918 2921 spa->spa_last_open_failed = 0;
2919 2922 spa->spa_last_ubsync_txg = 0;
2920 2923 spa->spa_load_txg = 0;
2921 2924 mutex_exit(&spa_namespace_lock);
2922 2925 }
2923 2926
2924 2927 *spapp = spa;
2925 2928
2926 2929 return (0);
2927 2930 }
2928 2931
2929 2932 int
2930 2933 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2931 2934 nvlist_t **config)
2932 2935 {
2933 2936 return (spa_open_common(name, spapp, tag, policy, config));
2934 2937 }
2935 2938
2936 2939 int
2937 2940 spa_open(const char *name, spa_t **spapp, void *tag)
2938 2941 {
2939 2942 return (spa_open_common(name, spapp, tag, NULL, NULL));
2940 2943 }
2941 2944
2942 2945 /*
2943 2946 * Lookup the given spa_t, incrementing the inject count in the process,
2944 2947 * preventing it from being exported or destroyed.
2945 2948 */
2946 2949 spa_t *
2947 2950 spa_inject_addref(char *name)
2948 2951 {
2949 2952 spa_t *spa;
2950 2953
2951 2954 mutex_enter(&spa_namespace_lock);
2952 2955 if ((spa = spa_lookup(name)) == NULL) {
2953 2956 mutex_exit(&spa_namespace_lock);
2954 2957 return (NULL);
2955 2958 }
2956 2959 spa->spa_inject_ref++;
2957 2960 mutex_exit(&spa_namespace_lock);
2958 2961
2959 2962 return (spa);
2960 2963 }
2961 2964
2962 2965 void
2963 2966 spa_inject_delref(spa_t *spa)
2964 2967 {
2965 2968 mutex_enter(&spa_namespace_lock);
2966 2969 spa->spa_inject_ref--;
2967 2970 mutex_exit(&spa_namespace_lock);
2968 2971 }
2969 2972
2970 2973 /*
2971 2974 * Add spares device information to the nvlist.
2972 2975 */
2973 2976 static void
2974 2977 spa_add_spares(spa_t *spa, nvlist_t *config)
2975 2978 {
2976 2979 nvlist_t **spares;
2977 2980 uint_t i, nspares;
2978 2981 nvlist_t *nvroot;
2979 2982 uint64_t guid;
2980 2983 vdev_stat_t *vs;
2981 2984 uint_t vsc;
2982 2985 uint64_t pool;
2983 2986
2984 2987 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2985 2988
2986 2989 if (spa->spa_spares.sav_count == 0)
2987 2990 return;
2988 2991
2989 2992 VERIFY(nvlist_lookup_nvlist(config,
2990 2993 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2991 2994 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2992 2995 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2993 2996 if (nspares != 0) {
2994 2997 VERIFY(nvlist_add_nvlist_array(nvroot,
2995 2998 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2996 2999 VERIFY(nvlist_lookup_nvlist_array(nvroot,
2997 3000 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2998 3001
2999 3002 /*
3000 3003 * Go through and find any spares which have since been
3001 3004 * repurposed as an active spare. If this is the case, update
3002 3005 * their status appropriately.
3003 3006 */
3004 3007 for (i = 0; i < nspares; i++) {
3005 3008 VERIFY(nvlist_lookup_uint64(spares[i],
3006 3009 ZPOOL_CONFIG_GUID, &guid) == 0);
3007 3010 if (spa_spare_exists(guid, &pool, NULL) &&
3008 3011 pool != 0ULL) {
3009 3012 VERIFY(nvlist_lookup_uint64_array(
3010 3013 spares[i], ZPOOL_CONFIG_VDEV_STATS,
3011 3014 (uint64_t **)&vs, &vsc) == 0);
3012 3015 vs->vs_state = VDEV_STATE_CANT_OPEN;
3013 3016 vs->vs_aux = VDEV_AUX_SPARED;
3014 3017 }
3015 3018 }
3016 3019 }
3017 3020 }
3018 3021
3019 3022 /*
3020 3023 * Add l2cache device information to the nvlist, including vdev stats.
3021 3024 */
3022 3025 static void
3023 3026 spa_add_l2cache(spa_t *spa, nvlist_t *config)
3024 3027 {
3025 3028 nvlist_t **l2cache;
3026 3029 uint_t i, j, nl2cache;
3027 3030 nvlist_t *nvroot;
3028 3031 uint64_t guid;
3029 3032 vdev_t *vd;
3030 3033 vdev_stat_t *vs;
3031 3034 uint_t vsc;
3032 3035
3033 3036 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3034 3037
3035 3038 if (spa->spa_l2cache.sav_count == 0)
3036 3039 return;
3037 3040
3038 3041 VERIFY(nvlist_lookup_nvlist(config,
3039 3042 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3040 3043 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3041 3044 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3042 3045 if (nl2cache != 0) {
3043 3046 VERIFY(nvlist_add_nvlist_array(nvroot,
3044 3047 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3045 3048 VERIFY(nvlist_lookup_nvlist_array(nvroot,
3046 3049 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3047 3050
3048 3051 /*
3049 3052 * Update level 2 cache device stats.
3050 3053 */
3051 3054
3052 3055 for (i = 0; i < nl2cache; i++) {
3053 3056 VERIFY(nvlist_lookup_uint64(l2cache[i],
3054 3057 ZPOOL_CONFIG_GUID, &guid) == 0);
3055 3058
3056 3059 vd = NULL;
3057 3060 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
3058 3061 if (guid ==
3059 3062 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
3060 3063 vd = spa->spa_l2cache.sav_vdevs[j];
3061 3064 break;
3062 3065 }
3063 3066 }
3064 3067 ASSERT(vd != NULL);
3065 3068
3066 3069 VERIFY(nvlist_lookup_uint64_array(l2cache[i],
3067 3070 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
3068 3071 == 0);
3069 3072 vdev_get_stats(vd, vs);
3070 3073 }
3071 3074 }
3072 3075 }
3073 3076
3074 3077 static void
3075 3078 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
3076 3079 {
3077 3080 nvlist_t *features;
3078 3081 zap_cursor_t zc;
3079 3082 zap_attribute_t za;
3080 3083
3081 3084 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3082 3085 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3083 3086
3084 3087 if (spa->spa_feat_for_read_obj != 0) {
3085 3088 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3086 3089 spa->spa_feat_for_read_obj);
3087 3090 zap_cursor_retrieve(&zc, &za) == 0;
3088 3091 zap_cursor_advance(&zc)) {
3089 3092 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3090 3093 za.za_num_integers == 1);
3091 3094 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3092 3095 za.za_first_integer));
3093 3096 }
3094 3097 zap_cursor_fini(&zc);
3095 3098 }
3096 3099
3097 3100 if (spa->spa_feat_for_write_obj != 0) {
3098 3101 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3099 3102 spa->spa_feat_for_write_obj);
3100 3103 zap_cursor_retrieve(&zc, &za) == 0;
3101 3104 zap_cursor_advance(&zc)) {
3102 3105 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3103 3106 za.za_num_integers == 1);
3104 3107 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3105 3108 za.za_first_integer));
3106 3109 }
3107 3110 zap_cursor_fini(&zc);
3108 3111 }
3109 3112
3110 3113 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
3111 3114 features) == 0);
3112 3115 nvlist_free(features);
3113 3116 }
3114 3117
3115 3118 int
3116 3119 spa_get_stats(const char *name, nvlist_t **config,
3117 3120 char *altroot, size_t buflen)
3118 3121 {
3119 3122 int error;
3120 3123 spa_t *spa;
3121 3124
3122 3125 *config = NULL;
3123 3126 error = spa_open_common(name, &spa, FTAG, NULL, config);
3124 3127
3125 3128 if (spa != NULL) {
3126 3129 /*
3127 3130 * This still leaves a window of inconsistency where the spares
3128 3131 * or l2cache devices could change and the config would be
3129 3132 * self-inconsistent.
3130 3133 */
3131 3134 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3132 3135
3133 3136 if (*config != NULL) {
3134 3137 uint64_t loadtimes[2];
3135 3138
3136 3139 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
3137 3140 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
3138 3141 VERIFY(nvlist_add_uint64_array(*config,
3139 3142 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
3140 3143
3141 3144 VERIFY(nvlist_add_uint64(*config,
3142 3145 ZPOOL_CONFIG_ERRCOUNT,
3143 3146 spa_get_errlog_size(spa)) == 0);
3144 3147
3145 3148 if (spa_suspended(spa))
3146 3149 VERIFY(nvlist_add_uint64(*config,
3147 3150 ZPOOL_CONFIG_SUSPENDED,
3148 3151 spa->spa_failmode) == 0);
3149 3152
3150 3153 spa_add_spares(spa, *config);
3151 3154 spa_add_l2cache(spa, *config);
3152 3155 spa_add_feature_stats(spa, *config);
3153 3156 }
3154 3157 }
3155 3158
3156 3159 /*
3157 3160 * We want to get the alternate root even for faulted pools, so we cheat
3158 3161 * and call spa_lookup() directly.
3159 3162 */
3160 3163 if (altroot) {
3161 3164 if (spa == NULL) {
3162 3165 mutex_enter(&spa_namespace_lock);
3163 3166 spa = spa_lookup(name);
3164 3167 if (spa)
3165 3168 spa_altroot(spa, altroot, buflen);
3166 3169 else
3167 3170 altroot[0] = '\0';
3168 3171 spa = NULL;
3169 3172 mutex_exit(&spa_namespace_lock);
3170 3173 } else {
3171 3174 spa_altroot(spa, altroot, buflen);
3172 3175 }
3173 3176 }
3174 3177
3175 3178 if (spa != NULL) {
3176 3179 spa_config_exit(spa, SCL_CONFIG, FTAG);
3177 3180 spa_close(spa, FTAG);
3178 3181 }
3179 3182
3180 3183 return (error);
3181 3184 }
3182 3185
3183 3186 /*
3184 3187 * Validate that the auxiliary device array is well formed. We must have an
3185 3188 * array of nvlists, each which describes a valid leaf vdev. If this is an
3186 3189 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
3187 3190 * specified, as long as they are well-formed.
3188 3191 */
3189 3192 static int
3190 3193 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
3191 3194 spa_aux_vdev_t *sav, const char *config, uint64_t version,
3192 3195 vdev_labeltype_t label)
3193 3196 {
3194 3197 nvlist_t **dev;
3195 3198 uint_t i, ndev;
3196 3199 vdev_t *vd;
3197 3200 int error;
3198 3201
3199 3202 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3200 3203
3201 3204 /*
3202 3205 * It's acceptable to have no devs specified.
3203 3206 */
3204 3207 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
3205 3208 return (0);
3206 3209
3207 3210 if (ndev == 0)
3208 3211 return (SET_ERROR(EINVAL));
3209 3212
3210 3213 /*
3211 3214 * Make sure the pool is formatted with a version that supports this
3212 3215 * device type.
3213 3216 */
3214 3217 if (spa_version(spa) < version)
3215 3218 return (SET_ERROR(ENOTSUP));
3216 3219
3217 3220 /*
3218 3221 * Set the pending device list so we correctly handle device in-use
3219 3222 * checking.
3220 3223 */
3221 3224 sav->sav_pending = dev;
3222 3225 sav->sav_npending = ndev;
3223 3226
3224 3227 for (i = 0; i < ndev; i++) {
3225 3228 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
3226 3229 mode)) != 0)
3227 3230 goto out;
3228 3231
3229 3232 if (!vd->vdev_ops->vdev_op_leaf) {
3230 3233 vdev_free(vd);
3231 3234 error = SET_ERROR(EINVAL);
3232 3235 goto out;
3233 3236 }
3234 3237
3235 3238 /*
3236 3239 * The L2ARC currently only supports disk devices in
3237 3240 * kernel context. For user-level testing, we allow it.
3238 3241 */
3239 3242 #ifdef _KERNEL
3240 3243 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
3241 3244 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
3242 3245 error = SET_ERROR(ENOTBLK);
3243 3246 vdev_free(vd);
3244 3247 goto out;
3245 3248 }
3246 3249 #endif
3247 3250 vd->vdev_top = vd;
3248 3251
3249 3252 if ((error = vdev_open(vd)) == 0 &&
3250 3253 (error = vdev_label_init(vd, crtxg, label)) == 0) {
3251 3254 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
3252 3255 vd->vdev_guid) == 0);
3253 3256 }
3254 3257
3255 3258 vdev_free(vd);
3256 3259
3257 3260 if (error &&
3258 3261 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
3259 3262 goto out;
3260 3263 else
3261 3264 error = 0;
3262 3265 }
3263 3266
3264 3267 out:
3265 3268 sav->sav_pending = NULL;
3266 3269 sav->sav_npending = 0;
3267 3270 return (error);
3268 3271 }
3269 3272
3270 3273 static int
3271 3274 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
3272 3275 {
3273 3276 int error;
3274 3277
3275 3278 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3276 3279
3277 3280 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3278 3281 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
3279 3282 VDEV_LABEL_SPARE)) != 0) {
3280 3283 return (error);
3281 3284 }
3282 3285
3283 3286 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3284 3287 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
3285 3288 VDEV_LABEL_L2CACHE));
3286 3289 }
3287 3290
3288 3291 static void
3289 3292 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
3290 3293 const char *config)
3291 3294 {
3292 3295 int i;
3293 3296
3294 3297 if (sav->sav_config != NULL) {
3295 3298 nvlist_t **olddevs;
3296 3299 uint_t oldndevs;
3297 3300 nvlist_t **newdevs;
3298 3301
3299 3302 /*
3300 3303 * Generate new dev list by concatentating with the
3301 3304 * current dev list.
3302 3305 */
3303 3306 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
3304 3307 &olddevs, &oldndevs) == 0);
3305 3308
3306 3309 newdevs = kmem_alloc(sizeof (void *) *
3307 3310 (ndevs + oldndevs), KM_SLEEP);
3308 3311 for (i = 0; i < oldndevs; i++)
3309 3312 VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
3310 3313 KM_SLEEP) == 0);
3311 3314 for (i = 0; i < ndevs; i++)
3312 3315 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
3313 3316 KM_SLEEP) == 0);
3314 3317
3315 3318 VERIFY(nvlist_remove(sav->sav_config, config,
3316 3319 DATA_TYPE_NVLIST_ARRAY) == 0);
3317 3320
3318 3321 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
3319 3322 config, newdevs, ndevs + oldndevs) == 0);
3320 3323 for (i = 0; i < oldndevs + ndevs; i++)
3321 3324 nvlist_free(newdevs[i]);
3322 3325 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
3323 3326 } else {
3324 3327 /*
3325 3328 * Generate a new dev list.
3326 3329 */
3327 3330 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
3328 3331 KM_SLEEP) == 0);
3329 3332 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
3330 3333 devs, ndevs) == 0);
3331 3334 }
3332 3335 }
3333 3336
3334 3337 /*
3335 3338 * Stop and drop level 2 ARC devices
3336 3339 */
3337 3340 void
3338 3341 spa_l2cache_drop(spa_t *spa)
3339 3342 {
3340 3343 vdev_t *vd;
3341 3344 int i;
3342 3345 spa_aux_vdev_t *sav = &spa->spa_l2cache;
3343 3346
3344 3347 for (i = 0; i < sav->sav_count; i++) {
3345 3348 uint64_t pool;
3346 3349
3347 3350 vd = sav->sav_vdevs[i];
3348 3351 ASSERT(vd != NULL);
3349 3352
3350 3353 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
3351 3354 pool != 0ULL && l2arc_vdev_present(vd))
3352 3355 l2arc_remove_vdev(vd);
3353 3356 }
3354 3357 }
3355 3358
3356 3359 /*
3357 3360 * Pool Creation
3358 3361 */
3359 3362 int
3360 3363 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
3361 3364 nvlist_t *zplprops)
3362 3365 {
3363 3366 spa_t *spa;
3364 3367 char *altroot = NULL;
3365 3368 vdev_t *rvd;
3366 3369 dsl_pool_t *dp;
3367 3370 dmu_tx_t *tx;
3368 3371 int error = 0;
3369 3372 uint64_t txg = TXG_INITIAL;
3370 3373 nvlist_t **spares, **l2cache;
3371 3374 uint_t nspares, nl2cache;
3372 3375 uint64_t version, obj;
3373 3376 boolean_t has_features;
3374 3377
3375 3378 /*
3376 3379 * If this pool already exists, return failure.
3377 3380 */
3378 3381 mutex_enter(&spa_namespace_lock);
3379 3382 if (spa_lookup(pool) != NULL) {
3380 3383 mutex_exit(&spa_namespace_lock);
3381 3384 return (SET_ERROR(EEXIST));
3382 3385 }
3383 3386
3384 3387 /*
3385 3388 * Allocate a new spa_t structure.
3386 3389 */
3387 3390 (void) nvlist_lookup_string(props,
3388 3391 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3389 3392 spa = spa_add(pool, NULL, altroot);
3390 3393 spa_activate(spa, spa_mode_global);
3391 3394
3392 3395 if (props && (error = spa_prop_validate(spa, props))) {
3393 3396 spa_deactivate(spa);
3394 3397 spa_remove(spa);
3395 3398 mutex_exit(&spa_namespace_lock);
3396 3399 return (error);
3397 3400 }
3398 3401
3399 3402 has_features = B_FALSE;
3400 3403 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
3401 3404 elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
3402 3405 if (zpool_prop_feature(nvpair_name(elem)))
3403 3406 has_features = B_TRUE;
3404 3407 }
3405 3408
3406 3409 if (has_features || nvlist_lookup_uint64(props,
3407 3410 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
3408 3411 version = SPA_VERSION;
3409 3412 }
3410 3413 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
3411 3414
3412 3415 spa->spa_first_txg = txg;
3413 3416 spa->spa_uberblock.ub_txg = txg - 1;
3414 3417 spa->spa_uberblock.ub_version = version;
3415 3418 spa->spa_ubsync = spa->spa_uberblock;
3416 3419
3417 3420 /*
3418 3421 * Create "The Godfather" zio to hold all async IOs
3419 3422 */
3420 3423 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
3421 3424 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
3422 3425
3423 3426 /*
3424 3427 * Create the root vdev.
3425 3428 */
3426 3429 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3427 3430
3428 3431 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
3429 3432
3430 3433 ASSERT(error != 0 || rvd != NULL);
3431 3434 ASSERT(error != 0 || spa->spa_root_vdev == rvd);
3432 3435
3433 3436 if (error == 0 && !zfs_allocatable_devs(nvroot))
3434 3437 error = SET_ERROR(EINVAL);
3435 3438
3436 3439 if (error == 0 &&
3437 3440 (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
3438 3441 (error = spa_validate_aux(spa, nvroot, txg,
3439 3442 VDEV_ALLOC_ADD)) == 0) {
3440 3443 for (int c = 0; c < rvd->vdev_children; c++) {
3441 3444 vdev_metaslab_set_size(rvd->vdev_child[c]);
3442 3445 vdev_expand(rvd->vdev_child[c], txg);
3443 3446 }
3444 3447 }
3445 3448
3446 3449 spa_config_exit(spa, SCL_ALL, FTAG);
3447 3450
3448 3451 if (error != 0) {
3449 3452 spa_unload(spa);
3450 3453 spa_deactivate(spa);
3451 3454 spa_remove(spa);
3452 3455 mutex_exit(&spa_namespace_lock);
3453 3456 return (error);
3454 3457 }
3455 3458
3456 3459 /*
3457 3460 * Get the list of spares, if specified.
3458 3461 */
3459 3462 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3460 3463 &spares, &nspares) == 0) {
3461 3464 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
3462 3465 KM_SLEEP) == 0);
3463 3466 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3464 3467 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3465 3468 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3466 3469 spa_load_spares(spa);
3467 3470 spa_config_exit(spa, SCL_ALL, FTAG);
3468 3471 spa->spa_spares.sav_sync = B_TRUE;
3469 3472 }
3470 3473
3471 3474 /*
3472 3475 * Get the list of level 2 cache devices, if specified.
3473 3476 */
3474 3477 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3475 3478 &l2cache, &nl2cache) == 0) {
3476 3479 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3477 3480 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3478 3481 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3479 3482 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3480 3483 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3481 3484 spa_load_l2cache(spa);
3482 3485 spa_config_exit(spa, SCL_ALL, FTAG);
3483 3486 spa->spa_l2cache.sav_sync = B_TRUE;
3484 3487 }
3485 3488
3486 3489 spa->spa_is_initializing = B_TRUE;
3487 3490 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3488 3491 spa->spa_meta_objset = dp->dp_meta_objset;
3489 3492 spa->spa_is_initializing = B_FALSE;
3490 3493
3491 3494 /*
3492 3495 * Create DDTs (dedup tables).
3493 3496 */
3494 3497 ddt_create(spa);
3495 3498
3496 3499 spa_update_dspace(spa);
3497 3500
3498 3501 tx = dmu_tx_create_assigned(dp, txg);
3499 3502
3500 3503 /*
3501 3504 * Create the pool config object.
3502 3505 */
3503 3506 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3504 3507 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3505 3508 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3506 3509
3507 3510 if (zap_add(spa->spa_meta_objset,
3508 3511 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3509 3512 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3510 3513 cmn_err(CE_PANIC, "failed to add pool config");
3511 3514 }
3512 3515
3513 3516 if (spa_version(spa) >= SPA_VERSION_FEATURES)
3514 3517 spa_feature_create_zap_objects(spa, tx);
3515 3518
3516 3519 if (zap_add(spa->spa_meta_objset,
3517 3520 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3518 3521 sizeof (uint64_t), 1, &version, tx) != 0) {
3519 3522 cmn_err(CE_PANIC, "failed to add pool version");
3520 3523 }
3521 3524
3522 3525 /* Newly created pools with the right version are always deflated. */
3523 3526 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3524 3527 spa->spa_deflate = TRUE;
3525 3528 if (zap_add(spa->spa_meta_objset,
3526 3529 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3527 3530 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3528 3531 cmn_err(CE_PANIC, "failed to add deflate");
3529 3532 }
3530 3533 }
3531 3534
3532 3535 /*
3533 3536 * Create the deferred-free bpobj. Turn off compression
3534 3537 * because sync-to-convergence takes longer if the blocksize
3535 3538 * keeps changing.
3536 3539 */
3537 3540 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3538 3541 dmu_object_set_compress(spa->spa_meta_objset, obj,
3539 3542 ZIO_COMPRESS_OFF, tx);
3540 3543 if (zap_add(spa->spa_meta_objset,
3541 3544 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3542 3545 sizeof (uint64_t), 1, &obj, tx) != 0) {
3543 3546 cmn_err(CE_PANIC, "failed to add bpobj");
3544 3547 }
3545 3548 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3546 3549 spa->spa_meta_objset, obj));
3547 3550
3548 3551 /*
3549 3552 * Create the pool's history object.
3550 3553 */
3551 3554 if (version >= SPA_VERSION_ZPOOL_HISTORY)
3552 3555 spa_history_create_obj(spa, tx);
3553 3556
3554 3557 /*
3555 3558 * Set pool properties.
3556 3559 */
3557 3560 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3558 3561 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3559 3562 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3560 3563 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3561 3564
3562 3565 if (props != NULL) {
3563 3566 spa_configfile_set(spa, props, B_FALSE);
3564 3567 spa_sync_props(props, tx);
3565 3568 }
3566 3569
3567 3570 dmu_tx_commit(tx);
3568 3571
3569 3572 spa->spa_sync_on = B_TRUE;
3570 3573 txg_sync_start(spa->spa_dsl_pool);
3571 3574
3572 3575 /*
3573 3576 * We explicitly wait for the first transaction to complete so that our
3574 3577 * bean counters are appropriately updated.
3575 3578 */
3576 3579 txg_wait_synced(spa->spa_dsl_pool, txg);
3577 3580
3578 3581 spa_config_sync(spa, B_FALSE, B_TRUE);
3579 3582
3580 3583 spa_history_log_version(spa, "create");
3581 3584
3582 3585 spa->spa_minref = refcount_count(&spa->spa_refcount);
3583 3586
3584 3587 mutex_exit(&spa_namespace_lock);
3585 3588
3586 3589 return (0);
3587 3590 }
3588 3591
3589 3592 #ifdef _KERNEL
3590 3593 /*
3591 3594 * Get the root pool information from the root disk, then import the root pool
3592 3595 * during the system boot up time.
3593 3596 */
3594 3597 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3595 3598
3596 3599 static nvlist_t *
3597 3600 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3598 3601 {
3599 3602 nvlist_t *config;
3600 3603 nvlist_t *nvtop, *nvroot;
3601 3604 uint64_t pgid;
3602 3605
3603 3606 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3604 3607 return (NULL);
3605 3608
3606 3609 /*
3607 3610 * Add this top-level vdev to the child array.
3608 3611 */
3609 3612 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3610 3613 &nvtop) == 0);
3611 3614 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3612 3615 &pgid) == 0);
3613 3616 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3614 3617
3615 3618 /*
3616 3619 * Put this pool's top-level vdevs into a root vdev.
3617 3620 */
3618 3621 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3619 3622 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3620 3623 VDEV_TYPE_ROOT) == 0);
3621 3624 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3622 3625 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3623 3626 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3624 3627 &nvtop, 1) == 0);
3625 3628
3626 3629 /*
3627 3630 * Replace the existing vdev_tree with the new root vdev in
3628 3631 * this pool's configuration (remove the old, add the new).
3629 3632 */
3630 3633 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3631 3634 nvlist_free(nvroot);
3632 3635 return (config);
3633 3636 }
3634 3637
3635 3638 /*
3636 3639 * Walk the vdev tree and see if we can find a device with "better"
3637 3640 * configuration. A configuration is "better" if the label on that
3638 3641 * device has a more recent txg.
3639 3642 */
3640 3643 static void
3641 3644 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3642 3645 {
3643 3646 for (int c = 0; c < vd->vdev_children; c++)
3644 3647 spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3645 3648
3646 3649 if (vd->vdev_ops->vdev_op_leaf) {
3647 3650 nvlist_t *label;
3648 3651 uint64_t label_txg;
3649 3652
3650 3653 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
3651 3654 &label) != 0)
3652 3655 return;
3653 3656
3654 3657 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
3655 3658 &label_txg) == 0);
3656 3659
3657 3660 /*
3658 3661 * Do we have a better boot device?
3659 3662 */
3660 3663 if (label_txg > *txg) {
3661 3664 *txg = label_txg;
3662 3665 *avd = vd;
3663 3666 }
3664 3667 nvlist_free(label);
3665 3668 }
3666 3669 }
3667 3670
3668 3671 /*
3669 3672 * Import a root pool.
3670 3673 *
3671 3674 * For x86. devpath_list will consist of devid and/or physpath name of
3672 3675 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3673 3676 * The GRUB "findroot" command will return the vdev we should boot.
3674 3677 *
3675 3678 * For Sparc, devpath_list consists the physpath name of the booting device
3676 3679 * no matter the rootpool is a single device pool or a mirrored pool.
3677 3680 * e.g.
3678 3681 * "/pci@1f,0/ide@d/disk@0,0:a"
3679 3682 */
3680 3683 int
3681 3684 spa_import_rootpool(char *devpath, char *devid)
3682 3685 {
3683 3686 spa_t *spa;
3684 3687 vdev_t *rvd, *bvd, *avd = NULL;
3685 3688 nvlist_t *config, *nvtop;
3686 3689 uint64_t guid, txg;
3687 3690 char *pname;
3688 3691 int error;
3689 3692
3690 3693 /*
3691 3694 * Read the label from the boot device and generate a configuration.
3692 3695 */
3693 3696 config = spa_generate_rootconf(devpath, devid, &guid);
3694 3697 #if defined(_OBP) && defined(_KERNEL)
3695 3698 if (config == NULL) {
3696 3699 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3697 3700 /* iscsi boot */
3698 3701 get_iscsi_bootpath_phy(devpath);
3699 3702 config = spa_generate_rootconf(devpath, devid, &guid);
3700 3703 }
3701 3704 }
3702 3705 #endif
3703 3706 if (config == NULL) {
3704 3707 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
3705 3708 devpath);
3706 3709 return (SET_ERROR(EIO));
3707 3710 }
3708 3711
3709 3712 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3710 3713 &pname) == 0);
3711 3714 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3712 3715
3713 3716 mutex_enter(&spa_namespace_lock);
3714 3717 if ((spa = spa_lookup(pname)) != NULL) {
3715 3718 /*
3716 3719 * Remove the existing root pool from the namespace so that we
3717 3720 * can replace it with the correct config we just read in.
3718 3721 */
3719 3722 spa_remove(spa);
3720 3723 }
3721 3724
3722 3725 spa = spa_add(pname, config, NULL);
3723 3726 spa->spa_is_root = B_TRUE;
3724 3727 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3725 3728
3726 3729 /*
3727 3730 * Build up a vdev tree based on the boot device's label config.
3728 3731 */
3729 3732 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3730 3733 &nvtop) == 0);
3731 3734 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3732 3735 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3733 3736 VDEV_ALLOC_ROOTPOOL);
3734 3737 spa_config_exit(spa, SCL_ALL, FTAG);
3735 3738 if (error) {
3736 3739 mutex_exit(&spa_namespace_lock);
3737 3740 nvlist_free(config);
3738 3741 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3739 3742 pname);
3740 3743 return (error);
3741 3744 }
3742 3745
3743 3746 /*
3744 3747 * Get the boot vdev.
3745 3748 */
3746 3749 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3747 3750 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
3748 3751 (u_longlong_t)guid);
3749 3752 error = SET_ERROR(ENOENT);
3750 3753 goto out;
3751 3754 }
3752 3755
3753 3756 /*
3754 3757 * Determine if there is a better boot device.
3755 3758 */
3756 3759 avd = bvd;
3757 3760 spa_alt_rootvdev(rvd, &avd, &txg);
3758 3761 if (avd != bvd) {
3759 3762 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
3760 3763 "try booting from '%s'", avd->vdev_path);
3761 3764 error = SET_ERROR(EINVAL);
3762 3765 goto out;
3763 3766 }
3764 3767
3765 3768 /*
3766 3769 * If the boot device is part of a spare vdev then ensure that
3767 3770 * we're booting off the active spare.
3768 3771 */
3769 3772 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3770 3773 !bvd->vdev_isspare) {
3771 3774 cmn_err(CE_NOTE, "The boot device is currently spared. Please "
3772 3775 "try booting from '%s'",
3773 3776 bvd->vdev_parent->
3774 3777 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
3775 3778 error = SET_ERROR(EINVAL);
3776 3779 goto out;
3777 3780 }
3778 3781
3779 3782 error = 0;
3780 3783 out:
3781 3784 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3782 3785 vdev_free(rvd);
3783 3786 spa_config_exit(spa, SCL_ALL, FTAG);
3784 3787 mutex_exit(&spa_namespace_lock);
3785 3788
3786 3789 nvlist_free(config);
3787 3790 return (error);
3788 3791 }
3789 3792
3790 3793 #endif
3791 3794
3792 3795 /*
3793 3796 * Import a non-root pool into the system.
3794 3797 */
3795 3798 int
3796 3799 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3797 3800 {
3798 3801 spa_t *spa;
3799 3802 char *altroot = NULL;
3800 3803 spa_load_state_t state = SPA_LOAD_IMPORT;
3801 3804 zpool_rewind_policy_t policy;
3802 3805 uint64_t mode = spa_mode_global;
3803 3806 uint64_t readonly = B_FALSE;
3804 3807 int error;
3805 3808 nvlist_t *nvroot;
3806 3809 nvlist_t **spares, **l2cache;
3807 3810 uint_t nspares, nl2cache;
3808 3811
3809 3812 /*
3810 3813 * If a pool with this name exists, return failure.
3811 3814 */
3812 3815 mutex_enter(&spa_namespace_lock);
3813 3816 if (spa_lookup(pool) != NULL) {
3814 3817 mutex_exit(&spa_namespace_lock);
3815 3818 return (SET_ERROR(EEXIST));
3816 3819 }
3817 3820
3818 3821 /*
3819 3822 * Create and initialize the spa structure.
3820 3823 */
3821 3824 (void) nvlist_lookup_string(props,
3822 3825 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3823 3826 (void) nvlist_lookup_uint64(props,
3824 3827 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
3825 3828 if (readonly)
3826 3829 mode = FREAD;
3827 3830 spa = spa_add(pool, config, altroot);
3828 3831 spa->spa_import_flags = flags;
3829 3832
3830 3833 /*
3831 3834 * Verbatim import - Take a pool and insert it into the namespace
3832 3835 * as if it had been loaded at boot.
3833 3836 */
3834 3837 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
3835 3838 if (props != NULL)
3836 3839 spa_configfile_set(spa, props, B_FALSE);
3837 3840
3838 3841 spa_config_sync(spa, B_FALSE, B_TRUE);
3839 3842
3840 3843 mutex_exit(&spa_namespace_lock);
3841 3844 spa_history_log_version(spa, "import");
3842 3845
3843 3846 return (0);
3844 3847 }
3845 3848
3846 3849 spa_activate(spa, mode);
3847 3850
3848 3851 /*
3849 3852 * Don't start async tasks until we know everything is healthy.
3850 3853 */
3851 3854 spa_async_suspend(spa);
3852 3855
3853 3856 zpool_get_rewind_policy(config, &policy);
3854 3857 if (policy.zrp_request & ZPOOL_DO_REWIND)
3855 3858 state = SPA_LOAD_RECOVER;
3856 3859
3857 3860 /*
3858 3861 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
3859 3862 * because the user-supplied config is actually the one to trust when
3860 3863 * doing an import.
3861 3864 */
3862 3865 if (state != SPA_LOAD_RECOVER)
3863 3866 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
3864 3867
3865 3868 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
3866 3869 policy.zrp_request);
3867 3870
3868 3871 /*
3869 3872 * Propagate anything learned while loading the pool and pass it
3870 3873 * back to caller (i.e. rewind info, missing devices, etc).
3871 3874 */
3872 3875 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
3873 3876 spa->spa_load_info) == 0);
3874 3877
3875 3878 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3876 3879 /*
3877 3880 * Toss any existing sparelist, as it doesn't have any validity
3878 3881 * anymore, and conflicts with spa_has_spare().
3879 3882 */
3880 3883 if (spa->spa_spares.sav_config) {
3881 3884 nvlist_free(spa->spa_spares.sav_config);
3882 3885 spa->spa_spares.sav_config = NULL;
3883 3886 spa_load_spares(spa);
3884 3887 }
3885 3888 if (spa->spa_l2cache.sav_config) {
3886 3889 nvlist_free(spa->spa_l2cache.sav_config);
3887 3890 spa->spa_l2cache.sav_config = NULL;
3888 3891 spa_load_l2cache(spa);
3889 3892 }
3890 3893
3891 3894 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3892 3895 &nvroot) == 0);
3893 3896 if (error == 0)
3894 3897 error = spa_validate_aux(spa, nvroot, -1ULL,
3895 3898 VDEV_ALLOC_SPARE);
3896 3899 if (error == 0)
3897 3900 error = spa_validate_aux(spa, nvroot, -1ULL,
3898 3901 VDEV_ALLOC_L2CACHE);
3899 3902 spa_config_exit(spa, SCL_ALL, FTAG);
3900 3903
3901 3904 if (props != NULL)
3902 3905 spa_configfile_set(spa, props, B_FALSE);
3903 3906
3904 3907 if (error != 0 || (props && spa_writeable(spa) &&
3905 3908 (error = spa_prop_set(spa, props)))) {
3906 3909 spa_unload(spa);
3907 3910 spa_deactivate(spa);
3908 3911 spa_remove(spa);
3909 3912 mutex_exit(&spa_namespace_lock);
3910 3913 return (error);
3911 3914 }
3912 3915
3913 3916 spa_async_resume(spa);
3914 3917
3915 3918 /*
3916 3919 * Override any spares and level 2 cache devices as specified by
3917 3920 * the user, as these may have correct device names/devids, etc.
3918 3921 */
3919 3922 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3920 3923 &spares, &nspares) == 0) {
3921 3924 if (spa->spa_spares.sav_config)
3922 3925 VERIFY(nvlist_remove(spa->spa_spares.sav_config,
3923 3926 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
3924 3927 else
3925 3928 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
3926 3929 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3927 3930 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3928 3931 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3929 3932 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3930 3933 spa_load_spares(spa);
3931 3934 spa_config_exit(spa, SCL_ALL, FTAG);
3932 3935 spa->spa_spares.sav_sync = B_TRUE;
3933 3936 }
3934 3937 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3935 3938 &l2cache, &nl2cache) == 0) {
3936 3939 if (spa->spa_l2cache.sav_config)
3937 3940 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
3938 3941 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
3939 3942 else
3940 3943 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3941 3944 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3942 3945 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3943 3946 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3944 3947 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3945 3948 spa_load_l2cache(spa);
3946 3949 spa_config_exit(spa, SCL_ALL, FTAG);
3947 3950 spa->spa_l2cache.sav_sync = B_TRUE;
3948 3951 }
3949 3952
3950 3953 /*
3951 3954 * Check for any removed devices.
3952 3955 */
3953 3956 if (spa->spa_autoreplace) {
3954 3957 spa_aux_check_removed(&spa->spa_spares);
3955 3958 spa_aux_check_removed(&spa->spa_l2cache);
3956 3959 }
3957 3960
3958 3961 if (spa_writeable(spa)) {
3959 3962 /*
3960 3963 * Update the config cache to include the newly-imported pool.
3961 3964 */
3962 3965 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3963 3966 }
3964 3967
3965 3968 /*
3966 3969 * It's possible that the pool was expanded while it was exported.
3967 3970 * We kick off an async task to handle this for us.
3968 3971 */
3969 3972 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
3970 3973
3971 3974 mutex_exit(&spa_namespace_lock);
3972 3975 spa_history_log_version(spa, "import");
3973 3976
3974 3977 return (0);
3975 3978 }
3976 3979
3977 3980 nvlist_t *
3978 3981 spa_tryimport(nvlist_t *tryconfig)
3979 3982 {
3980 3983 nvlist_t *config = NULL;
3981 3984 char *poolname;
3982 3985 spa_t *spa;
3983 3986 uint64_t state;
3984 3987 int error;
3985 3988
3986 3989 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
3987 3990 return (NULL);
3988 3991
3989 3992 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
3990 3993 return (NULL);
3991 3994
3992 3995 /*
3993 3996 * Create and initialize the spa structure.
3994 3997 */
3995 3998 mutex_enter(&spa_namespace_lock);
3996 3999 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
3997 4000 spa_activate(spa, FREAD);
3998 4001
3999 4002 /*
4000 4003 * Pass off the heavy lifting to spa_load().
4001 4004 * Pass TRUE for mosconfig because the user-supplied config
4002 4005 * is actually the one to trust when doing an import.
4003 4006 */
4004 4007 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
4005 4008
4006 4009 /*
4007 4010 * If 'tryconfig' was at least parsable, return the current config.
4008 4011 */
4009 4012 if (spa->spa_root_vdev != NULL) {
4010 4013 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4011 4014 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
4012 4015 poolname) == 0);
4013 4016 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4014 4017 state) == 0);
4015 4018 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
4016 4019 spa->spa_uberblock.ub_timestamp) == 0);
4017 4020 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4018 4021 spa->spa_load_info) == 0);
4019 4022
4020 4023 /*
4021 4024 * If the bootfs property exists on this pool then we
4022 4025 * copy it out so that external consumers can tell which
4023 4026 * pools are bootable.
4024 4027 */
4025 4028 if ((!error || error == EEXIST) && spa->spa_bootfs) {
4026 4029 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4027 4030
4028 4031 /*
4029 4032 * We have to play games with the name since the
4030 4033 * pool was opened as TRYIMPORT_NAME.
4031 4034 */
4032 4035 if (dsl_dsobj_to_dsname(spa_name(spa),
4033 4036 spa->spa_bootfs, tmpname) == 0) {
4034 4037 char *cp;
4035 4038 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4036 4039
4037 4040 cp = strchr(tmpname, '/');
4038 4041 if (cp == NULL) {
4039 4042 (void) strlcpy(dsname, tmpname,
4040 4043 MAXPATHLEN);
4041 4044 } else {
4042 4045 (void) snprintf(dsname, MAXPATHLEN,
4043 4046 "%s/%s", poolname, ++cp);
4044 4047 }
4045 4048 VERIFY(nvlist_add_string(config,
4046 4049 ZPOOL_CONFIG_BOOTFS, dsname) == 0);
4047 4050 kmem_free(dsname, MAXPATHLEN);
4048 4051 }
4049 4052 kmem_free(tmpname, MAXPATHLEN);
4050 4053 }
4051 4054
4052 4055 /*
4053 4056 * Add the list of hot spares and level 2 cache devices.
4054 4057 */
4055 4058 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4056 4059 spa_add_spares(spa, config);
4057 4060 spa_add_l2cache(spa, config);
4058 4061 spa_config_exit(spa, SCL_CONFIG, FTAG);
4059 4062 }
4060 4063
4061 4064 spa_unload(spa);
4062 4065 spa_deactivate(spa);
4063 4066 spa_remove(spa);
4064 4067 mutex_exit(&spa_namespace_lock);
4065 4068
4066 4069 return (config);
4067 4070 }
4068 4071
4069 4072 /*
4070 4073 * Pool export/destroy
4071 4074 *
4072 4075 * The act of destroying or exporting a pool is very simple. We make sure there
4073 4076 * is no more pending I/O and any references to the pool are gone. Then, we
4074 4077 * update the pool state and sync all the labels to disk, removing the
4075 4078 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
4076 4079 * we don't sync the labels or remove the configuration cache.
4077 4080 */
4078 4081 static int
4079 4082 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
4080 4083 boolean_t force, boolean_t hardforce)
4081 4084 {
4082 4085 spa_t *spa;
4083 4086
4084 4087 if (oldconfig)
4085 4088 *oldconfig = NULL;
4086 4089
4087 4090 if (!(spa_mode_global & FWRITE))
4088 4091 return (SET_ERROR(EROFS));
4089 4092
4090 4093 mutex_enter(&spa_namespace_lock);
4091 4094 if ((spa = spa_lookup(pool)) == NULL) {
4092 4095 mutex_exit(&spa_namespace_lock);
4093 4096 return (SET_ERROR(ENOENT));
4094 4097 }
4095 4098
4096 4099 /*
4097 4100 * Put a hold on the pool, drop the namespace lock, stop async tasks,
4098 4101 * reacquire the namespace lock, and see if we can export.
4099 4102 */
4100 4103 spa_open_ref(spa, FTAG);
4101 4104 mutex_exit(&spa_namespace_lock);
4102 4105 spa_async_suspend(spa);
4103 4106 mutex_enter(&spa_namespace_lock);
4104 4107 spa_close(spa, FTAG);
4105 4108
4106 4109 /*
4107 4110 * The pool will be in core if it's openable,
4108 4111 * in which case we can modify its state.
4109 4112 */
4110 4113 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
4111 4114 /*
4112 4115 * Objsets may be open only because they're dirty, so we
4113 4116 * have to force it to sync before checking spa_refcnt.
4114 4117 */
4115 4118 txg_wait_synced(spa->spa_dsl_pool, 0);
4116 4119
4117 4120 /*
4118 4121 * A pool cannot be exported or destroyed if there are active
4119 4122 * references. If we are resetting a pool, allow references by
4120 4123 * fault injection handlers.
4121 4124 */
4122 4125 if (!spa_refcount_zero(spa) ||
4123 4126 (spa->spa_inject_ref != 0 &&
4124 4127 new_state != POOL_STATE_UNINITIALIZED)) {
4125 4128 spa_async_resume(spa);
4126 4129 mutex_exit(&spa_namespace_lock);
4127 4130 return (SET_ERROR(EBUSY));
4128 4131 }
4129 4132
4130 4133 /*
4131 4134 * A pool cannot be exported if it has an active shared spare.
4132 4135 * This is to prevent other pools stealing the active spare
4133 4136 * from an exported pool. At user's own will, such pool can
4134 4137 * be forcedly exported.
4135 4138 */
4136 4139 if (!force && new_state == POOL_STATE_EXPORTED &&
4137 4140 spa_has_active_shared_spare(spa)) {
4138 4141 spa_async_resume(spa);
4139 4142 mutex_exit(&spa_namespace_lock);
4140 4143 return (SET_ERROR(EXDEV));
4141 4144 }
4142 4145
4143 4146 /*
4144 4147 * We want this to be reflected on every label,
4145 4148 * so mark them all dirty. spa_unload() will do the
4146 4149 * final sync that pushes these changes out.
4147 4150 */
4148 4151 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
4149 4152 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4150 4153 spa->spa_state = new_state;
4151 4154 spa->spa_final_txg = spa_last_synced_txg(spa) +
4152 4155 TXG_DEFER_SIZE + 1;
4153 4156 vdev_config_dirty(spa->spa_root_vdev);
4154 4157 spa_config_exit(spa, SCL_ALL, FTAG);
4155 4158 }
4156 4159 }
4157 4160
4158 4161 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
4159 4162
4160 4163 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4161 4164 spa_unload(spa);
4162 4165 spa_deactivate(spa);
4163 4166 }
4164 4167
4165 4168 if (oldconfig && spa->spa_config)
4166 4169 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
4167 4170
4168 4171 if (new_state != POOL_STATE_UNINITIALIZED) {
4169 4172 if (!hardforce)
4170 4173 spa_config_sync(spa, B_TRUE, B_TRUE);
4171 4174 spa_remove(spa);
4172 4175 }
4173 4176 mutex_exit(&spa_namespace_lock);
4174 4177
4175 4178 return (0);
4176 4179 }
4177 4180
4178 4181 /*
4179 4182 * Destroy a storage pool.
4180 4183 */
4181 4184 int
4182 4185 spa_destroy(char *pool)
4183 4186 {
4184 4187 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
4185 4188 B_FALSE, B_FALSE));
4186 4189 }
4187 4190
4188 4191 /*
4189 4192 * Export a storage pool.
4190 4193 */
4191 4194 int
4192 4195 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
4193 4196 boolean_t hardforce)
4194 4197 {
4195 4198 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
4196 4199 force, hardforce));
4197 4200 }
4198 4201
4199 4202 /*
4200 4203 * Similar to spa_export(), this unloads the spa_t without actually removing it
4201 4204 * from the namespace in any way.
4202 4205 */
4203 4206 int
4204 4207 spa_reset(char *pool)
4205 4208 {
4206 4209 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
4207 4210 B_FALSE, B_FALSE));
4208 4211 }
4209 4212
4210 4213 /*
4211 4214 * ==========================================================================
4212 4215 * Device manipulation
4213 4216 * ==========================================================================
4214 4217 */
4215 4218
4216 4219 /*
4217 4220 * Add a device to a storage pool.
4218 4221 */
4219 4222 int
4220 4223 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
4221 4224 {
4222 4225 uint64_t txg, id;
4223 4226 int error;
4224 4227 vdev_t *rvd = spa->spa_root_vdev;
4225 4228 vdev_t *vd, *tvd;
4226 4229 nvlist_t **spares, **l2cache;
4227 4230 uint_t nspares, nl2cache;
4228 4231
4229 4232 ASSERT(spa_writeable(spa));
4230 4233
4231 4234 txg = spa_vdev_enter(spa);
4232 4235
4233 4236 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
4234 4237 VDEV_ALLOC_ADD)) != 0)
4235 4238 return (spa_vdev_exit(spa, NULL, txg, error));
4236 4239
4237 4240 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
4238 4241
4239 4242 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
4240 4243 &nspares) != 0)
4241 4244 nspares = 0;
4242 4245
4243 4246 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
4244 4247 &nl2cache) != 0)
4245 4248 nl2cache = 0;
4246 4249
4247 4250 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
4248 4251 return (spa_vdev_exit(spa, vd, txg, EINVAL));
4249 4252
4250 4253 if (vd->vdev_children != 0 &&
4251 4254 (error = vdev_create(vd, txg, B_FALSE)) != 0)
4252 4255 return (spa_vdev_exit(spa, vd, txg, error));
4253 4256
4254 4257 /*
4255 4258 * We must validate the spares and l2cache devices after checking the
4256 4259 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
4257 4260 */
4258 4261 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
4259 4262 return (spa_vdev_exit(spa, vd, txg, error));
4260 4263
4261 4264 /*
4262 4265 * Transfer each new top-level vdev from vd to rvd.
4263 4266 */
4264 4267 for (int c = 0; c < vd->vdev_children; c++) {
4265 4268
4266 4269 /*
4267 4270 * Set the vdev id to the first hole, if one exists.
4268 4271 */
4269 4272 for (id = 0; id < rvd->vdev_children; id++) {
4270 4273 if (rvd->vdev_child[id]->vdev_ishole) {
4271 4274 vdev_free(rvd->vdev_child[id]);
4272 4275 break;
4273 4276 }
4274 4277 }
4275 4278 tvd = vd->vdev_child[c];
4276 4279 vdev_remove_child(vd, tvd);
4277 4280 tvd->vdev_id = id;
4278 4281 vdev_add_child(rvd, tvd);
4279 4282 vdev_config_dirty(tvd);
4280 4283 }
4281 4284
4282 4285 if (nspares != 0) {
4283 4286 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
4284 4287 ZPOOL_CONFIG_SPARES);
4285 4288 spa_load_spares(spa);
4286 4289 spa->spa_spares.sav_sync = B_TRUE;
4287 4290 }
4288 4291
4289 4292 if (nl2cache != 0) {
4290 4293 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
4291 4294 ZPOOL_CONFIG_L2CACHE);
4292 4295 spa_load_l2cache(spa);
4293 4296 spa->spa_l2cache.sav_sync = B_TRUE;
4294 4297 }
4295 4298
4296 4299 /*
4297 4300 * We have to be careful when adding new vdevs to an existing pool.
4298 4301 * If other threads start allocating from these vdevs before we
4299 4302 * sync the config cache, and we lose power, then upon reboot we may
4300 4303 * fail to open the pool because there are DVAs that the config cache
4301 4304 * can't translate. Therefore, we first add the vdevs without
4302 4305 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
4303 4306 * and then let spa_config_update() initialize the new metaslabs.
4304 4307 *
4305 4308 * spa_load() checks for added-but-not-initialized vdevs, so that
4306 4309 * if we lose power at any point in this sequence, the remaining
4307 4310 * steps will be completed the next time we load the pool.
4308 4311 */
4309 4312 (void) spa_vdev_exit(spa, vd, txg, 0);
4310 4313
4311 4314 mutex_enter(&spa_namespace_lock);
4312 4315 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4313 4316 mutex_exit(&spa_namespace_lock);
4314 4317
4315 4318 return (0);
4316 4319 }
4317 4320
4318 4321 /*
4319 4322 * Attach a device to a mirror. The arguments are the path to any device
4320 4323 * in the mirror, and the nvroot for the new device. If the path specifies
4321 4324 * a device that is not mirrored, we automatically insert the mirror vdev.
4322 4325 *
4323 4326 * If 'replacing' is specified, the new device is intended to replace the
4324 4327 * existing device; in this case the two devices are made into their own
4325 4328 * mirror using the 'replacing' vdev, which is functionally identical to
4326 4329 * the mirror vdev (it actually reuses all the same ops) but has a few
4327 4330 * extra rules: you can't attach to it after it's been created, and upon
4328 4331 * completion of resilvering, the first disk (the one being replaced)
4329 4332 * is automatically detached.
4330 4333 */
4331 4334 int
4332 4335 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
4333 4336 {
4334 4337 uint64_t txg, dtl_max_txg;
4335 4338 vdev_t *rvd = spa->spa_root_vdev;
4336 4339 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
4337 4340 vdev_ops_t *pvops;
4338 4341 char *oldvdpath, *newvdpath;
4339 4342 int newvd_isspare;
4340 4343 int error;
4341 4344
4342 4345 ASSERT(spa_writeable(spa));
4343 4346
4344 4347 txg = spa_vdev_enter(spa);
4345 4348
4346 4349 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
4347 4350
4348 4351 if (oldvd == NULL)
4349 4352 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4350 4353
4351 4354 if (!oldvd->vdev_ops->vdev_op_leaf)
4352 4355 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4353 4356
4354 4357 pvd = oldvd->vdev_parent;
4355 4358
4356 4359 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
4357 4360 VDEV_ALLOC_ATTACH)) != 0)
4358 4361 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4359 4362
4360 4363 if (newrootvd->vdev_children != 1)
4361 4364 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4362 4365
4363 4366 newvd = newrootvd->vdev_child[0];
4364 4367
4365 4368 if (!newvd->vdev_ops->vdev_op_leaf)
4366 4369 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4367 4370
4368 4371 if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
4369 4372 return (spa_vdev_exit(spa, newrootvd, txg, error));
4370 4373
4371 4374 /*
4372 4375 * Spares can't replace logs
4373 4376 */
4374 4377 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
4375 4378 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4376 4379
4377 4380 if (!replacing) {
4378 4381 /*
4379 4382 * For attach, the only allowable parent is a mirror or the root
4380 4383 * vdev.
4381 4384 */
4382 4385 if (pvd->vdev_ops != &vdev_mirror_ops &&
4383 4386 pvd->vdev_ops != &vdev_root_ops)
4384 4387 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4385 4388
4386 4389 pvops = &vdev_mirror_ops;
4387 4390 } else {
4388 4391 /*
4389 4392 * Active hot spares can only be replaced by inactive hot
4390 4393 * spares.
4391 4394 */
4392 4395 if (pvd->vdev_ops == &vdev_spare_ops &&
4393 4396 oldvd->vdev_isspare &&
4394 4397 !spa_has_spare(spa, newvd->vdev_guid))
4395 4398 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4396 4399
4397 4400 /*
4398 4401 * If the source is a hot spare, and the parent isn't already a
4399 4402 * spare, then we want to create a new hot spare. Otherwise, we
4400 4403 * want to create a replacing vdev. The user is not allowed to
4401 4404 * attach to a spared vdev child unless the 'isspare' state is
4402 4405 * the same (spare replaces spare, non-spare replaces
4403 4406 * non-spare).
4404 4407 */
4405 4408 if (pvd->vdev_ops == &vdev_replacing_ops &&
4406 4409 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
4407 4410 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4408 4411 } else if (pvd->vdev_ops == &vdev_spare_ops &&
4409 4412 newvd->vdev_isspare != oldvd->vdev_isspare) {
4410 4413 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4411 4414 }
4412 4415
4413 4416 if (newvd->vdev_isspare)
4414 4417 pvops = &vdev_spare_ops;
4415 4418 else
4416 4419 pvops = &vdev_replacing_ops;
4417 4420 }
4418 4421
4419 4422 /*
4420 4423 * Make sure the new device is big enough.
4421 4424 */
4422 4425 if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
4423 4426 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
4424 4427
4425 4428 /*
4426 4429 * The new device cannot have a higher alignment requirement
4427 4430 * than the top-level vdev.
4428 4431 */
4429 4432 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
4430 4433 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
4431 4434
4432 4435 /*
4433 4436 * If this is an in-place replacement, update oldvd's path and devid
4434 4437 * to make it distinguishable from newvd, and unopenable from now on.
4435 4438 */
4436 4439 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
4437 4440 spa_strfree(oldvd->vdev_path);
4438 4441 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
4439 4442 KM_SLEEP);
4440 4443 (void) sprintf(oldvd->vdev_path, "%s/%s",
4441 4444 newvd->vdev_path, "old");
4442 4445 if (oldvd->vdev_devid != NULL) {
4443 4446 spa_strfree(oldvd->vdev_devid);
4444 4447 oldvd->vdev_devid = NULL;
4445 4448 }
4446 4449 }
4447 4450
4448 4451 /* mark the device being resilvered */
4449 4452 newvd->vdev_resilver_txg = txg;
4450 4453
4451 4454 /*
4452 4455 * If the parent is not a mirror, or if we're replacing, insert the new
4453 4456 * mirror/replacing/spare vdev above oldvd.
4454 4457 */
4455 4458 if (pvd->vdev_ops != pvops)
4456 4459 pvd = vdev_add_parent(oldvd, pvops);
4457 4460
4458 4461 ASSERT(pvd->vdev_top->vdev_parent == rvd);
4459 4462 ASSERT(pvd->vdev_ops == pvops);
4460 4463 ASSERT(oldvd->vdev_parent == pvd);
4461 4464
4462 4465 /*
4463 4466 * Extract the new device from its root and add it to pvd.
4464 4467 */
4465 4468 vdev_remove_child(newrootvd, newvd);
4466 4469 newvd->vdev_id = pvd->vdev_children;
4467 4470 newvd->vdev_crtxg = oldvd->vdev_crtxg;
4468 4471 vdev_add_child(pvd, newvd);
4469 4472
4470 4473 tvd = newvd->vdev_top;
4471 4474 ASSERT(pvd->vdev_top == tvd);
4472 4475 ASSERT(tvd->vdev_parent == rvd);
4473 4476
4474 4477 vdev_config_dirty(tvd);
4475 4478
4476 4479 /*
4477 4480 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4478 4481 * for any dmu_sync-ed blocks. It will propagate upward when
4479 4482 * spa_vdev_exit() calls vdev_dtl_reassess().
4480 4483 */
4481 4484 dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4482 4485
4483 4486 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4484 4487 dtl_max_txg - TXG_INITIAL);
4485 4488
4486 4489 if (newvd->vdev_isspare) {
4487 4490 spa_spare_activate(newvd);
4488 4491 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
4489 4492 }
4490 4493
4491 4494 oldvdpath = spa_strdup(oldvd->vdev_path);
4492 4495 newvdpath = spa_strdup(newvd->vdev_path);
4493 4496 newvd_isspare = newvd->vdev_isspare;
4494 4497
4495 4498 /*
4496 4499 * Mark newvd's DTL dirty in this txg.
4497 4500 */
4498 4501 vdev_dirty(tvd, VDD_DTL, newvd, txg);
4499 4502
4500 4503 /*
4501 4504 * Restart the resilver
4502 4505 */
4503 4506 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4504 4507
4505 4508 /*
4506 4509 * Commit the config
4507 4510 */
4508 4511 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4509 4512
4510 4513 spa_history_log_internal(spa, "vdev attach", NULL,
4511 4514 "%s vdev=%s %s vdev=%s",
4512 4515 replacing && newvd_isspare ? "spare in" :
4513 4516 replacing ? "replace" : "attach", newvdpath,
4514 4517 replacing ? "for" : "to", oldvdpath);
4515 4518
4516 4519 spa_strfree(oldvdpath);
4517 4520 spa_strfree(newvdpath);
4518 4521
4519 4522 if (spa->spa_bootfs)
4520 4523 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4521 4524
4522 4525 return (0);
4523 4526 }
4524 4527
4525 4528 /*
4526 4529 * Detach a device from a mirror or replacing vdev.
4527 4530 *
4528 4531 * If 'replace_done' is specified, only detach if the parent
4529 4532 * is a replacing vdev.
4530 4533 */
4531 4534 int
4532 4535 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4533 4536 {
4534 4537 uint64_t txg;
4535 4538 int error;
4536 4539 vdev_t *rvd = spa->spa_root_vdev;
4537 4540 vdev_t *vd, *pvd, *cvd, *tvd;
4538 4541 boolean_t unspare = B_FALSE;
4539 4542 uint64_t unspare_guid = 0;
4540 4543 char *vdpath;
4541 4544
4542 4545 ASSERT(spa_writeable(spa));
4543 4546
4544 4547 txg = spa_vdev_enter(spa);
4545 4548
4546 4549 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4547 4550
4548 4551 if (vd == NULL)
4549 4552 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4550 4553
4551 4554 if (!vd->vdev_ops->vdev_op_leaf)
4552 4555 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4553 4556
4554 4557 pvd = vd->vdev_parent;
4555 4558
4556 4559 /*
4557 4560 * If the parent/child relationship is not as expected, don't do it.
4558 4561 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4559 4562 * vdev that's replacing B with C. The user's intent in replacing
4560 4563 * is to go from M(A,B) to M(A,C). If the user decides to cancel
4561 4564 * the replace by detaching C, the expected behavior is to end up
4562 4565 * M(A,B). But suppose that right after deciding to detach C,
4563 4566 * the replacement of B completes. We would have M(A,C), and then
4564 4567 * ask to detach C, which would leave us with just A -- not what
4565 4568 * the user wanted. To prevent this, we make sure that the
4566 4569 * parent/child relationship hasn't changed -- in this example,
4567 4570 * that C's parent is still the replacing vdev R.
4568 4571 */
4569 4572 if (pvd->vdev_guid != pguid && pguid != 0)
4570 4573 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4571 4574
4572 4575 /*
4573 4576 * Only 'replacing' or 'spare' vdevs can be replaced.
4574 4577 */
4575 4578 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4576 4579 pvd->vdev_ops != &vdev_spare_ops)
4577 4580 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4578 4581
4579 4582 ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4580 4583 spa_version(spa) >= SPA_VERSION_SPARES);
4581 4584
4582 4585 /*
4583 4586 * Only mirror, replacing, and spare vdevs support detach.
4584 4587 */
4585 4588 if (pvd->vdev_ops != &vdev_replacing_ops &&
4586 4589 pvd->vdev_ops != &vdev_mirror_ops &&
4587 4590 pvd->vdev_ops != &vdev_spare_ops)
4588 4591 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4589 4592
4590 4593 /*
4591 4594 * If this device has the only valid copy of some data,
4592 4595 * we cannot safely detach it.
4593 4596 */
4594 4597 if (vdev_dtl_required(vd))
4595 4598 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4596 4599
4597 4600 ASSERT(pvd->vdev_children >= 2);
4598 4601
4599 4602 /*
4600 4603 * If we are detaching the second disk from a replacing vdev, then
4601 4604 * check to see if we changed the original vdev's path to have "/old"
4602 4605 * at the end in spa_vdev_attach(). If so, undo that change now.
4603 4606 */
4604 4607 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4605 4608 vd->vdev_path != NULL) {
4606 4609 size_t len = strlen(vd->vdev_path);
4607 4610
4608 4611 for (int c = 0; c < pvd->vdev_children; c++) {
4609 4612 cvd = pvd->vdev_child[c];
4610 4613
4611 4614 if (cvd == vd || cvd->vdev_path == NULL)
4612 4615 continue;
4613 4616
4614 4617 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4615 4618 strcmp(cvd->vdev_path + len, "/old") == 0) {
4616 4619 spa_strfree(cvd->vdev_path);
4617 4620 cvd->vdev_path = spa_strdup(vd->vdev_path);
4618 4621 break;
4619 4622 }
4620 4623 }
4621 4624 }
4622 4625
4623 4626 /*
4624 4627 * If we are detaching the original disk from a spare, then it implies
4625 4628 * that the spare should become a real disk, and be removed from the
4626 4629 * active spare list for the pool.
4627 4630 */
4628 4631 if (pvd->vdev_ops == &vdev_spare_ops &&
4629 4632 vd->vdev_id == 0 &&
4630 4633 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
4631 4634 unspare = B_TRUE;
4632 4635
4633 4636 /*
4634 4637 * Erase the disk labels so the disk can be used for other things.
4635 4638 * This must be done after all other error cases are handled,
4636 4639 * but before we disembowel vd (so we can still do I/O to it).
4637 4640 * But if we can't do it, don't treat the error as fatal --
4638 4641 * it may be that the unwritability of the disk is the reason
4639 4642 * it's being detached!
4640 4643 */
4641 4644 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4642 4645
4643 4646 /*
4644 4647 * Remove vd from its parent and compact the parent's children.
4645 4648 */
4646 4649 vdev_remove_child(pvd, vd);
4647 4650 vdev_compact_children(pvd);
4648 4651
4649 4652 /*
4650 4653 * Remember one of the remaining children so we can get tvd below.
4651 4654 */
4652 4655 cvd = pvd->vdev_child[pvd->vdev_children - 1];
4653 4656
4654 4657 /*
4655 4658 * If we need to remove the remaining child from the list of hot spares,
4656 4659 * do it now, marking the vdev as no longer a spare in the process.
4657 4660 * We must do this before vdev_remove_parent(), because that can
4658 4661 * change the GUID if it creates a new toplevel GUID. For a similar
4659 4662 * reason, we must remove the spare now, in the same txg as the detach;
4660 4663 * otherwise someone could attach a new sibling, change the GUID, and
4661 4664 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
4662 4665 */
4663 4666 if (unspare) {
4664 4667 ASSERT(cvd->vdev_isspare);
4665 4668 spa_spare_remove(cvd);
4666 4669 unspare_guid = cvd->vdev_guid;
4667 4670 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4668 4671 cvd->vdev_unspare = B_TRUE;
4669 4672 }
4670 4673
4671 4674 /*
4672 4675 * If the parent mirror/replacing vdev only has one child,
4673 4676 * the parent is no longer needed. Remove it from the tree.
4674 4677 */
4675 4678 if (pvd->vdev_children == 1) {
4676 4679 if (pvd->vdev_ops == &vdev_spare_ops)
4677 4680 cvd->vdev_unspare = B_FALSE;
4678 4681 vdev_remove_parent(cvd);
4679 4682 }
4680 4683
4681 4684
4682 4685 /*
4683 4686 * We don't set tvd until now because the parent we just removed
4684 4687 * may have been the previous top-level vdev.
4685 4688 */
4686 4689 tvd = cvd->vdev_top;
4687 4690 ASSERT(tvd->vdev_parent == rvd);
4688 4691
4689 4692 /*
4690 4693 * Reevaluate the parent vdev state.
4691 4694 */
4692 4695 vdev_propagate_state(cvd);
4693 4696
4694 4697 /*
4695 4698 * If the 'autoexpand' property is set on the pool then automatically
4696 4699 * try to expand the size of the pool. For example if the device we
4697 4700 * just detached was smaller than the others, it may be possible to
4698 4701 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4699 4702 * first so that we can obtain the updated sizes of the leaf vdevs.
4700 4703 */
4701 4704 if (spa->spa_autoexpand) {
4702 4705 vdev_reopen(tvd);
4703 4706 vdev_expand(tvd, txg);
4704 4707 }
4705 4708
4706 4709 vdev_config_dirty(tvd);
4707 4710
4708 4711 /*
4709 4712 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
4710 4713 * vd->vdev_detached is set and free vd's DTL object in syncing context.
4711 4714 * But first make sure we're not on any *other* txg's DTL list, to
4712 4715 * prevent vd from being accessed after it's freed.
4713 4716 */
4714 4717 vdpath = spa_strdup(vd->vdev_path);
4715 4718 for (int t = 0; t < TXG_SIZE; t++)
4716 4719 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4717 4720 vd->vdev_detached = B_TRUE;
4718 4721 vdev_dirty(tvd, VDD_DTL, vd, txg);
4719 4722
4720 4723 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
4721 4724
4722 4725 /* hang on to the spa before we release the lock */
4723 4726 spa_open_ref(spa, FTAG);
4724 4727
4725 4728 error = spa_vdev_exit(spa, vd, txg, 0);
4726 4729
4727 4730 spa_history_log_internal(spa, "detach", NULL,
4728 4731 "vdev=%s", vdpath);
4729 4732 spa_strfree(vdpath);
4730 4733
4731 4734 /*
4732 4735 * If this was the removal of the original device in a hot spare vdev,
4733 4736 * then we want to go through and remove the device from the hot spare
4734 4737 * list of every other pool.
4735 4738 */
4736 4739 if (unspare) {
4737 4740 spa_t *altspa = NULL;
4738 4741
4739 4742 mutex_enter(&spa_namespace_lock);
4740 4743 while ((altspa = spa_next(altspa)) != NULL) {
4741 4744 if (altspa->spa_state != POOL_STATE_ACTIVE ||
4742 4745 altspa == spa)
4743 4746 continue;
4744 4747
4745 4748 spa_open_ref(altspa, FTAG);
4746 4749 mutex_exit(&spa_namespace_lock);
4747 4750 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
4748 4751 mutex_enter(&spa_namespace_lock);
4749 4752 spa_close(altspa, FTAG);
4750 4753 }
4751 4754 mutex_exit(&spa_namespace_lock);
4752 4755
4753 4756 /* search the rest of the vdevs for spares to remove */
4754 4757 spa_vdev_resilver_done(spa);
4755 4758 }
4756 4759
4757 4760 /* all done with the spa; OK to release */
4758 4761 mutex_enter(&spa_namespace_lock);
4759 4762 spa_close(spa, FTAG);
4760 4763 mutex_exit(&spa_namespace_lock);
4761 4764
4762 4765 return (error);
4763 4766 }
4764 4767
4765 4768 /*
4766 4769 * Split a set of devices from their mirrors, and create a new pool from them.
4767 4770 */
4768 4771 int
4769 4772 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4770 4773 nvlist_t *props, boolean_t exp)
4771 4774 {
4772 4775 int error = 0;
4773 4776 uint64_t txg, *glist;
4774 4777 spa_t *newspa;
4775 4778 uint_t c, children, lastlog;
4776 4779 nvlist_t **child, *nvl, *tmp;
4777 4780 dmu_tx_t *tx;
4778 4781 char *altroot = NULL;
4779 4782 vdev_t *rvd, **vml = NULL; /* vdev modify list */
4780 4783 boolean_t activate_slog;
4781 4784
4782 4785 ASSERT(spa_writeable(spa));
4783 4786
4784 4787 txg = spa_vdev_enter(spa);
4785 4788
4786 4789 /* clear the log and flush everything up to now */
4787 4790 activate_slog = spa_passivate_log(spa);
4788 4791 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4789 4792 error = spa_offline_log(spa);
4790 4793 txg = spa_vdev_config_enter(spa);
4791 4794
4792 4795 if (activate_slog)
4793 4796 spa_activate_log(spa);
4794 4797
4795 4798 if (error != 0)
4796 4799 return (spa_vdev_exit(spa, NULL, txg, error));
4797 4800
4798 4801 /* check new spa name before going any further */
4799 4802 if (spa_lookup(newname) != NULL)
4800 4803 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
4801 4804
4802 4805 /*
4803 4806 * scan through all the children to ensure they're all mirrors
4804 4807 */
4805 4808 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
4806 4809 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
4807 4810 &children) != 0)
4808 4811 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4809 4812
4810 4813 /* first, check to ensure we've got the right child count */
4811 4814 rvd = spa->spa_root_vdev;
4812 4815 lastlog = 0;
4813 4816 for (c = 0; c < rvd->vdev_children; c++) {
4814 4817 vdev_t *vd = rvd->vdev_child[c];
4815 4818
4816 4819 /* don't count the holes & logs as children */
4817 4820 if (vd->vdev_islog || vd->vdev_ishole) {
4818 4821 if (lastlog == 0)
4819 4822 lastlog = c;
4820 4823 continue;
4821 4824 }
4822 4825
4823 4826 lastlog = 0;
4824 4827 }
4825 4828 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
4826 4829 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4827 4830
4828 4831 /* next, ensure no spare or cache devices are part of the split */
4829 4832 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
4830 4833 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
4831 4834 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4832 4835
4833 4836 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
4834 4837 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
4835 4838
4836 4839 /* then, loop over each vdev and validate it */
4837 4840 for (c = 0; c < children; c++) {
4838 4841 uint64_t is_hole = 0;
4839 4842
4840 4843 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
4841 4844 &is_hole);
4842 4845
4843 4846 if (is_hole != 0) {
4844 4847 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
4845 4848 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
4846 4849 continue;
4847 4850 } else {
4848 4851 error = SET_ERROR(EINVAL);
4849 4852 break;
4850 4853 }
4851 4854 }
4852 4855
4853 4856 /* which disk is going to be split? */
4854 4857 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
4855 4858 &glist[c]) != 0) {
4856 4859 error = SET_ERROR(EINVAL);
4857 4860 break;
4858 4861 }
4859 4862
4860 4863 /* look it up in the spa */
4861 4864 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
4862 4865 if (vml[c] == NULL) {
4863 4866 error = SET_ERROR(ENODEV);
4864 4867 break;
4865 4868 }
4866 4869
4867 4870 /* make sure there's nothing stopping the split */
4868 4871 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
4869 4872 vml[c]->vdev_islog ||
4870 4873 vml[c]->vdev_ishole ||
4871 4874 vml[c]->vdev_isspare ||
4872 4875 vml[c]->vdev_isl2cache ||
4873 4876 !vdev_writeable(vml[c]) ||
4874 4877 vml[c]->vdev_children != 0 ||
4875 4878 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
4876 4879 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
4877 4880 error = SET_ERROR(EINVAL);
4878 4881 break;
4879 4882 }
4880 4883
4881 4884 if (vdev_dtl_required(vml[c])) {
4882 4885 error = SET_ERROR(EBUSY);
4883 4886 break;
4884 4887 }
4885 4888
4886 4889 /* we need certain info from the top level */
4887 4890 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
4888 4891 vml[c]->vdev_top->vdev_ms_array) == 0);
4889 4892 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
4890 4893 vml[c]->vdev_top->vdev_ms_shift) == 0);
4891 4894 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
4892 4895 vml[c]->vdev_top->vdev_asize) == 0);
4893 4896 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
4894 4897 vml[c]->vdev_top->vdev_ashift) == 0);
4895 4898 }
4896 4899
4897 4900 if (error != 0) {
4898 4901 kmem_free(vml, children * sizeof (vdev_t *));
4899 4902 kmem_free(glist, children * sizeof (uint64_t));
4900 4903 return (spa_vdev_exit(spa, NULL, txg, error));
4901 4904 }
4902 4905
4903 4906 /* stop writers from using the disks */
4904 4907 for (c = 0; c < children; c++) {
4905 4908 if (vml[c] != NULL)
4906 4909 vml[c]->vdev_offline = B_TRUE;
4907 4910 }
4908 4911 vdev_reopen(spa->spa_root_vdev);
4909 4912
4910 4913 /*
4911 4914 * Temporarily record the splitting vdevs in the spa config. This
4912 4915 * will disappear once the config is regenerated.
4913 4916 */
4914 4917 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4915 4918 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
4916 4919 glist, children) == 0);
4917 4920 kmem_free(glist, children * sizeof (uint64_t));
4918 4921
4919 4922 mutex_enter(&spa->spa_props_lock);
4920 4923 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
4921 4924 nvl) == 0);
4922 4925 mutex_exit(&spa->spa_props_lock);
4923 4926 spa->spa_config_splitting = nvl;
4924 4927 vdev_config_dirty(spa->spa_root_vdev);
4925 4928
4926 4929 /* configure and create the new pool */
4927 4930 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
4928 4931 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4929 4932 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
4930 4933 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
4931 4934 spa_version(spa)) == 0);
4932 4935 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
4933 4936 spa->spa_config_txg) == 0);
4934 4937 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
4935 4938 spa_generate_guid(NULL)) == 0);
4936 4939 (void) nvlist_lookup_string(props,
4937 4940 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4938 4941
4939 4942 /* add the new pool to the namespace */
4940 4943 newspa = spa_add(newname, config, altroot);
4941 4944 newspa->spa_config_txg = spa->spa_config_txg;
4942 4945 spa_set_log_state(newspa, SPA_LOG_CLEAR);
4943 4946
4944 4947 /* release the spa config lock, retaining the namespace lock */
4945 4948 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4946 4949
4947 4950 if (zio_injection_enabled)
4948 4951 zio_handle_panic_injection(spa, FTAG, 1);
4949 4952
4950 4953 spa_activate(newspa, spa_mode_global);
4951 4954 spa_async_suspend(newspa);
4952 4955
4953 4956 /* create the new pool from the disks of the original pool */
4954 4957 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
4955 4958 if (error)
4956 4959 goto out;
4957 4960
4958 4961 /* if that worked, generate a real config for the new pool */
4959 4962 if (newspa->spa_root_vdev != NULL) {
4960 4963 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
4961 4964 NV_UNIQUE_NAME, KM_SLEEP) == 0);
4962 4965 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
4963 4966 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
4964 4967 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
4965 4968 B_TRUE));
4966 4969 }
4967 4970
4968 4971 /* set the props */
4969 4972 if (props != NULL) {
4970 4973 spa_configfile_set(newspa, props, B_FALSE);
4971 4974 error = spa_prop_set(newspa, props);
4972 4975 if (error)
4973 4976 goto out;
4974 4977 }
4975 4978
4976 4979 /* flush everything */
4977 4980 txg = spa_vdev_config_enter(newspa);
4978 4981 vdev_config_dirty(newspa->spa_root_vdev);
4979 4982 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
4980 4983
4981 4984 if (zio_injection_enabled)
4982 4985 zio_handle_panic_injection(spa, FTAG, 2);
4983 4986
4984 4987 spa_async_resume(newspa);
4985 4988
4986 4989 /* finally, update the original pool's config */
4987 4990 txg = spa_vdev_config_enter(spa);
4988 4991 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4989 4992 error = dmu_tx_assign(tx, TXG_WAIT);
4990 4993 if (error != 0)
4991 4994 dmu_tx_abort(tx);
4992 4995 for (c = 0; c < children; c++) {
4993 4996 if (vml[c] != NULL) {
4994 4997 vdev_split(vml[c]);
4995 4998 if (error == 0)
4996 4999 spa_history_log_internal(spa, "detach", tx,
4997 5000 "vdev=%s", vml[c]->vdev_path);
4998 5001 vdev_free(vml[c]);
4999 5002 }
5000 5003 }
5001 5004 vdev_config_dirty(spa->spa_root_vdev);
5002 5005 spa->spa_config_splitting = NULL;
5003 5006 nvlist_free(nvl);
5004 5007 if (error == 0)
5005 5008 dmu_tx_commit(tx);
5006 5009 (void) spa_vdev_exit(spa, NULL, txg, 0);
5007 5010
5008 5011 if (zio_injection_enabled)
5009 5012 zio_handle_panic_injection(spa, FTAG, 3);
5010 5013
5011 5014 /* split is complete; log a history record */
5012 5015 spa_history_log_internal(newspa, "split", NULL,
5013 5016 "from pool %s", spa_name(spa));
5014 5017
5015 5018 kmem_free(vml, children * sizeof (vdev_t *));
5016 5019
5017 5020 /* if we're not going to mount the filesystems in userland, export */
5018 5021 if (exp)
5019 5022 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
5020 5023 B_FALSE, B_FALSE);
5021 5024
5022 5025 return (error);
5023 5026
5024 5027 out:
5025 5028 spa_unload(newspa);
5026 5029 spa_deactivate(newspa);
5027 5030 spa_remove(newspa);
5028 5031
5029 5032 txg = spa_vdev_config_enter(spa);
5030 5033
5031 5034 /* re-online all offlined disks */
5032 5035 for (c = 0; c < children; c++) {
5033 5036 if (vml[c] != NULL)
5034 5037 vml[c]->vdev_offline = B_FALSE;
5035 5038 }
5036 5039 vdev_reopen(spa->spa_root_vdev);
5037 5040
5038 5041 nvlist_free(spa->spa_config_splitting);
5039 5042 spa->spa_config_splitting = NULL;
5040 5043 (void) spa_vdev_exit(spa, NULL, txg, error);
5041 5044
5042 5045 kmem_free(vml, children * sizeof (vdev_t *));
5043 5046 return (error);
5044 5047 }
5045 5048
5046 5049 static nvlist_t *
5047 5050 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
5048 5051 {
5049 5052 for (int i = 0; i < count; i++) {
5050 5053 uint64_t guid;
5051 5054
5052 5055 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
5053 5056 &guid) == 0);
5054 5057
5055 5058 if (guid == target_guid)
5056 5059 return (nvpp[i]);
5057 5060 }
5058 5061
5059 5062 return (NULL);
5060 5063 }
5061 5064
5062 5065 static void
5063 5066 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
5064 5067 nvlist_t *dev_to_remove)
5065 5068 {
5066 5069 nvlist_t **newdev = NULL;
5067 5070
5068 5071 if (count > 1)
5069 5072 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
5070 5073
5071 5074 for (int i = 0, j = 0; i < count; i++) {
5072 5075 if (dev[i] == dev_to_remove)
5073 5076 continue;
5074 5077 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
5075 5078 }
5076 5079
5077 5080 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
5078 5081 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
5079 5082
5080 5083 for (int i = 0; i < count - 1; i++)
5081 5084 nvlist_free(newdev[i]);
5082 5085
5083 5086 if (count > 1)
5084 5087 kmem_free(newdev, (count - 1) * sizeof (void *));
5085 5088 }
5086 5089
5087 5090 /*
5088 5091 * Evacuate the device.
5089 5092 */
5090 5093 static int
5091 5094 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
5092 5095 {
5093 5096 uint64_t txg;
5094 5097 int error = 0;
5095 5098
5096 5099 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5097 5100 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5098 5101 ASSERT(vd == vd->vdev_top);
5099 5102
5100 5103 /*
5101 5104 * Evacuate the device. We don't hold the config lock as writer
5102 5105 * since we need to do I/O but we do keep the
5103 5106 * spa_namespace_lock held. Once this completes the device
5104 5107 * should no longer have any blocks allocated on it.
5105 5108 */
5106 5109 if (vd->vdev_islog) {
5107 5110 if (vd->vdev_stat.vs_alloc != 0)
5108 5111 error = spa_offline_log(spa);
5109 5112 } else {
5110 5113 error = SET_ERROR(ENOTSUP);
5111 5114 }
5112 5115
5113 5116 if (error)
5114 5117 return (error);
5115 5118
5116 5119 /*
5117 5120 * The evacuation succeeded. Remove any remaining MOS metadata
5118 5121 * associated with this vdev, and wait for these changes to sync.
5119 5122 */
5120 5123 ASSERT0(vd->vdev_stat.vs_alloc);
5121 5124 txg = spa_vdev_config_enter(spa);
5122 5125 vd->vdev_removing = B_TRUE;
5123 5126 vdev_dirty(vd, 0, NULL, txg);
5124 5127 vdev_config_dirty(vd);
5125 5128 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5126 5129
5127 5130 return (0);
5128 5131 }
5129 5132
5130 5133 /*
5131 5134 * Complete the removal by cleaning up the namespace.
5132 5135 */
5133 5136 static void
5134 5137 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
5135 5138 {
5136 5139 vdev_t *rvd = spa->spa_root_vdev;
5137 5140 uint64_t id = vd->vdev_id;
5138 5141 boolean_t last_vdev = (id == (rvd->vdev_children - 1));
5139 5142
5140 5143 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5141 5144 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5142 5145 ASSERT(vd == vd->vdev_top);
5143 5146
5144 5147 /*
5145 5148 * Only remove any devices which are empty.
5146 5149 */
5147 5150 if (vd->vdev_stat.vs_alloc != 0)
5148 5151 return;
5149 5152
5150 5153 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5151 5154
5152 5155 if (list_link_active(&vd->vdev_state_dirty_node))
5153 5156 vdev_state_clean(vd);
5154 5157 if (list_link_active(&vd->vdev_config_dirty_node))
5155 5158 vdev_config_clean(vd);
5156 5159
5157 5160 vdev_free(vd);
5158 5161
5159 5162 if (last_vdev) {
5160 5163 vdev_compact_children(rvd);
5161 5164 } else {
5162 5165 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
5163 5166 vdev_add_child(rvd, vd);
5164 5167 }
5165 5168 vdev_config_dirty(rvd);
5166 5169
5167 5170 /*
5168 5171 * Reassess the health of our root vdev.
5169 5172 */
5170 5173 vdev_reopen(rvd);
5171 5174 }
5172 5175
5173 5176 /*
5174 5177 * Remove a device from the pool -
5175 5178 *
5176 5179 * Removing a device from the vdev namespace requires several steps
5177 5180 * and can take a significant amount of time. As a result we use
5178 5181 * the spa_vdev_config_[enter/exit] functions which allow us to
5179 5182 * grab and release the spa_config_lock while still holding the namespace
5180 5183 * lock. During each step the configuration is synced out.
5181 5184 *
5182 5185 * Currently, this supports removing only hot spares, slogs, and level 2 ARC
5183 5186 * devices.
5184 5187 */
5185 5188 int
5186 5189 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
5187 5190 {
5188 5191 vdev_t *vd;
5189 5192 metaslab_group_t *mg;
5190 5193 nvlist_t **spares, **l2cache, *nv;
5191 5194 uint64_t txg = 0;
5192 5195 uint_t nspares, nl2cache;
5193 5196 int error = 0;
5194 5197 boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
5195 5198
5196 5199 ASSERT(spa_writeable(spa));
5197 5200
5198 5201 if (!locked)
5199 5202 txg = spa_vdev_enter(spa);
5200 5203
5201 5204 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5202 5205
5203 5206 if (spa->spa_spares.sav_vdevs != NULL &&
5204 5207 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5205 5208 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
5206 5209 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
5207 5210 /*
5208 5211 * Only remove the hot spare if it's not currently in use
5209 5212 * in this pool.
5210 5213 */
5211 5214 if (vd == NULL || unspare) {
5212 5215 spa_vdev_remove_aux(spa->spa_spares.sav_config,
5213 5216 ZPOOL_CONFIG_SPARES, spares, nspares, nv);
5214 5217 spa_load_spares(spa);
5215 5218 spa->spa_spares.sav_sync = B_TRUE;
5216 5219 } else {
5217 5220 error = SET_ERROR(EBUSY);
5218 5221 }
5219 5222 } else if (spa->spa_l2cache.sav_vdevs != NULL &&
5220 5223 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
5221 5224 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
5222 5225 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
5223 5226 /*
5224 5227 * Cache devices can always be removed.
5225 5228 */
5226 5229 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
5227 5230 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
5228 5231 spa_load_l2cache(spa);
5229 5232 spa->spa_l2cache.sav_sync = B_TRUE;
5230 5233 } else if (vd != NULL && vd->vdev_islog) {
5231 5234 ASSERT(!locked);
5232 5235 ASSERT(vd == vd->vdev_top);
5233 5236
5234 5237 /*
5235 5238 * XXX - Once we have bp-rewrite this should
5236 5239 * become the common case.
5237 5240 */
5238 5241
5239 5242 mg = vd->vdev_mg;
5240 5243
5241 5244 /*
5242 5245 * Stop allocating from this vdev.
5243 5246 */
5244 5247 metaslab_group_passivate(mg);
5245 5248
5246 5249 /*
5247 5250 * Wait for the youngest allocations and frees to sync,
5248 5251 * and then wait for the deferral of those frees to finish.
5249 5252 */
5250 5253 spa_vdev_config_exit(spa, NULL,
5251 5254 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
5252 5255
5253 5256 /*
5254 5257 * Attempt to evacuate the vdev.
5255 5258 */
5256 5259 error = spa_vdev_remove_evacuate(spa, vd);
5257 5260
5258 5261 txg = spa_vdev_config_enter(spa);
5259 5262
5260 5263 /*
5261 5264 * If we couldn't evacuate the vdev, unwind.
5262 5265 */
5263 5266 if (error) {
5264 5267 metaslab_group_activate(mg);
5265 5268 return (spa_vdev_exit(spa, NULL, txg, error));
5266 5269 }
5267 5270
5268 5271 /*
5269 5272 * Clean up the vdev namespace.
5270 5273 */
5271 5274 spa_vdev_remove_from_namespace(spa, vd);
5272 5275
5273 5276 } else if (vd != NULL) {
5274 5277 /*
5275 5278 * Normal vdevs cannot be removed (yet).
5276 5279 */
5277 5280 error = SET_ERROR(ENOTSUP);
5278 5281 } else {
5279 5282 /*
5280 5283 * There is no vdev of any kind with the specified guid.
5281 5284 */
5282 5285 error = SET_ERROR(ENOENT);
5283 5286 }
5284 5287
5285 5288 if (!locked)
5286 5289 return (spa_vdev_exit(spa, NULL, txg, error));
5287 5290
5288 5291 return (error);
5289 5292 }
5290 5293
5291 5294 /*
5292 5295 * Find any device that's done replacing, or a vdev marked 'unspare' that's
5293 5296 * currently spared, so we can detach it.
5294 5297 */
5295 5298 static vdev_t *
5296 5299 spa_vdev_resilver_done_hunt(vdev_t *vd)
5297 5300 {
5298 5301 vdev_t *newvd, *oldvd;
5299 5302
5300 5303 for (int c = 0; c < vd->vdev_children; c++) {
5301 5304 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
5302 5305 if (oldvd != NULL)
5303 5306 return (oldvd);
5304 5307 }
5305 5308
5306 5309 /*
5307 5310 * Check for a completed replacement. We always consider the first
5308 5311 * vdev in the list to be the oldest vdev, and the last one to be
5309 5312 * the newest (see spa_vdev_attach() for how that works). In
5310 5313 * the case where the newest vdev is faulted, we will not automatically
5311 5314 * remove it after a resilver completes. This is OK as it will require
5312 5315 * user intervention to determine which disk the admin wishes to keep.
5313 5316 */
5314 5317 if (vd->vdev_ops == &vdev_replacing_ops) {
5315 5318 ASSERT(vd->vdev_children > 1);
5316 5319
5317 5320 newvd = vd->vdev_child[vd->vdev_children - 1];
5318 5321 oldvd = vd->vdev_child[0];
5319 5322
5320 5323 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
5321 5324 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5322 5325 !vdev_dtl_required(oldvd))
5323 5326 return (oldvd);
5324 5327 }
5325 5328
5326 5329 /*
5327 5330 * Check for a completed resilver with the 'unspare' flag set.
5328 5331 */
5329 5332 if (vd->vdev_ops == &vdev_spare_ops) {
5330 5333 vdev_t *first = vd->vdev_child[0];
5331 5334 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
5332 5335
5333 5336 if (last->vdev_unspare) {
5334 5337 oldvd = first;
5335 5338 newvd = last;
5336 5339 } else if (first->vdev_unspare) {
5337 5340 oldvd = last;
5338 5341 newvd = first;
5339 5342 } else {
5340 5343 oldvd = NULL;
5341 5344 }
5342 5345
5343 5346 if (oldvd != NULL &&
5344 5347 vdev_dtl_empty(newvd, DTL_MISSING) &&
5345 5348 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5346 5349 !vdev_dtl_required(oldvd))
5347 5350 return (oldvd);
5348 5351
5349 5352 /*
5350 5353 * If there are more than two spares attached to a disk,
5351 5354 * and those spares are not required, then we want to
5352 5355 * attempt to free them up now so that they can be used
5353 5356 * by other pools. Once we're back down to a single
5354 5357 * disk+spare, we stop removing them.
5355 5358 */
5356 5359 if (vd->vdev_children > 2) {
5357 5360 newvd = vd->vdev_child[1];
5358 5361
5359 5362 if (newvd->vdev_isspare && last->vdev_isspare &&
5360 5363 vdev_dtl_empty(last, DTL_MISSING) &&
5361 5364 vdev_dtl_empty(last, DTL_OUTAGE) &&
5362 5365 !vdev_dtl_required(newvd))
5363 5366 return (newvd);
5364 5367 }
5365 5368 }
5366 5369
5367 5370 return (NULL);
5368 5371 }
5369 5372
5370 5373 static void
5371 5374 spa_vdev_resilver_done(spa_t *spa)
5372 5375 {
5373 5376 vdev_t *vd, *pvd, *ppvd;
5374 5377 uint64_t guid, sguid, pguid, ppguid;
5375 5378
5376 5379 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5377 5380
5378 5381 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
5379 5382 pvd = vd->vdev_parent;
5380 5383 ppvd = pvd->vdev_parent;
5381 5384 guid = vd->vdev_guid;
5382 5385 pguid = pvd->vdev_guid;
5383 5386 ppguid = ppvd->vdev_guid;
5384 5387 sguid = 0;
5385 5388 /*
5386 5389 * If we have just finished replacing a hot spared device, then
5387 5390 * we need to detach the parent's first child (the original hot
5388 5391 * spare) as well.
5389 5392 */
5390 5393 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
5391 5394 ppvd->vdev_children == 2) {
5392 5395 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
5393 5396 sguid = ppvd->vdev_child[1]->vdev_guid;
5394 5397 }
5395 5398 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
5396 5399
5397 5400 spa_config_exit(spa, SCL_ALL, FTAG);
5398 5401 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
5399 5402 return;
5400 5403 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
5401 5404 return;
5402 5405 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5403 5406 }
5404 5407
5405 5408 spa_config_exit(spa, SCL_ALL, FTAG);
5406 5409 }
5407 5410
5408 5411 /*
5409 5412 * Update the stored path or FRU for this vdev.
5410 5413 */
5411 5414 int
5412 5415 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
5413 5416 boolean_t ispath)
5414 5417 {
5415 5418 vdev_t *vd;
5416 5419 boolean_t sync = B_FALSE;
5417 5420
5418 5421 ASSERT(spa_writeable(spa));
5419 5422
5420 5423 spa_vdev_state_enter(spa, SCL_ALL);
5421 5424
5422 5425 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
5423 5426 return (spa_vdev_state_exit(spa, NULL, ENOENT));
5424 5427
5425 5428 if (!vd->vdev_ops->vdev_op_leaf)
5426 5429 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
5427 5430
5428 5431 if (ispath) {
5429 5432 if (strcmp(value, vd->vdev_path) != 0) {
5430 5433 spa_strfree(vd->vdev_path);
5431 5434 vd->vdev_path = spa_strdup(value);
5432 5435 sync = B_TRUE;
5433 5436 }
5434 5437 } else {
5435 5438 if (vd->vdev_fru == NULL) {
5436 5439 vd->vdev_fru = spa_strdup(value);
5437 5440 sync = B_TRUE;
5438 5441 } else if (strcmp(value, vd->vdev_fru) != 0) {
5439 5442 spa_strfree(vd->vdev_fru);
5440 5443 vd->vdev_fru = spa_strdup(value);
5441 5444 sync = B_TRUE;
5442 5445 }
5443 5446 }
5444 5447
5445 5448 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
5446 5449 }
5447 5450
5448 5451 int
5449 5452 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
5450 5453 {
5451 5454 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
5452 5455 }
5453 5456
5454 5457 int
5455 5458 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
5456 5459 {
5457 5460 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
5458 5461 }
5459 5462
5460 5463 /*
5461 5464 * ==========================================================================
5462 5465 * SPA Scanning
5463 5466 * ==========================================================================
5464 5467 */
5465 5468
5466 5469 int
5467 5470 spa_scan_stop(spa_t *spa)
5468 5471 {
5469 5472 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5470 5473 if (dsl_scan_resilvering(spa->spa_dsl_pool))
5471 5474 return (SET_ERROR(EBUSY));
5472 5475 return (dsl_scan_cancel(spa->spa_dsl_pool));
5473 5476 }
5474 5477
5475 5478 int
5476 5479 spa_scan(spa_t *spa, pool_scan_func_t func)
5477 5480 {
5478 5481 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5479 5482
5480 5483 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5481 5484 return (SET_ERROR(ENOTSUP));
5482 5485
5483 5486 /*
5484 5487 * If a resilver was requested, but there is no DTL on a
5485 5488 * writeable leaf device, we have nothing to do.
5486 5489 */
5487 5490 if (func == POOL_SCAN_RESILVER &&
5488 5491 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5489 5492 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5490 5493 return (0);
5491 5494 }
5492 5495
5493 5496 return (dsl_scan(spa->spa_dsl_pool, func));
5494 5497 }
5495 5498
5496 5499 /*
5497 5500 * ==========================================================================
5498 5501 * SPA async task processing
5499 5502 * ==========================================================================
5500 5503 */
5501 5504
5502 5505 static void
5503 5506 spa_async_remove(spa_t *spa, vdev_t *vd)
5504 5507 {
5505 5508 if (vd->vdev_remove_wanted) {
5506 5509 vd->vdev_remove_wanted = B_FALSE;
5507 5510 vd->vdev_delayed_close = B_FALSE;
5508 5511 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5509 5512
5510 5513 /*
5511 5514 * We want to clear the stats, but we don't want to do a full
5512 5515 * vdev_clear() as that will cause us to throw away
5513 5516 * degraded/faulted state as well as attempt to reopen the
5514 5517 * device, all of which is a waste.
5515 5518 */
5516 5519 vd->vdev_stat.vs_read_errors = 0;
5517 5520 vd->vdev_stat.vs_write_errors = 0;
5518 5521 vd->vdev_stat.vs_checksum_errors = 0;
5519 5522
5520 5523 vdev_state_dirty(vd->vdev_top);
5521 5524 }
5522 5525
5523 5526 for (int c = 0; c < vd->vdev_children; c++)
5524 5527 spa_async_remove(spa, vd->vdev_child[c]);
5525 5528 }
5526 5529
5527 5530 static void
5528 5531 spa_async_probe(spa_t *spa, vdev_t *vd)
5529 5532 {
5530 5533 if (vd->vdev_probe_wanted) {
5531 5534 vd->vdev_probe_wanted = B_FALSE;
5532 5535 vdev_reopen(vd); /* vdev_open() does the actual probe */
5533 5536 }
5534 5537
5535 5538 for (int c = 0; c < vd->vdev_children; c++)
5536 5539 spa_async_probe(spa, vd->vdev_child[c]);
5537 5540 }
5538 5541
5539 5542 static void
5540 5543 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5541 5544 {
5542 5545 sysevent_id_t eid;
5543 5546 nvlist_t *attr;
5544 5547 char *physpath;
5545 5548
5546 5549 if (!spa->spa_autoexpand)
5547 5550 return;
5548 5551
5549 5552 for (int c = 0; c < vd->vdev_children; c++) {
5550 5553 vdev_t *cvd = vd->vdev_child[c];
5551 5554 spa_async_autoexpand(spa, cvd);
5552 5555 }
5553 5556
5554 5557 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5555 5558 return;
5556 5559
5557 5560 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
5558 5561 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
5559 5562
5560 5563 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5561 5564 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
5562 5565
5563 5566 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
5564 5567 ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
5565 5568
5566 5569 nvlist_free(attr);
5567 5570 kmem_free(physpath, MAXPATHLEN);
5568 5571 }
5569 5572
5570 5573 static void
5571 5574 spa_async_thread(spa_t *spa)
5572 5575 {
5573 5576 int tasks;
5574 5577
5575 5578 ASSERT(spa->spa_sync_on);
5576 5579
5577 5580 mutex_enter(&spa->spa_async_lock);
5578 5581 tasks = spa->spa_async_tasks;
5579 5582 spa->spa_async_tasks = 0;
5580 5583 mutex_exit(&spa->spa_async_lock);
5581 5584
5582 5585 /*
5583 5586 * See if the config needs to be updated.
5584 5587 */
5585 5588 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5586 5589 uint64_t old_space, new_space;
5587 5590
5588 5591 mutex_enter(&spa_namespace_lock);
5589 5592 old_space = metaslab_class_get_space(spa_normal_class(spa));
5590 5593 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5591 5594 new_space = metaslab_class_get_space(spa_normal_class(spa));
5592 5595 mutex_exit(&spa_namespace_lock);
5593 5596
5594 5597 /*
5595 5598 * If the pool grew as a result of the config update,
5596 5599 * then log an internal history event.
5597 5600 */
5598 5601 if (new_space != old_space) {
5599 5602 spa_history_log_internal(spa, "vdev online", NULL,
5600 5603 "pool '%s' size: %llu(+%llu)",
5601 5604 spa_name(spa), new_space, new_space - old_space);
5602 5605 }
5603 5606 }
5604 5607
5605 5608 /*
5606 5609 * See if any devices need to be marked REMOVED.
5607 5610 */
5608 5611 if (tasks & SPA_ASYNC_REMOVE) {
5609 5612 spa_vdev_state_enter(spa, SCL_NONE);
5610 5613 spa_async_remove(spa, spa->spa_root_vdev);
5611 5614 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
5612 5615 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5613 5616 for (int i = 0; i < spa->spa_spares.sav_count; i++)
5614 5617 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5615 5618 (void) spa_vdev_state_exit(spa, NULL, 0);
5616 5619 }
5617 5620
5618 5621 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5619 5622 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5620 5623 spa_async_autoexpand(spa, spa->spa_root_vdev);
5621 5624 spa_config_exit(spa, SCL_CONFIG, FTAG);
5622 5625 }
5623 5626
5624 5627 /*
5625 5628 * See if any devices need to be probed.
5626 5629 */
5627 5630 if (tasks & SPA_ASYNC_PROBE) {
5628 5631 spa_vdev_state_enter(spa, SCL_NONE);
5629 5632 spa_async_probe(spa, spa->spa_root_vdev);
5630 5633 (void) spa_vdev_state_exit(spa, NULL, 0);
5631 5634 }
5632 5635
5633 5636 /*
5634 5637 * If any devices are done replacing, detach them.
5635 5638 */
5636 5639 if (tasks & SPA_ASYNC_RESILVER_DONE)
5637 5640 spa_vdev_resilver_done(spa);
5638 5641
5639 5642 /*
5640 5643 * Kick off a resilver.
5641 5644 */
5642 5645 if (tasks & SPA_ASYNC_RESILVER)
5643 5646 dsl_resilver_restart(spa->spa_dsl_pool, 0);
5644 5647
5645 5648 /*
5646 5649 * Let the world know that we're done.
5647 5650 */
5648 5651 mutex_enter(&spa->spa_async_lock);
5649 5652 spa->spa_async_thread = NULL;
5650 5653 cv_broadcast(&spa->spa_async_cv);
5651 5654 mutex_exit(&spa->spa_async_lock);
5652 5655 thread_exit();
5653 5656 }
5654 5657
5655 5658 void
5656 5659 spa_async_suspend(spa_t *spa)
5657 5660 {
5658 5661 mutex_enter(&spa->spa_async_lock);
5659 5662 spa->spa_async_suspended++;
5660 5663 while (spa->spa_async_thread != NULL)
5661 5664 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5662 5665 mutex_exit(&spa->spa_async_lock);
5663 5666 }
5664 5667
5665 5668 void
5666 5669 spa_async_resume(spa_t *spa)
5667 5670 {
5668 5671 mutex_enter(&spa->spa_async_lock);
5669 5672 ASSERT(spa->spa_async_suspended != 0);
5670 5673 spa->spa_async_suspended--;
5671 5674 mutex_exit(&spa->spa_async_lock);
5672 5675 }
5673 5676
5674 5677 static boolean_t
5675 5678 spa_async_tasks_pending(spa_t *spa)
5676 5679 {
5677 5680 uint_t non_config_tasks;
5678 5681 uint_t config_task;
5679 5682 boolean_t config_task_suspended;
5680 5683
5681 5684 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
5682 5685 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
5683 5686 if (spa->spa_ccw_fail_time == 0) {
5684 5687 config_task_suspended = B_FALSE;
5685 5688 } else {
5686 5689 config_task_suspended =
5687 5690 (gethrtime() - spa->spa_ccw_fail_time) <
5688 5691 (zfs_ccw_retry_interval * NANOSEC);
5689 5692 }
5690 5693
5691 5694 return (non_config_tasks || (config_task && !config_task_suspended));
5692 5695 }
5693 5696
5694 5697 static void
5695 5698 spa_async_dispatch(spa_t *spa)
5696 5699 {
5697 5700 mutex_enter(&spa->spa_async_lock);
5698 5701 if (spa_async_tasks_pending(spa) &&
5699 5702 !spa->spa_async_suspended &&
5700 5703 spa->spa_async_thread == NULL &&
5701 5704 rootdir != NULL)
5702 5705 spa->spa_async_thread = thread_create(NULL, 0,
5703 5706 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5704 5707 mutex_exit(&spa->spa_async_lock);
5705 5708 }
5706 5709
5707 5710 void
5708 5711 spa_async_request(spa_t *spa, int task)
5709 5712 {
5710 5713 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5711 5714 mutex_enter(&spa->spa_async_lock);
5712 5715 spa->spa_async_tasks |= task;
5713 5716 mutex_exit(&spa->spa_async_lock);
5714 5717 }
5715 5718
5716 5719 /*
5717 5720 * ==========================================================================
5718 5721 * SPA syncing routines
5719 5722 * ==========================================================================
5720 5723 */
5721 5724
5722 5725 static int
5723 5726 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5724 5727 {
5725 5728 bpobj_t *bpo = arg;
5726 5729 bpobj_enqueue(bpo, bp, tx);
5727 5730 return (0);
5728 5731 }
5729 5732
↓ open down ↓ |
4833 lines elided |
↑ open up ↑ |
5730 5733 static int
5731 5734 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5732 5735 {
5733 5736 zio_t *zio = arg;
5734 5737
5735 5738 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5736 5739 zio->io_flags));
5737 5740 return (0);
5738 5741 }
5739 5742
5743 +/*
5744 + * Note: this simple function is not inlined to make it easier to dtrace the
5745 + * amount of time spent syncing frees.
5746 + */
5740 5747 static void
5748 +spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
5749 +{
5750 + zio_t *zio = zio_root(spa, NULL, NULL, 0);
5751 + bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
5752 + VERIFY(zio_wait(zio) == 0);
5753 +}
5754 +
5755 +/*
5756 + * Note: this simple function is not inlined to make it easier to dtrace the
5757 + * amount of time spent syncing deferred frees.
5758 + */
5759 +static void
5760 +spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
5761 +{
5762 + zio_t *zio = zio_root(spa, NULL, NULL, 0);
5763 + VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
5764 + spa_free_sync_cb, zio, tx), ==, 0);
5765 + VERIFY0(zio_wait(zio));
5766 +}
5767 +
5768 +
5769 +static void
5741 5770 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5742 5771 {
5743 5772 char *packed = NULL;
5744 5773 size_t bufsize;
5745 5774 size_t nvsize = 0;
5746 5775 dmu_buf_t *db;
5747 5776
5748 5777 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5749 5778
5750 5779 /*
5751 5780 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5752 5781 * information. This avoids the dbuf_will_dirty() path and
5753 5782 * saves us a pre-read to get data we don't actually care about.
5754 5783 */
5755 5784 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
5756 5785 packed = kmem_alloc(bufsize, KM_SLEEP);
5757 5786
5758 5787 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5759 5788 KM_SLEEP) == 0);
5760 5789 bzero(packed + nvsize, bufsize - nvsize);
5761 5790
5762 5791 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5763 5792
5764 5793 kmem_free(packed, bufsize);
5765 5794
5766 5795 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5767 5796 dmu_buf_will_dirty(db, tx);
5768 5797 *(uint64_t *)db->db_data = nvsize;
5769 5798 dmu_buf_rele(db, FTAG);
5770 5799 }
5771 5800
5772 5801 static void
5773 5802 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5774 5803 const char *config, const char *entry)
5775 5804 {
5776 5805 nvlist_t *nvroot;
5777 5806 nvlist_t **list;
5778 5807 int i;
5779 5808
5780 5809 if (!sav->sav_sync)
5781 5810 return;
5782 5811
5783 5812 /*
5784 5813 * Update the MOS nvlist describing the list of available devices.
5785 5814 * spa_validate_aux() will have already made sure this nvlist is
5786 5815 * valid and the vdevs are labeled appropriately.
5787 5816 */
5788 5817 if (sav->sav_object == 0) {
5789 5818 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5790 5819 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5791 5820 sizeof (uint64_t), tx);
5792 5821 VERIFY(zap_update(spa->spa_meta_objset,
5793 5822 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5794 5823 &sav->sav_object, tx) == 0);
5795 5824 }
5796 5825
5797 5826 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5798 5827 if (sav->sav_count == 0) {
5799 5828 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
5800 5829 } else {
5801 5830 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
5802 5831 for (i = 0; i < sav->sav_count; i++)
5803 5832 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
5804 5833 B_FALSE, VDEV_CONFIG_L2CACHE);
5805 5834 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5806 5835 sav->sav_count) == 0);
5807 5836 for (i = 0; i < sav->sav_count; i++)
5808 5837 nvlist_free(list[i]);
5809 5838 kmem_free(list, sav->sav_count * sizeof (void *));
5810 5839 }
5811 5840
5812 5841 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
5813 5842 nvlist_free(nvroot);
5814 5843
5815 5844 sav->sav_sync = B_FALSE;
5816 5845 }
5817 5846
5818 5847 static void
5819 5848 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
5820 5849 {
5821 5850 nvlist_t *config;
5822 5851
5823 5852 if (list_is_empty(&spa->spa_config_dirty_list))
5824 5853 return;
5825 5854
5826 5855 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5827 5856
5828 5857 config = spa_config_generate(spa, spa->spa_root_vdev,
5829 5858 dmu_tx_get_txg(tx), B_FALSE);
5830 5859
5831 5860 /*
5832 5861 * If we're upgrading the spa version then make sure that
5833 5862 * the config object gets updated with the correct version.
5834 5863 */
5835 5864 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
5836 5865 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5837 5866 spa->spa_uberblock.ub_version);
5838 5867
5839 5868 spa_config_exit(spa, SCL_STATE, FTAG);
5840 5869
5841 5870 if (spa->spa_config_syncing)
5842 5871 nvlist_free(spa->spa_config_syncing);
5843 5872 spa->spa_config_syncing = config;
5844 5873
5845 5874 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5846 5875 }
5847 5876
5848 5877 static void
5849 5878 spa_sync_version(void *arg, dmu_tx_t *tx)
5850 5879 {
5851 5880 uint64_t *versionp = arg;
5852 5881 uint64_t version = *versionp;
5853 5882 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5854 5883
5855 5884 /*
5856 5885 * Setting the version is special cased when first creating the pool.
5857 5886 */
5858 5887 ASSERT(tx->tx_txg != TXG_INITIAL);
5859 5888
5860 5889 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
5861 5890 ASSERT(version >= spa_version(spa));
5862 5891
5863 5892 spa->spa_uberblock.ub_version = version;
5864 5893 vdev_config_dirty(spa->spa_root_vdev);
5865 5894 spa_history_log_internal(spa, "set", tx, "version=%lld", version);
5866 5895 }
5867 5896
5868 5897 /*
5869 5898 * Set zpool properties.
5870 5899 */
5871 5900 static void
5872 5901 spa_sync_props(void *arg, dmu_tx_t *tx)
5873 5902 {
5874 5903 nvlist_t *nvp = arg;
5875 5904 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5876 5905 objset_t *mos = spa->spa_meta_objset;
5877 5906 nvpair_t *elem = NULL;
5878 5907
5879 5908 mutex_enter(&spa->spa_props_lock);
5880 5909
5881 5910 while ((elem = nvlist_next_nvpair(nvp, elem))) {
5882 5911 uint64_t intval;
5883 5912 char *strval, *fname;
5884 5913 zpool_prop_t prop;
5885 5914 const char *propname;
5886 5915 zprop_type_t proptype;
5887 5916 zfeature_info_t *feature;
5888 5917
5889 5918 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
5890 5919 case ZPROP_INVAL:
5891 5920 /*
5892 5921 * We checked this earlier in spa_prop_validate().
5893 5922 */
5894 5923 ASSERT(zpool_prop_feature(nvpair_name(elem)));
5895 5924
5896 5925 fname = strchr(nvpair_name(elem), '@') + 1;
5897 5926 VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
5898 5927
5899 5928 spa_feature_enable(spa, feature, tx);
5900 5929 spa_history_log_internal(spa, "set", tx,
5901 5930 "%s=enabled", nvpair_name(elem));
5902 5931 break;
5903 5932
5904 5933 case ZPOOL_PROP_VERSION:
5905 5934 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5906 5935 /*
5907 5936 * The version is synced seperatly before other
5908 5937 * properties and should be correct by now.
5909 5938 */
5910 5939 ASSERT3U(spa_version(spa), >=, intval);
5911 5940 break;
5912 5941
5913 5942 case ZPOOL_PROP_ALTROOT:
5914 5943 /*
5915 5944 * 'altroot' is a non-persistent property. It should
5916 5945 * have been set temporarily at creation or import time.
5917 5946 */
5918 5947 ASSERT(spa->spa_root != NULL);
5919 5948 break;
5920 5949
5921 5950 case ZPOOL_PROP_READONLY:
5922 5951 case ZPOOL_PROP_CACHEFILE:
5923 5952 /*
5924 5953 * 'readonly' and 'cachefile' are also non-persisitent
5925 5954 * properties.
5926 5955 */
5927 5956 break;
5928 5957 case ZPOOL_PROP_COMMENT:
5929 5958 VERIFY(nvpair_value_string(elem, &strval) == 0);
5930 5959 if (spa->spa_comment != NULL)
5931 5960 spa_strfree(spa->spa_comment);
5932 5961 spa->spa_comment = spa_strdup(strval);
5933 5962 /*
5934 5963 * We need to dirty the configuration on all the vdevs
5935 5964 * so that their labels get updated. It's unnecessary
5936 5965 * to do this for pool creation since the vdev's
5937 5966 * configuratoin has already been dirtied.
5938 5967 */
5939 5968 if (tx->tx_txg != TXG_INITIAL)
5940 5969 vdev_config_dirty(spa->spa_root_vdev);
5941 5970 spa_history_log_internal(spa, "set", tx,
5942 5971 "%s=%s", nvpair_name(elem), strval);
5943 5972 break;
5944 5973 default:
5945 5974 /*
5946 5975 * Set pool property values in the poolprops mos object.
5947 5976 */
5948 5977 if (spa->spa_pool_props_object == 0) {
5949 5978 spa->spa_pool_props_object =
5950 5979 zap_create_link(mos, DMU_OT_POOL_PROPS,
5951 5980 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5952 5981 tx);
5953 5982 }
5954 5983
5955 5984 /* normalize the property name */
5956 5985 propname = zpool_prop_to_name(prop);
5957 5986 proptype = zpool_prop_get_type(prop);
5958 5987
5959 5988 if (nvpair_type(elem) == DATA_TYPE_STRING) {
5960 5989 ASSERT(proptype == PROP_TYPE_STRING);
5961 5990 VERIFY(nvpair_value_string(elem, &strval) == 0);
5962 5991 VERIFY(zap_update(mos,
5963 5992 spa->spa_pool_props_object, propname,
5964 5993 1, strlen(strval) + 1, strval, tx) == 0);
5965 5994 spa_history_log_internal(spa, "set", tx,
5966 5995 "%s=%s", nvpair_name(elem), strval);
5967 5996 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5968 5997 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5969 5998
5970 5999 if (proptype == PROP_TYPE_INDEX) {
5971 6000 const char *unused;
5972 6001 VERIFY(zpool_prop_index_to_string(
5973 6002 prop, intval, &unused) == 0);
5974 6003 }
5975 6004 VERIFY(zap_update(mos,
5976 6005 spa->spa_pool_props_object, propname,
5977 6006 8, 1, &intval, tx) == 0);
5978 6007 spa_history_log_internal(spa, "set", tx,
5979 6008 "%s=%lld", nvpair_name(elem), intval);
5980 6009 } else {
5981 6010 ASSERT(0); /* not allowed */
5982 6011 }
5983 6012
5984 6013 switch (prop) {
5985 6014 case ZPOOL_PROP_DELEGATION:
5986 6015 spa->spa_delegation = intval;
5987 6016 break;
5988 6017 case ZPOOL_PROP_BOOTFS:
5989 6018 spa->spa_bootfs = intval;
5990 6019 break;
5991 6020 case ZPOOL_PROP_FAILUREMODE:
5992 6021 spa->spa_failmode = intval;
5993 6022 break;
5994 6023 case ZPOOL_PROP_AUTOEXPAND:
5995 6024 spa->spa_autoexpand = intval;
5996 6025 if (tx->tx_txg != TXG_INITIAL)
5997 6026 spa_async_request(spa,
5998 6027 SPA_ASYNC_AUTOEXPAND);
5999 6028 break;
6000 6029 case ZPOOL_PROP_DEDUPDITTO:
6001 6030 spa->spa_dedup_ditto = intval;
6002 6031 break;
6003 6032 default:
6004 6033 break;
6005 6034 }
6006 6035 }
6007 6036
6008 6037 }
6009 6038
6010 6039 mutex_exit(&spa->spa_props_lock);
6011 6040 }
6012 6041
6013 6042 /*
6014 6043 * Perform one-time upgrade on-disk changes. spa_version() does not
6015 6044 * reflect the new version this txg, so there must be no changes this
6016 6045 * txg to anything that the upgrade code depends on after it executes.
6017 6046 * Therefore this must be called after dsl_pool_sync() does the sync
6018 6047 * tasks.
6019 6048 */
6020 6049 static void
6021 6050 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
6022 6051 {
6023 6052 dsl_pool_t *dp = spa->spa_dsl_pool;
6024 6053
6025 6054 ASSERT(spa->spa_sync_pass == 1);
6026 6055
6027 6056 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
6028 6057
6029 6058 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
6030 6059 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
6031 6060 dsl_pool_create_origin(dp, tx);
6032 6061
6033 6062 /* Keeping the origin open increases spa_minref */
6034 6063 spa->spa_minref += 3;
6035 6064 }
6036 6065
6037 6066 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
6038 6067 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
6039 6068 dsl_pool_upgrade_clones(dp, tx);
6040 6069 }
6041 6070
6042 6071 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
6043 6072 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
6044 6073 dsl_pool_upgrade_dir_clones(dp, tx);
6045 6074
6046 6075 /* Keeping the freedir open increases spa_minref */
6047 6076 spa->spa_minref += 3;
6048 6077 }
6049 6078
6050 6079 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6051 6080 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6052 6081 spa_feature_create_zap_objects(spa, tx);
6053 6082 }
6054 6083 rrw_exit(&dp->dp_config_rwlock, FTAG);
6055 6084 }
↓ open down ↓ |
305 lines elided |
↑ open up ↑ |
6056 6085
6057 6086 /*
6058 6087 * Sync the specified transaction group. New blocks may be dirtied as
6059 6088 * part of the process, so we iterate until it converges.
6060 6089 */
6061 6090 void
6062 6091 spa_sync(spa_t *spa, uint64_t txg)
6063 6092 {
6064 6093 dsl_pool_t *dp = spa->spa_dsl_pool;
6065 6094 objset_t *mos = spa->spa_meta_objset;
6066 - bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
6067 6095 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6068 6096 vdev_t *rvd = spa->spa_root_vdev;
6069 6097 vdev_t *vd;
6070 6098 dmu_tx_t *tx;
6071 6099 int error;
6072 6100
6073 6101 VERIFY(spa_writeable(spa));
6074 6102
6075 6103 /*
6076 6104 * Lock out configuration changes.
6077 6105 */
6078 6106 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6079 6107
6080 6108 spa->spa_syncing_txg = txg;
6081 6109 spa->spa_sync_pass = 0;
6082 6110
6083 6111 /*
6084 6112 * If there are any pending vdev state changes, convert them
6085 6113 * into config changes that go out with this transaction group.
6086 6114 */
6087 6115 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6088 6116 while (list_head(&spa->spa_state_dirty_list) != NULL) {
6089 6117 /*
6090 6118 * We need the write lock here because, for aux vdevs,
6091 6119 * calling vdev_config_dirty() modifies sav_config.
6092 6120 * This is ugly and will become unnecessary when we
6093 6121 * eliminate the aux vdev wart by integrating all vdevs
6094 6122 * into the root vdev tree.
6095 6123 */
6096 6124 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6097 6125 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
6098 6126 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
6099 6127 vdev_state_clean(vd);
6100 6128 vdev_config_dirty(vd);
6101 6129 }
6102 6130 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6103 6131 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6104 6132 }
6105 6133 spa_config_exit(spa, SCL_STATE, FTAG);
6106 6134
6107 6135 tx = dmu_tx_create_assigned(dp, txg);
6108 6136
6109 6137 spa->spa_sync_starttime = gethrtime();
6110 6138 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
6111 6139 spa->spa_sync_starttime + spa->spa_deadman_synctime));
6112 6140
6113 6141 /*
6114 6142 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
6115 6143 * set spa_deflate if we have no raid-z vdevs.
6116 6144 */
6117 6145 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
6118 6146 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
6119 6147 int i;
6120 6148
6121 6149 for (i = 0; i < rvd->vdev_children; i++) {
6122 6150 vd = rvd->vdev_child[i];
6123 6151 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
6124 6152 break;
6125 6153 }
6126 6154 if (i == rvd->vdev_children) {
6127 6155 spa->spa_deflate = TRUE;
6128 6156 VERIFY(0 == zap_add(spa->spa_meta_objset,
6129 6157 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6130 6158 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6131 6159 }
6132 6160 }
6133 6161
6134 6162 /*
6135 6163 * If anything has changed in this txg, or if someone is waiting
↓ open down ↓ |
59 lines elided |
↑ open up ↑ |
6136 6164 * for this txg to sync (eg, spa_vdev_remove()), push the
6137 6165 * deferred frees from the previous txg. If not, leave them
6138 6166 * alone so that we don't generate work on an otherwise idle
6139 6167 * system.
6140 6168 */
6141 6169 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
6142 6170 !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
6143 6171 !txg_list_empty(&dp->dp_sync_tasks, txg) ||
6144 6172 ((dsl_scan_active(dp->dp_scan) ||
6145 6173 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
6146 - zio_t *zio = zio_root(spa, NULL, NULL, 0);
6147 - VERIFY3U(bpobj_iterate(defer_bpo,
6148 - spa_free_sync_cb, zio, tx), ==, 0);
6149 - VERIFY0(zio_wait(zio));
6174 + spa_sync_deferred_frees(spa, tx);
6150 6175 }
6151 6176
6152 6177 /*
6153 6178 * Iterate to convergence.
6154 6179 */
6155 6180 do {
6156 6181 int pass = ++spa->spa_sync_pass;
6157 6182
6158 6183 spa_sync_config_object(spa, tx);
6159 6184 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6160 6185 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6161 6186 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6162 6187 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6163 6188 spa_errlog_sync(spa, txg);
6164 6189 dsl_pool_sync(dp, txg);
6165 6190
6166 6191 if (pass < zfs_sync_pass_deferred_free) {
6167 - zio_t *zio = zio_root(spa, NULL, NULL, 0);
6168 - bplist_iterate(free_bpl, spa_free_sync_cb,
6169 - zio, tx);
6170 - VERIFY(zio_wait(zio) == 0);
6192 + spa_sync_frees(spa, free_bpl, tx);
6171 6193 } else {
6172 6194 bplist_iterate(free_bpl, bpobj_enqueue_cb,
6173 - defer_bpo, tx);
6195 + &spa->spa_deferred_bpobj, tx);
6174 6196 }
6175 6197
6176 6198 ddt_sync(spa, txg);
6177 6199 dsl_scan_sync(dp, tx);
6178 6200
6179 6201 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
6180 6202 vdev_sync(vd, txg);
6181 6203
6182 6204 if (pass == 1)
6183 6205 spa_sync_upgrades(spa, tx);
6184 6206
6185 6207 } while (dmu_objset_is_dirty(mos, txg));
6186 6208
6187 6209 /*
6188 6210 * Rewrite the vdev configuration (which includes the uberblock)
6189 6211 * to commit the transaction group.
6190 6212 *
6191 6213 * If there are no dirty vdevs, we sync the uberblock to a few
6192 6214 * random top-level vdevs that are known to be visible in the
6193 6215 * config cache (see spa_vdev_add() for a complete description).
6194 6216 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
6195 6217 */
6196 6218 for (;;) {
6197 6219 /*
6198 6220 * We hold SCL_STATE to prevent vdev open/close/etc.
6199 6221 * while we're attempting to write the vdev labels.
6200 6222 */
6201 6223 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6202 6224
6203 6225 if (list_is_empty(&spa->spa_config_dirty_list)) {
6204 6226 vdev_t *svd[SPA_DVAS_PER_BP];
6205 6227 int svdcount = 0;
6206 6228 int children = rvd->vdev_children;
6207 6229 int c0 = spa_get_random(children);
6208 6230
6209 6231 for (int c = 0; c < children; c++) {
6210 6232 vd = rvd->vdev_child[(c0 + c) % children];
6211 6233 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
6212 6234 continue;
6213 6235 svd[svdcount++] = vd;
6214 6236 if (svdcount == SPA_DVAS_PER_BP)
6215 6237 break;
6216 6238 }
6217 6239 error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
6218 6240 if (error != 0)
6219 6241 error = vdev_config_sync(svd, svdcount, txg,
6220 6242 B_TRUE);
6221 6243 } else {
6222 6244 error = vdev_config_sync(rvd->vdev_child,
6223 6245 rvd->vdev_children, txg, B_FALSE);
6224 6246 if (error != 0)
6225 6247 error = vdev_config_sync(rvd->vdev_child,
6226 6248 rvd->vdev_children, txg, B_TRUE);
6227 6249 }
6228 6250
6229 6251 if (error == 0)
6230 6252 spa->spa_last_synced_guid = rvd->vdev_guid;
6231 6253
6232 6254 spa_config_exit(spa, SCL_STATE, FTAG);
6233 6255
6234 6256 if (error == 0)
6235 6257 break;
6236 6258 zio_suspend(spa, NULL);
6237 6259 zio_resume_wait(spa);
6238 6260 }
6239 6261 dmu_tx_commit(tx);
6240 6262
6241 6263 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
6242 6264
6243 6265 /*
6244 6266 * Clear the dirty config list.
6245 6267 */
6246 6268 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
6247 6269 vdev_config_clean(vd);
6248 6270
6249 6271 /*
6250 6272 * Now that the new config has synced transactionally,
6251 6273 * let it become visible to the config cache.
6252 6274 */
6253 6275 if (spa->spa_config_syncing != NULL) {
6254 6276 spa_config_set(spa, spa->spa_config_syncing);
6255 6277 spa->spa_config_txg = txg;
6256 6278 spa->spa_config_syncing = NULL;
6257 6279 }
6258 6280
6259 6281 spa->spa_ubsync = spa->spa_uberblock;
6260 6282
6261 6283 dsl_pool_sync_done(dp, txg);
6262 6284
6263 6285 /*
6264 6286 * Update usable space statistics.
6265 6287 */
6266 6288 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
6267 6289 vdev_sync_done(vd, txg);
6268 6290
6269 6291 spa_update_dspace(spa);
6270 6292
6271 6293 /*
6272 6294 * It had better be the case that we didn't dirty anything
6273 6295 * since vdev_config_sync().
6274 6296 */
6275 6297 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
6276 6298 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
6277 6299 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
6278 6300
6279 6301 spa->spa_sync_pass = 0;
6280 6302
6281 6303 spa_config_exit(spa, SCL_CONFIG, FTAG);
6282 6304
6283 6305 spa_handle_ignored_writes(spa);
6284 6306
6285 6307 /*
6286 6308 * If any async tasks have been requested, kick them off.
6287 6309 */
6288 6310 spa_async_dispatch(spa);
6289 6311 }
6290 6312
6291 6313 /*
6292 6314 * Sync all pools. We don't want to hold the namespace lock across these
6293 6315 * operations, so we take a reference on the spa_t and drop the lock during the
6294 6316 * sync.
6295 6317 */
6296 6318 void
6297 6319 spa_sync_allpools(void)
6298 6320 {
6299 6321 spa_t *spa = NULL;
6300 6322 mutex_enter(&spa_namespace_lock);
6301 6323 while ((spa = spa_next(spa)) != NULL) {
6302 6324 if (spa_state(spa) != POOL_STATE_ACTIVE ||
6303 6325 !spa_writeable(spa) || spa_suspended(spa))
6304 6326 continue;
6305 6327 spa_open_ref(spa, FTAG);
6306 6328 mutex_exit(&spa_namespace_lock);
6307 6329 txg_wait_synced(spa_get_dsl(spa), 0);
6308 6330 mutex_enter(&spa_namespace_lock);
6309 6331 spa_close(spa, FTAG);
6310 6332 }
6311 6333 mutex_exit(&spa_namespace_lock);
6312 6334 }
6313 6335
6314 6336 /*
6315 6337 * ==========================================================================
6316 6338 * Miscellaneous routines
6317 6339 * ==========================================================================
6318 6340 */
6319 6341
6320 6342 /*
6321 6343 * Remove all pools in the system.
6322 6344 */
6323 6345 void
6324 6346 spa_evict_all(void)
6325 6347 {
6326 6348 spa_t *spa;
6327 6349
6328 6350 /*
6329 6351 * Remove all cached state. All pools should be closed now,
6330 6352 * so every spa in the AVL tree should be unreferenced.
6331 6353 */
6332 6354 mutex_enter(&spa_namespace_lock);
6333 6355 while ((spa = spa_next(NULL)) != NULL) {
6334 6356 /*
6335 6357 * Stop async tasks. The async thread may need to detach
6336 6358 * a device that's been replaced, which requires grabbing
6337 6359 * spa_namespace_lock, so we must drop it here.
6338 6360 */
6339 6361 spa_open_ref(spa, FTAG);
6340 6362 mutex_exit(&spa_namespace_lock);
6341 6363 spa_async_suspend(spa);
6342 6364 mutex_enter(&spa_namespace_lock);
6343 6365 spa_close(spa, FTAG);
6344 6366
6345 6367 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6346 6368 spa_unload(spa);
6347 6369 spa_deactivate(spa);
6348 6370 }
6349 6371 spa_remove(spa);
6350 6372 }
6351 6373 mutex_exit(&spa_namespace_lock);
6352 6374 }
6353 6375
6354 6376 vdev_t *
6355 6377 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
6356 6378 {
6357 6379 vdev_t *vd;
6358 6380 int i;
6359 6381
6360 6382 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
6361 6383 return (vd);
6362 6384
6363 6385 if (aux) {
6364 6386 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
6365 6387 vd = spa->spa_l2cache.sav_vdevs[i];
6366 6388 if (vd->vdev_guid == guid)
6367 6389 return (vd);
6368 6390 }
6369 6391
6370 6392 for (i = 0; i < spa->spa_spares.sav_count; i++) {
6371 6393 vd = spa->spa_spares.sav_vdevs[i];
6372 6394 if (vd->vdev_guid == guid)
6373 6395 return (vd);
6374 6396 }
6375 6397 }
6376 6398
6377 6399 return (NULL);
6378 6400 }
6379 6401
6380 6402 void
6381 6403 spa_upgrade(spa_t *spa, uint64_t version)
6382 6404 {
6383 6405 ASSERT(spa_writeable(spa));
6384 6406
6385 6407 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6386 6408
6387 6409 /*
6388 6410 * This should only be called for a non-faulted pool, and since a
6389 6411 * future version would result in an unopenable pool, this shouldn't be
6390 6412 * possible.
6391 6413 */
6392 6414 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
6393 6415 ASSERT(version >= spa->spa_uberblock.ub_version);
6394 6416
6395 6417 spa->spa_uberblock.ub_version = version;
6396 6418 vdev_config_dirty(spa->spa_root_vdev);
6397 6419
6398 6420 spa_config_exit(spa, SCL_ALL, FTAG);
6399 6421
6400 6422 txg_wait_synced(spa_get_dsl(spa), 0);
6401 6423 }
6402 6424
6403 6425 boolean_t
6404 6426 spa_has_spare(spa_t *spa, uint64_t guid)
6405 6427 {
6406 6428 int i;
6407 6429 uint64_t spareguid;
6408 6430 spa_aux_vdev_t *sav = &spa->spa_spares;
6409 6431
6410 6432 for (i = 0; i < sav->sav_count; i++)
6411 6433 if (sav->sav_vdevs[i]->vdev_guid == guid)
6412 6434 return (B_TRUE);
6413 6435
6414 6436 for (i = 0; i < sav->sav_npending; i++) {
6415 6437 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
6416 6438 &spareguid) == 0 && spareguid == guid)
6417 6439 return (B_TRUE);
6418 6440 }
6419 6441
6420 6442 return (B_FALSE);
6421 6443 }
6422 6444
6423 6445 /*
6424 6446 * Check if a pool has an active shared spare device.
6425 6447 * Note: reference count of an active spare is 2, as a spare and as a replace
6426 6448 */
6427 6449 static boolean_t
6428 6450 spa_has_active_shared_spare(spa_t *spa)
6429 6451 {
6430 6452 int i, refcnt;
6431 6453 uint64_t pool;
6432 6454 spa_aux_vdev_t *sav = &spa->spa_spares;
6433 6455
6434 6456 for (i = 0; i < sav->sav_count; i++) {
6435 6457 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
6436 6458 &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
6437 6459 refcnt > 2)
6438 6460 return (B_TRUE);
6439 6461 }
6440 6462
6441 6463 return (B_FALSE);
6442 6464 }
6443 6465
6444 6466 /*
6445 6467 * Post a sysevent corresponding to the given event. The 'name' must be one of
6446 6468 * the event definitions in sys/sysevent/eventdefs.h. The payload will be
6447 6469 * filled in from the spa and (optionally) the vdev. This doesn't do anything
6448 6470 * in the userland libzpool, as we don't want consumers to misinterpret ztest
6449 6471 * or zdb as real changes.
6450 6472 */
6451 6473 void
6452 6474 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
6453 6475 {
6454 6476 #ifdef _KERNEL
6455 6477 sysevent_t *ev;
6456 6478 sysevent_attr_list_t *attr = NULL;
6457 6479 sysevent_value_t value;
6458 6480 sysevent_id_t eid;
6459 6481
6460 6482 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
6461 6483 SE_SLEEP);
6462 6484
6463 6485 value.value_type = SE_DATA_TYPE_STRING;
6464 6486 value.value.sv_string = spa_name(spa);
6465 6487 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
6466 6488 goto done;
6467 6489
6468 6490 value.value_type = SE_DATA_TYPE_UINT64;
6469 6491 value.value.sv_uint64 = spa_guid(spa);
6470 6492 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
6471 6493 goto done;
6472 6494
6473 6495 if (vd) {
6474 6496 value.value_type = SE_DATA_TYPE_UINT64;
6475 6497 value.value.sv_uint64 = vd->vdev_guid;
6476 6498 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
6477 6499 SE_SLEEP) != 0)
6478 6500 goto done;
6479 6501
6480 6502 if (vd->vdev_path) {
6481 6503 value.value_type = SE_DATA_TYPE_STRING;
6482 6504 value.value.sv_string = vd->vdev_path;
6483 6505 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
6484 6506 &value, SE_SLEEP) != 0)
6485 6507 goto done;
6486 6508 }
6487 6509 }
6488 6510
6489 6511 if (sysevent_attach_attributes(ev, attr) != 0)
6490 6512 goto done;
6491 6513 attr = NULL;
6492 6514
6493 6515 (void) log_sysevent(ev, SE_SLEEP, &eid);
6494 6516
6495 6517 done:
6496 6518 if (attr)
6497 6519 sysevent_free_attr(attr);
6498 6520 sysevent_free(ev);
6499 6521 #endif
6500 6522 }
↓ open down ↓ |
317 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX