Print this page
    
3742 zfs comments need cleaner, more consistent style
Submitted by:   Will Andrews <willa@spectralogic.com>
Submitted by:   Alan Somers <alans@spectralogic.com>
Reviewed by:    Matthew Ahrens <mahrens@delphix.com>
Reviewed by:    George Wilson <george.wilson@delphix.com>
Reviewed by:    Eric Schrock <eric.schrock@delphix.com>
    
      
        | Split | Close | 
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/spa.c
          +++ new/usr/src/uts/common/fs/zfs/spa.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29   29   * SPA: Storage Pool Allocator
  30   30   *
  31   31   * This file contains all the routines used when modifying on-disk SPA state.
  32   32   * This includes opening, importing, destroying, exporting a pool, and syncing a
  33   33   * pool.
  34   34   */
  35   35  
  36   36  #include <sys/zfs_context.h>
  37   37  #include <sys/fm/fs/zfs.h>
  38   38  #include <sys/spa_impl.h>
  39   39  #include <sys/zio.h>
  40   40  #include <sys/zio_checksum.h>
  41   41  #include <sys/dmu.h>
  42   42  #include <sys/dmu_tx.h>
  43   43  #include <sys/zap.h>
  44   44  #include <sys/zil.h>
  45   45  #include <sys/ddt.h>
  46   46  #include <sys/vdev_impl.h>
  47   47  #include <sys/metaslab.h>
  48   48  #include <sys/metaslab_impl.h>
  49   49  #include <sys/uberblock_impl.h>
  50   50  #include <sys/txg.h>
  51   51  #include <sys/avl.h>
  52   52  #include <sys/dmu_traverse.h>
  53   53  #include <sys/dmu_objset.h>
  54   54  #include <sys/unique.h>
  55   55  #include <sys/dsl_pool.h>
  56   56  #include <sys/dsl_dataset.h>
  57   57  #include <sys/dsl_dir.h>
  58   58  #include <sys/dsl_prop.h>
  59   59  #include <sys/dsl_synctask.h>
  60   60  #include <sys/fs/zfs.h>
  61   61  #include <sys/arc.h>
  62   62  #include <sys/callb.h>
  63   63  #include <sys/systeminfo.h>
  64   64  #include <sys/spa_boot.h>
  65   65  #include <sys/zfs_ioctl.h>
  66   66  #include <sys/dsl_scan.h>
  67   67  #include <sys/zfeature.h>
  68   68  #include <sys/dsl_destroy.h>
  69   69  
  70   70  #ifdef  _KERNEL
  71   71  #include <sys/bootprops.h>
  72   72  #include <sys/callb.h>
  73   73  #include <sys/cpupart.h>
  74   74  #include <sys/pool.h>
  75   75  #include <sys/sysdc.h>
  76   76  #include <sys/zone.h>
  77   77  #endif  /* _KERNEL */
  78   78  
  79   79  #include "zfs_prop.h"
  80   80  #include "zfs_comutil.h"
  81   81  
  82   82  typedef enum zti_modes {
  83   83          ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
  84   84          ZTI_MODE_ONLINE_PERCENT,        /* value is % of online CPUs */
  85   85          ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
  86   86          ZTI_MODE_NULL,                  /* don't create a taskq */
  87   87          ZTI_NMODES
  88   88  } zti_modes_t;
  89   89  
  90   90  #define ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }
  91   91  #define ZTI_PCT(n)      { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
  92   92  #define ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
  93   93  #define ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
  94   94  
  95   95  #define ZTI_N(n)        ZTI_P(n, 1)
  96   96  #define ZTI_ONE         ZTI_N(1)
  97   97  
  98   98  typedef struct zio_taskq_info {
  99   99          zti_modes_t zti_mode;
 100  100          uint_t zti_value;
 101  101          uint_t zti_count;
 102  102  } zio_taskq_info_t;
 103  103  
 104  104  static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 105  105          "issue", "issue_high", "intr", "intr_high"
 106  106  };
 107  107  
 108  108  /*
 109  109   * This table defines the taskq settings for each ZFS I/O type. When
 110  110   * initializing a pool, we use this table to create an appropriately sized
 111  111   * taskq. Some operations are low volume and therefore have a small, static
 112  112   * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
 113  113   * macros. Other operations process a large amount of data; the ZTI_BATCH
 114  114   * macro causes us to create a taskq oriented for throughput. Some operations
 115  115   * are so high frequency and short-lived that the taskq itself can become a a
 116  116   * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
 117  117   * additional degree of parallelism specified by the number of threads per-
 118  118   * taskq and the number of taskqs; when dispatching an event in this case, the
 119  119   * particular taskq is chosen at random.
 120  120   *
 121  121   * The different taskq priorities are to handle the different contexts (issue
 122  122   * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
 123  123   * need to be handled with minimum delay.
 124  124   */
 125  125  const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 126  126          /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
 127  127          { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* NULL */
 128  128          { ZTI_N(8),     ZTI_NULL,       ZTI_BATCH,      ZTI_NULL }, /* READ */
 129  129          { ZTI_BATCH,    ZTI_N(5),       ZTI_N(8),       ZTI_N(5) }, /* WRITE */
 130  130          { ZTI_P(12, 8), ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
 131  131          { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
 132  132          { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
 133  133  };
 134  134  
 135  135  static void spa_sync_version(void *arg, dmu_tx_t *tx);
 136  136  static void spa_sync_props(void *arg, dmu_tx_t *tx);
 137  137  static boolean_t spa_has_active_shared_spare(spa_t *spa);
 138  138  static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
 139  139      spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
 140  140      char **ereport);
 141  141  static void spa_vdev_resilver_done(spa_t *spa);
 142  142  
 143  143  uint_t          zio_taskq_batch_pct = 100;      /* 1 thread per cpu in pset */
 144  144  id_t            zio_taskq_psrset_bind = PS_NONE;
 145  145  boolean_t       zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 146  146  uint_t          zio_taskq_basedc = 80;          /* base duty cycle */
 147  147  
 148  148  boolean_t       spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 149  149  extern int      zfs_sync_pass_deferred_free;
 150  150  
 151  151  /*
 152  152   * This (illegal) pool name is used when temporarily importing a spa_t in order
 153  153   * to get the vdev stats associated with the imported devices.
 154  154   */
 155  155  #define TRYIMPORT_NAME  "$import"
 156  156  
 157  157  /*
 158  158   * ==========================================================================
 159  159   * SPA properties routines
 160  160   * ==========================================================================
 161  161   */
 162  162  
 163  163  /*
 164  164   * Add a (source=src, propname=propval) list to an nvlist.
 165  165   */
 166  166  static void
 167  167  spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
 168  168      uint64_t intval, zprop_source_t src)
 169  169  {
 170  170          const char *propname = zpool_prop_to_name(prop);
 171  171          nvlist_t *propval;
 172  172  
 173  173          VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 174  174          VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 175  175  
 176  176          if (strval != NULL)
 177  177                  VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 178  178          else
 179  179                  VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 180  180  
 181  181          VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 182  182          nvlist_free(propval);
 183  183  }
 184  184  
 185  185  /*
 186  186   * Get property values from the spa configuration.
 187  187   */
 188  188  static void
 189  189  spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 190  190  {
 191  191          vdev_t *rvd = spa->spa_root_vdev;
 192  192          dsl_pool_t *pool = spa->spa_dsl_pool;
 193  193          uint64_t size;
 194  194          uint64_t alloc;
 195  195          uint64_t space;
 196  196          uint64_t cap, version;
 197  197          zprop_source_t src = ZPROP_SRC_NONE;
 198  198          spa_config_dirent_t *dp;
 199  199  
 200  200          ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 201  201  
 202  202          if (rvd != NULL) {
 203  203                  alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 204  204                  size = metaslab_class_get_space(spa_normal_class(spa));
 205  205                  spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 206  206                  spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 207  207                  spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 208  208                  spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 209  209                      size - alloc, src);
 210  210  
 211  211                  space = 0;
 212  212                  for (int c = 0; c < rvd->vdev_children; c++) {
 213  213                          vdev_t *tvd = rvd->vdev_child[c];
 214  214                          space += tvd->vdev_max_asize - tvd->vdev_asize;
 215  215                  }
 216  216                  spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
 217  217                      src);
 218  218  
 219  219                  spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 220  220                      (spa_mode(spa) == FREAD), src);
 221  221  
 222  222                  cap = (size == 0) ? 0 : (alloc * 100 / size);
 223  223                  spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 224  224  
 225  225                  spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 226  226                      ddt_get_pool_dedup_ratio(spa), src);
 227  227  
 228  228                  spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 229  229                      rvd->vdev_state, src);
 230  230  
 231  231                  version = spa_version(spa);
 232  232                  if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
 233  233                          src = ZPROP_SRC_DEFAULT;
 234  234                  else
 235  235                          src = ZPROP_SRC_LOCAL;
 236  236                  spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 237  237          }
 238  238  
 239  239          if (pool != NULL) {
 240  240                  dsl_dir_t *freedir = pool->dp_free_dir;
 241  241  
 242  242                  /*
 243  243                   * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 244  244                   * when opening pools before this version freedir will be NULL.
 245  245                   */
 246  246                  if (freedir != NULL) {
 247  247                          spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
 248  248                              freedir->dd_phys->dd_used_bytes, src);
 249  249                  } else {
 250  250                          spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 251  251                              NULL, 0, src);
 252  252                  }
 253  253          }
 254  254  
 255  255          spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 256  256  
 257  257          if (spa->spa_comment != NULL) {
 258  258                  spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 259  259                      0, ZPROP_SRC_LOCAL);
 260  260          }
 261  261  
 262  262          if (spa->spa_root != NULL)
 263  263                  spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 264  264                      0, ZPROP_SRC_LOCAL);
 265  265  
 266  266          if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 267  267                  if (dp->scd_path == NULL) {
 268  268                          spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 269  269                              "none", 0, ZPROP_SRC_LOCAL);
 270  270                  } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 271  271                          spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 272  272                              dp->scd_path, 0, ZPROP_SRC_LOCAL);
 273  273                  }
 274  274          }
 275  275  }
 276  276  
 277  277  /*
 278  278   * Get zpool property values.
 279  279   */
 280  280  int
 281  281  spa_prop_get(spa_t *spa, nvlist_t **nvp)
 282  282  {
 283  283          objset_t *mos = spa->spa_meta_objset;
 284  284          zap_cursor_t zc;
 285  285          zap_attribute_t za;
 286  286          int err;
 287  287  
 288  288          VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 289  289  
 290  290          mutex_enter(&spa->spa_props_lock);
 291  291  
 292  292          /*
 293  293           * Get properties from the spa config.
 294  294           */
 295  295          spa_prop_get_config(spa, nvp);
 296  296  
 297  297          /* If no pool property object, no more prop to get. */
 298  298          if (mos == NULL || spa->spa_pool_props_object == 0) {
 299  299                  mutex_exit(&spa->spa_props_lock);
 300  300                  return (0);
 301  301          }
 302  302  
 303  303          /*
 304  304           * Get properties from the MOS pool property object.
 305  305           */
 306  306          for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 307  307              (err = zap_cursor_retrieve(&zc, &za)) == 0;
 308  308              zap_cursor_advance(&zc)) {
 309  309                  uint64_t intval = 0;
 310  310                  char *strval = NULL;
 311  311                  zprop_source_t src = ZPROP_SRC_DEFAULT;
 312  312                  zpool_prop_t prop;
 313  313  
 314  314                  if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
 315  315                          continue;
 316  316  
 317  317                  switch (za.za_integer_length) {
 318  318                  case 8:
 319  319                          /* integer property */
 320  320                          if (za.za_first_integer !=
 321  321                              zpool_prop_default_numeric(prop))
 322  322                                  src = ZPROP_SRC_LOCAL;
 323  323  
 324  324                          if (prop == ZPOOL_PROP_BOOTFS) {
 325  325                                  dsl_pool_t *dp;
 326  326                                  dsl_dataset_t *ds = NULL;
 327  327  
 328  328                                  dp = spa_get_dsl(spa);
 329  329                                  dsl_pool_config_enter(dp, FTAG);
 330  330                                  if (err = dsl_dataset_hold_obj(dp,
 331  331                                      za.za_first_integer, FTAG, &ds)) {
 332  332                                          dsl_pool_config_exit(dp, FTAG);
 333  333                                          break;
 334  334                                  }
 335  335  
 336  336                                  strval = kmem_alloc(
 337  337                                      MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
 338  338                                      KM_SLEEP);
 339  339                                  dsl_dataset_name(ds, strval);
 340  340                                  dsl_dataset_rele(ds, FTAG);
 341  341                                  dsl_pool_config_exit(dp, FTAG);
 342  342                          } else {
 343  343                                  strval = NULL;
 344  344                                  intval = za.za_first_integer;
 345  345                          }
 346  346  
 347  347                          spa_prop_add_list(*nvp, prop, strval, intval, src);
 348  348  
 349  349                          if (strval != NULL)
 350  350                                  kmem_free(strval,
 351  351                                      MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
 352  352  
 353  353                          break;
 354  354  
 355  355                  case 1:
 356  356                          /* string property */
 357  357                          strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
 358  358                          err = zap_lookup(mos, spa->spa_pool_props_object,
 359  359                              za.za_name, 1, za.za_num_integers, strval);
 360  360                          if (err) {
 361  361                                  kmem_free(strval, za.za_num_integers);
 362  362                                  break;
 363  363                          }
 364  364                          spa_prop_add_list(*nvp, prop, strval, 0, src);
 365  365                          kmem_free(strval, za.za_num_integers);
 366  366                          break;
 367  367  
 368  368                  default:
 369  369                          break;
 370  370                  }
 371  371          }
 372  372          zap_cursor_fini(&zc);
 373  373          mutex_exit(&spa->spa_props_lock);
 374  374  out:
 375  375          if (err && err != ENOENT) {
 376  376                  nvlist_free(*nvp);
 377  377                  *nvp = NULL;
 378  378                  return (err);
 379  379          }
 380  380  
 381  381          return (0);
 382  382  }
 383  383  
 384  384  /*
 385  385   * Validate the given pool properties nvlist and modify the list
 386  386   * for the property values to be set.
 387  387   */
 388  388  static int
 389  389  spa_prop_validate(spa_t *spa, nvlist_t *props)
 390  390  {
 391  391          nvpair_t *elem;
 392  392          int error = 0, reset_bootfs = 0;
 393  393          uint64_t objnum = 0;
 394  394          boolean_t has_feature = B_FALSE;
 395  395  
 396  396          elem = NULL;
 397  397          while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 398  398                  uint64_t intval;
 399  399                  char *strval, *slash, *check, *fname;
 400  400                  const char *propname = nvpair_name(elem);
 401  401                  zpool_prop_t prop = zpool_name_to_prop(propname);
 402  402  
 403  403                  switch (prop) {
 404  404                  case ZPROP_INVAL:
 405  405                          if (!zpool_prop_feature(propname)) {
 406  406                                  error = SET_ERROR(EINVAL);
 407  407                                  break;
 408  408                          }
 409  409  
 410  410                          /*
 411  411                           * Sanitize the input.
 412  412                           */
 413  413                          if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 414  414                                  error = SET_ERROR(EINVAL);
 415  415                                  break;
 416  416                          }
 417  417  
 418  418                          if (nvpair_value_uint64(elem, &intval) != 0) {
 419  419                                  error = SET_ERROR(EINVAL);
 420  420                                  break;
 421  421                          }
 422  422  
 423  423                          if (intval != 0) {
 424  424                                  error = SET_ERROR(EINVAL);
 425  425                                  break;
 426  426                          }
 427  427  
 428  428                          fname = strchr(propname, '@') + 1;
 429  429                          if (zfeature_lookup_name(fname, NULL) != 0) {
 430  430                                  error = SET_ERROR(EINVAL);
 431  431                                  break;
 432  432                          }
 433  433  
 434  434                          has_feature = B_TRUE;
 435  435                          break;
 436  436  
 437  437                  case ZPOOL_PROP_VERSION:
 438  438                          error = nvpair_value_uint64(elem, &intval);
 439  439                          if (!error &&
 440  440                              (intval < spa_version(spa) ||
 441  441                              intval > SPA_VERSION_BEFORE_FEATURES ||
 442  442                              has_feature))
 443  443                                  error = SET_ERROR(EINVAL);
 444  444                          break;
 445  445  
 446  446                  case ZPOOL_PROP_DELEGATION:
 447  447                  case ZPOOL_PROP_AUTOREPLACE:
 448  448                  case ZPOOL_PROP_LISTSNAPS:
 449  449                  case ZPOOL_PROP_AUTOEXPAND:
 450  450                          error = nvpair_value_uint64(elem, &intval);
 451  451                          if (!error && intval > 1)
 452  452                                  error = SET_ERROR(EINVAL);
 453  453                          break;
 454  454  
 455  455                  case ZPOOL_PROP_BOOTFS:
 456  456                          /*
 457  457                           * If the pool version is less than SPA_VERSION_BOOTFS,
 458  458                           * or the pool is still being created (version == 0),
 459  459                           * the bootfs property cannot be set.
 460  460                           */
 461  461                          if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 462  462                                  error = SET_ERROR(ENOTSUP);
 463  463                                  break;
 464  464                          }
 465  465  
 466  466                          /*
 467  467                           * Make sure the vdev config is bootable
 468  468                           */
 469  469                          if (!vdev_is_bootable(spa->spa_root_vdev)) {
 470  470                                  error = SET_ERROR(ENOTSUP);
 471  471                                  break;
 472  472                          }
 473  473  
 474  474                          reset_bootfs = 1;
 475  475  
 476  476                          error = nvpair_value_string(elem, &strval);
 477  477  
 478  478                          if (!error) {
 479  479                                  objset_t *os;
 480  480                                  uint64_t compress;
 481  481  
 482  482                                  if (strval == NULL || strval[0] == '\0') {
 483  483                                          objnum = zpool_prop_default_numeric(
 484  484                                              ZPOOL_PROP_BOOTFS);
 485  485                                          break;
 486  486                                  }
 487  487  
 488  488                                  if (error = dmu_objset_hold(strval, FTAG, &os))
 489  489                                          break;
 490  490  
 491  491                                  /* Must be ZPL and not gzip compressed. */
 492  492  
 493  493                                  if (dmu_objset_type(os) != DMU_OST_ZFS) {
 494  494                                          error = SET_ERROR(ENOTSUP);
 495  495                                  } else if ((error =
 496  496                                      dsl_prop_get_int_ds(dmu_objset_ds(os),
 497  497                                      zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 498  498                                      &compress)) == 0 &&
 499  499                                      !BOOTFS_COMPRESS_VALID(compress)) {
 500  500                                          error = SET_ERROR(ENOTSUP);
 501  501                                  } else {
 502  502                                          objnum = dmu_objset_id(os);
 503  503                                  }
 504  504                                  dmu_objset_rele(os, FTAG);
 505  505                          }
 506  506                          break;
 507  507  
 508  508                  case ZPOOL_PROP_FAILUREMODE:
 509  509                          error = nvpair_value_uint64(elem, &intval);
 510  510                          if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
 511  511                              intval > ZIO_FAILURE_MODE_PANIC))
 512  512                                  error = SET_ERROR(EINVAL);
 513  513  
 514  514                          /*
 515  515                           * This is a special case which only occurs when
 516  516                           * the pool has completely failed. This allows
 517  517                           * the user to change the in-core failmode property
 518  518                           * without syncing it out to disk (I/Os might
 519  519                           * currently be blocked). We do this by returning
 520  520                           * EIO to the caller (spa_prop_set) to trick it
 521  521                           * into thinking we encountered a property validation
 522  522                           * error.
 523  523                           */
 524  524                          if (!error && spa_suspended(spa)) {
 525  525                                  spa->spa_failmode = intval;
 526  526                                  error = SET_ERROR(EIO);
 527  527                          }
 528  528                          break;
 529  529  
 530  530                  case ZPOOL_PROP_CACHEFILE:
 531  531                          if ((error = nvpair_value_string(elem, &strval)) != 0)
 532  532                                  break;
 533  533  
 534  534                          if (strval[0] == '\0')
 535  535                                  break;
 536  536  
 537  537                          if (strcmp(strval, "none") == 0)
 538  538                                  break;
 539  539  
 540  540                          if (strval[0] != '/') {
 541  541                                  error = SET_ERROR(EINVAL);
 542  542                                  break;
 543  543                          }
 544  544  
 545  545                          slash = strrchr(strval, '/');
 546  546                          ASSERT(slash != NULL);
 547  547  
 548  548                          if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 549  549                              strcmp(slash, "/..") == 0)
 550  550                                  error = SET_ERROR(EINVAL);
 551  551                          break;
 552  552  
 553  553                  case ZPOOL_PROP_COMMENT:
 554  554                          if ((error = nvpair_value_string(elem, &strval)) != 0)
 555  555                                  break;
 556  556                          for (check = strval; *check != '\0'; check++) {
 557  557                                  /*
 558  558                                   * The kernel doesn't have an easy isprint()
 559  559                                   * check.  For this kernel check, we merely
 560  560                                   * check ASCII apart from DEL.  Fix this if
 561  561                                   * there is an easy-to-use kernel isprint().
 562  562                                   */
 563  563                                  if (*check >= 0x7f) {
 564  564                                          error = SET_ERROR(EINVAL);
 565  565                                          break;
 566  566                                  }
 567  567                                  check++;
 568  568                          }
 569  569                          if (strlen(strval) > ZPROP_MAX_COMMENT)
 570  570                                  error = E2BIG;
 571  571                          break;
 572  572  
 573  573                  case ZPOOL_PROP_DEDUPDITTO:
 574  574                          if (spa_version(spa) < SPA_VERSION_DEDUP)
 575  575                                  error = SET_ERROR(ENOTSUP);
 576  576                          else
 577  577                                  error = nvpair_value_uint64(elem, &intval);
 578  578                          if (error == 0 &&
 579  579                              intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
 580  580                                  error = SET_ERROR(EINVAL);
 581  581                          break;
 582  582                  }
 583  583  
 584  584                  if (error)
 585  585                          break;
 586  586          }
 587  587  
 588  588          if (!error && reset_bootfs) {
 589  589                  error = nvlist_remove(props,
 590  590                      zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 591  591  
 592  592                  if (!error) {
 593  593                          error = nvlist_add_uint64(props,
 594  594                              zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 595  595                  }
 596  596          }
 597  597  
 598  598          return (error);
 599  599  }
 600  600  
 601  601  void
 602  602  spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 603  603  {
 604  604          char *cachefile;
 605  605          spa_config_dirent_t *dp;
 606  606  
 607  607          if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 608  608              &cachefile) != 0)
 609  609                  return;
 610  610  
 611  611          dp = kmem_alloc(sizeof (spa_config_dirent_t),
 612  612              KM_SLEEP);
 613  613  
 614  614          if (cachefile[0] == '\0')
 615  615                  dp->scd_path = spa_strdup(spa_config_path);
 616  616          else if (strcmp(cachefile, "none") == 0)
 617  617                  dp->scd_path = NULL;
 618  618          else
 619  619                  dp->scd_path = spa_strdup(cachefile);
 620  620  
 621  621          list_insert_head(&spa->spa_config_list, dp);
 622  622          if (need_sync)
 623  623                  spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 624  624  }
 625  625  
 626  626  int
 627  627  spa_prop_set(spa_t *spa, nvlist_t *nvp)
 628  628  {
 629  629          int error;
 630  630          nvpair_t *elem = NULL;
 631  631          boolean_t need_sync = B_FALSE;
 632  632  
 633  633          if ((error = spa_prop_validate(spa, nvp)) != 0)
 634  634                  return (error);
 635  635  
 636  636          while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 637  637                  zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 638  638  
 639  639                  if (prop == ZPOOL_PROP_CACHEFILE ||
 640  640                      prop == ZPOOL_PROP_ALTROOT ||
 641  641                      prop == ZPOOL_PROP_READONLY)
 642  642                          continue;
 643  643  
 644  644                  if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
 645  645                          uint64_t ver;
 646  646  
 647  647                          if (prop == ZPOOL_PROP_VERSION) {
 648  648                                  VERIFY(nvpair_value_uint64(elem, &ver) == 0);
 649  649                          } else {
 650  650                                  ASSERT(zpool_prop_feature(nvpair_name(elem)));
 651  651                                  ver = SPA_VERSION_FEATURES;
 652  652                                  need_sync = B_TRUE;
 653  653                          }
 654  654  
 655  655                          /* Save time if the version is already set. */
 656  656                          if (ver == spa_version(spa))
 657  657                                  continue;
 658  658  
 659  659                          /*
 660  660                           * In addition to the pool directory object, we might
 661  661                           * create the pool properties object, the features for
 662  662                           * read object, the features for write object, or the
 663  663                           * feature descriptions object.
 664  664                           */
 665  665                          error = dsl_sync_task(spa->spa_name, NULL,
 666  666                              spa_sync_version, &ver, 6);
 667  667                          if (error)
 668  668                                  return (error);
 669  669                          continue;
 670  670                  }
 671  671  
 672  672                  need_sync = B_TRUE;
 673  673                  break;
 674  674          }
 675  675  
 676  676          if (need_sync) {
 677  677                  return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
 678  678                      nvp, 6));
 679  679          }
 680  680  
 681  681          return (0);
 682  682  }
 683  683  
 684  684  /*
 685  685   * If the bootfs property value is dsobj, clear it.
 686  686   */
 687  687  void
 688  688  spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 689  689  {
 690  690          if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 691  691                  VERIFY(zap_remove(spa->spa_meta_objset,
 692  692                      spa->spa_pool_props_object,
 693  693                      zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 694  694                  spa->spa_bootfs = 0;
 695  695          }
 696  696  }
 697  697  
 698  698  /*ARGSUSED*/
 699  699  static int
 700  700  spa_change_guid_check(void *arg, dmu_tx_t *tx)
 701  701  {
 702  702          uint64_t *newguid = arg;
 703  703          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 704  704          vdev_t *rvd = spa->spa_root_vdev;
 705  705          uint64_t vdev_state;
 706  706  
 707  707          spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 708  708          vdev_state = rvd->vdev_state;
 709  709          spa_config_exit(spa, SCL_STATE, FTAG);
 710  710  
 711  711          if (vdev_state != VDEV_STATE_HEALTHY)
 712  712                  return (SET_ERROR(ENXIO));
 713  713  
 714  714          ASSERT3U(spa_guid(spa), !=, *newguid);
 715  715  
 716  716          return (0);
 717  717  }
 718  718  
 719  719  static void
 720  720  spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 721  721  {
 722  722          uint64_t *newguid = arg;
 723  723          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 724  724          uint64_t oldguid;
 725  725          vdev_t *rvd = spa->spa_root_vdev;
 726  726  
 727  727          oldguid = spa_guid(spa);
 728  728  
 729  729          spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 730  730          rvd->vdev_guid = *newguid;
 731  731          rvd->vdev_guid_sum += (*newguid - oldguid);
 732  732          vdev_config_dirty(rvd);
 733  733          spa_config_exit(spa, SCL_STATE, FTAG);
 734  734  
 735  735          spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
 736  736              oldguid, *newguid);
 737  737  }
 738  738  
 739  739  /*
 740  740   * Change the GUID for the pool.  This is done so that we can later
 741  741   * re-import a pool built from a clone of our own vdevs.  We will modify
 742  742   * the root vdev's guid, our own pool guid, and then mark all of our
 743  743   * vdevs dirty.  Note that we must make sure that all our vdevs are
 744  744   * online when we do this, or else any vdevs that weren't present
 745  745   * would be orphaned from our pool.  We are also going to issue a
 746  746   * sysevent to update any watchers.
 747  747   */
 748  748  int
 749  749  spa_change_guid(spa_t *spa)
 750  750  {
 751  751          int error;
 752  752          uint64_t guid;
 753  753  
 754  754          mutex_enter(&spa_namespace_lock);
 755  755          guid = spa_generate_guid(NULL);
 756  756  
 757  757          error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 758  758              spa_change_guid_sync, &guid, 5);
 759  759  
 760  760          if (error == 0) {
 761  761                  spa_config_sync(spa, B_FALSE, B_TRUE);
 762  762                  spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
 763  763          }
 764  764  
 765  765          mutex_exit(&spa_namespace_lock);
 766  766  
 767  767          return (error);
 768  768  }
 769  769  
 770  770  /*
 771  771   * ==========================================================================
 772  772   * SPA state manipulation (open/create/destroy/import/export)
 773  773   * ==========================================================================
 774  774   */
 775  775  
 776  776  static int
 777  777  spa_error_entry_compare(const void *a, const void *b)
 778  778  {
 779  779          spa_error_entry_t *sa = (spa_error_entry_t *)a;
 780  780          spa_error_entry_t *sb = (spa_error_entry_t *)b;
 781  781          int ret;
 782  782  
 783  783          ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
 784  784              sizeof (zbookmark_t));
 785  785  
 786  786          if (ret < 0)
 787  787                  return (-1);
 788  788          else if (ret > 0)
 789  789                  return (1);
 790  790          else
 791  791                  return (0);
 792  792  }
 793  793  
 794  794  /*
 795  795   * Utility function which retrieves copies of the current logs and
 796  796   * re-initializes them in the process.
 797  797   */
 798  798  void
 799  799  spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 800  800  {
 801  801          ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 802  802  
 803  803          bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
 804  804          bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
 805  805  
 806  806          avl_create(&spa->spa_errlist_scrub,
 807  807              spa_error_entry_compare, sizeof (spa_error_entry_t),
 808  808              offsetof(spa_error_entry_t, se_avl));
 809  809          avl_create(&spa->spa_errlist_last,
 810  810              spa_error_entry_compare, sizeof (spa_error_entry_t),
 811  811              offsetof(spa_error_entry_t, se_avl));
 812  812  }
 813  813  
 814  814  static void
 815  815  spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 816  816  {
 817  817          const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 818  818          enum zti_modes mode = ztip->zti_mode;
 819  819          uint_t value = ztip->zti_value;
 820  820          uint_t count = ztip->zti_count;
 821  821          spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 822  822          char name[32];
 823  823          uint_t flags = 0;
 824  824          boolean_t batch = B_FALSE;
 825  825  
 826  826          if (mode == ZTI_MODE_NULL) {
 827  827                  tqs->stqs_count = 0;
 828  828                  tqs->stqs_taskq = NULL;
 829  829                  return;
 830  830          }
 831  831  
 832  832          ASSERT3U(count, >, 0);
 833  833  
 834  834          tqs->stqs_count = count;
 835  835          tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 836  836  
 837  837          for (uint_t i = 0; i < count; i++) {
 838  838                  taskq_t *tq;
 839  839  
 840  840                  switch (mode) {
 841  841                  case ZTI_MODE_FIXED:
 842  842                          ASSERT3U(value, >=, 1);
 843  843                          value = MAX(value, 1);
 844  844                          break;
 845  845  
 846  846                  case ZTI_MODE_BATCH:
 847  847                          batch = B_TRUE;
 848  848                          flags |= TASKQ_THREADS_CPU_PCT;
 849  849                          value = zio_taskq_batch_pct;
 850  850                          break;
 851  851  
 852  852                  case ZTI_MODE_ONLINE_PERCENT:
 853  853                          flags |= TASKQ_THREADS_CPU_PCT;
 854  854                          break;
 855  855  
 856  856                  default:
 857  857                          panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 858  858                              "spa_activate()",
 859  859                              zio_type_name[t], zio_taskq_types[q], mode, value);
 860  860                          break;
 861  861                  }
 862  862  
 863  863                  if (count > 1) {
 864  864                          (void) snprintf(name, sizeof (name), "%s_%s_%u",
 865  865                              zio_type_name[t], zio_taskq_types[q], i);
 866  866                  } else {
 867  867                          (void) snprintf(name, sizeof (name), "%s_%s",
 868  868                              zio_type_name[t], zio_taskq_types[q]);
 869  869                  }
 870  870  
 871  871                  if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 872  872                          if (batch)
 873  873                                  flags |= TASKQ_DC_BATCH;
 874  874  
 875  875                          tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 876  876                              spa->spa_proc, zio_taskq_basedc, flags);
 877  877                  } else {
 878  878                          tq = taskq_create_proc(name, value, maxclsyspri, 50,
 879  879                              INT_MAX, spa->spa_proc, flags);
 880  880                  }
 881  881  
 882  882                  tqs->stqs_taskq[i] = tq;
 883  883          }
 884  884  }
 885  885  
 886  886  static void
 887  887  spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 888  888  {
 889  889          spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 890  890  
 891  891          if (tqs->stqs_taskq == NULL) {
 892  892                  ASSERT0(tqs->stqs_count);
 893  893                  return;
 894  894          }
 895  895  
 896  896          for (uint_t i = 0; i < tqs->stqs_count; i++) {
 897  897                  ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 898  898                  taskq_destroy(tqs->stqs_taskq[i]);
 899  899          }
 900  900  
 901  901          kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
 902  902          tqs->stqs_taskq = NULL;
 903  903  }
 904  904  
 905  905  /*
 906  906   * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
 907  907   * Note that a type may have multiple discrete taskqs to avoid lock contention
 908  908   * on the taskq itself. In that case we choose which taskq at random by using
 909  909   * the low bits of gethrtime().
 910  910   */
 911  911  void
 912  912  spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
 913  913      task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
 914  914  {
 915  915          spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 916  916          taskq_t *tq;
 917  917  
 918  918          ASSERT3P(tqs->stqs_taskq, !=, NULL);
 919  919          ASSERT3U(tqs->stqs_count, !=, 0);
 920  920  
 921  921          if (tqs->stqs_count == 1) {
 922  922                  tq = tqs->stqs_taskq[0];
 923  923          } else {
 924  924                  tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
 925  925          }
 926  926  
 927  927          taskq_dispatch_ent(tq, func, arg, flags, ent);
 928  928  }
 929  929  
 930  930  static void
 931  931  spa_create_zio_taskqs(spa_t *spa)
 932  932  {
 933  933          for (int t = 0; t < ZIO_TYPES; t++) {
 934  934                  for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 935  935                          spa_taskqs_init(spa, t, q);
 936  936                  }
 937  937          }
 938  938  }
 939  939  
 940  940  #ifdef _KERNEL
 941  941  static void
 942  942  spa_thread(void *arg)
 943  943  {
 944  944          callb_cpr_t cprinfo;
 945  945  
 946  946          spa_t *spa = arg;
 947  947          user_t *pu = PTOU(curproc);
 948  948  
 949  949          CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 950  950              spa->spa_name);
 951  951  
 952  952          ASSERT(curproc != &p0);
 953  953          (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 954  954              "zpool-%s", spa->spa_name);
 955  955          (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 956  956  
 957  957          /* bind this thread to the requested psrset */
 958  958          if (zio_taskq_psrset_bind != PS_NONE) {
 959  959                  pool_lock();
 960  960                  mutex_enter(&cpu_lock);
 961  961                  mutex_enter(&pidlock);
 962  962                  mutex_enter(&curproc->p_lock);
 963  963  
 964  964                  if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 965  965                      0, NULL, NULL) == 0)  {
 966  966                          curthread->t_bind_pset = zio_taskq_psrset_bind;
 967  967                  } else {
 968  968                          cmn_err(CE_WARN,
 969  969                              "Couldn't bind process for zfs pool \"%s\" to "
 970  970                              "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 971  971                  }
 972  972  
 973  973                  mutex_exit(&curproc->p_lock);
 974  974                  mutex_exit(&pidlock);
 975  975                  mutex_exit(&cpu_lock);
 976  976                  pool_unlock();
 977  977          }
 978  978  
 979  979          if (zio_taskq_sysdc) {
 980  980                  sysdc_thread_enter(curthread, 100, 0);
 981  981          }
 982  982  
 983  983          spa->spa_proc = curproc;
 984  984          spa->spa_did = curthread->t_did;
 985  985  
 986  986          spa_create_zio_taskqs(spa);
 987  987  
 988  988          mutex_enter(&spa->spa_proc_lock);
 989  989          ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 990  990  
 991  991          spa->spa_proc_state = SPA_PROC_ACTIVE;
 992  992          cv_broadcast(&spa->spa_proc_cv);
 993  993  
 994  994          CALLB_CPR_SAFE_BEGIN(&cprinfo);
 995  995          while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 996  996                  cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 997  997          CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 998  998  
 999  999          ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1000 1000          spa->spa_proc_state = SPA_PROC_GONE;
1001 1001          spa->spa_proc = &p0;
1002 1002          cv_broadcast(&spa->spa_proc_cv);
1003 1003          CALLB_CPR_EXIT(&cprinfo);       /* drops spa_proc_lock */
1004 1004  
1005 1005          mutex_enter(&curproc->p_lock);
1006 1006          lwp_exit();
1007 1007  }
1008 1008  #endif
1009 1009  
1010 1010  /*
1011 1011   * Activate an uninitialized pool.
1012 1012   */
1013 1013  static void
1014 1014  spa_activate(spa_t *spa, int mode)
1015 1015  {
1016 1016          ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1017 1017  
1018 1018          spa->spa_state = POOL_STATE_ACTIVE;
1019 1019          spa->spa_mode = mode;
1020 1020  
1021 1021          spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1022 1022          spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1023 1023  
1024 1024          /* Try to create a covering process */
1025 1025          mutex_enter(&spa->spa_proc_lock);
1026 1026          ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1027 1027          ASSERT(spa->spa_proc == &p0);
1028 1028          spa->spa_did = 0;
1029 1029  
1030 1030          /* Only create a process if we're going to be around a while. */
1031 1031          if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1032 1032                  if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1033 1033                      NULL, 0) == 0) {
1034 1034                          spa->spa_proc_state = SPA_PROC_CREATED;
1035 1035                          while (spa->spa_proc_state == SPA_PROC_CREATED) {
1036 1036                                  cv_wait(&spa->spa_proc_cv,
1037 1037                                      &spa->spa_proc_lock);
1038 1038                          }
1039 1039                          ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1040 1040                          ASSERT(spa->spa_proc != &p0);
1041 1041                          ASSERT(spa->spa_did != 0);
1042 1042                  } else {
1043 1043  #ifdef _KERNEL
1044 1044                          cmn_err(CE_WARN,
1045 1045                              "Couldn't create process for zfs pool \"%s\"\n",
1046 1046                              spa->spa_name);
1047 1047  #endif
1048 1048                  }
1049 1049          }
1050 1050          mutex_exit(&spa->spa_proc_lock);
1051 1051  
1052 1052          /* If we didn't create a process, we need to create our taskqs. */
1053 1053          if (spa->spa_proc == &p0) {
1054 1054                  spa_create_zio_taskqs(spa);
1055 1055          }
1056 1056  
1057 1057          list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1058 1058              offsetof(vdev_t, vdev_config_dirty_node));
1059 1059          list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1060 1060              offsetof(vdev_t, vdev_state_dirty_node));
1061 1061  
1062 1062          txg_list_create(&spa->spa_vdev_txg_list,
1063 1063              offsetof(struct vdev, vdev_txg_node));
1064 1064  
1065 1065          avl_create(&spa->spa_errlist_scrub,
1066 1066              spa_error_entry_compare, sizeof (spa_error_entry_t),
1067 1067              offsetof(spa_error_entry_t, se_avl));
1068 1068          avl_create(&spa->spa_errlist_last,
1069 1069              spa_error_entry_compare, sizeof (spa_error_entry_t),
1070 1070              offsetof(spa_error_entry_t, se_avl));
1071 1071  }
1072 1072  
1073 1073  /*
1074 1074   * Opposite of spa_activate().
1075 1075   */
1076 1076  static void
1077 1077  spa_deactivate(spa_t *spa)
1078 1078  {
1079 1079          ASSERT(spa->spa_sync_on == B_FALSE);
1080 1080          ASSERT(spa->spa_dsl_pool == NULL);
1081 1081          ASSERT(spa->spa_root_vdev == NULL);
1082 1082          ASSERT(spa->spa_async_zio_root == NULL);
1083 1083          ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1084 1084  
1085 1085          txg_list_destroy(&spa->spa_vdev_txg_list);
1086 1086  
1087 1087          list_destroy(&spa->spa_config_dirty_list);
1088 1088          list_destroy(&spa->spa_state_dirty_list);
1089 1089  
1090 1090          for (int t = 0; t < ZIO_TYPES; t++) {
1091 1091                  for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1092 1092                          spa_taskqs_fini(spa, t, q);
1093 1093                  }
1094 1094          }
1095 1095  
1096 1096          metaslab_class_destroy(spa->spa_normal_class);
1097 1097          spa->spa_normal_class = NULL;
1098 1098  
1099 1099          metaslab_class_destroy(spa->spa_log_class);
1100 1100          spa->spa_log_class = NULL;
1101 1101  
1102 1102          /*
1103 1103           * If this was part of an import or the open otherwise failed, we may
1104 1104           * still have errors left in the queues.  Empty them just in case.
1105 1105           */
1106 1106          spa_errlog_drain(spa);
1107 1107  
1108 1108          avl_destroy(&spa->spa_errlist_scrub);
1109 1109          avl_destroy(&spa->spa_errlist_last);
1110 1110  
1111 1111          spa->spa_state = POOL_STATE_UNINITIALIZED;
1112 1112  
1113 1113          mutex_enter(&spa->spa_proc_lock);
1114 1114          if (spa->spa_proc_state != SPA_PROC_NONE) {
1115 1115                  ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1116 1116                  spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1117 1117                  cv_broadcast(&spa->spa_proc_cv);
1118 1118                  while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1119 1119                          ASSERT(spa->spa_proc != &p0);
1120 1120                          cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1121 1121                  }
1122 1122                  ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1123 1123                  spa->spa_proc_state = SPA_PROC_NONE;
1124 1124          }
1125 1125          ASSERT(spa->spa_proc == &p0);
1126 1126          mutex_exit(&spa->spa_proc_lock);
1127 1127  
1128 1128          /*
1129 1129           * We want to make sure spa_thread() has actually exited the ZFS
1130 1130           * module, so that the module can't be unloaded out from underneath
1131 1131           * it.
1132 1132           */
1133 1133          if (spa->spa_did != 0) {
1134 1134                  thread_join(spa->spa_did);
1135 1135                  spa->spa_did = 0;
1136 1136          }
1137 1137  }
1138 1138  
1139 1139  /*
1140 1140   * Verify a pool configuration, and construct the vdev tree appropriately.  This
1141 1141   * will create all the necessary vdevs in the appropriate layout, with each vdev
1142 1142   * in the CLOSED state.  This will prep the pool before open/creation/import.
1143 1143   * All vdev validation is done by the vdev_alloc() routine.
1144 1144   */
1145 1145  static int
1146 1146  spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1147 1147      uint_t id, int atype)
1148 1148  {
1149 1149          nvlist_t **child;
1150 1150          uint_t children;
1151 1151          int error;
1152 1152  
1153 1153          if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1154 1154                  return (error);
1155 1155  
1156 1156          if ((*vdp)->vdev_ops->vdev_op_leaf)
1157 1157                  return (0);
1158 1158  
1159 1159          error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1160 1160              &child, &children);
1161 1161  
1162 1162          if (error == ENOENT)
1163 1163                  return (0);
1164 1164  
1165 1165          if (error) {
1166 1166                  vdev_free(*vdp);
1167 1167                  *vdp = NULL;
1168 1168                  return (SET_ERROR(EINVAL));
1169 1169          }
1170 1170  
1171 1171          for (int c = 0; c < children; c++) {
1172 1172                  vdev_t *vd;
1173 1173                  if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1174 1174                      atype)) != 0) {
1175 1175                          vdev_free(*vdp);
1176 1176                          *vdp = NULL;
1177 1177                          return (error);
1178 1178                  }
1179 1179          }
1180 1180  
1181 1181          ASSERT(*vdp != NULL);
1182 1182  
1183 1183          return (0);
1184 1184  }
1185 1185  
1186 1186  /*
1187 1187   * Opposite of spa_load().
1188 1188   */
1189 1189  static void
1190 1190  spa_unload(spa_t *spa)
1191 1191  {
1192 1192          int i;
1193 1193  
1194 1194          ASSERT(MUTEX_HELD(&spa_namespace_lock));
1195 1195  
1196 1196          /*
1197 1197           * Stop async tasks.
1198 1198           */
1199 1199          spa_async_suspend(spa);
1200 1200  
1201 1201          /*
1202 1202           * Stop syncing.
1203 1203           */
1204 1204          if (spa->spa_sync_on) {
1205 1205                  txg_sync_stop(spa->spa_dsl_pool);
1206 1206                  spa->spa_sync_on = B_FALSE;
1207 1207          }
1208 1208  
1209 1209          /*
1210 1210           * Wait for any outstanding async I/O to complete.
1211 1211           */
1212 1212          if (spa->spa_async_zio_root != NULL) {
1213 1213                  (void) zio_wait(spa->spa_async_zio_root);
1214 1214                  spa->spa_async_zio_root = NULL;
1215 1215          }
1216 1216  
1217 1217          bpobj_close(&spa->spa_deferred_bpobj);
1218 1218  
1219 1219          /*
1220 1220           * Close the dsl pool.
1221 1221           */
1222 1222          if (spa->spa_dsl_pool) {
1223 1223                  dsl_pool_close(spa->spa_dsl_pool);
1224 1224                  spa->spa_dsl_pool = NULL;
1225 1225                  spa->spa_meta_objset = NULL;
1226 1226          }
1227 1227  
1228 1228          ddt_unload(spa);
1229 1229  
1230 1230          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1231 1231  
1232 1232          /*
1233 1233           * Drop and purge level 2 cache
1234 1234           */
1235 1235          spa_l2cache_drop(spa);
1236 1236  
1237 1237          /*
1238 1238           * Close all vdevs.
1239 1239           */
1240 1240          if (spa->spa_root_vdev)
1241 1241                  vdev_free(spa->spa_root_vdev);
1242 1242          ASSERT(spa->spa_root_vdev == NULL);
1243 1243  
1244 1244          for (i = 0; i < spa->spa_spares.sav_count; i++)
1245 1245                  vdev_free(spa->spa_spares.sav_vdevs[i]);
1246 1246          if (spa->spa_spares.sav_vdevs) {
1247 1247                  kmem_free(spa->spa_spares.sav_vdevs,
1248 1248                      spa->spa_spares.sav_count * sizeof (void *));
1249 1249                  spa->spa_spares.sav_vdevs = NULL;
1250 1250          }
1251 1251          if (spa->spa_spares.sav_config) {
1252 1252                  nvlist_free(spa->spa_spares.sav_config);
1253 1253                  spa->spa_spares.sav_config = NULL;
1254 1254          }
1255 1255          spa->spa_spares.sav_count = 0;
1256 1256  
1257 1257          for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1258 1258                  vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1259 1259                  vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1260 1260          }
1261 1261          if (spa->spa_l2cache.sav_vdevs) {
1262 1262                  kmem_free(spa->spa_l2cache.sav_vdevs,
1263 1263                      spa->spa_l2cache.sav_count * sizeof (void *));
1264 1264                  spa->spa_l2cache.sav_vdevs = NULL;
1265 1265          }
1266 1266          if (spa->spa_l2cache.sav_config) {
1267 1267                  nvlist_free(spa->spa_l2cache.sav_config);
1268 1268                  spa->spa_l2cache.sav_config = NULL;
1269 1269          }
1270 1270          spa->spa_l2cache.sav_count = 0;
1271 1271  
1272 1272          spa->spa_async_suspended = 0;
1273 1273  
1274 1274          if (spa->spa_comment != NULL) {
1275 1275                  spa_strfree(spa->spa_comment);
1276 1276                  spa->spa_comment = NULL;
1277 1277          }
1278 1278  
1279 1279          spa_config_exit(spa, SCL_ALL, FTAG);
1280 1280  }
1281 1281  
1282 1282  /*
1283 1283   * Load (or re-load) the current list of vdevs describing the active spares for
1284 1284   * this pool.  When this is called, we have some form of basic information in
1285 1285   * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1286 1286   * then re-generate a more complete list including status information.
1287 1287   */
1288 1288  static void
1289 1289  spa_load_spares(spa_t *spa)
1290 1290  {
1291 1291          nvlist_t **spares;
1292 1292          uint_t nspares;
1293 1293          int i;
1294 1294          vdev_t *vd, *tvd;
1295 1295  
1296 1296          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1297 1297  
1298 1298          /*
1299 1299           * First, close and free any existing spare vdevs.
1300 1300           */
1301 1301          for (i = 0; i < spa->spa_spares.sav_count; i++) {
1302 1302                  vd = spa->spa_spares.sav_vdevs[i];
1303 1303  
1304 1304                  /* Undo the call to spa_activate() below */
1305 1305                  if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1306 1306                      B_FALSE)) != NULL && tvd->vdev_isspare)
1307 1307                          spa_spare_remove(tvd);
1308 1308                  vdev_close(vd);
1309 1309                  vdev_free(vd);
1310 1310          }
1311 1311  
1312 1312          if (spa->spa_spares.sav_vdevs)
1313 1313                  kmem_free(spa->spa_spares.sav_vdevs,
1314 1314                      spa->spa_spares.sav_count * sizeof (void *));
1315 1315  
1316 1316          if (spa->spa_spares.sav_config == NULL)
1317 1317                  nspares = 0;
1318 1318          else
1319 1319                  VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1320 1320                      ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1321 1321  
1322 1322          spa->spa_spares.sav_count = (int)nspares;
1323 1323          spa->spa_spares.sav_vdevs = NULL;
1324 1324  
1325 1325          if (nspares == 0)
1326 1326                  return;
1327 1327  
1328 1328          /*
1329 1329           * Construct the array of vdevs, opening them to get status in the
1330 1330           * process.   For each spare, there is potentially two different vdev_t
1331 1331           * structures associated with it: one in the list of spares (used only
1332 1332           * for basic validation purposes) and one in the active vdev
1333 1333           * configuration (if it's spared in).  During this phase we open and
1334 1334           * validate each vdev on the spare list.  If the vdev also exists in the
1335 1335           * active configuration, then we also mark this vdev as an active spare.
1336 1336           */
1337 1337          spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1338 1338              KM_SLEEP);
1339 1339          for (i = 0; i < spa->spa_spares.sav_count; i++) {
1340 1340                  VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1341 1341                      VDEV_ALLOC_SPARE) == 0);
1342 1342                  ASSERT(vd != NULL);
1343 1343  
1344 1344                  spa->spa_spares.sav_vdevs[i] = vd;
1345 1345  
1346 1346                  if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1347 1347                      B_FALSE)) != NULL) {
1348 1348                          if (!tvd->vdev_isspare)
1349 1349                                  spa_spare_add(tvd);
1350 1350  
1351 1351                          /*
1352 1352                           * We only mark the spare active if we were successfully
1353 1353                           * able to load the vdev.  Otherwise, importing a pool
1354 1354                           * with a bad active spare would result in strange
1355 1355                           * behavior, because multiple pool would think the spare
1356 1356                           * is actively in use.
1357 1357                           *
1358 1358                           * There is a vulnerability here to an equally bizarre
1359 1359                           * circumstance, where a dead active spare is later
1360 1360                           * brought back to life (onlined or otherwise).  Given
1361 1361                           * the rarity of this scenario, and the extra complexity
1362 1362                           * it adds, we ignore the possibility.
1363 1363                           */
1364 1364                          if (!vdev_is_dead(tvd))
1365 1365                                  spa_spare_activate(tvd);
1366 1366                  }
1367 1367  
1368 1368                  vd->vdev_top = vd;
1369 1369                  vd->vdev_aux = &spa->spa_spares;
1370 1370  
1371 1371                  if (vdev_open(vd) != 0)
1372 1372                          continue;
1373 1373  
1374 1374                  if (vdev_validate_aux(vd) == 0)
1375 1375                          spa_spare_add(vd);
1376 1376          }
1377 1377  
1378 1378          /*
1379 1379           * Recompute the stashed list of spares, with status information
1380 1380           * this time.
1381 1381           */
1382 1382          VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1383 1383              DATA_TYPE_NVLIST_ARRAY) == 0);
1384 1384  
1385 1385          spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1386 1386              KM_SLEEP);
1387 1387          for (i = 0; i < spa->spa_spares.sav_count; i++)
1388 1388                  spares[i] = vdev_config_generate(spa,
1389 1389                      spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1390 1390          VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1391 1391              ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1392 1392          for (i = 0; i < spa->spa_spares.sav_count; i++)
1393 1393                  nvlist_free(spares[i]);
1394 1394          kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1395 1395  }
1396 1396  
1397 1397  /*
1398 1398   * Load (or re-load) the current list of vdevs describing the active l2cache for
1399 1399   * this pool.  When this is called, we have some form of basic information in
1400 1400   * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1401 1401   * then re-generate a more complete list including status information.
1402 1402   * Devices which are already active have their details maintained, and are
1403 1403   * not re-opened.
1404 1404   */
1405 1405  static void
1406 1406  spa_load_l2cache(spa_t *spa)
1407 1407  {
1408 1408          nvlist_t **l2cache;
1409 1409          uint_t nl2cache;
1410 1410          int i, j, oldnvdevs;
1411 1411          uint64_t guid;
1412 1412          vdev_t *vd, **oldvdevs, **newvdevs;
1413 1413          spa_aux_vdev_t *sav = &spa->spa_l2cache;
1414 1414  
1415 1415          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1416 1416  
1417 1417          if (sav->sav_config != NULL) {
1418 1418                  VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1419 1419                      ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1420 1420                  newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1421 1421          } else {
1422 1422                  nl2cache = 0;
1423 1423                  newvdevs = NULL;
1424 1424          }
1425 1425  
1426 1426          oldvdevs = sav->sav_vdevs;
1427 1427          oldnvdevs = sav->sav_count;
1428 1428          sav->sav_vdevs = NULL;
1429 1429          sav->sav_count = 0;
1430 1430  
1431 1431          /*
1432 1432           * Process new nvlist of vdevs.
1433 1433           */
1434 1434          for (i = 0; i < nl2cache; i++) {
1435 1435                  VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1436 1436                      &guid) == 0);
1437 1437  
1438 1438                  newvdevs[i] = NULL;
1439 1439                  for (j = 0; j < oldnvdevs; j++) {
1440 1440                          vd = oldvdevs[j];
1441 1441                          if (vd != NULL && guid == vd->vdev_guid) {
1442 1442                                  /*
1443 1443                                   * Retain previous vdev for add/remove ops.
1444 1444                                   */
1445 1445                                  newvdevs[i] = vd;
1446 1446                                  oldvdevs[j] = NULL;
1447 1447                                  break;
1448 1448                          }
1449 1449                  }
1450 1450  
1451 1451                  if (newvdevs[i] == NULL) {
1452 1452                          /*
1453 1453                           * Create new vdev
1454 1454                           */
1455 1455                          VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1456 1456                              VDEV_ALLOC_L2CACHE) == 0);
1457 1457                          ASSERT(vd != NULL);
1458 1458                          newvdevs[i] = vd;
1459 1459  
1460 1460                          /*
1461 1461                           * Commit this vdev as an l2cache device,
1462 1462                           * even if it fails to open.
1463 1463                           */
1464 1464                          spa_l2cache_add(vd);
1465 1465  
1466 1466                          vd->vdev_top = vd;
1467 1467                          vd->vdev_aux = sav;
1468 1468  
1469 1469                          spa_l2cache_activate(vd);
1470 1470  
1471 1471                          if (vdev_open(vd) != 0)
1472 1472                                  continue;
1473 1473  
1474 1474                          (void) vdev_validate_aux(vd);
1475 1475  
1476 1476                          if (!vdev_is_dead(vd))
1477 1477                                  l2arc_add_vdev(spa, vd);
1478 1478                  }
1479 1479          }
1480 1480  
1481 1481          /*
1482 1482           * Purge vdevs that were dropped
1483 1483           */
1484 1484          for (i = 0; i < oldnvdevs; i++) {
1485 1485                  uint64_t pool;
1486 1486  
1487 1487                  vd = oldvdevs[i];
1488 1488                  if (vd != NULL) {
1489 1489                          ASSERT(vd->vdev_isl2cache);
1490 1490  
1491 1491                          if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1492 1492                              pool != 0ULL && l2arc_vdev_present(vd))
1493 1493                                  l2arc_remove_vdev(vd);
1494 1494                          vdev_clear_stats(vd);
1495 1495                          vdev_free(vd);
1496 1496                  }
1497 1497          }
1498 1498  
1499 1499          if (oldvdevs)
1500 1500                  kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1501 1501  
1502 1502          if (sav->sav_config == NULL)
1503 1503                  goto out;
1504 1504  
1505 1505          sav->sav_vdevs = newvdevs;
1506 1506          sav->sav_count = (int)nl2cache;
1507 1507  
1508 1508          /*
1509 1509           * Recompute the stashed list of l2cache devices, with status
1510 1510           * information this time.
1511 1511           */
1512 1512          VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1513 1513              DATA_TYPE_NVLIST_ARRAY) == 0);
1514 1514  
1515 1515          l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1516 1516          for (i = 0; i < sav->sav_count; i++)
1517 1517                  l2cache[i] = vdev_config_generate(spa,
1518 1518                      sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1519 1519          VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1520 1520              ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1521 1521  out:
1522 1522          for (i = 0; i < sav->sav_count; i++)
1523 1523                  nvlist_free(l2cache[i]);
1524 1524          if (sav->sav_count)
1525 1525                  kmem_free(l2cache, sav->sav_count * sizeof (void *));
1526 1526  }
1527 1527  
1528 1528  static int
1529 1529  load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1530 1530  {
1531 1531          dmu_buf_t *db;
1532 1532          char *packed = NULL;
1533 1533          size_t nvsize = 0;
1534 1534          int error;
1535 1535          *value = NULL;
1536 1536  
1537 1537          VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1538 1538          nvsize = *(uint64_t *)db->db_data;
1539 1539          dmu_buf_rele(db, FTAG);
1540 1540  
1541 1541          packed = kmem_alloc(nvsize, KM_SLEEP);
1542 1542          error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1543 1543              DMU_READ_PREFETCH);
1544 1544          if (error == 0)
1545 1545                  error = nvlist_unpack(packed, nvsize, value, 0);
1546 1546          kmem_free(packed, nvsize);
1547 1547  
1548 1548          return (error);
1549 1549  }
1550 1550  
1551 1551  /*
1552 1552   * Checks to see if the given vdev could not be opened, in which case we post a
1553 1553   * sysevent to notify the autoreplace code that the device has been removed.
1554 1554   */
1555 1555  static void
1556 1556  spa_check_removed(vdev_t *vd)
1557 1557  {
1558 1558          for (int c = 0; c < vd->vdev_children; c++)
1559 1559                  spa_check_removed(vd->vdev_child[c]);
1560 1560  
1561 1561          if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1562 1562              !vd->vdev_ishole) {
1563 1563                  zfs_post_autoreplace(vd->vdev_spa, vd);
1564 1564                  spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1565 1565          }
1566 1566  }
1567 1567  
1568 1568  /*
1569 1569   * Validate the current config against the MOS config
1570 1570   */
1571 1571  static boolean_t
1572 1572  spa_config_valid(spa_t *spa, nvlist_t *config)
1573 1573  {
1574 1574          vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1575 1575          nvlist_t *nv;
1576 1576  
1577 1577          VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1578 1578  
1579 1579          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1580 1580          VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1581 1581  
1582 1582          ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1583 1583  
1584 1584          /*
1585 1585           * If we're doing a normal import, then build up any additional
1586 1586           * diagnostic information about missing devices in this config.
1587 1587           * We'll pass this up to the user for further processing.
1588 1588           */
1589 1589          if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1590 1590                  nvlist_t **child, *nv;
1591 1591                  uint64_t idx = 0;
1592 1592  
1593 1593                  child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1594 1594                      KM_SLEEP);
1595 1595                  VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1596 1596  
1597 1597                  for (int c = 0; c < rvd->vdev_children; c++) {
1598 1598                          vdev_t *tvd = rvd->vdev_child[c];
1599 1599                          vdev_t *mtvd  = mrvd->vdev_child[c];
1600 1600  
1601 1601                          if (tvd->vdev_ops == &vdev_missing_ops &&
1602 1602                              mtvd->vdev_ops != &vdev_missing_ops &&
1603 1603                              mtvd->vdev_islog)
1604 1604                                  child[idx++] = vdev_config_generate(spa, mtvd,
1605 1605                                      B_FALSE, 0);
1606 1606                  }
1607 1607  
1608 1608                  if (idx) {
1609 1609                          VERIFY(nvlist_add_nvlist_array(nv,
1610 1610                              ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1611 1611                          VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1612 1612                              ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1613 1613  
1614 1614                          for (int i = 0; i < idx; i++)
1615 1615                                  nvlist_free(child[i]);
1616 1616                  }
1617 1617                  nvlist_free(nv);
1618 1618                  kmem_free(child, rvd->vdev_children * sizeof (char **));
1619 1619          }
1620 1620  
1621 1621          /*
1622 1622           * Compare the root vdev tree with the information we have
1623 1623           * from the MOS config (mrvd). Check each top-level vdev
1624 1624           * with the corresponding MOS config top-level (mtvd).
1625 1625           */
1626 1626          for (int c = 0; c < rvd->vdev_children; c++) {
1627 1627                  vdev_t *tvd = rvd->vdev_child[c];
1628 1628                  vdev_t *mtvd  = mrvd->vdev_child[c];
1629 1629  
1630 1630                  /*
1631 1631                   * Resolve any "missing" vdevs in the current configuration.
1632 1632                   * If we find that the MOS config has more accurate information
1633 1633                   * about the top-level vdev then use that vdev instead.
1634 1634                   */
1635 1635                  if (tvd->vdev_ops == &vdev_missing_ops &&
1636 1636                      mtvd->vdev_ops != &vdev_missing_ops) {
1637 1637  
1638 1638                          if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1639 1639                                  continue;
1640 1640  
1641 1641                          /*
1642 1642                           * Device specific actions.
1643 1643                           */
1644 1644                          if (mtvd->vdev_islog) {
1645 1645                                  spa_set_log_state(spa, SPA_LOG_CLEAR);
1646 1646                          } else {
1647 1647                                  /*
1648 1648                                   * XXX - once we have 'readonly' pool
1649 1649                                   * support we should be able to handle
1650 1650                                   * missing data devices by transitioning
1651 1651                                   * the pool to readonly.
1652 1652                                   */
1653 1653                                  continue;
1654 1654                          }
1655 1655  
1656 1656                          /*
1657 1657                           * Swap the missing vdev with the data we were
1658 1658                           * able to obtain from the MOS config.
1659 1659                           */
1660 1660                          vdev_remove_child(rvd, tvd);
1661 1661                          vdev_remove_child(mrvd, mtvd);
1662 1662  
1663 1663                          vdev_add_child(rvd, mtvd);
1664 1664                          vdev_add_child(mrvd, tvd);
1665 1665  
1666 1666                          spa_config_exit(spa, SCL_ALL, FTAG);
1667 1667                          vdev_load(mtvd);
1668 1668                          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1669 1669  
1670 1670                          vdev_reopen(rvd);
1671 1671                  } else if (mtvd->vdev_islog) {
1672 1672                          /*
1673 1673                           * Load the slog device's state from the MOS config
1674 1674                           * since it's possible that the label does not
1675 1675                           * contain the most up-to-date information.
1676 1676                           */
1677 1677                          vdev_load_log_state(tvd, mtvd);
1678 1678                          vdev_reopen(tvd);
1679 1679                  }
1680 1680          }
1681 1681          vdev_free(mrvd);
1682 1682          spa_config_exit(spa, SCL_ALL, FTAG);
1683 1683  
1684 1684          /*
1685 1685           * Ensure we were able to validate the config.
1686 1686           */
1687 1687          return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1688 1688  }
1689 1689  
1690 1690  /*
1691 1691   * Check for missing log devices
1692 1692   */
1693 1693  static boolean_t
1694 1694  spa_check_logs(spa_t *spa)
1695 1695  {
1696 1696          boolean_t rv = B_FALSE;
1697 1697  
1698 1698          switch (spa->spa_log_state) {
1699 1699          case SPA_LOG_MISSING:
1700 1700                  /* need to recheck in case slog has been restored */
1701 1701          case SPA_LOG_UNKNOWN:
1702 1702                  rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
1703 1703                      NULL, DS_FIND_CHILDREN) != 0);
1704 1704                  if (rv)
1705 1705                          spa_set_log_state(spa, SPA_LOG_MISSING);
1706 1706                  break;
1707 1707          }
1708 1708          return (rv);
1709 1709  }
1710 1710  
1711 1711  static boolean_t
1712 1712  spa_passivate_log(spa_t *spa)
1713 1713  {
1714 1714          vdev_t *rvd = spa->spa_root_vdev;
1715 1715          boolean_t slog_found = B_FALSE;
1716 1716  
1717 1717          ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1718 1718  
1719 1719          if (!spa_has_slogs(spa))
1720 1720                  return (B_FALSE);
1721 1721  
1722 1722          for (int c = 0; c < rvd->vdev_children; c++) {
1723 1723                  vdev_t *tvd = rvd->vdev_child[c];
1724 1724                  metaslab_group_t *mg = tvd->vdev_mg;
1725 1725  
1726 1726                  if (tvd->vdev_islog) {
1727 1727                          metaslab_group_passivate(mg);
1728 1728                          slog_found = B_TRUE;
1729 1729                  }
1730 1730          }
1731 1731  
1732 1732          return (slog_found);
1733 1733  }
1734 1734  
1735 1735  static void
1736 1736  spa_activate_log(spa_t *spa)
1737 1737  {
1738 1738          vdev_t *rvd = spa->spa_root_vdev;
1739 1739  
1740 1740          ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1741 1741  
1742 1742          for (int c = 0; c < rvd->vdev_children; c++) {
1743 1743                  vdev_t *tvd = rvd->vdev_child[c];
1744 1744                  metaslab_group_t *mg = tvd->vdev_mg;
1745 1745  
1746 1746                  if (tvd->vdev_islog)
1747 1747                          metaslab_group_activate(mg);
1748 1748          }
1749 1749  }
1750 1750  
1751 1751  int
1752 1752  spa_offline_log(spa_t *spa)
1753 1753  {
1754 1754          int error;
1755 1755  
1756 1756          error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1757 1757              NULL, DS_FIND_CHILDREN);
1758 1758          if (error == 0) {
1759 1759                  /*
1760 1760                   * We successfully offlined the log device, sync out the
1761 1761                   * current txg so that the "stubby" block can be removed
1762 1762                   * by zil_sync().
1763 1763                   */
1764 1764                  txg_wait_synced(spa->spa_dsl_pool, 0);
1765 1765          }
1766 1766          return (error);
1767 1767  }
1768 1768  
1769 1769  static void
1770 1770  spa_aux_check_removed(spa_aux_vdev_t *sav)
1771 1771  {
1772 1772          for (int i = 0; i < sav->sav_count; i++)
1773 1773                  spa_check_removed(sav->sav_vdevs[i]);
1774 1774  }
1775 1775  
1776 1776  void
1777 1777  spa_claim_notify(zio_t *zio)
1778 1778  {
1779 1779          spa_t *spa = zio->io_spa;
1780 1780  
1781 1781          if (zio->io_error)
1782 1782                  return;
1783 1783  
1784 1784          mutex_enter(&spa->spa_props_lock);      /* any mutex will do */
1785 1785          if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1786 1786                  spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1787 1787          mutex_exit(&spa->spa_props_lock);
1788 1788  }
1789 1789  
1790 1790  typedef struct spa_load_error {
1791 1791          uint64_t        sle_meta_count;
1792 1792          uint64_t        sle_data_count;
1793 1793  } spa_load_error_t;
1794 1794  
1795 1795  static void
1796 1796  spa_load_verify_done(zio_t *zio)
1797 1797  {
1798 1798          blkptr_t *bp = zio->io_bp;
1799 1799          spa_load_error_t *sle = zio->io_private;
1800 1800          dmu_object_type_t type = BP_GET_TYPE(bp);
1801 1801          int error = zio->io_error;
1802 1802  
1803 1803          if (error) {
1804 1804                  if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1805 1805                      type != DMU_OT_INTENT_LOG)
1806 1806                          atomic_add_64(&sle->sle_meta_count, 1);
1807 1807                  else
1808 1808                          atomic_add_64(&sle->sle_data_count, 1);
1809 1809          }
1810 1810          zio_data_buf_free(zio->io_data, zio->io_size);
1811 1811  }
1812 1812  
1813 1813  /*ARGSUSED*/
1814 1814  static int
1815 1815  spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1816 1816      const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1817 1817  {
1818 1818          if (bp != NULL) {
1819 1819                  zio_t *rio = arg;
1820 1820                  size_t size = BP_GET_PSIZE(bp);
1821 1821                  void *data = zio_data_buf_alloc(size);
1822 1822  
1823 1823                  zio_nowait(zio_read(rio, spa, bp, data, size,
1824 1824                      spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1825 1825                      ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1826 1826                      ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1827 1827          }
1828 1828          return (0);
1829 1829  }
1830 1830  
1831 1831  static int
1832 1832  spa_load_verify(spa_t *spa)
1833 1833  {
1834 1834          zio_t *rio;
1835 1835          spa_load_error_t sle = { 0 };
1836 1836          zpool_rewind_policy_t policy;
1837 1837          boolean_t verify_ok = B_FALSE;
1838 1838          int error;
1839 1839  
1840 1840          zpool_get_rewind_policy(spa->spa_config, &policy);
1841 1841  
1842 1842          if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1843 1843                  return (0);
1844 1844  
1845 1845          rio = zio_root(spa, NULL, &sle,
1846 1846              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1847 1847  
1848 1848          error = traverse_pool(spa, spa->spa_verify_min_txg,
1849 1849              TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1850 1850  
1851 1851          (void) zio_wait(rio);
1852 1852  
1853 1853          spa->spa_load_meta_errors = sle.sle_meta_count;
1854 1854          spa->spa_load_data_errors = sle.sle_data_count;
1855 1855  
1856 1856          if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1857 1857              sle.sle_data_count <= policy.zrp_maxdata) {
1858 1858                  int64_t loss = 0;
1859 1859  
1860 1860                  verify_ok = B_TRUE;
1861 1861                  spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1862 1862                  spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1863 1863  
1864 1864                  loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1865 1865                  VERIFY(nvlist_add_uint64(spa->spa_load_info,
1866 1866                      ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1867 1867                  VERIFY(nvlist_add_int64(spa->spa_load_info,
1868 1868                      ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1869 1869                  VERIFY(nvlist_add_uint64(spa->spa_load_info,
1870 1870                      ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1871 1871          } else {
1872 1872                  spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1873 1873          }
1874 1874  
1875 1875          if (error) {
1876 1876                  if (error != ENXIO && error != EIO)
1877 1877                          error = SET_ERROR(EIO);
1878 1878                  return (error);
1879 1879          }
1880 1880  
1881 1881          return (verify_ok ? 0 : EIO);
1882 1882  }
1883 1883  
1884 1884  /*
1885 1885   * Find a value in the pool props object.
1886 1886   */
1887 1887  static void
1888 1888  spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1889 1889  {
1890 1890          (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1891 1891              zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1892 1892  }
1893 1893  
1894 1894  /*
1895 1895   * Find a value in the pool directory object.
1896 1896   */
1897 1897  static int
1898 1898  spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1899 1899  {
1900 1900          return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1901 1901              name, sizeof (uint64_t), 1, val));
1902 1902  }
1903 1903  
1904 1904  static int
1905 1905  spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1906 1906  {
1907 1907          vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1908 1908          return (err);
1909 1909  }
1910 1910  
1911 1911  /*
1912 1912   * Fix up config after a partly-completed split.  This is done with the
1913 1913   * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
1914 1914   * pool have that entry in their config, but only the splitting one contains
1915 1915   * a list of all the guids of the vdevs that are being split off.
1916 1916   *
1917 1917   * This function determines what to do with that list: either rejoin
1918 1918   * all the disks to the pool, or complete the splitting process.  To attempt
1919 1919   * the rejoin, each disk that is offlined is marked online again, and
1920 1920   * we do a reopen() call.  If the vdev label for every disk that was
1921 1921   * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1922 1922   * then we call vdev_split() on each disk, and complete the split.
1923 1923   *
1924 1924   * Otherwise we leave the config alone, with all the vdevs in place in
1925 1925   * the original pool.
1926 1926   */
1927 1927  static void
1928 1928  spa_try_repair(spa_t *spa, nvlist_t *config)
1929 1929  {
1930 1930          uint_t extracted;
1931 1931          uint64_t *glist;
1932 1932          uint_t i, gcount;
1933 1933          nvlist_t *nvl;
1934 1934          vdev_t **vd;
1935 1935          boolean_t attempt_reopen;
1936 1936  
1937 1937          if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1938 1938                  return;
1939 1939  
1940 1940          /* check that the config is complete */
1941 1941          if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1942 1942              &glist, &gcount) != 0)
1943 1943                  return;
1944 1944  
1945 1945          vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
1946 1946  
1947 1947          /* attempt to online all the vdevs & validate */
1948 1948          attempt_reopen = B_TRUE;
1949 1949          for (i = 0; i < gcount; i++) {
1950 1950                  if (glist[i] == 0)      /* vdev is hole */
1951 1951                          continue;
1952 1952  
1953 1953                  vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
1954 1954                  if (vd[i] == NULL) {
1955 1955                          /*
1956 1956                           * Don't bother attempting to reopen the disks;
1957 1957                           * just do the split.
1958 1958                           */
1959 1959                          attempt_reopen = B_FALSE;
1960 1960                  } else {
1961 1961                          /* attempt to re-online it */
1962 1962                          vd[i]->vdev_offline = B_FALSE;
1963 1963                  }
1964 1964          }
1965 1965  
1966 1966          if (attempt_reopen) {
1967 1967                  vdev_reopen(spa->spa_root_vdev);
1968 1968  
1969 1969                  /* check each device to see what state it's in */
1970 1970                  for (extracted = 0, i = 0; i < gcount; i++) {
1971 1971                          if (vd[i] != NULL &&
1972 1972                              vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
1973 1973                                  break;
1974 1974                          ++extracted;
1975 1975                  }
1976 1976          }
1977 1977  
1978 1978          /*
1979 1979           * If every disk has been moved to the new pool, or if we never
1980 1980           * even attempted to look at them, then we split them off for
1981 1981           * good.
1982 1982           */
1983 1983          if (!attempt_reopen || gcount == extracted) {
1984 1984                  for (i = 0; i < gcount; i++)
1985 1985                          if (vd[i] != NULL)
1986 1986                                  vdev_split(vd[i]);
1987 1987                  vdev_reopen(spa->spa_root_vdev);
1988 1988          }
1989 1989  
1990 1990          kmem_free(vd, gcount * sizeof (vdev_t *));
1991 1991  }
1992 1992  
1993 1993  static int
1994 1994  spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
1995 1995      boolean_t mosconfig)
1996 1996  {
1997 1997          nvlist_t *config = spa->spa_config;
1998 1998          char *ereport = FM_EREPORT_ZFS_POOL;
1999 1999          char *comment;
2000 2000          int error;
2001 2001          uint64_t pool_guid;
2002 2002          nvlist_t *nvl;
2003 2003  
2004 2004          if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
2005 2005                  return (SET_ERROR(EINVAL));
2006 2006  
2007 2007          ASSERT(spa->spa_comment == NULL);
2008 2008          if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2009 2009                  spa->spa_comment = spa_strdup(comment);
2010 2010  
2011 2011          /*
2012 2012           * Versioning wasn't explicitly added to the label until later, so if
2013 2013           * it's not present treat it as the initial version.
2014 2014           */
2015 2015          if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2016 2016              &spa->spa_ubsync.ub_version) != 0)
2017 2017                  spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2018 2018  
2019 2019          (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2020 2020              &spa->spa_config_txg);
2021 2021  
2022 2022          if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
2023 2023              spa_guid_exists(pool_guid, 0)) {
2024 2024                  error = SET_ERROR(EEXIST);
2025 2025          } else {
2026 2026                  spa->spa_config_guid = pool_guid;
2027 2027  
2028 2028                  if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
2029 2029                      &nvl) == 0) {
2030 2030                          VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
2031 2031                              KM_SLEEP) == 0);
2032 2032                  }
2033 2033  
2034 2034                  nvlist_free(spa->spa_load_info);
2035 2035                  spa->spa_load_info = fnvlist_alloc();
2036 2036  
2037 2037                  gethrestime(&spa->spa_loaded_ts);
2038 2038                  error = spa_load_impl(spa, pool_guid, config, state, type,
2039 2039                      mosconfig, &ereport);
2040 2040          }
2041 2041  
2042 2042          spa->spa_minref = refcount_count(&spa->spa_refcount);
2043 2043          if (error) {
2044 2044                  if (error != EEXIST) {
2045 2045                          spa->spa_loaded_ts.tv_sec = 0;
2046 2046                          spa->spa_loaded_ts.tv_nsec = 0;
2047 2047                  }
2048 2048                  if (error != EBADF) {
2049 2049                          zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2050 2050                  }
2051 2051          }
2052 2052          spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2053 2053          spa->spa_ena = 0;
2054 2054  
2055 2055          return (error);
2056 2056  }
2057 2057  
2058 2058  /*
2059 2059   * Load an existing storage pool, using the pool's builtin spa_config as a
2060 2060   * source of configuration information.
2061 2061   */
2062 2062  static int
2063 2063  spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
2064 2064      spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
2065 2065      char **ereport)
2066 2066  {
2067 2067          int error = 0;
2068 2068          nvlist_t *nvroot = NULL;
2069 2069          nvlist_t *label;
2070 2070          vdev_t *rvd;
2071 2071          uberblock_t *ub = &spa->spa_uberblock;
2072 2072          uint64_t children, config_cache_txg = spa->spa_config_txg;
2073 2073          int orig_mode = spa->spa_mode;
2074 2074          int parse;
2075 2075          uint64_t obj;
2076 2076          boolean_t missing_feat_write = B_FALSE;
2077 2077  
2078 2078          /*
2079 2079           * If this is an untrusted config, access the pool in read-only mode.
2080 2080           * This prevents things like resilvering recently removed devices.
2081 2081           */
2082 2082          if (!mosconfig)
2083 2083                  spa->spa_mode = FREAD;
2084 2084  
2085 2085          ASSERT(MUTEX_HELD(&spa_namespace_lock));
2086 2086  
2087 2087          spa->spa_load_state = state;
2088 2088  
2089 2089          if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
2090 2090                  return (SET_ERROR(EINVAL));
2091 2091  
2092 2092          parse = (type == SPA_IMPORT_EXISTING ?
2093 2093              VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2094 2094  
2095 2095          /*
2096 2096           * Create "The Godfather" zio to hold all async IOs
2097 2097           */
2098 2098          spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2099 2099              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2100 2100  
2101 2101          /*
2102 2102           * Parse the configuration into a vdev tree.  We explicitly set the
2103 2103           * value that will be returned by spa_version() since parsing the
2104 2104           * configuration requires knowing the version number.
2105 2105           */
2106 2106          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2107 2107          error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
2108 2108          spa_config_exit(spa, SCL_ALL, FTAG);
2109 2109  
2110 2110          if (error != 0)
2111 2111                  return (error);
2112 2112  
2113 2113          ASSERT(spa->spa_root_vdev == rvd);
2114 2114  
2115 2115          if (type != SPA_IMPORT_ASSEMBLE) {
2116 2116                  ASSERT(spa_guid(spa) == pool_guid);
2117 2117          }
2118 2118  
2119 2119          /*
2120 2120           * Try to open all vdevs, loading each label in the process.
2121 2121           */
2122 2122          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2123 2123          error = vdev_open(rvd);
2124 2124          spa_config_exit(spa, SCL_ALL, FTAG);
2125 2125          if (error != 0)
2126 2126                  return (error);
2127 2127  
2128 2128          /*
2129 2129           * We need to validate the vdev labels against the configuration that
2130 2130           * we have in hand, which is dependent on the setting of mosconfig. If
2131 2131           * mosconfig is true then we're validating the vdev labels based on
2132 2132           * that config.  Otherwise, we're validating against the cached config
2133 2133           * (zpool.cache) that was read when we loaded the zfs module, and then
2134 2134           * later we will recursively call spa_load() and validate against
2135 2135           * the vdev config.
2136 2136           *
2137 2137           * If we're assembling a new pool that's been split off from an
2138 2138           * existing pool, the labels haven't yet been updated so we skip
2139 2139           * validation for now.
2140 2140           */
2141 2141          if (type != SPA_IMPORT_ASSEMBLE) {
2142 2142                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2143 2143                  error = vdev_validate(rvd, mosconfig);
2144 2144                  spa_config_exit(spa, SCL_ALL, FTAG);
2145 2145  
2146 2146                  if (error != 0)
2147 2147                          return (error);
2148 2148  
2149 2149                  if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2150 2150                          return (SET_ERROR(ENXIO));
2151 2151          }
2152 2152  
2153 2153          /*
2154 2154           * Find the best uberblock.
2155 2155           */
2156 2156          vdev_uberblock_load(rvd, ub, &label);
2157 2157  
2158 2158          /*
2159 2159           * If we weren't able to find a single valid uberblock, return failure.
2160 2160           */
2161 2161          if (ub->ub_txg == 0) {
2162 2162                  nvlist_free(label);
2163 2163                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2164 2164          }
2165 2165  
2166 2166          /*
2167 2167           * If the pool has an unsupported version we can't open it.
2168 2168           */
2169 2169          if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2170 2170                  nvlist_free(label);
2171 2171                  return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2172 2172          }
2173 2173  
2174 2174          if (ub->ub_version >= SPA_VERSION_FEATURES) {
2175 2175                  nvlist_t *features;
2176 2176  
2177 2177                  /*
2178 2178                   * If we weren't able to find what's necessary for reading the
2179 2179                   * MOS in the label, return failure.
2180 2180                   */
2181 2181                  if (label == NULL || nvlist_lookup_nvlist(label,
2182 2182                      ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
2183 2183                          nvlist_free(label);
2184 2184                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2185 2185                              ENXIO));
2186 2186                  }
2187 2187  
2188 2188                  /*
2189 2189                   * Update our in-core representation with the definitive values
2190 2190                   * from the label.
2191 2191                   */
2192 2192                  nvlist_free(spa->spa_label_features);
2193 2193                  VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2194 2194          }
2195 2195  
2196 2196          nvlist_free(label);
2197 2197  
2198 2198          /*
2199 2199           * Look through entries in the label nvlist's features_for_read. If
2200 2200           * there is a feature listed there which we don't understand then we
2201 2201           * cannot open a pool.
2202 2202           */
2203 2203          if (ub->ub_version >= SPA_VERSION_FEATURES) {
2204 2204                  nvlist_t *unsup_feat;
2205 2205  
2206 2206                  VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2207 2207                      0);
2208 2208  
2209 2209                  for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2210 2210                      NULL); nvp != NULL;
2211 2211                      nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2212 2212                          if (!zfeature_is_supported(nvpair_name(nvp))) {
2213 2213                                  VERIFY(nvlist_add_string(unsup_feat,
2214 2214                                      nvpair_name(nvp), "") == 0);
2215 2215                          }
2216 2216                  }
2217 2217  
2218 2218                  if (!nvlist_empty(unsup_feat)) {
2219 2219                          VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2220 2220                              ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2221 2221                          nvlist_free(unsup_feat);
2222 2222                          return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2223 2223                              ENOTSUP));
2224 2224                  }
2225 2225  
2226 2226                  nvlist_free(unsup_feat);
2227 2227          }
2228 2228  
2229 2229          /*
2230 2230           * If the vdev guid sum doesn't match the uberblock, we have an
2231 2231           * incomplete configuration.  We first check to see if the pool
2232 2232           * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
2233 2233           * If it is, defer the vdev_guid_sum check till later so we
2234 2234           * can handle missing vdevs.
2235 2235           */
2236 2236          if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
2237 2237              &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
2238 2238              rvd->vdev_guid_sum != ub->ub_guid_sum)
2239 2239                  return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2240 2240  
2241 2241          if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2242 2242                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2243 2243                  spa_try_repair(spa, config);
2244 2244                  spa_config_exit(spa, SCL_ALL, FTAG);
2245 2245                  nvlist_free(spa->spa_config_splitting);
2246 2246                  spa->spa_config_splitting = NULL;
2247 2247          }
2248 2248  
2249 2249          /*
2250 2250           * Initialize internal SPA structures.
2251 2251           */
2252 2252          spa->spa_state = POOL_STATE_ACTIVE;
2253 2253          spa->spa_ubsync = spa->spa_uberblock;
2254 2254          spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2255 2255              TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2256 2256          spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2257 2257              spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2258 2258          spa->spa_claim_max_txg = spa->spa_first_txg;
2259 2259          spa->spa_prev_software_version = ub->ub_software_version;
2260 2260  
2261 2261          error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2262 2262          if (error)
2263 2263                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2264 2264          spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2265 2265  
2266 2266          if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2267 2267                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2268 2268  
2269 2269          if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2270 2270                  boolean_t missing_feat_read = B_FALSE;
2271 2271                  nvlist_t *unsup_feat, *enabled_feat;
2272 2272  
2273 2273                  if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2274 2274                      &spa->spa_feat_for_read_obj) != 0) {
2275 2275                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2276 2276                  }
2277 2277  
2278 2278                  if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2279 2279                      &spa->spa_feat_for_write_obj) != 0) {
2280 2280                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2281 2281                  }
2282 2282  
2283 2283                  if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2284 2284                      &spa->spa_feat_desc_obj) != 0) {
2285 2285                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2286 2286                  }
2287 2287  
2288 2288                  enabled_feat = fnvlist_alloc();
2289 2289                  unsup_feat = fnvlist_alloc();
2290 2290  
2291 2291                  if (!feature_is_supported(spa->spa_meta_objset,
2292 2292                      spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
2293 2293                      unsup_feat, enabled_feat))
2294 2294                          missing_feat_read = B_TRUE;
2295 2295  
2296 2296                  if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
2297 2297                          if (!feature_is_supported(spa->spa_meta_objset,
2298 2298                              spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
2299 2299                              unsup_feat, enabled_feat)) {
2300 2300                                  missing_feat_write = B_TRUE;
2301 2301                          }
2302 2302                  }
2303 2303  
2304 2304                  fnvlist_add_nvlist(spa->spa_load_info,
2305 2305                      ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2306 2306  
2307 2307                  if (!nvlist_empty(unsup_feat)) {
2308 2308                          fnvlist_add_nvlist(spa->spa_load_info,
2309 2309                              ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
2310 2310                  }
2311 2311  
2312 2312                  fnvlist_free(enabled_feat);
2313 2313                  fnvlist_free(unsup_feat);
2314 2314  
2315 2315                  if (!missing_feat_read) {
2316 2316                          fnvlist_add_boolean(spa->spa_load_info,
2317 2317                              ZPOOL_CONFIG_CAN_RDONLY);
2318 2318                  }
2319 2319  
2320 2320                  /*
2321 2321                   * If the state is SPA_LOAD_TRYIMPORT, our objective is
2322 2322                   * twofold: to determine whether the pool is available for
2323 2323                   * import in read-write mode and (if it is not) whether the
2324 2324                   * pool is available for import in read-only mode. If the pool
2325 2325                   * is available for import in read-write mode, it is displayed
2326 2326                   * as available in userland; if it is not available for import
2327 2327                   * in read-only mode, it is displayed as unavailable in
2328 2328                   * userland. If the pool is available for import in read-only
2329 2329                   * mode but not read-write mode, it is displayed as unavailable
2330 2330                   * in userland with a special note that the pool is actually
2331 2331                   * available for open in read-only mode.
2332 2332                   *
2333 2333                   * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2334 2334                   * missing a feature for write, we must first determine whether
2335 2335                   * the pool can be opened read-only before returning to
2336 2336                   * userland in order to know whether to display the
2337 2337                   * abovementioned note.
2338 2338                   */
2339 2339                  if (missing_feat_read || (missing_feat_write &&
2340 2340                      spa_writeable(spa))) {
2341 2341                          return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2342 2342                              ENOTSUP));
2343 2343                  }
2344 2344          }
2345 2345  
2346 2346          spa->spa_is_initializing = B_TRUE;
2347 2347          error = dsl_pool_open(spa->spa_dsl_pool);
2348 2348          spa->spa_is_initializing = B_FALSE;
2349 2349          if (error != 0)
2350 2350                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2351 2351  
2352 2352          if (!mosconfig) {
2353 2353                  uint64_t hostid;
2354 2354                  nvlist_t *policy = NULL, *nvconfig;
2355 2355  
2356 2356                  if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2357 2357                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2358 2358  
2359 2359                  if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2360 2360                      ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2361 2361                          char *hostname;
2362 2362                          unsigned long myhostid = 0;
2363 2363  
2364 2364                          VERIFY(nvlist_lookup_string(nvconfig,
2365 2365                              ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2366 2366  
2367 2367  #ifdef  _KERNEL
2368 2368                          myhostid = zone_get_hostid(NULL);
2369 2369  #else   /* _KERNEL */
2370 2370                          /*
2371 2371                           * We're emulating the system's hostid in userland, so
2372 2372                           * we can't use zone_get_hostid().
2373 2373                           */
2374 2374                          (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2375 2375  #endif  /* _KERNEL */
2376 2376                          if (hostid != 0 && myhostid != 0 &&
2377 2377                              hostid != myhostid) {
2378 2378                                  nvlist_free(nvconfig);
2379 2379                                  cmn_err(CE_WARN, "pool '%s' could not be "
2380 2380                                      "loaded as it was last accessed by "
2381 2381                                      "another system (host: %s hostid: 0x%lx). "
2382 2382                                      "See: http://illumos.org/msg/ZFS-8000-EY",
2383 2383                                      spa_name(spa), hostname,
2384 2384                                      (unsigned long)hostid);
2385 2385                                  return (SET_ERROR(EBADF));
2386 2386                          }
2387 2387                  }
2388 2388                  if (nvlist_lookup_nvlist(spa->spa_config,
2389 2389                      ZPOOL_REWIND_POLICY, &policy) == 0)
2390 2390                          VERIFY(nvlist_add_nvlist(nvconfig,
2391 2391                              ZPOOL_REWIND_POLICY, policy) == 0);
2392 2392  
2393 2393                  spa_config_set(spa, nvconfig);
2394 2394                  spa_unload(spa);
2395 2395                  spa_deactivate(spa);
2396 2396                  spa_activate(spa, orig_mode);
2397 2397  
2398 2398                  return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2399 2399          }
2400 2400  
2401 2401          if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2402 2402                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2403 2403          error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2404 2404          if (error != 0)
2405 2405                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2406 2406  
2407 2407          /*
2408 2408           * Load the bit that tells us to use the new accounting function
2409 2409           * (raid-z deflation).  If we have an older pool, this will not
2410 2410           * be present.
2411 2411           */
2412 2412          error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2413 2413          if (error != 0 && error != ENOENT)
2414 2414                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2415 2415  
2416 2416          error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2417 2417              &spa->spa_creation_version);
2418 2418          if (error != 0 && error != ENOENT)
2419 2419                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2420 2420  
2421 2421          /*
2422 2422           * Load the persistent error log.  If we have an older pool, this will
2423 2423           * not be present.
2424 2424           */
2425 2425          error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2426 2426          if (error != 0 && error != ENOENT)
2427 2427                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2428 2428  
2429 2429          error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2430 2430              &spa->spa_errlog_scrub);
2431 2431          if (error != 0 && error != ENOENT)
2432 2432                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2433 2433  
2434 2434          /*
2435 2435           * Load the history object.  If we have an older pool, this
2436 2436           * will not be present.
2437 2437           */
2438 2438          error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2439 2439          if (error != 0 && error != ENOENT)
2440 2440                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2441 2441  
2442 2442          /*
2443 2443           * If we're assembling the pool from the split-off vdevs of
2444 2444           * an existing pool, we don't want to attach the spares & cache
2445 2445           * devices.
2446 2446           */
2447 2447  
2448 2448          /*
2449 2449           * Load any hot spares for this pool.
2450 2450           */
2451 2451          error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2452 2452          if (error != 0 && error != ENOENT)
2453 2453                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2454 2454          if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2455 2455                  ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2456 2456                  if (load_nvlist(spa, spa->spa_spares.sav_object,
2457 2457                      &spa->spa_spares.sav_config) != 0)
2458 2458                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2459 2459  
2460 2460                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2461 2461                  spa_load_spares(spa);
2462 2462                  spa_config_exit(spa, SCL_ALL, FTAG);
2463 2463          } else if (error == 0) {
2464 2464                  spa->spa_spares.sav_sync = B_TRUE;
2465 2465          }
2466 2466  
2467 2467          /*
2468 2468           * Load any level 2 ARC devices for this pool.
2469 2469           */
2470 2470          error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2471 2471              &spa->spa_l2cache.sav_object);
2472 2472          if (error != 0 && error != ENOENT)
2473 2473                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2474 2474          if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2475 2475                  ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2476 2476                  if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2477 2477                      &spa->spa_l2cache.sav_config) != 0)
2478 2478                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2479 2479  
2480 2480                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2481 2481                  spa_load_l2cache(spa);
2482 2482                  spa_config_exit(spa, SCL_ALL, FTAG);
2483 2483          } else if (error == 0) {
2484 2484                  spa->spa_l2cache.sav_sync = B_TRUE;
2485 2485          }
2486 2486  
2487 2487          spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2488 2488  
2489 2489          error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2490 2490          if (error && error != ENOENT)
2491 2491                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2492 2492  
2493 2493          if (error == 0) {
2494 2494                  uint64_t autoreplace;
2495 2495  
2496 2496                  spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2497 2497                  spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2498 2498                  spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2499 2499                  spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2500 2500                  spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2501 2501                  spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2502 2502                      &spa->spa_dedup_ditto);
2503 2503  
2504 2504                  spa->spa_autoreplace = (autoreplace != 0);
2505 2505          }
2506 2506  
2507 2507          /*
2508 2508           * If the 'autoreplace' property is set, then post a resource notifying
2509 2509           * the ZFS DE that it should not issue any faults for unopenable
2510 2510           * devices.  We also iterate over the vdevs, and post a sysevent for any
2511 2511           * unopenable vdevs so that the normal autoreplace handler can take
2512 2512           * over.
2513 2513           */
2514 2514          if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2515 2515                  spa_check_removed(spa->spa_root_vdev);
2516 2516                  /*
2517 2517                   * For the import case, this is done in spa_import(), because
2518 2518                   * at this point we're using the spare definitions from
2519 2519                   * the MOS config, not necessarily from the userland config.
2520 2520                   */
2521 2521                  if (state != SPA_LOAD_IMPORT) {
2522 2522                          spa_aux_check_removed(&spa->spa_spares);
2523 2523                          spa_aux_check_removed(&spa->spa_l2cache);
2524 2524                  }
2525 2525          }
2526 2526  
2527 2527          /*
2528 2528           * Load the vdev state for all toplevel vdevs.
2529 2529           */
2530 2530          vdev_load(rvd);
2531 2531  
2532 2532          /*
2533 2533           * Propagate the leaf DTLs we just loaded all the way up the tree.
2534 2534           */
2535 2535          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2536 2536          vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2537 2537          spa_config_exit(spa, SCL_ALL, FTAG);
2538 2538  
2539 2539          /*
2540 2540           * Load the DDTs (dedup tables).
2541 2541           */
2542 2542          error = ddt_load(spa);
2543 2543          if (error != 0)
2544 2544                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2545 2545  
2546 2546          spa_update_dspace(spa);
2547 2547  
2548 2548          /*
2549 2549           * Validate the config, using the MOS config to fill in any
2550 2550           * information which might be missing.  If we fail to validate
2551 2551           * the config then declare the pool unfit for use. If we're
2552 2552           * assembling a pool from a split, the log is not transferred
2553 2553           * over.
2554 2554           */
2555 2555          if (type != SPA_IMPORT_ASSEMBLE) {
2556 2556                  nvlist_t *nvconfig;
2557 2557  
2558 2558                  if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2559 2559                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2560 2560  
2561 2561                  if (!spa_config_valid(spa, nvconfig)) {
2562 2562                          nvlist_free(nvconfig);
2563 2563                          return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2564 2564                              ENXIO));
2565 2565                  }
2566 2566                  nvlist_free(nvconfig);
2567 2567  
2568 2568                  /*
2569 2569                   * Now that we've validated the config, check the state of the
2570 2570                   * root vdev.  If it can't be opened, it indicates one or
2571 2571                   * more toplevel vdevs are faulted.
2572 2572                   */
2573 2573                  if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2574 2574                          return (SET_ERROR(ENXIO));
2575 2575  
2576 2576                  if (spa_check_logs(spa)) {
2577 2577                          *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2578 2578                          return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2579 2579                  }
2580 2580          }
2581 2581  
2582 2582          if (missing_feat_write) {
2583 2583                  ASSERT(state == SPA_LOAD_TRYIMPORT);
2584 2584  
2585 2585                  /*
2586 2586                   * At this point, we know that we can open the pool in
2587 2587                   * read-only mode but not read-write mode. We now have enough
2588 2588                   * information and can return to userland.
2589 2589                   */
2590 2590                  return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
2591 2591          }
2592 2592  
2593 2593          /*
2594 2594           * We've successfully opened the pool, verify that we're ready
2595 2595           * to start pushing transactions.
2596 2596           */
2597 2597          if (state != SPA_LOAD_TRYIMPORT) {
2598 2598                  if (error = spa_load_verify(spa))
2599 2599                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2600 2600                              error));
2601 2601          }
2602 2602  
2603 2603          if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2604 2604              spa->spa_load_max_txg == UINT64_MAX)) {
2605 2605                  dmu_tx_t *tx;
2606 2606                  int need_update = B_FALSE;
2607 2607  
2608 2608                  ASSERT(state != SPA_LOAD_TRYIMPORT);
2609 2609  
2610 2610                  /*
2611 2611                   * Claim log blocks that haven't been committed yet.
2612 2612                   * This must all happen in a single txg.
2613 2613                   * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2614 2614                   * invoked from zil_claim_log_block()'s i/o done callback.
2615 2615                   * Price of rollback is that we abandon the log.
2616 2616                   */
2617 2617                  spa->spa_claiming = B_TRUE;
2618 2618  
2619 2619                  tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2620 2620                      spa_first_txg(spa));
2621 2621                  (void) dmu_objset_find(spa_name(spa),
2622 2622                      zil_claim, tx, DS_FIND_CHILDREN);
2623 2623                  dmu_tx_commit(tx);
2624 2624  
2625 2625                  spa->spa_claiming = B_FALSE;
2626 2626  
2627 2627                  spa_set_log_state(spa, SPA_LOG_GOOD);
2628 2628                  spa->spa_sync_on = B_TRUE;
2629 2629                  txg_sync_start(spa->spa_dsl_pool);
2630 2630  
2631 2631                  /*
2632 2632                   * Wait for all claims to sync.  We sync up to the highest
2633 2633                   * claimed log block birth time so that claimed log blocks
2634 2634                   * don't appear to be from the future.  spa_claim_max_txg
2635 2635                   * will have been set for us by either zil_check_log_chain()
2636 2636                   * (invoked from spa_check_logs()) or zil_claim() above.
2637 2637                   */
2638 2638                  txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2639 2639  
2640 2640                  /*
2641 2641                   * If the config cache is stale, or we have uninitialized
2642 2642                   * metaslabs (see spa_vdev_add()), then update the config.
2643 2643                   *
2644 2644                   * If this is a verbatim import, trust the current
2645 2645                   * in-core spa_config and update the disk labels.
2646 2646                   */
2647 2647                  if (config_cache_txg != spa->spa_config_txg ||
2648 2648                      state == SPA_LOAD_IMPORT ||
2649 2649                      state == SPA_LOAD_RECOVER ||
2650 2650                      (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2651 2651                          need_update = B_TRUE;
2652 2652  
2653 2653                  for (int c = 0; c < rvd->vdev_children; c++)
2654 2654                          if (rvd->vdev_child[c]->vdev_ms_array == 0)
2655 2655                                  need_update = B_TRUE;
2656 2656  
2657 2657                  /*
2658 2658                   * Update the config cache asychronously in case we're the
2659 2659                   * root pool, in which case the config cache isn't writable yet.
2660 2660                   */
2661 2661                  if (need_update)
2662 2662                          spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2663 2663  
2664 2664                  /*
2665 2665                   * Check all DTLs to see if anything needs resilvering.
2666 2666                   */
2667 2667                  if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2668 2668                      vdev_resilver_needed(rvd, NULL, NULL))
2669 2669                          spa_async_request(spa, SPA_ASYNC_RESILVER);
2670 2670  
2671 2671                  /*
2672 2672                   * Log the fact that we booted up (so that we can detect if
2673 2673                   * we rebooted in the middle of an operation).
2674 2674                   */
2675 2675                  spa_history_log_version(spa, "open");
2676 2676  
2677 2677                  /*
2678 2678                   * Delete any inconsistent datasets.
2679 2679                   */
2680 2680                  (void) dmu_objset_find(spa_name(spa),
2681 2681                      dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2682 2682  
2683 2683                  /*
2684 2684                   * Clean up any stale temporary dataset userrefs.
2685 2685                   */
2686 2686                  dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2687 2687          }
2688 2688  
2689 2689          return (0);
2690 2690  }
2691 2691  
2692 2692  static int
2693 2693  spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2694 2694  {
2695 2695          int mode = spa->spa_mode;
2696 2696  
2697 2697          spa_unload(spa);
2698 2698          spa_deactivate(spa);
2699 2699  
2700 2700          spa->spa_load_max_txg--;
2701 2701  
2702 2702          spa_activate(spa, mode);
2703 2703          spa_async_suspend(spa);
2704 2704  
2705 2705          return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2706 2706  }
2707 2707  
2708 2708  /*
2709 2709   * If spa_load() fails this function will try loading prior txg's. If
2710 2710   * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
2711 2711   * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
2712 2712   * function will not rewind the pool and will return the same error as
2713 2713   * spa_load().
2714 2714   */
2715 2715  static int
2716 2716  spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2717 2717      uint64_t max_request, int rewind_flags)
2718 2718  {
2719 2719          nvlist_t *loadinfo = NULL;
2720 2720          nvlist_t *config = NULL;
2721 2721          int load_error, rewind_error;
2722 2722          uint64_t safe_rewind_txg;
2723 2723          uint64_t min_txg;
2724 2724  
2725 2725          if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2726 2726                  spa->spa_load_max_txg = spa->spa_load_txg;
2727 2727                  spa_set_log_state(spa, SPA_LOG_CLEAR);
2728 2728          } else {
2729 2729                  spa->spa_load_max_txg = max_request;
2730 2730          }
2731 2731  
2732 2732          load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2733 2733              mosconfig);
2734 2734          if (load_error == 0)
2735 2735                  return (0);
2736 2736  
2737 2737          if (spa->spa_root_vdev != NULL)
2738 2738                  config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2739 2739  
2740 2740          spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2741 2741          spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2742 2742  
2743 2743          if (rewind_flags & ZPOOL_NEVER_REWIND) {
2744 2744                  nvlist_free(config);
2745 2745                  return (load_error);
2746 2746          }
2747 2747  
2748 2748          if (state == SPA_LOAD_RECOVER) {
2749 2749                  /* Price of rolling back is discarding txgs, including log */
2750 2750                  spa_set_log_state(spa, SPA_LOG_CLEAR);
2751 2751          } else {
2752 2752                  /*
2753 2753                   * If we aren't rolling back save the load info from our first
2754 2754                   * import attempt so that we can restore it after attempting
2755 2755                   * to rewind.
2756 2756                   */
2757 2757                  loadinfo = spa->spa_load_info;
2758 2758                  spa->spa_load_info = fnvlist_alloc();
2759 2759          }
2760 2760  
2761 2761          spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2762 2762          safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2763 2763          min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2764 2764              TXG_INITIAL : safe_rewind_txg;
2765 2765  
2766 2766          /*
2767 2767           * Continue as long as we're finding errors, we're still within
2768 2768           * the acceptable rewind range, and we're still finding uberblocks
2769 2769           */
2770 2770          while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2771 2771              spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2772 2772                  if (spa->spa_load_max_txg < safe_rewind_txg)
2773 2773                          spa->spa_extreme_rewind = B_TRUE;
2774 2774                  rewind_error = spa_load_retry(spa, state, mosconfig);
2775 2775          }
2776 2776  
2777 2777          spa->spa_extreme_rewind = B_FALSE;
2778 2778          spa->spa_load_max_txg = UINT64_MAX;
2779 2779  
2780 2780          if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2781 2781                  spa_config_set(spa, config);
2782 2782  
2783 2783          if (state == SPA_LOAD_RECOVER) {
2784 2784                  ASSERT3P(loadinfo, ==, NULL);
2785 2785                  return (rewind_error);
2786 2786          } else {
2787 2787                  /* Store the rewind info as part of the initial load info */
2788 2788                  fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
2789 2789                      spa->spa_load_info);
2790 2790  
2791 2791                  /* Restore the initial load info */
2792 2792                  fnvlist_free(spa->spa_load_info);
2793 2793                  spa->spa_load_info = loadinfo;
2794 2794  
2795 2795                  return (load_error);
2796 2796          }
2797 2797  }
2798 2798  
2799 2799  /*
2800 2800   * Pool Open/Import
2801 2801   *
2802 2802   * The import case is identical to an open except that the configuration is sent
2803 2803   * down from userland, instead of grabbed from the configuration cache.  For the
2804 2804   * case of an open, the pool configuration will exist in the
2805 2805   * POOL_STATE_UNINITIALIZED state.
2806 2806   *
2807 2807   * The stats information (gen/count/ustats) is used to gather vdev statistics at
2808 2808   * the same time open the pool, without having to keep around the spa_t in some
2809 2809   * ambiguous state.
2810 2810   */
2811 2811  static int
2812 2812  spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2813 2813      nvlist_t **config)
2814 2814  {
2815 2815          spa_t *spa;
2816 2816          spa_load_state_t state = SPA_LOAD_OPEN;
2817 2817          int error;
2818 2818          int locked = B_FALSE;
2819 2819  
2820 2820          *spapp = NULL;
2821 2821  
2822 2822          /*
2823 2823           * As disgusting as this is, we need to support recursive calls to this
2824 2824           * function because dsl_dir_open() is called during spa_load(), and ends
2825 2825           * up calling spa_open() again.  The real fix is to figure out how to
2826 2826           * avoid dsl_dir_open() calling this in the first place.
2827 2827           */
2828 2828          if (mutex_owner(&spa_namespace_lock) != curthread) {
2829 2829                  mutex_enter(&spa_namespace_lock);
2830 2830                  locked = B_TRUE;
2831 2831          }
2832 2832  
2833 2833          if ((spa = spa_lookup(pool)) == NULL) {
2834 2834                  if (locked)
2835 2835                          mutex_exit(&spa_namespace_lock);
2836 2836                  return (SET_ERROR(ENOENT));
2837 2837          }
2838 2838  
2839 2839          if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
2840 2840                  zpool_rewind_policy_t policy;
2841 2841  
2842 2842                  zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
2843 2843                      &policy);
2844 2844                  if (policy.zrp_request & ZPOOL_DO_REWIND)
2845 2845                          state = SPA_LOAD_RECOVER;
2846 2846  
2847 2847                  spa_activate(spa, spa_mode_global);
2848 2848  
2849 2849                  if (state != SPA_LOAD_RECOVER)
2850 2850                          spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2851 2851  
2852 2852                  error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2853 2853                      policy.zrp_request);
2854 2854  
2855 2855                  if (error == EBADF) {
2856 2856                          /*
2857 2857                           * If vdev_validate() returns failure (indicated by
2858 2858                           * EBADF), it indicates that one of the vdevs indicates
2859 2859                           * that the pool has been exported or destroyed.  If
2860 2860                           * this is the case, the config cache is out of sync and
2861 2861                           * we should remove the pool from the namespace.
2862 2862                           */
2863 2863                          spa_unload(spa);
2864 2864                          spa_deactivate(spa);
2865 2865                          spa_config_sync(spa, B_TRUE, B_TRUE);
2866 2866                          spa_remove(spa);
2867 2867                          if (locked)
2868 2868                                  mutex_exit(&spa_namespace_lock);
2869 2869                          return (SET_ERROR(ENOENT));
2870 2870                  }
2871 2871  
2872 2872                  if (error) {
2873 2873                          /*
2874 2874                           * We can't open the pool, but we still have useful
2875 2875                           * information: the state of each vdev after the
2876 2876                           * attempted vdev_open().  Return this to the user.
2877 2877                           */
2878 2878                          if (config != NULL && spa->spa_config) {
2879 2879                                  VERIFY(nvlist_dup(spa->spa_config, config,
2880 2880                                      KM_SLEEP) == 0);
2881 2881                                  VERIFY(nvlist_add_nvlist(*config,
2882 2882                                      ZPOOL_CONFIG_LOAD_INFO,
2883 2883                                      spa->spa_load_info) == 0);
2884 2884                          }
2885 2885                          spa_unload(spa);
2886 2886                          spa_deactivate(spa);
2887 2887                          spa->spa_last_open_failed = error;
2888 2888                          if (locked)
2889 2889                                  mutex_exit(&spa_namespace_lock);
2890 2890                          *spapp = NULL;
2891 2891                          return (error);
2892 2892                  }
2893 2893          }
2894 2894  
2895 2895          spa_open_ref(spa, tag);
2896 2896  
2897 2897          if (config != NULL)
2898 2898                  *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2899 2899  
2900 2900          /*
2901 2901           * If we've recovered the pool, pass back any information we
2902 2902           * gathered while doing the load.
2903 2903           */
2904 2904          if (state == SPA_LOAD_RECOVER) {
2905 2905                  VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
2906 2906                      spa->spa_load_info) == 0);
2907 2907          }
2908 2908  
2909 2909          if (locked) {
2910 2910                  spa->spa_last_open_failed = 0;
2911 2911                  spa->spa_last_ubsync_txg = 0;
2912 2912                  spa->spa_load_txg = 0;
2913 2913                  mutex_exit(&spa_namespace_lock);
2914 2914          }
2915 2915  
2916 2916          *spapp = spa;
2917 2917  
2918 2918          return (0);
2919 2919  }
2920 2920  
2921 2921  int
2922 2922  spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2923 2923      nvlist_t **config)
2924 2924  {
2925 2925          return (spa_open_common(name, spapp, tag, policy, config));
2926 2926  }
2927 2927  
2928 2928  int
2929 2929  spa_open(const char *name, spa_t **spapp, void *tag)
2930 2930  {
2931 2931          return (spa_open_common(name, spapp, tag, NULL, NULL));
2932 2932  }
2933 2933  
2934 2934  /*
2935 2935   * Lookup the given spa_t, incrementing the inject count in the process,
2936 2936   * preventing it from being exported or destroyed.
2937 2937   */
2938 2938  spa_t *
2939 2939  spa_inject_addref(char *name)
2940 2940  {
2941 2941          spa_t *spa;
2942 2942  
2943 2943          mutex_enter(&spa_namespace_lock);
2944 2944          if ((spa = spa_lookup(name)) == NULL) {
2945 2945                  mutex_exit(&spa_namespace_lock);
2946 2946                  return (NULL);
2947 2947          }
2948 2948          spa->spa_inject_ref++;
2949 2949          mutex_exit(&spa_namespace_lock);
2950 2950  
2951 2951          return (spa);
2952 2952  }
2953 2953  
2954 2954  void
2955 2955  spa_inject_delref(spa_t *spa)
2956 2956  {
2957 2957          mutex_enter(&spa_namespace_lock);
2958 2958          spa->spa_inject_ref--;
2959 2959          mutex_exit(&spa_namespace_lock);
2960 2960  }
2961 2961  
2962 2962  /*
2963 2963   * Add spares device information to the nvlist.
2964 2964   */
2965 2965  static void
2966 2966  spa_add_spares(spa_t *spa, nvlist_t *config)
2967 2967  {
2968 2968          nvlist_t **spares;
2969 2969          uint_t i, nspares;
2970 2970          nvlist_t *nvroot;
2971 2971          uint64_t guid;
2972 2972          vdev_stat_t *vs;
2973 2973          uint_t vsc;
2974 2974          uint64_t pool;
2975 2975  
2976 2976          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2977 2977  
2978 2978          if (spa->spa_spares.sav_count == 0)
2979 2979                  return;
2980 2980  
2981 2981          VERIFY(nvlist_lookup_nvlist(config,
2982 2982              ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2983 2983          VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2984 2984              ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2985 2985          if (nspares != 0) {
2986 2986                  VERIFY(nvlist_add_nvlist_array(nvroot,
2987 2987                      ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2988 2988                  VERIFY(nvlist_lookup_nvlist_array(nvroot,
2989 2989                      ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2990 2990  
2991 2991                  /*
2992 2992                   * Go through and find any spares which have since been
2993 2993                   * repurposed as an active spare.  If this is the case, update
2994 2994                   * their status appropriately.
2995 2995                   */
2996 2996                  for (i = 0; i < nspares; i++) {
2997 2997                          VERIFY(nvlist_lookup_uint64(spares[i],
2998 2998                              ZPOOL_CONFIG_GUID, &guid) == 0);
2999 2999                          if (spa_spare_exists(guid, &pool, NULL) &&
3000 3000                              pool != 0ULL) {
3001 3001                                  VERIFY(nvlist_lookup_uint64_array(
3002 3002                                      spares[i], ZPOOL_CONFIG_VDEV_STATS,
3003 3003                                      (uint64_t **)&vs, &vsc) == 0);
3004 3004                                  vs->vs_state = VDEV_STATE_CANT_OPEN;
3005 3005                                  vs->vs_aux = VDEV_AUX_SPARED;
3006 3006                          }
3007 3007                  }
3008 3008          }
3009 3009  }
3010 3010  
3011 3011  /*
3012 3012   * Add l2cache device information to the nvlist, including vdev stats.
3013 3013   */
3014 3014  static void
3015 3015  spa_add_l2cache(spa_t *spa, nvlist_t *config)
3016 3016  {
3017 3017          nvlist_t **l2cache;
3018 3018          uint_t i, j, nl2cache;
3019 3019          nvlist_t *nvroot;
3020 3020          uint64_t guid;
3021 3021          vdev_t *vd;
3022 3022          vdev_stat_t *vs;
3023 3023          uint_t vsc;
3024 3024  
3025 3025          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3026 3026  
3027 3027          if (spa->spa_l2cache.sav_count == 0)
3028 3028                  return;
3029 3029  
3030 3030          VERIFY(nvlist_lookup_nvlist(config,
3031 3031              ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3032 3032          VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3033 3033              ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3034 3034          if (nl2cache != 0) {
3035 3035                  VERIFY(nvlist_add_nvlist_array(nvroot,
3036 3036                      ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3037 3037                  VERIFY(nvlist_lookup_nvlist_array(nvroot,
3038 3038                      ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3039 3039  
3040 3040                  /*
3041 3041                   * Update level 2 cache device stats.
3042 3042                   */
3043 3043  
3044 3044                  for (i = 0; i < nl2cache; i++) {
3045 3045                          VERIFY(nvlist_lookup_uint64(l2cache[i],
3046 3046                              ZPOOL_CONFIG_GUID, &guid) == 0);
3047 3047  
3048 3048                          vd = NULL;
3049 3049                          for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
3050 3050                                  if (guid ==
3051 3051                                      spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
3052 3052                                          vd = spa->spa_l2cache.sav_vdevs[j];
3053 3053                                          break;
3054 3054                                  }
3055 3055                          }
3056 3056                          ASSERT(vd != NULL);
3057 3057  
3058 3058                          VERIFY(nvlist_lookup_uint64_array(l2cache[i],
3059 3059                              ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
3060 3060                              == 0);
3061 3061                          vdev_get_stats(vd, vs);
3062 3062                  }
3063 3063          }
3064 3064  }
3065 3065  
3066 3066  static void
3067 3067  spa_add_feature_stats(spa_t *spa, nvlist_t *config)
3068 3068  {
3069 3069          nvlist_t *features;
3070 3070          zap_cursor_t zc;
3071 3071          zap_attribute_t za;
3072 3072  
3073 3073          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3074 3074          VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3075 3075  
3076 3076          if (spa->spa_feat_for_read_obj != 0) {
3077 3077                  for (zap_cursor_init(&zc, spa->spa_meta_objset,
3078 3078                      spa->spa_feat_for_read_obj);
3079 3079                      zap_cursor_retrieve(&zc, &za) == 0;
3080 3080                      zap_cursor_advance(&zc)) {
3081 3081                          ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3082 3082                              za.za_num_integers == 1);
3083 3083                          VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3084 3084                              za.za_first_integer));
3085 3085                  }
3086 3086                  zap_cursor_fini(&zc);
3087 3087          }
3088 3088  
3089 3089          if (spa->spa_feat_for_write_obj != 0) {
3090 3090                  for (zap_cursor_init(&zc, spa->spa_meta_objset,
3091 3091                      spa->spa_feat_for_write_obj);
3092 3092                      zap_cursor_retrieve(&zc, &za) == 0;
3093 3093                      zap_cursor_advance(&zc)) {
3094 3094                          ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3095 3095                              za.za_num_integers == 1);
3096 3096                          VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3097 3097                              za.za_first_integer));
3098 3098                  }
3099 3099                  zap_cursor_fini(&zc);
3100 3100          }
3101 3101  
3102 3102          VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
3103 3103              features) == 0);
3104 3104          nvlist_free(features);
3105 3105  }
3106 3106  
3107 3107  int
3108 3108  spa_get_stats(const char *name, nvlist_t **config,
3109 3109      char *altroot, size_t buflen)
3110 3110  {
3111 3111          int error;
3112 3112          spa_t *spa;
3113 3113  
3114 3114          *config = NULL;
3115 3115          error = spa_open_common(name, &spa, FTAG, NULL, config);
3116 3116  
3117 3117          if (spa != NULL) {
3118 3118                  /*
3119 3119                   * This still leaves a window of inconsistency where the spares
3120 3120                   * or l2cache devices could change and the config would be
3121 3121                   * self-inconsistent.
3122 3122                   */
3123 3123                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3124 3124  
3125 3125                  if (*config != NULL) {
3126 3126                          uint64_t loadtimes[2];
3127 3127  
3128 3128                          loadtimes[0] = spa->spa_loaded_ts.tv_sec;
3129 3129                          loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
3130 3130                          VERIFY(nvlist_add_uint64_array(*config,
3131 3131                              ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
3132 3132  
3133 3133                          VERIFY(nvlist_add_uint64(*config,
3134 3134                              ZPOOL_CONFIG_ERRCOUNT,
3135 3135                              spa_get_errlog_size(spa)) == 0);
3136 3136  
3137 3137                          if (spa_suspended(spa))
3138 3138                                  VERIFY(nvlist_add_uint64(*config,
3139 3139                                      ZPOOL_CONFIG_SUSPENDED,
3140 3140                                      spa->spa_failmode) == 0);
3141 3141  
3142 3142                          spa_add_spares(spa, *config);
3143 3143                          spa_add_l2cache(spa, *config);
3144 3144                          spa_add_feature_stats(spa, *config);
3145 3145                  }
3146 3146          }
3147 3147  
3148 3148          /*
3149 3149           * We want to get the alternate root even for faulted pools, so we cheat
3150 3150           * and call spa_lookup() directly.
3151 3151           */
3152 3152          if (altroot) {
3153 3153                  if (spa == NULL) {
3154 3154                          mutex_enter(&spa_namespace_lock);
3155 3155                          spa = spa_lookup(name);
3156 3156                          if (spa)
3157 3157                                  spa_altroot(spa, altroot, buflen);
3158 3158                          else
3159 3159                                  altroot[0] = '\0';
3160 3160                          spa = NULL;
3161 3161                          mutex_exit(&spa_namespace_lock);
3162 3162                  } else {
3163 3163                          spa_altroot(spa, altroot, buflen);
3164 3164                  }
3165 3165          }
3166 3166  
3167 3167          if (spa != NULL) {
3168 3168                  spa_config_exit(spa, SCL_CONFIG, FTAG);
3169 3169                  spa_close(spa, FTAG);
3170 3170          }
3171 3171  
3172 3172          return (error);
3173 3173  }
3174 3174  
3175 3175  /*
3176 3176   * Validate that the auxiliary device array is well formed.  We must have an
3177 3177   * array of nvlists, each which describes a valid leaf vdev.  If this is an
3178 3178   * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
3179 3179   * specified, as long as they are well-formed.
3180 3180   */
3181 3181  static int
3182 3182  spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
3183 3183      spa_aux_vdev_t *sav, const char *config, uint64_t version,
3184 3184      vdev_labeltype_t label)
3185 3185  {
3186 3186          nvlist_t **dev;
3187 3187          uint_t i, ndev;
3188 3188          vdev_t *vd;
3189 3189          int error;
3190 3190  
3191 3191          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3192 3192  
3193 3193          /*
3194 3194           * It's acceptable to have no devs specified.
3195 3195           */
3196 3196          if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
3197 3197                  return (0);
3198 3198  
3199 3199          if (ndev == 0)
3200 3200                  return (SET_ERROR(EINVAL));
3201 3201  
3202 3202          /*
3203 3203           * Make sure the pool is formatted with a version that supports this
3204 3204           * device type.
3205 3205           */
3206 3206          if (spa_version(spa) < version)
3207 3207                  return (SET_ERROR(ENOTSUP));
3208 3208  
3209 3209          /*
3210 3210           * Set the pending device list so we correctly handle device in-use
3211 3211           * checking.
3212 3212           */
3213 3213          sav->sav_pending = dev;
3214 3214          sav->sav_npending = ndev;
3215 3215  
3216 3216          for (i = 0; i < ndev; i++) {
3217 3217                  if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
3218 3218                      mode)) != 0)
3219 3219                          goto out;
3220 3220  
3221 3221                  if (!vd->vdev_ops->vdev_op_leaf) {
3222 3222                          vdev_free(vd);
3223 3223                          error = SET_ERROR(EINVAL);
3224 3224                          goto out;
3225 3225                  }
3226 3226  
3227 3227                  /*
3228 3228                   * The L2ARC currently only supports disk devices in
3229 3229                   * kernel context.  For user-level testing, we allow it.
3230 3230                   */
3231 3231  #ifdef _KERNEL
3232 3232                  if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
3233 3233                      strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
3234 3234                          error = SET_ERROR(ENOTBLK);
3235 3235                          vdev_free(vd);
3236 3236                          goto out;
3237 3237                  }
3238 3238  #endif
3239 3239                  vd->vdev_top = vd;
3240 3240  
3241 3241                  if ((error = vdev_open(vd)) == 0 &&
3242 3242                      (error = vdev_label_init(vd, crtxg, label)) == 0) {
3243 3243                          VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
3244 3244                              vd->vdev_guid) == 0);
3245 3245                  }
3246 3246  
3247 3247                  vdev_free(vd);
3248 3248  
3249 3249                  if (error &&
3250 3250                      (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
3251 3251                          goto out;
3252 3252                  else
3253 3253                          error = 0;
3254 3254          }
3255 3255  
3256 3256  out:
3257 3257          sav->sav_pending = NULL;
3258 3258          sav->sav_npending = 0;
3259 3259          return (error);
3260 3260  }
3261 3261  
3262 3262  static int
3263 3263  spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
3264 3264  {
3265 3265          int error;
3266 3266  
3267 3267          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3268 3268  
3269 3269          if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3270 3270              &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
3271 3271              VDEV_LABEL_SPARE)) != 0) {
3272 3272                  return (error);
3273 3273          }
3274 3274  
3275 3275          return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3276 3276              &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
3277 3277              VDEV_LABEL_L2CACHE));
3278 3278  }
3279 3279  
3280 3280  static void
3281 3281  spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
3282 3282      const char *config)
3283 3283  {
3284 3284          int i;
3285 3285  
3286 3286          if (sav->sav_config != NULL) {
3287 3287                  nvlist_t **olddevs;
3288 3288                  uint_t oldndevs;
3289 3289                  nvlist_t **newdevs;
3290 3290  
3291 3291                  /*
3292 3292                   * Generate new dev list by concatentating with the
3293 3293                   * current dev list.
3294 3294                   */
3295 3295                  VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
3296 3296                      &olddevs, &oldndevs) == 0);
3297 3297  
3298 3298                  newdevs = kmem_alloc(sizeof (void *) *
3299 3299                      (ndevs + oldndevs), KM_SLEEP);
3300 3300                  for (i = 0; i < oldndevs; i++)
3301 3301                          VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
3302 3302                              KM_SLEEP) == 0);
3303 3303                  for (i = 0; i < ndevs; i++)
3304 3304                          VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
3305 3305                              KM_SLEEP) == 0);
3306 3306  
3307 3307                  VERIFY(nvlist_remove(sav->sav_config, config,
3308 3308                      DATA_TYPE_NVLIST_ARRAY) == 0);
3309 3309  
3310 3310                  VERIFY(nvlist_add_nvlist_array(sav->sav_config,
3311 3311                      config, newdevs, ndevs + oldndevs) == 0);
3312 3312                  for (i = 0; i < oldndevs + ndevs; i++)
3313 3313                          nvlist_free(newdevs[i]);
3314 3314                  kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
3315 3315          } else {
3316 3316                  /*
3317 3317                   * Generate a new dev list.
3318 3318                   */
3319 3319                  VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
3320 3320                      KM_SLEEP) == 0);
3321 3321                  VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
3322 3322                      devs, ndevs) == 0);
3323 3323          }
3324 3324  }
3325 3325  
3326 3326  /*
3327 3327   * Stop and drop level 2 ARC devices
3328 3328   */
3329 3329  void
3330 3330  spa_l2cache_drop(spa_t *spa)
3331 3331  {
3332 3332          vdev_t *vd;
3333 3333          int i;
3334 3334          spa_aux_vdev_t *sav = &spa->spa_l2cache;
3335 3335  
3336 3336          for (i = 0; i < sav->sav_count; i++) {
3337 3337                  uint64_t pool;
3338 3338  
3339 3339                  vd = sav->sav_vdevs[i];
3340 3340                  ASSERT(vd != NULL);
3341 3341  
3342 3342                  if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
3343 3343                      pool != 0ULL && l2arc_vdev_present(vd))
3344 3344                          l2arc_remove_vdev(vd);
3345 3345          }
3346 3346  }
3347 3347  
3348 3348  /*
3349 3349   * Pool Creation
3350 3350   */
3351 3351  int
3352 3352  spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
3353 3353      nvlist_t *zplprops)
3354 3354  {
3355 3355          spa_t *spa;
3356 3356          char *altroot = NULL;
3357 3357          vdev_t *rvd;
3358 3358          dsl_pool_t *dp;
3359 3359          dmu_tx_t *tx;
3360 3360          int error = 0;
3361 3361          uint64_t txg = TXG_INITIAL;
3362 3362          nvlist_t **spares, **l2cache;
3363 3363          uint_t nspares, nl2cache;
3364 3364          uint64_t version, obj;
3365 3365          boolean_t has_features;
3366 3366  
3367 3367          /*
3368 3368           * If this pool already exists, return failure.
3369 3369           */
3370 3370          mutex_enter(&spa_namespace_lock);
3371 3371          if (spa_lookup(pool) != NULL) {
3372 3372                  mutex_exit(&spa_namespace_lock);
3373 3373                  return (SET_ERROR(EEXIST));
3374 3374          }
3375 3375  
3376 3376          /*
3377 3377           * Allocate a new spa_t structure.
3378 3378           */
3379 3379          (void) nvlist_lookup_string(props,
3380 3380              zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3381 3381          spa = spa_add(pool, NULL, altroot);
3382 3382          spa_activate(spa, spa_mode_global);
3383 3383  
3384 3384          if (props && (error = spa_prop_validate(spa, props))) {
3385 3385                  spa_deactivate(spa);
3386 3386                  spa_remove(spa);
3387 3387                  mutex_exit(&spa_namespace_lock);
3388 3388                  return (error);
3389 3389          }
3390 3390  
3391 3391          has_features = B_FALSE;
3392 3392          for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
3393 3393              elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
3394 3394                  if (zpool_prop_feature(nvpair_name(elem)))
3395 3395                          has_features = B_TRUE;
3396 3396          }
3397 3397  
3398 3398          if (has_features || nvlist_lookup_uint64(props,
3399 3399              zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
3400 3400                  version = SPA_VERSION;
3401 3401          }
3402 3402          ASSERT(SPA_VERSION_IS_SUPPORTED(version));
3403 3403  
3404 3404          spa->spa_first_txg = txg;
3405 3405          spa->spa_uberblock.ub_txg = txg - 1;
3406 3406          spa->spa_uberblock.ub_version = version;
3407 3407          spa->spa_ubsync = spa->spa_uberblock;
3408 3408  
3409 3409          /*
3410 3410           * Create "The Godfather" zio to hold all async IOs
3411 3411           */
3412 3412          spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
3413 3413              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
3414 3414  
3415 3415          /*
3416 3416           * Create the root vdev.
3417 3417           */
3418 3418          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3419 3419  
3420 3420          error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
3421 3421  
3422 3422          ASSERT(error != 0 || rvd != NULL);
3423 3423          ASSERT(error != 0 || spa->spa_root_vdev == rvd);
3424 3424  
3425 3425          if (error == 0 && !zfs_allocatable_devs(nvroot))
3426 3426                  error = SET_ERROR(EINVAL);
3427 3427  
3428 3428          if (error == 0 &&
3429 3429              (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
3430 3430              (error = spa_validate_aux(spa, nvroot, txg,
3431 3431              VDEV_ALLOC_ADD)) == 0) {
3432 3432                  for (int c = 0; c < rvd->vdev_children; c++) {
3433 3433                          vdev_metaslab_set_size(rvd->vdev_child[c]);
3434 3434                          vdev_expand(rvd->vdev_child[c], txg);
3435 3435                  }
3436 3436          }
3437 3437  
3438 3438          spa_config_exit(spa, SCL_ALL, FTAG);
3439 3439  
3440 3440          if (error != 0) {
3441 3441                  spa_unload(spa);
3442 3442                  spa_deactivate(spa);
3443 3443                  spa_remove(spa);
3444 3444                  mutex_exit(&spa_namespace_lock);
3445 3445                  return (error);
3446 3446          }
3447 3447  
3448 3448          /*
3449 3449           * Get the list of spares, if specified.
3450 3450           */
3451 3451          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3452 3452              &spares, &nspares) == 0) {
3453 3453                  VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
3454 3454                      KM_SLEEP) == 0);
3455 3455                  VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3456 3456                      ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3457 3457                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3458 3458                  spa_load_spares(spa);
3459 3459                  spa_config_exit(spa, SCL_ALL, FTAG);
3460 3460                  spa->spa_spares.sav_sync = B_TRUE;
3461 3461          }
3462 3462  
3463 3463          /*
3464 3464           * Get the list of level 2 cache devices, if specified.
3465 3465           */
3466 3466          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3467 3467              &l2cache, &nl2cache) == 0) {
3468 3468                  VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3469 3469                      NV_UNIQUE_NAME, KM_SLEEP) == 0);
3470 3470                  VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3471 3471                      ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3472 3472                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3473 3473                  spa_load_l2cache(spa);
3474 3474                  spa_config_exit(spa, SCL_ALL, FTAG);
3475 3475                  spa->spa_l2cache.sav_sync = B_TRUE;
3476 3476          }
3477 3477  
3478 3478          spa->spa_is_initializing = B_TRUE;
3479 3479          spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3480 3480          spa->spa_meta_objset = dp->dp_meta_objset;
3481 3481          spa->spa_is_initializing = B_FALSE;
3482 3482  
3483 3483          /*
3484 3484           * Create DDTs (dedup tables).
3485 3485           */
3486 3486          ddt_create(spa);
3487 3487  
3488 3488          spa_update_dspace(spa);
3489 3489  
3490 3490          tx = dmu_tx_create_assigned(dp, txg);
3491 3491  
3492 3492          /*
3493 3493           * Create the pool config object.
3494 3494           */
3495 3495          spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3496 3496              DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3497 3497              DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3498 3498  
3499 3499          if (zap_add(spa->spa_meta_objset,
3500 3500              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3501 3501              sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3502 3502                  cmn_err(CE_PANIC, "failed to add pool config");
3503 3503          }
3504 3504  
3505 3505          if (spa_version(spa) >= SPA_VERSION_FEATURES)
3506 3506                  spa_feature_create_zap_objects(spa, tx);
3507 3507  
3508 3508          if (zap_add(spa->spa_meta_objset,
3509 3509              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3510 3510              sizeof (uint64_t), 1, &version, tx) != 0) {
3511 3511                  cmn_err(CE_PANIC, "failed to add pool version");
3512 3512          }
3513 3513  
3514 3514          /* Newly created pools with the right version are always deflated. */
3515 3515          if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3516 3516                  spa->spa_deflate = TRUE;
3517 3517                  if (zap_add(spa->spa_meta_objset,
3518 3518                      DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3519 3519                      sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3520 3520                          cmn_err(CE_PANIC, "failed to add deflate");
3521 3521                  }
3522 3522          }
3523 3523  
3524 3524          /*
3525 3525           * Create the deferred-free bpobj.  Turn off compression
3526 3526           * because sync-to-convergence takes longer if the blocksize
3527 3527           * keeps changing.
3528 3528           */
3529 3529          obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3530 3530          dmu_object_set_compress(spa->spa_meta_objset, obj,
3531 3531              ZIO_COMPRESS_OFF, tx);
3532 3532          if (zap_add(spa->spa_meta_objset,
3533 3533              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3534 3534              sizeof (uint64_t), 1, &obj, tx) != 0) {
3535 3535                  cmn_err(CE_PANIC, "failed to add bpobj");
3536 3536          }
3537 3537          VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3538 3538              spa->spa_meta_objset, obj));
3539 3539  
3540 3540          /*
3541 3541           * Create the pool's history object.
3542 3542           */
3543 3543          if (version >= SPA_VERSION_ZPOOL_HISTORY)
3544 3544                  spa_history_create_obj(spa, tx);
3545 3545  
3546 3546          /*
3547 3547           * Set pool properties.
3548 3548           */
3549 3549          spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3550 3550          spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3551 3551          spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3552 3552          spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3553 3553  
3554 3554          if (props != NULL) {
3555 3555                  spa_configfile_set(spa, props, B_FALSE);
3556 3556                  spa_sync_props(props, tx);
3557 3557          }
3558 3558  
3559 3559          dmu_tx_commit(tx);
3560 3560  
3561 3561          spa->spa_sync_on = B_TRUE;
3562 3562          txg_sync_start(spa->spa_dsl_pool);
3563 3563  
3564 3564          /*
3565 3565           * We explicitly wait for the first transaction to complete so that our
3566 3566           * bean counters are appropriately updated.
3567 3567           */
3568 3568          txg_wait_synced(spa->spa_dsl_pool, txg);
3569 3569  
3570 3570          spa_config_sync(spa, B_FALSE, B_TRUE);
3571 3571  
3572 3572          spa_history_log_version(spa, "create");
3573 3573  
3574 3574          spa->spa_minref = refcount_count(&spa->spa_refcount);
3575 3575  
3576 3576          mutex_exit(&spa_namespace_lock);
3577 3577  
3578 3578          return (0);
3579 3579  }
3580 3580  
3581 3581  #ifdef _KERNEL
3582 3582  /*
3583 3583   * Get the root pool information from the root disk, then import the root pool
3584 3584   * during the system boot up time.
3585 3585   */
3586 3586  extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3587 3587  
3588 3588  static nvlist_t *
3589 3589  spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3590 3590  {
3591 3591          nvlist_t *config;
3592 3592          nvlist_t *nvtop, *nvroot;
3593 3593          uint64_t pgid;
3594 3594  
3595 3595          if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3596 3596                  return (NULL);
3597 3597  
3598 3598          /*
3599 3599           * Add this top-level vdev to the child array.
3600 3600           */
3601 3601          VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3602 3602              &nvtop) == 0);
3603 3603          VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3604 3604              &pgid) == 0);
3605 3605          VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3606 3606  
3607 3607          /*
3608 3608           * Put this pool's top-level vdevs into a root vdev.
3609 3609           */
3610 3610          VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3611 3611          VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3612 3612              VDEV_TYPE_ROOT) == 0);
3613 3613          VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3614 3614          VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3615 3615          VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3616 3616              &nvtop, 1) == 0);
3617 3617  
3618 3618          /*
3619 3619           * Replace the existing vdev_tree with the new root vdev in
3620 3620           * this pool's configuration (remove the old, add the new).
3621 3621           */
3622 3622          VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3623 3623          nvlist_free(nvroot);
3624 3624          return (config);
3625 3625  }
3626 3626  
3627 3627  /*
3628 3628   * Walk the vdev tree and see if we can find a device with "better"
3629 3629   * configuration. A configuration is "better" if the label on that
3630 3630   * device has a more recent txg.
3631 3631   */
3632 3632  static void
3633 3633  spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3634 3634  {
3635 3635          for (int c = 0; c < vd->vdev_children; c++)
3636 3636                  spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3637 3637  
3638 3638          if (vd->vdev_ops->vdev_op_leaf) {
3639 3639                  nvlist_t *label;
3640 3640                  uint64_t label_txg;
3641 3641  
3642 3642                  if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
3643 3643                      &label) != 0)
3644 3644                          return;
3645 3645  
3646 3646                  VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
3647 3647                      &label_txg) == 0);
3648 3648  
3649 3649                  /*
3650 3650                   * Do we have a better boot device?
3651 3651                   */
3652 3652                  if (label_txg > *txg) {
3653 3653                          *txg = label_txg;
3654 3654                          *avd = vd;
3655 3655                  }
3656 3656                  nvlist_free(label);
3657 3657          }
3658 3658  }
3659 3659  
3660 3660  /*
3661 3661   * Import a root pool.
3662 3662   *
3663 3663   * For x86. devpath_list will consist of devid and/or physpath name of
3664 3664   * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3665 3665   * The GRUB "findroot" command will return the vdev we should boot.
3666 3666   *
3667 3667   * For Sparc, devpath_list consists the physpath name of the booting device
3668 3668   * no matter the rootpool is a single device pool or a mirrored pool.
3669 3669   * e.g.
3670 3670   *      "/pci@1f,0/ide@d/disk@0,0:a"
3671 3671   */
3672 3672  int
3673 3673  spa_import_rootpool(char *devpath, char *devid)
3674 3674  {
3675 3675          spa_t *spa;
3676 3676          vdev_t *rvd, *bvd, *avd = NULL;
3677 3677          nvlist_t *config, *nvtop;
3678 3678          uint64_t guid, txg;
3679 3679          char *pname;
3680 3680          int error;
3681 3681  
3682 3682          /*
3683 3683           * Read the label from the boot device and generate a configuration.
3684 3684           */
3685 3685          config = spa_generate_rootconf(devpath, devid, &guid);
3686 3686  #if defined(_OBP) && defined(_KERNEL)
3687 3687          if (config == NULL) {
3688 3688                  if (strstr(devpath, "/iscsi/ssd") != NULL) {
3689 3689                          /* iscsi boot */
3690 3690                          get_iscsi_bootpath_phy(devpath);
3691 3691                          config = spa_generate_rootconf(devpath, devid, &guid);
3692 3692                  }
3693 3693          }
3694 3694  #endif
3695 3695          if (config == NULL) {
3696 3696                  cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
3697 3697                      devpath);
3698 3698                  return (SET_ERROR(EIO));
3699 3699          }
3700 3700  
3701 3701          VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3702 3702              &pname) == 0);
3703 3703          VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3704 3704  
3705 3705          mutex_enter(&spa_namespace_lock);
3706 3706          if ((spa = spa_lookup(pname)) != NULL) {
3707 3707                  /*
3708 3708                   * Remove the existing root pool from the namespace so that we
3709 3709                   * can replace it with the correct config we just read in.
3710 3710                   */
3711 3711                  spa_remove(spa);
3712 3712          }
3713 3713  
3714 3714          spa = spa_add(pname, config, NULL);
3715 3715          spa->spa_is_root = B_TRUE;
3716 3716          spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3717 3717  
3718 3718          /*
3719 3719           * Build up a vdev tree based on the boot device's label config.
3720 3720           */
3721 3721          VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3722 3722              &nvtop) == 0);
3723 3723          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3724 3724          error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3725 3725              VDEV_ALLOC_ROOTPOOL);
3726 3726          spa_config_exit(spa, SCL_ALL, FTAG);
3727 3727          if (error) {
3728 3728                  mutex_exit(&spa_namespace_lock);
3729 3729                  nvlist_free(config);
3730 3730                  cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3731 3731                      pname);
3732 3732                  return (error);
3733 3733          }
3734 3734  
3735 3735          /*
3736 3736           * Get the boot vdev.
3737 3737           */
3738 3738          if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3739 3739                  cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
3740 3740                      (u_longlong_t)guid);
3741 3741                  error = SET_ERROR(ENOENT);
3742 3742                  goto out;
3743 3743          }
3744 3744  
3745 3745          /*
3746 3746           * Determine if there is a better boot device.
3747 3747           */
3748 3748          avd = bvd;
3749 3749          spa_alt_rootvdev(rvd, &avd, &txg);
3750 3750          if (avd != bvd) {
3751 3751                  cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
3752 3752                      "try booting from '%s'", avd->vdev_path);
3753 3753                  error = SET_ERROR(EINVAL);
3754 3754                  goto out;
3755 3755          }
3756 3756  
3757 3757          /*
3758 3758           * If the boot device is part of a spare vdev then ensure that
3759 3759           * we're booting off the active spare.
3760 3760           */
3761 3761          if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3762 3762              !bvd->vdev_isspare) {
3763 3763                  cmn_err(CE_NOTE, "The boot device is currently spared. Please "
3764 3764                      "try booting from '%s'",
3765 3765                      bvd->vdev_parent->
3766 3766                      vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
3767 3767                  error = SET_ERROR(EINVAL);
3768 3768                  goto out;
3769 3769          }
3770 3770  
3771 3771          error = 0;
3772 3772  out:
3773 3773          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3774 3774          vdev_free(rvd);
3775 3775          spa_config_exit(spa, SCL_ALL, FTAG);
3776 3776          mutex_exit(&spa_namespace_lock);
3777 3777  
3778 3778          nvlist_free(config);
3779 3779          return (error);
3780 3780  }
3781 3781  
3782 3782  #endif
3783 3783  
3784 3784  /*
3785 3785   * Import a non-root pool into the system.
3786 3786   */
3787 3787  int
3788 3788  spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3789 3789  {
3790 3790          spa_t *spa;
3791 3791          char *altroot = NULL;
3792 3792          spa_load_state_t state = SPA_LOAD_IMPORT;
3793 3793          zpool_rewind_policy_t policy;
3794 3794          uint64_t mode = spa_mode_global;
3795 3795          uint64_t readonly = B_FALSE;
3796 3796          int error;
3797 3797          nvlist_t *nvroot;
3798 3798          nvlist_t **spares, **l2cache;
3799 3799          uint_t nspares, nl2cache;
3800 3800  
3801 3801          /*
3802 3802           * If a pool with this name exists, return failure.
3803 3803           */
3804 3804          mutex_enter(&spa_namespace_lock);
3805 3805          if (spa_lookup(pool) != NULL) {
3806 3806                  mutex_exit(&spa_namespace_lock);
3807 3807                  return (SET_ERROR(EEXIST));
3808 3808          }
3809 3809  
3810 3810          /*
3811 3811           * Create and initialize the spa structure.
3812 3812           */
3813 3813          (void) nvlist_lookup_string(props,
3814 3814              zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3815 3815          (void) nvlist_lookup_uint64(props,
3816 3816              zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
3817 3817          if (readonly)
3818 3818                  mode = FREAD;
3819 3819          spa = spa_add(pool, config, altroot);
3820 3820          spa->spa_import_flags = flags;
3821 3821  
3822 3822          /*
3823 3823           * Verbatim import - Take a pool and insert it into the namespace
3824 3824           * as if it had been loaded at boot.
3825 3825           */
3826 3826          if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
3827 3827                  if (props != NULL)
3828 3828                          spa_configfile_set(spa, props, B_FALSE);
3829 3829  
3830 3830                  spa_config_sync(spa, B_FALSE, B_TRUE);
3831 3831  
3832 3832                  mutex_exit(&spa_namespace_lock);
3833 3833                  spa_history_log_version(spa, "import");
3834 3834  
3835 3835                  return (0);
3836 3836          }
3837 3837  
3838 3838          spa_activate(spa, mode);
3839 3839  
3840 3840          /*
3841 3841           * Don't start async tasks until we know everything is healthy.
3842 3842           */
3843 3843          spa_async_suspend(spa);
3844 3844  
3845 3845          zpool_get_rewind_policy(config, &policy);
3846 3846          if (policy.zrp_request & ZPOOL_DO_REWIND)
3847 3847                  state = SPA_LOAD_RECOVER;
3848 3848  
3849 3849          /*
3850 3850           * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
3851 3851           * because the user-supplied config is actually the one to trust when
3852 3852           * doing an import.
3853 3853           */
3854 3854          if (state != SPA_LOAD_RECOVER)
3855 3855                  spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
3856 3856  
3857 3857          error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
3858 3858              policy.zrp_request);
3859 3859  
3860 3860          /*
3861 3861           * Propagate anything learned while loading the pool and pass it
3862 3862           * back to caller (i.e. rewind info, missing devices, etc).
3863 3863           */
3864 3864          VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
3865 3865              spa->spa_load_info) == 0);
3866 3866  
3867 3867          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3868 3868          /*
3869 3869           * Toss any existing sparelist, as it doesn't have any validity
3870 3870           * anymore, and conflicts with spa_has_spare().
3871 3871           */
3872 3872          if (spa->spa_spares.sav_config) {
3873 3873                  nvlist_free(spa->spa_spares.sav_config);
3874 3874                  spa->spa_spares.sav_config = NULL;
3875 3875                  spa_load_spares(spa);
3876 3876          }
3877 3877          if (spa->spa_l2cache.sav_config) {
3878 3878                  nvlist_free(spa->spa_l2cache.sav_config);
3879 3879                  spa->spa_l2cache.sav_config = NULL;
3880 3880                  spa_load_l2cache(spa);
3881 3881          }
3882 3882  
3883 3883          VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3884 3884              &nvroot) == 0);
3885 3885          if (error == 0)
3886 3886                  error = spa_validate_aux(spa, nvroot, -1ULL,
3887 3887                      VDEV_ALLOC_SPARE);
3888 3888          if (error == 0)
3889 3889                  error = spa_validate_aux(spa, nvroot, -1ULL,
3890 3890                      VDEV_ALLOC_L2CACHE);
3891 3891          spa_config_exit(spa, SCL_ALL, FTAG);
3892 3892  
3893 3893          if (props != NULL)
3894 3894                  spa_configfile_set(spa, props, B_FALSE);
3895 3895  
3896 3896          if (error != 0 || (props && spa_writeable(spa) &&
3897 3897              (error = spa_prop_set(spa, props)))) {
3898 3898                  spa_unload(spa);
3899 3899                  spa_deactivate(spa);
3900 3900                  spa_remove(spa);
3901 3901                  mutex_exit(&spa_namespace_lock);
3902 3902                  return (error);
3903 3903          }
3904 3904  
3905 3905          spa_async_resume(spa);
3906 3906  
3907 3907          /*
3908 3908           * Override any spares and level 2 cache devices as specified by
3909 3909           * the user, as these may have correct device names/devids, etc.
3910 3910           */
3911 3911          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3912 3912              &spares, &nspares) == 0) {
3913 3913                  if (spa->spa_spares.sav_config)
3914 3914                          VERIFY(nvlist_remove(spa->spa_spares.sav_config,
3915 3915                              ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
3916 3916                  else
3917 3917                          VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
3918 3918                              NV_UNIQUE_NAME, KM_SLEEP) == 0);
3919 3919                  VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3920 3920                      ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3921 3921                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3922 3922                  spa_load_spares(spa);
3923 3923                  spa_config_exit(spa, SCL_ALL, FTAG);
3924 3924                  spa->spa_spares.sav_sync = B_TRUE;
3925 3925          }
3926 3926          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3927 3927              &l2cache, &nl2cache) == 0) {
3928 3928                  if (spa->spa_l2cache.sav_config)
3929 3929                          VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
3930 3930                              ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
3931 3931                  else
3932 3932                          VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3933 3933                              NV_UNIQUE_NAME, KM_SLEEP) == 0);
3934 3934                  VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3935 3935                      ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3936 3936                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3937 3937                  spa_load_l2cache(spa);
3938 3938                  spa_config_exit(spa, SCL_ALL, FTAG);
3939 3939                  spa->spa_l2cache.sav_sync = B_TRUE;
3940 3940          }
3941 3941  
3942 3942          /*
3943 3943           * Check for any removed devices.
3944 3944           */
3945 3945          if (spa->spa_autoreplace) {
3946 3946                  spa_aux_check_removed(&spa->spa_spares);
3947 3947                  spa_aux_check_removed(&spa->spa_l2cache);
3948 3948          }
3949 3949  
3950 3950          if (spa_writeable(spa)) {
3951 3951                  /*
3952 3952                   * Update the config cache to include the newly-imported pool.
3953 3953                   */
3954 3954                  spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3955 3955          }
3956 3956  
3957 3957          /*
3958 3958           * It's possible that the pool was expanded while it was exported.
3959 3959           * We kick off an async task to handle this for us.
3960 3960           */
3961 3961          spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
3962 3962  
3963 3963          mutex_exit(&spa_namespace_lock);
3964 3964          spa_history_log_version(spa, "import");
3965 3965  
3966 3966          return (0);
3967 3967  }
3968 3968  
3969 3969  nvlist_t *
3970 3970  spa_tryimport(nvlist_t *tryconfig)
3971 3971  {
3972 3972          nvlist_t *config = NULL;
3973 3973          char *poolname;
3974 3974          spa_t *spa;
3975 3975          uint64_t state;
3976 3976          int error;
3977 3977  
3978 3978          if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
3979 3979                  return (NULL);
3980 3980  
3981 3981          if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
3982 3982                  return (NULL);
3983 3983  
3984 3984          /*
3985 3985           * Create and initialize the spa structure.
3986 3986           */
3987 3987          mutex_enter(&spa_namespace_lock);
3988 3988          spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
3989 3989          spa_activate(spa, FREAD);
3990 3990  
3991 3991          /*
3992 3992           * Pass off the heavy lifting to spa_load().
3993 3993           * Pass TRUE for mosconfig because the user-supplied config
3994 3994           * is actually the one to trust when doing an import.
3995 3995           */
3996 3996          error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
3997 3997  
3998 3998          /*
3999 3999           * If 'tryconfig' was at least parsable, return the current config.
4000 4000           */
4001 4001          if (spa->spa_root_vdev != NULL) {
4002 4002                  config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4003 4003                  VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
4004 4004                      poolname) == 0);
4005 4005                  VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4006 4006                      state) == 0);
4007 4007                  VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
4008 4008                      spa->spa_uberblock.ub_timestamp) == 0);
4009 4009                  VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4010 4010                      spa->spa_load_info) == 0);
4011 4011  
4012 4012                  /*
4013 4013                   * If the bootfs property exists on this pool then we
4014 4014                   * copy it out so that external consumers can tell which
4015 4015                   * pools are bootable.
4016 4016                   */
4017 4017                  if ((!error || error == EEXIST) && spa->spa_bootfs) {
4018 4018                          char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4019 4019  
4020 4020                          /*
4021 4021                           * We have to play games with the name since the
4022 4022                           * pool was opened as TRYIMPORT_NAME.
4023 4023                           */
4024 4024                          if (dsl_dsobj_to_dsname(spa_name(spa),
4025 4025                              spa->spa_bootfs, tmpname) == 0) {
4026 4026                                  char *cp;
4027 4027                                  char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4028 4028  
4029 4029                                  cp = strchr(tmpname, '/');
4030 4030                                  if (cp == NULL) {
4031 4031                                          (void) strlcpy(dsname, tmpname,
4032 4032                                              MAXPATHLEN);
4033 4033                                  } else {
4034 4034                                          (void) snprintf(dsname, MAXPATHLEN,
4035 4035                                              "%s/%s", poolname, ++cp);
4036 4036                                  }
4037 4037                                  VERIFY(nvlist_add_string(config,
4038 4038                                      ZPOOL_CONFIG_BOOTFS, dsname) == 0);
4039 4039                                  kmem_free(dsname, MAXPATHLEN);
4040 4040                          }
4041 4041                          kmem_free(tmpname, MAXPATHLEN);
4042 4042                  }
4043 4043  
4044 4044                  /*
4045 4045                   * Add the list of hot spares and level 2 cache devices.
4046 4046                   */
4047 4047                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4048 4048                  spa_add_spares(spa, config);
4049 4049                  spa_add_l2cache(spa, config);
4050 4050                  spa_config_exit(spa, SCL_CONFIG, FTAG);
4051 4051          }
4052 4052  
4053 4053          spa_unload(spa);
4054 4054          spa_deactivate(spa);
4055 4055          spa_remove(spa);
4056 4056          mutex_exit(&spa_namespace_lock);
4057 4057  
4058 4058          return (config);
4059 4059  }
4060 4060  
4061 4061  /*
4062 4062   * Pool export/destroy
4063 4063   *
4064 4064   * The act of destroying or exporting a pool is very simple.  We make sure there
4065 4065   * is no more pending I/O and any references to the pool are gone.  Then, we
4066 4066   * update the pool state and sync all the labels to disk, removing the
4067 4067   * configuration from the cache afterwards. If the 'hardforce' flag is set, then
4068 4068   * we don't sync the labels or remove the configuration cache.
4069 4069   */
4070 4070  static int
4071 4071  spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
4072 4072      boolean_t force, boolean_t hardforce)
4073 4073  {
4074 4074          spa_t *spa;
4075 4075  
4076 4076          if (oldconfig)
4077 4077                  *oldconfig = NULL;
4078 4078  
4079 4079          if (!(spa_mode_global & FWRITE))
4080 4080                  return (SET_ERROR(EROFS));
4081 4081  
4082 4082          mutex_enter(&spa_namespace_lock);
4083 4083          if ((spa = spa_lookup(pool)) == NULL) {
4084 4084                  mutex_exit(&spa_namespace_lock);
4085 4085                  return (SET_ERROR(ENOENT));
4086 4086          }
4087 4087  
4088 4088          /*
4089 4089           * Put a hold on the pool, drop the namespace lock, stop async tasks,
4090 4090           * reacquire the namespace lock, and see if we can export.
4091 4091           */
4092 4092          spa_open_ref(spa, FTAG);
4093 4093          mutex_exit(&spa_namespace_lock);
4094 4094          spa_async_suspend(spa);
4095 4095          mutex_enter(&spa_namespace_lock);
4096 4096          spa_close(spa, FTAG);
4097 4097  
4098 4098          /*
4099 4099           * The pool will be in core if it's openable,
4100 4100           * in which case we can modify its state.
4101 4101           */
4102 4102          if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
4103 4103                  /*
4104 4104                   * Objsets may be open only because they're dirty, so we
4105 4105                   * have to force it to sync before checking spa_refcnt.
4106 4106                   */
4107 4107                  txg_wait_synced(spa->spa_dsl_pool, 0);
4108 4108  
4109 4109                  /*
4110 4110                   * A pool cannot be exported or destroyed if there are active
4111 4111                   * references.  If we are resetting a pool, allow references by
4112 4112                   * fault injection handlers.
4113 4113                   */
4114 4114                  if (!spa_refcount_zero(spa) ||
4115 4115                      (spa->spa_inject_ref != 0 &&
4116 4116                      new_state != POOL_STATE_UNINITIALIZED)) {
4117 4117                          spa_async_resume(spa);
4118 4118                          mutex_exit(&spa_namespace_lock);
4119 4119                          return (SET_ERROR(EBUSY));
4120 4120                  }
4121 4121  
4122 4122                  /*
4123 4123                   * A pool cannot be exported if it has an active shared spare.
4124 4124                   * This is to prevent other pools stealing the active spare
4125 4125                   * from an exported pool. At user's own will, such pool can
4126 4126                   * be forcedly exported.
4127 4127                   */
4128 4128                  if (!force && new_state == POOL_STATE_EXPORTED &&
4129 4129                      spa_has_active_shared_spare(spa)) {
4130 4130                          spa_async_resume(spa);
4131 4131                          mutex_exit(&spa_namespace_lock);
4132 4132                          return (SET_ERROR(EXDEV));
4133 4133                  }
4134 4134  
4135 4135                  /*
4136 4136                   * We want this to be reflected on every label,
4137 4137                   * so mark them all dirty.  spa_unload() will do the
4138 4138                   * final sync that pushes these changes out.
4139 4139                   */
4140 4140                  if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
4141 4141                          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4142 4142                          spa->spa_state = new_state;
4143 4143                          spa->spa_final_txg = spa_last_synced_txg(spa) +
4144 4144                              TXG_DEFER_SIZE + 1;
4145 4145                          vdev_config_dirty(spa->spa_root_vdev);
4146 4146                          spa_config_exit(spa, SCL_ALL, FTAG);
4147 4147                  }
4148 4148          }
4149 4149  
4150 4150          spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
4151 4151  
4152 4152          if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4153 4153                  spa_unload(spa);
4154 4154                  spa_deactivate(spa);
4155 4155          }
4156 4156  
4157 4157          if (oldconfig && spa->spa_config)
4158 4158                  VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
4159 4159  
4160 4160          if (new_state != POOL_STATE_UNINITIALIZED) {
4161 4161                  if (!hardforce)
4162 4162                          spa_config_sync(spa, B_TRUE, B_TRUE);
4163 4163                  spa_remove(spa);
4164 4164          }
4165 4165          mutex_exit(&spa_namespace_lock);
4166 4166  
4167 4167          return (0);
4168 4168  }
4169 4169  
4170 4170  /*
4171 4171   * Destroy a storage pool.
4172 4172   */
4173 4173  int
4174 4174  spa_destroy(char *pool)
4175 4175  {
4176 4176          return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
4177 4177              B_FALSE, B_FALSE));
4178 4178  }
4179 4179  
4180 4180  /*
4181 4181   * Export a storage pool.
4182 4182   */
4183 4183  int
4184 4184  spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
4185 4185      boolean_t hardforce)
4186 4186  {
4187 4187          return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
4188 4188              force, hardforce));
4189 4189  }
4190 4190  
4191 4191  /*
4192 4192   * Similar to spa_export(), this unloads the spa_t without actually removing it
4193 4193   * from the namespace in any way.
4194 4194   */
4195 4195  int
4196 4196  spa_reset(char *pool)
4197 4197  {
4198 4198          return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
4199 4199              B_FALSE, B_FALSE));
4200 4200  }
4201 4201  
4202 4202  /*
4203 4203   * ==========================================================================
4204 4204   * Device manipulation
4205 4205   * ==========================================================================
4206 4206   */
4207 4207  
4208 4208  /*
4209 4209   * Add a device to a storage pool.
4210 4210   */
4211 4211  int
4212 4212  spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
4213 4213  {
4214 4214          uint64_t txg, id;
4215 4215          int error;
4216 4216          vdev_t *rvd = spa->spa_root_vdev;
4217 4217          vdev_t *vd, *tvd;
4218 4218          nvlist_t **spares, **l2cache;
4219 4219          uint_t nspares, nl2cache;
4220 4220  
4221 4221          ASSERT(spa_writeable(spa));
4222 4222  
4223 4223          txg = spa_vdev_enter(spa);
4224 4224  
4225 4225          if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
4226 4226              VDEV_ALLOC_ADD)) != 0)
4227 4227                  return (spa_vdev_exit(spa, NULL, txg, error));
4228 4228  
4229 4229          spa->spa_pending_vdev = vd;     /* spa_vdev_exit() will clear this */
4230 4230  
4231 4231          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
4232 4232              &nspares) != 0)
4233 4233                  nspares = 0;
4234 4234  
4235 4235          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
4236 4236              &nl2cache) != 0)
4237 4237                  nl2cache = 0;
4238 4238  
4239 4239          if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
4240 4240                  return (spa_vdev_exit(spa, vd, txg, EINVAL));
4241 4241  
4242 4242          if (vd->vdev_children != 0 &&
4243 4243              (error = vdev_create(vd, txg, B_FALSE)) != 0)
4244 4244                  return (spa_vdev_exit(spa, vd, txg, error));
4245 4245  
4246 4246          /*
4247 4247           * We must validate the spares and l2cache devices after checking the
4248 4248           * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
4249 4249           */
4250 4250          if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
4251 4251                  return (spa_vdev_exit(spa, vd, txg, error));
4252 4252  
4253 4253          /*
4254 4254           * Transfer each new top-level vdev from vd to rvd.
4255 4255           */
4256 4256          for (int c = 0; c < vd->vdev_children; c++) {
4257 4257  
4258 4258                  /*
4259 4259                   * Set the vdev id to the first hole, if one exists.
4260 4260                   */
4261 4261                  for (id = 0; id < rvd->vdev_children; id++) {
4262 4262                          if (rvd->vdev_child[id]->vdev_ishole) {
4263 4263                                  vdev_free(rvd->vdev_child[id]);
4264 4264                                  break;
4265 4265                          }
4266 4266                  }
4267 4267                  tvd = vd->vdev_child[c];
4268 4268                  vdev_remove_child(vd, tvd);
4269 4269                  tvd->vdev_id = id;
4270 4270                  vdev_add_child(rvd, tvd);
4271 4271                  vdev_config_dirty(tvd);
4272 4272          }
4273 4273  
4274 4274          if (nspares != 0) {
4275 4275                  spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
4276 4276                      ZPOOL_CONFIG_SPARES);
4277 4277                  spa_load_spares(spa);
4278 4278                  spa->spa_spares.sav_sync = B_TRUE;
4279 4279          }
4280 4280  
4281 4281          if (nl2cache != 0) {
4282 4282                  spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
4283 4283                      ZPOOL_CONFIG_L2CACHE);
4284 4284                  spa_load_l2cache(spa);
4285 4285                  spa->spa_l2cache.sav_sync = B_TRUE;
4286 4286          }
4287 4287  
4288 4288          /*
4289 4289           * We have to be careful when adding new vdevs to an existing pool.
4290 4290           * If other threads start allocating from these vdevs before we
4291 4291           * sync the config cache, and we lose power, then upon reboot we may
4292 4292           * fail to open the pool because there are DVAs that the config cache
4293 4293           * can't translate.  Therefore, we first add the vdevs without
4294 4294           * initializing metaslabs; sync the config cache (via spa_vdev_exit());
4295 4295           * and then let spa_config_update() initialize the new metaslabs.
4296 4296           *
4297 4297           * spa_load() checks for added-but-not-initialized vdevs, so that
4298 4298           * if we lose power at any point in this sequence, the remaining
4299 4299           * steps will be completed the next time we load the pool.
4300 4300           */
4301 4301          (void) spa_vdev_exit(spa, vd, txg, 0);
4302 4302  
4303 4303          mutex_enter(&spa_namespace_lock);
4304 4304          spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4305 4305          mutex_exit(&spa_namespace_lock);
4306 4306  
4307 4307          return (0);
4308 4308  }
4309 4309  
4310 4310  /*
4311 4311   * Attach a device to a mirror.  The arguments are the path to any device
4312 4312   * in the mirror, and the nvroot for the new device.  If the path specifies
4313 4313   * a device that is not mirrored, we automatically insert the mirror vdev.
4314 4314   *
4315 4315   * If 'replacing' is specified, the new device is intended to replace the
4316 4316   * existing device; in this case the two devices are made into their own
4317 4317   * mirror using the 'replacing' vdev, which is functionally identical to
4318 4318   * the mirror vdev (it actually reuses all the same ops) but has a few
4319 4319   * extra rules: you can't attach to it after it's been created, and upon
4320 4320   * completion of resilvering, the first disk (the one being replaced)
4321 4321   * is automatically detached.
4322 4322   */
4323 4323  int
4324 4324  spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
4325 4325  {
4326 4326          uint64_t txg, dtl_max_txg;
4327 4327          vdev_t *rvd = spa->spa_root_vdev;
4328 4328          vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
4329 4329          vdev_ops_t *pvops;
4330 4330          char *oldvdpath, *newvdpath;
4331 4331          int newvd_isspare;
4332 4332          int error;
4333 4333  
4334 4334          ASSERT(spa_writeable(spa));
4335 4335  
4336 4336          txg = spa_vdev_enter(spa);
4337 4337  
4338 4338          oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
4339 4339  
4340 4340          if (oldvd == NULL)
4341 4341                  return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4342 4342  
4343 4343          if (!oldvd->vdev_ops->vdev_op_leaf)
4344 4344                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4345 4345  
4346 4346          pvd = oldvd->vdev_parent;
4347 4347  
4348 4348          if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
4349 4349              VDEV_ALLOC_ATTACH)) != 0)
4350 4350                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4351 4351  
4352 4352          if (newrootvd->vdev_children != 1)
4353 4353                  return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4354 4354  
4355 4355          newvd = newrootvd->vdev_child[0];
4356 4356  
4357 4357          if (!newvd->vdev_ops->vdev_op_leaf)
4358 4358                  return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4359 4359  
4360 4360          if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
4361 4361                  return (spa_vdev_exit(spa, newrootvd, txg, error));
4362 4362  
4363 4363          /*
4364 4364           * Spares can't replace logs
4365 4365           */
4366 4366          if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
4367 4367                  return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4368 4368  
4369 4369          if (!replacing) {
4370 4370                  /*
4371 4371                   * For attach, the only allowable parent is a mirror or the root
4372 4372                   * vdev.
4373 4373                   */
4374 4374                  if (pvd->vdev_ops != &vdev_mirror_ops &&
4375 4375                      pvd->vdev_ops != &vdev_root_ops)
4376 4376                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4377 4377  
4378 4378                  pvops = &vdev_mirror_ops;
4379 4379          } else {
4380 4380                  /*
4381 4381                   * Active hot spares can only be replaced by inactive hot
4382 4382                   * spares.
4383 4383                   */
4384 4384                  if (pvd->vdev_ops == &vdev_spare_ops &&
4385 4385                      oldvd->vdev_isspare &&
4386 4386                      !spa_has_spare(spa, newvd->vdev_guid))
4387 4387                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4388 4388  
4389 4389                  /*
4390 4390                   * If the source is a hot spare, and the parent isn't already a
4391 4391                   * spare, then we want to create a new hot spare.  Otherwise, we
4392 4392                   * want to create a replacing vdev.  The user is not allowed to
4393 4393                   * attach to a spared vdev child unless the 'isspare' state is
4394 4394                   * the same (spare replaces spare, non-spare replaces
4395 4395                   * non-spare).
4396 4396                   */
4397 4397                  if (pvd->vdev_ops == &vdev_replacing_ops &&
4398 4398                      spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
4399 4399                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4400 4400                  } else if (pvd->vdev_ops == &vdev_spare_ops &&
4401 4401                      newvd->vdev_isspare != oldvd->vdev_isspare) {
4402 4402                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4403 4403                  }
4404 4404  
4405 4405                  if (newvd->vdev_isspare)
4406 4406                          pvops = &vdev_spare_ops;
4407 4407                  else
4408 4408                          pvops = &vdev_replacing_ops;
4409 4409          }
4410 4410  
4411 4411          /*
4412 4412           * Make sure the new device is big enough.
4413 4413           */
4414 4414          if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
4415 4415                  return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
4416 4416  
4417 4417          /*
4418 4418           * The new device cannot have a higher alignment requirement
4419 4419           * than the top-level vdev.
4420 4420           */
4421 4421          if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
4422 4422                  return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
4423 4423  
4424 4424          /*
4425 4425           * If this is an in-place replacement, update oldvd's path and devid
4426 4426           * to make it distinguishable from newvd, and unopenable from now on.
4427 4427           */
4428 4428          if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
4429 4429                  spa_strfree(oldvd->vdev_path);
4430 4430                  oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
4431 4431                      KM_SLEEP);
4432 4432                  (void) sprintf(oldvd->vdev_path, "%s/%s",
4433 4433                      newvd->vdev_path, "old");
4434 4434                  if (oldvd->vdev_devid != NULL) {
4435 4435                          spa_strfree(oldvd->vdev_devid);
4436 4436                          oldvd->vdev_devid = NULL;
4437 4437                  }
4438 4438          }
4439 4439  
4440 4440          /* mark the device being resilvered */
4441 4441          newvd->vdev_resilvering = B_TRUE;
4442 4442  
4443 4443          /*
4444 4444           * If the parent is not a mirror, or if we're replacing, insert the new
4445 4445           * mirror/replacing/spare vdev above oldvd.
4446 4446           */
4447 4447          if (pvd->vdev_ops != pvops)
4448 4448                  pvd = vdev_add_parent(oldvd, pvops);
4449 4449  
4450 4450          ASSERT(pvd->vdev_top->vdev_parent == rvd);
4451 4451          ASSERT(pvd->vdev_ops == pvops);
4452 4452          ASSERT(oldvd->vdev_parent == pvd);
4453 4453  
4454 4454          /*
4455 4455           * Extract the new device from its root and add it to pvd.
4456 4456           */
4457 4457          vdev_remove_child(newrootvd, newvd);
4458 4458          newvd->vdev_id = pvd->vdev_children;
4459 4459          newvd->vdev_crtxg = oldvd->vdev_crtxg;
4460 4460          vdev_add_child(pvd, newvd);
4461 4461  
4462 4462          tvd = newvd->vdev_top;
4463 4463          ASSERT(pvd->vdev_top == tvd);
4464 4464          ASSERT(tvd->vdev_parent == rvd);
4465 4465  
4466 4466          vdev_config_dirty(tvd);
4467 4467  
4468 4468          /*
4469 4469           * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4470 4470           * for any dmu_sync-ed blocks.  It will propagate upward when
4471 4471           * spa_vdev_exit() calls vdev_dtl_reassess().
4472 4472           */
4473 4473          dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4474 4474  
4475 4475          vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4476 4476              dtl_max_txg - TXG_INITIAL);
4477 4477  
4478 4478          if (newvd->vdev_isspare) {
4479 4479                  spa_spare_activate(newvd);
4480 4480                  spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
4481 4481          }
4482 4482  
4483 4483          oldvdpath = spa_strdup(oldvd->vdev_path);
4484 4484          newvdpath = spa_strdup(newvd->vdev_path);
4485 4485          newvd_isspare = newvd->vdev_isspare;
4486 4486  
4487 4487          /*
4488 4488           * Mark newvd's DTL dirty in this txg.
4489 4489           */
4490 4490          vdev_dirty(tvd, VDD_DTL, newvd, txg);
4491 4491  
4492 4492          /*
4493 4493           * Restart the resilver
4494 4494           */
4495 4495          dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4496 4496  
4497 4497          /*
4498 4498           * Commit the config
4499 4499           */
4500 4500          (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4501 4501  
4502 4502          spa_history_log_internal(spa, "vdev attach", NULL,
4503 4503              "%s vdev=%s %s vdev=%s",
4504 4504              replacing && newvd_isspare ? "spare in" :
4505 4505              replacing ? "replace" : "attach", newvdpath,
4506 4506              replacing ? "for" : "to", oldvdpath);
4507 4507  
4508 4508          spa_strfree(oldvdpath);
  
    | ↓ open down ↓ | 4508 lines elided | ↑ open up ↑ | 
4509 4509          spa_strfree(newvdpath);
4510 4510  
4511 4511          if (spa->spa_bootfs)
4512 4512                  spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4513 4513  
4514 4514          return (0);
4515 4515  }
4516 4516  
4517 4517  /*
4518 4518   * Detach a device from a mirror or replacing vdev.
     4519 + *
4519 4520   * If 'replace_done' is specified, only detach if the parent
4520 4521   * is a replacing vdev.
4521 4522   */
4522 4523  int
4523 4524  spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4524 4525  {
4525 4526          uint64_t txg;
4526 4527          int error;
4527 4528          vdev_t *rvd = spa->spa_root_vdev;
4528 4529          vdev_t *vd, *pvd, *cvd, *tvd;
4529 4530          boolean_t unspare = B_FALSE;
4530 4531          uint64_t unspare_guid = 0;
4531 4532          char *vdpath;
4532 4533  
4533 4534          ASSERT(spa_writeable(spa));
4534 4535  
4535 4536          txg = spa_vdev_enter(spa);
4536 4537  
4537 4538          vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4538 4539  
4539 4540          if (vd == NULL)
4540 4541                  return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4541 4542  
4542 4543          if (!vd->vdev_ops->vdev_op_leaf)
4543 4544                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4544 4545  
4545 4546          pvd = vd->vdev_parent;
4546 4547  
4547 4548          /*
4548 4549           * If the parent/child relationship is not as expected, don't do it.
4549 4550           * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4550 4551           * vdev that's replacing B with C.  The user's intent in replacing
4551 4552           * is to go from M(A,B) to M(A,C).  If the user decides to cancel
4552 4553           * the replace by detaching C, the expected behavior is to end up
4553 4554           * M(A,B).  But suppose that right after deciding to detach C,
4554 4555           * the replacement of B completes.  We would have M(A,C), and then
4555 4556           * ask to detach C, which would leave us with just A -- not what
4556 4557           * the user wanted.  To prevent this, we make sure that the
4557 4558           * parent/child relationship hasn't changed -- in this example,
4558 4559           * that C's parent is still the replacing vdev R.
4559 4560           */
4560 4561          if (pvd->vdev_guid != pguid && pguid != 0)
4561 4562                  return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4562 4563  
4563 4564          /*
4564 4565           * Only 'replacing' or 'spare' vdevs can be replaced.
4565 4566           */
4566 4567          if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4567 4568              pvd->vdev_ops != &vdev_spare_ops)
4568 4569                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4569 4570  
4570 4571          ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4571 4572              spa_version(spa) >= SPA_VERSION_SPARES);
4572 4573  
4573 4574          /*
4574 4575           * Only mirror, replacing, and spare vdevs support detach.
4575 4576           */
4576 4577          if (pvd->vdev_ops != &vdev_replacing_ops &&
4577 4578              pvd->vdev_ops != &vdev_mirror_ops &&
4578 4579              pvd->vdev_ops != &vdev_spare_ops)
4579 4580                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4580 4581  
4581 4582          /*
4582 4583           * If this device has the only valid copy of some data,
4583 4584           * we cannot safely detach it.
4584 4585           */
4585 4586          if (vdev_dtl_required(vd))
4586 4587                  return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4587 4588  
4588 4589          ASSERT(pvd->vdev_children >= 2);
4589 4590  
4590 4591          /*
4591 4592           * If we are detaching the second disk from a replacing vdev, then
4592 4593           * check to see if we changed the original vdev's path to have "/old"
4593 4594           * at the end in spa_vdev_attach().  If so, undo that change now.
4594 4595           */
4595 4596          if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4596 4597              vd->vdev_path != NULL) {
4597 4598                  size_t len = strlen(vd->vdev_path);
4598 4599  
4599 4600                  for (int c = 0; c < pvd->vdev_children; c++) {
4600 4601                          cvd = pvd->vdev_child[c];
4601 4602  
4602 4603                          if (cvd == vd || cvd->vdev_path == NULL)
4603 4604                                  continue;
4604 4605  
4605 4606                          if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4606 4607                              strcmp(cvd->vdev_path + len, "/old") == 0) {
4607 4608                                  spa_strfree(cvd->vdev_path);
4608 4609                                  cvd->vdev_path = spa_strdup(vd->vdev_path);
4609 4610                                  break;
4610 4611                          }
4611 4612                  }
4612 4613          }
4613 4614  
4614 4615          /*
4615 4616           * If we are detaching the original disk from a spare, then it implies
4616 4617           * that the spare should become a real disk, and be removed from the
4617 4618           * active spare list for the pool.
4618 4619           */
4619 4620          if (pvd->vdev_ops == &vdev_spare_ops &&
4620 4621              vd->vdev_id == 0 &&
4621 4622              pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
4622 4623                  unspare = B_TRUE;
4623 4624  
4624 4625          /*
4625 4626           * Erase the disk labels so the disk can be used for other things.
4626 4627           * This must be done after all other error cases are handled,
4627 4628           * but before we disembowel vd (so we can still do I/O to it).
4628 4629           * But if we can't do it, don't treat the error as fatal --
4629 4630           * it may be that the unwritability of the disk is the reason
4630 4631           * it's being detached!
4631 4632           */
4632 4633          error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4633 4634  
4634 4635          /*
4635 4636           * Remove vd from its parent and compact the parent's children.
4636 4637           */
4637 4638          vdev_remove_child(pvd, vd);
4638 4639          vdev_compact_children(pvd);
4639 4640  
4640 4641          /*
4641 4642           * Remember one of the remaining children so we can get tvd below.
4642 4643           */
4643 4644          cvd = pvd->vdev_child[pvd->vdev_children - 1];
4644 4645  
4645 4646          /*
4646 4647           * If we need to remove the remaining child from the list of hot spares,
4647 4648           * do it now, marking the vdev as no longer a spare in the process.
4648 4649           * We must do this before vdev_remove_parent(), because that can
4649 4650           * change the GUID if it creates a new toplevel GUID.  For a similar
4650 4651           * reason, we must remove the spare now, in the same txg as the detach;
4651 4652           * otherwise someone could attach a new sibling, change the GUID, and
4652 4653           * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
4653 4654           */
4654 4655          if (unspare) {
4655 4656                  ASSERT(cvd->vdev_isspare);
4656 4657                  spa_spare_remove(cvd);
4657 4658                  unspare_guid = cvd->vdev_guid;
4658 4659                  (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4659 4660                  cvd->vdev_unspare = B_TRUE;
4660 4661          }
4661 4662  
4662 4663          /*
4663 4664           * If the parent mirror/replacing vdev only has one child,
4664 4665           * the parent is no longer needed.  Remove it from the tree.
4665 4666           */
4666 4667          if (pvd->vdev_children == 1) {
4667 4668                  if (pvd->vdev_ops == &vdev_spare_ops)
4668 4669                          cvd->vdev_unspare = B_FALSE;
4669 4670                  vdev_remove_parent(cvd);
4670 4671                  cvd->vdev_resilvering = B_FALSE;
4671 4672          }
4672 4673  
4673 4674  
4674 4675          /*
4675 4676           * We don't set tvd until now because the parent we just removed
4676 4677           * may have been the previous top-level vdev.
4677 4678           */
4678 4679          tvd = cvd->vdev_top;
4679 4680          ASSERT(tvd->vdev_parent == rvd);
4680 4681  
4681 4682          /*
4682 4683           * Reevaluate the parent vdev state.
4683 4684           */
4684 4685          vdev_propagate_state(cvd);
4685 4686  
4686 4687          /*
4687 4688           * If the 'autoexpand' property is set on the pool then automatically
4688 4689           * try to expand the size of the pool. For example if the device we
4689 4690           * just detached was smaller than the others, it may be possible to
4690 4691           * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4691 4692           * first so that we can obtain the updated sizes of the leaf vdevs.
4692 4693           */
4693 4694          if (spa->spa_autoexpand) {
4694 4695                  vdev_reopen(tvd);
4695 4696                  vdev_expand(tvd, txg);
4696 4697          }
4697 4698  
4698 4699          vdev_config_dirty(tvd);
4699 4700  
4700 4701          /*
4701 4702           * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
4702 4703           * vd->vdev_detached is set and free vd's DTL object in syncing context.
4703 4704           * But first make sure we're not on any *other* txg's DTL list, to
4704 4705           * prevent vd from being accessed after it's freed.
4705 4706           */
4706 4707          vdpath = spa_strdup(vd->vdev_path);
4707 4708          for (int t = 0; t < TXG_SIZE; t++)
4708 4709                  (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4709 4710          vd->vdev_detached = B_TRUE;
4710 4711          vdev_dirty(tvd, VDD_DTL, vd, txg);
4711 4712  
4712 4713          spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
4713 4714  
4714 4715          /* hang on to the spa before we release the lock */
4715 4716          spa_open_ref(spa, FTAG);
4716 4717  
4717 4718          error = spa_vdev_exit(spa, vd, txg, 0);
4718 4719  
4719 4720          spa_history_log_internal(spa, "detach", NULL,
4720 4721              "vdev=%s", vdpath);
4721 4722          spa_strfree(vdpath);
4722 4723  
4723 4724          /*
4724 4725           * If this was the removal of the original device in a hot spare vdev,
4725 4726           * then we want to go through and remove the device from the hot spare
4726 4727           * list of every other pool.
4727 4728           */
4728 4729          if (unspare) {
4729 4730                  spa_t *altspa = NULL;
4730 4731  
4731 4732                  mutex_enter(&spa_namespace_lock);
4732 4733                  while ((altspa = spa_next(altspa)) != NULL) {
4733 4734                          if (altspa->spa_state != POOL_STATE_ACTIVE ||
4734 4735                              altspa == spa)
4735 4736                                  continue;
4736 4737  
4737 4738                          spa_open_ref(altspa, FTAG);
4738 4739                          mutex_exit(&spa_namespace_lock);
4739 4740                          (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
4740 4741                          mutex_enter(&spa_namespace_lock);
4741 4742                          spa_close(altspa, FTAG);
4742 4743                  }
4743 4744                  mutex_exit(&spa_namespace_lock);
4744 4745  
4745 4746                  /* search the rest of the vdevs for spares to remove */
4746 4747                  spa_vdev_resilver_done(spa);
4747 4748          }
4748 4749  
4749 4750          /* all done with the spa; OK to release */
4750 4751          mutex_enter(&spa_namespace_lock);
4751 4752          spa_close(spa, FTAG);
4752 4753          mutex_exit(&spa_namespace_lock);
4753 4754  
4754 4755          return (error);
4755 4756  }
4756 4757  
4757 4758  /*
4758 4759   * Split a set of devices from their mirrors, and create a new pool from them.
4759 4760   */
4760 4761  int
4761 4762  spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4762 4763      nvlist_t *props, boolean_t exp)
4763 4764  {
4764 4765          int error = 0;
4765 4766          uint64_t txg, *glist;
4766 4767          spa_t *newspa;
4767 4768          uint_t c, children, lastlog;
4768 4769          nvlist_t **child, *nvl, *tmp;
4769 4770          dmu_tx_t *tx;
4770 4771          char *altroot = NULL;
4771 4772          vdev_t *rvd, **vml = NULL;                      /* vdev modify list */
4772 4773          boolean_t activate_slog;
4773 4774  
4774 4775          ASSERT(spa_writeable(spa));
4775 4776  
4776 4777          txg = spa_vdev_enter(spa);
4777 4778  
4778 4779          /* clear the log and flush everything up to now */
4779 4780          activate_slog = spa_passivate_log(spa);
4780 4781          (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4781 4782          error = spa_offline_log(spa);
4782 4783          txg = spa_vdev_config_enter(spa);
4783 4784  
4784 4785          if (activate_slog)
4785 4786                  spa_activate_log(spa);
4786 4787  
4787 4788          if (error != 0)
4788 4789                  return (spa_vdev_exit(spa, NULL, txg, error));
4789 4790  
4790 4791          /* check new spa name before going any further */
4791 4792          if (spa_lookup(newname) != NULL)
4792 4793                  return (spa_vdev_exit(spa, NULL, txg, EEXIST));
4793 4794  
4794 4795          /*
4795 4796           * scan through all the children to ensure they're all mirrors
4796 4797           */
4797 4798          if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
4798 4799              nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
4799 4800              &children) != 0)
4800 4801                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4801 4802  
4802 4803          /* first, check to ensure we've got the right child count */
4803 4804          rvd = spa->spa_root_vdev;
4804 4805          lastlog = 0;
4805 4806          for (c = 0; c < rvd->vdev_children; c++) {
4806 4807                  vdev_t *vd = rvd->vdev_child[c];
4807 4808  
4808 4809                  /* don't count the holes & logs as children */
4809 4810                  if (vd->vdev_islog || vd->vdev_ishole) {
4810 4811                          if (lastlog == 0)
4811 4812                                  lastlog = c;
4812 4813                          continue;
4813 4814                  }
4814 4815  
4815 4816                  lastlog = 0;
4816 4817          }
4817 4818          if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
4818 4819                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4819 4820  
4820 4821          /* next, ensure no spare or cache devices are part of the split */
4821 4822          if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
4822 4823              nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
4823 4824                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4824 4825  
4825 4826          vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
4826 4827          glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
4827 4828  
4828 4829          /* then, loop over each vdev and validate it */
4829 4830          for (c = 0; c < children; c++) {
4830 4831                  uint64_t is_hole = 0;
4831 4832  
4832 4833                  (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
4833 4834                      &is_hole);
4834 4835  
4835 4836                  if (is_hole != 0) {
4836 4837                          if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
4837 4838                              spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
4838 4839                                  continue;
4839 4840                          } else {
4840 4841                                  error = SET_ERROR(EINVAL);
4841 4842                                  break;
4842 4843                          }
4843 4844                  }
4844 4845  
4845 4846                  /* which disk is going to be split? */
4846 4847                  if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
4847 4848                      &glist[c]) != 0) {
4848 4849                          error = SET_ERROR(EINVAL);
4849 4850                          break;
4850 4851                  }
4851 4852  
4852 4853                  /* look it up in the spa */
4853 4854                  vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
4854 4855                  if (vml[c] == NULL) {
4855 4856                          error = SET_ERROR(ENODEV);
4856 4857                          break;
4857 4858                  }
4858 4859  
4859 4860                  /* make sure there's nothing stopping the split */
4860 4861                  if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
4861 4862                      vml[c]->vdev_islog ||
4862 4863                      vml[c]->vdev_ishole ||
4863 4864                      vml[c]->vdev_isspare ||
4864 4865                      vml[c]->vdev_isl2cache ||
4865 4866                      !vdev_writeable(vml[c]) ||
4866 4867                      vml[c]->vdev_children != 0 ||
4867 4868                      vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
4868 4869                      c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
4869 4870                          error = SET_ERROR(EINVAL);
4870 4871                          break;
4871 4872                  }
4872 4873  
4873 4874                  if (vdev_dtl_required(vml[c])) {
4874 4875                          error = SET_ERROR(EBUSY);
4875 4876                          break;
4876 4877                  }
4877 4878  
4878 4879                  /* we need certain info from the top level */
4879 4880                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
4880 4881                      vml[c]->vdev_top->vdev_ms_array) == 0);
4881 4882                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
4882 4883                      vml[c]->vdev_top->vdev_ms_shift) == 0);
4883 4884                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
4884 4885                      vml[c]->vdev_top->vdev_asize) == 0);
4885 4886                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
4886 4887                      vml[c]->vdev_top->vdev_ashift) == 0);
4887 4888          }
4888 4889  
4889 4890          if (error != 0) {
4890 4891                  kmem_free(vml, children * sizeof (vdev_t *));
4891 4892                  kmem_free(glist, children * sizeof (uint64_t));
4892 4893                  return (spa_vdev_exit(spa, NULL, txg, error));
4893 4894          }
4894 4895  
4895 4896          /* stop writers from using the disks */
4896 4897          for (c = 0; c < children; c++) {
4897 4898                  if (vml[c] != NULL)
4898 4899                          vml[c]->vdev_offline = B_TRUE;
4899 4900          }
4900 4901          vdev_reopen(spa->spa_root_vdev);
4901 4902  
4902 4903          /*
4903 4904           * Temporarily record the splitting vdevs in the spa config.  This
4904 4905           * will disappear once the config is regenerated.
4905 4906           */
4906 4907          VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4907 4908          VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
4908 4909              glist, children) == 0);
4909 4910          kmem_free(glist, children * sizeof (uint64_t));
4910 4911  
4911 4912          mutex_enter(&spa->spa_props_lock);
4912 4913          VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
4913 4914              nvl) == 0);
4914 4915          mutex_exit(&spa->spa_props_lock);
4915 4916          spa->spa_config_splitting = nvl;
4916 4917          vdev_config_dirty(spa->spa_root_vdev);
4917 4918  
4918 4919          /* configure and create the new pool */
4919 4920          VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
4920 4921          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4921 4922              exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
4922 4923          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
4923 4924              spa_version(spa)) == 0);
4924 4925          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
4925 4926              spa->spa_config_txg) == 0);
4926 4927          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
4927 4928              spa_generate_guid(NULL)) == 0);
4928 4929          (void) nvlist_lookup_string(props,
4929 4930              zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4930 4931  
4931 4932          /* add the new pool to the namespace */
4932 4933          newspa = spa_add(newname, config, altroot);
4933 4934          newspa->spa_config_txg = spa->spa_config_txg;
4934 4935          spa_set_log_state(newspa, SPA_LOG_CLEAR);
4935 4936  
4936 4937          /* release the spa config lock, retaining the namespace lock */
4937 4938          spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4938 4939  
4939 4940          if (zio_injection_enabled)
4940 4941                  zio_handle_panic_injection(spa, FTAG, 1);
4941 4942  
4942 4943          spa_activate(newspa, spa_mode_global);
4943 4944          spa_async_suspend(newspa);
4944 4945  
4945 4946          /* create the new pool from the disks of the original pool */
4946 4947          error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
4947 4948          if (error)
4948 4949                  goto out;
4949 4950  
4950 4951          /* if that worked, generate a real config for the new pool */
4951 4952          if (newspa->spa_root_vdev != NULL) {
4952 4953                  VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
4953 4954                      NV_UNIQUE_NAME, KM_SLEEP) == 0);
4954 4955                  VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
4955 4956                      ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
4956 4957                  spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
4957 4958                      B_TRUE));
4958 4959          }
4959 4960  
4960 4961          /* set the props */
4961 4962          if (props != NULL) {
4962 4963                  spa_configfile_set(newspa, props, B_FALSE);
4963 4964                  error = spa_prop_set(newspa, props);
4964 4965                  if (error)
4965 4966                          goto out;
4966 4967          }
4967 4968  
4968 4969          /* flush everything */
4969 4970          txg = spa_vdev_config_enter(newspa);
4970 4971          vdev_config_dirty(newspa->spa_root_vdev);
4971 4972          (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
4972 4973  
4973 4974          if (zio_injection_enabled)
4974 4975                  zio_handle_panic_injection(spa, FTAG, 2);
4975 4976  
4976 4977          spa_async_resume(newspa);
4977 4978  
4978 4979          /* finally, update the original pool's config */
4979 4980          txg = spa_vdev_config_enter(spa);
4980 4981          tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4981 4982          error = dmu_tx_assign(tx, TXG_WAIT);
4982 4983          if (error != 0)
4983 4984                  dmu_tx_abort(tx);
4984 4985          for (c = 0; c < children; c++) {
4985 4986                  if (vml[c] != NULL) {
4986 4987                          vdev_split(vml[c]);
4987 4988                          if (error == 0)
4988 4989                                  spa_history_log_internal(spa, "detach", tx,
4989 4990                                      "vdev=%s", vml[c]->vdev_path);
4990 4991                          vdev_free(vml[c]);
4991 4992                  }
4992 4993          }
4993 4994          vdev_config_dirty(spa->spa_root_vdev);
4994 4995          spa->spa_config_splitting = NULL;
4995 4996          nvlist_free(nvl);
4996 4997          if (error == 0)
4997 4998                  dmu_tx_commit(tx);
4998 4999          (void) spa_vdev_exit(spa, NULL, txg, 0);
4999 5000  
5000 5001          if (zio_injection_enabled)
5001 5002                  zio_handle_panic_injection(spa, FTAG, 3);
5002 5003  
5003 5004          /* split is complete; log a history record */
5004 5005          spa_history_log_internal(newspa, "split", NULL,
5005 5006              "from pool %s", spa_name(spa));
5006 5007  
5007 5008          kmem_free(vml, children * sizeof (vdev_t *));
5008 5009  
5009 5010          /* if we're not going to mount the filesystems in userland, export */
5010 5011          if (exp)
5011 5012                  error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
5012 5013                      B_FALSE, B_FALSE);
5013 5014  
5014 5015          return (error);
5015 5016  
5016 5017  out:
5017 5018          spa_unload(newspa);
5018 5019          spa_deactivate(newspa);
5019 5020          spa_remove(newspa);
5020 5021  
5021 5022          txg = spa_vdev_config_enter(spa);
5022 5023  
5023 5024          /* re-online all offlined disks */
5024 5025          for (c = 0; c < children; c++) {
5025 5026                  if (vml[c] != NULL)
5026 5027                          vml[c]->vdev_offline = B_FALSE;
5027 5028          }
5028 5029          vdev_reopen(spa->spa_root_vdev);
5029 5030  
5030 5031          nvlist_free(spa->spa_config_splitting);
5031 5032          spa->spa_config_splitting = NULL;
5032 5033          (void) spa_vdev_exit(spa, NULL, txg, error);
5033 5034  
5034 5035          kmem_free(vml, children * sizeof (vdev_t *));
5035 5036          return (error);
5036 5037  }
5037 5038  
5038 5039  static nvlist_t *
5039 5040  spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
5040 5041  {
5041 5042          for (int i = 0; i < count; i++) {
5042 5043                  uint64_t guid;
5043 5044  
5044 5045                  VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
5045 5046                      &guid) == 0);
5046 5047  
5047 5048                  if (guid == target_guid)
5048 5049                          return (nvpp[i]);
5049 5050          }
5050 5051  
5051 5052          return (NULL);
5052 5053  }
5053 5054  
5054 5055  static void
5055 5056  spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
5056 5057          nvlist_t *dev_to_remove)
5057 5058  {
5058 5059          nvlist_t **newdev = NULL;
5059 5060  
5060 5061          if (count > 1)
5061 5062                  newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
5062 5063  
5063 5064          for (int i = 0, j = 0; i < count; i++) {
5064 5065                  if (dev[i] == dev_to_remove)
5065 5066                          continue;
5066 5067                  VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
5067 5068          }
5068 5069  
5069 5070          VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
5070 5071          VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
5071 5072  
5072 5073          for (int i = 0; i < count - 1; i++)
5073 5074                  nvlist_free(newdev[i]);
5074 5075  
5075 5076          if (count > 1)
5076 5077                  kmem_free(newdev, (count - 1) * sizeof (void *));
5077 5078  }
5078 5079  
5079 5080  /*
5080 5081   * Evacuate the device.
5081 5082   */
5082 5083  static int
5083 5084  spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
5084 5085  {
5085 5086          uint64_t txg;
5086 5087          int error = 0;
5087 5088  
5088 5089          ASSERT(MUTEX_HELD(&spa_namespace_lock));
5089 5090          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5090 5091          ASSERT(vd == vd->vdev_top);
5091 5092  
5092 5093          /*
5093 5094           * Evacuate the device.  We don't hold the config lock as writer
5094 5095           * since we need to do I/O but we do keep the
5095 5096           * spa_namespace_lock held.  Once this completes the device
5096 5097           * should no longer have any blocks allocated on it.
5097 5098           */
5098 5099          if (vd->vdev_islog) {
5099 5100                  if (vd->vdev_stat.vs_alloc != 0)
5100 5101                          error = spa_offline_log(spa);
5101 5102          } else {
5102 5103                  error = SET_ERROR(ENOTSUP);
5103 5104          }
5104 5105  
5105 5106          if (error)
5106 5107                  return (error);
5107 5108  
5108 5109          /*
5109 5110           * The evacuation succeeded.  Remove any remaining MOS metadata
5110 5111           * associated with this vdev, and wait for these changes to sync.
5111 5112           */
5112 5113          ASSERT0(vd->vdev_stat.vs_alloc);
5113 5114          txg = spa_vdev_config_enter(spa);
5114 5115          vd->vdev_removing = B_TRUE;
5115 5116          vdev_dirty(vd, 0, NULL, txg);
5116 5117          vdev_config_dirty(vd);
5117 5118          spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5118 5119  
5119 5120          return (0);
5120 5121  }
5121 5122  
5122 5123  /*
5123 5124   * Complete the removal by cleaning up the namespace.
5124 5125   */
5125 5126  static void
5126 5127  spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
5127 5128  {
5128 5129          vdev_t *rvd = spa->spa_root_vdev;
5129 5130          uint64_t id = vd->vdev_id;
5130 5131          boolean_t last_vdev = (id == (rvd->vdev_children - 1));
5131 5132  
5132 5133          ASSERT(MUTEX_HELD(&spa_namespace_lock));
5133 5134          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5134 5135          ASSERT(vd == vd->vdev_top);
5135 5136  
5136 5137          /*
5137 5138           * Only remove any devices which are empty.
5138 5139           */
5139 5140          if (vd->vdev_stat.vs_alloc != 0)
5140 5141                  return;
5141 5142  
5142 5143          (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5143 5144  
5144 5145          if (list_link_active(&vd->vdev_state_dirty_node))
5145 5146                  vdev_state_clean(vd);
5146 5147          if (list_link_active(&vd->vdev_config_dirty_node))
5147 5148                  vdev_config_clean(vd);
5148 5149  
5149 5150          vdev_free(vd);
5150 5151  
5151 5152          if (last_vdev) {
5152 5153                  vdev_compact_children(rvd);
5153 5154          } else {
5154 5155                  vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
5155 5156                  vdev_add_child(rvd, vd);
5156 5157          }
5157 5158          vdev_config_dirty(rvd);
5158 5159  
5159 5160          /*
5160 5161           * Reassess the health of our root vdev.
5161 5162           */
5162 5163          vdev_reopen(rvd);
  
    | ↓ open down ↓ | 634 lines elided | ↑ open up ↑ | 
5163 5164  }
5164 5165  
5165 5166  /*
5166 5167   * Remove a device from the pool -
5167 5168   *
5168 5169   * Removing a device from the vdev namespace requires several steps
5169 5170   * and can take a significant amount of time.  As a result we use
5170 5171   * the spa_vdev_config_[enter/exit] functions which allow us to
5171 5172   * grab and release the spa_config_lock while still holding the namespace
5172 5173   * lock.  During each step the configuration is synced out.
5173      - */
5174      -
5175      -/*
5176      - * Remove a device from the pool.  Currently, this supports removing only hot
5177      - * spares, slogs, and level 2 ARC devices.
     5174 + *
     5175 + * Currently, this supports removing only hot spares, slogs, and level 2 ARC
     5176 + * devices.
5178 5177   */
5179 5178  int
5180 5179  spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
5181 5180  {
5182 5181          vdev_t *vd;
5183 5182          metaslab_group_t *mg;
5184 5183          nvlist_t **spares, **l2cache, *nv;
5185 5184          uint64_t txg = 0;
5186 5185          uint_t nspares, nl2cache;
5187 5186          int error = 0;
5188 5187          boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
5189 5188  
5190 5189          ASSERT(spa_writeable(spa));
5191 5190  
5192 5191          if (!locked)
5193 5192                  txg = spa_vdev_enter(spa);
5194 5193  
5195 5194          vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5196 5195  
5197 5196          if (spa->spa_spares.sav_vdevs != NULL &&
5198 5197              nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5199 5198              ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
5200 5199              (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
5201 5200                  /*
5202 5201                   * Only remove the hot spare if it's not currently in use
5203 5202                   * in this pool.
5204 5203                   */
5205 5204                  if (vd == NULL || unspare) {
5206 5205                          spa_vdev_remove_aux(spa->spa_spares.sav_config,
5207 5206                              ZPOOL_CONFIG_SPARES, spares, nspares, nv);
5208 5207                          spa_load_spares(spa);
5209 5208                          spa->spa_spares.sav_sync = B_TRUE;
5210 5209                  } else {
5211 5210                          error = SET_ERROR(EBUSY);
5212 5211                  }
5213 5212          } else if (spa->spa_l2cache.sav_vdevs != NULL &&
5214 5213              nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
5215 5214              ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
5216 5215              (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
5217 5216                  /*
5218 5217                   * Cache devices can always be removed.
5219 5218                   */
5220 5219                  spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
5221 5220                      ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
5222 5221                  spa_load_l2cache(spa);
5223 5222                  spa->spa_l2cache.sav_sync = B_TRUE;
5224 5223          } else if (vd != NULL && vd->vdev_islog) {
5225 5224                  ASSERT(!locked);
5226 5225                  ASSERT(vd == vd->vdev_top);
5227 5226  
5228 5227                  /*
5229 5228                   * XXX - Once we have bp-rewrite this should
5230 5229                   * become the common case.
5231 5230                   */
5232 5231  
5233 5232                  mg = vd->vdev_mg;
5234 5233  
5235 5234                  /*
5236 5235                   * Stop allocating from this vdev.
5237 5236                   */
5238 5237                  metaslab_group_passivate(mg);
5239 5238  
5240 5239                  /*
5241 5240                   * Wait for the youngest allocations and frees to sync,
5242 5241                   * and then wait for the deferral of those frees to finish.
5243 5242                   */
5244 5243                  spa_vdev_config_exit(spa, NULL,
5245 5244                      txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
5246 5245  
5247 5246                  /*
5248 5247                   * Attempt to evacuate the vdev.
5249 5248                   */
5250 5249                  error = spa_vdev_remove_evacuate(spa, vd);
5251 5250  
5252 5251                  txg = spa_vdev_config_enter(spa);
5253 5252  
5254 5253                  /*
5255 5254                   * If we couldn't evacuate the vdev, unwind.
5256 5255                   */
5257 5256                  if (error) {
5258 5257                          metaslab_group_activate(mg);
5259 5258                          return (spa_vdev_exit(spa, NULL, txg, error));
5260 5259                  }
5261 5260  
5262 5261                  /*
5263 5262                   * Clean up the vdev namespace.
5264 5263                   */
5265 5264                  spa_vdev_remove_from_namespace(spa, vd);
5266 5265  
5267 5266          } else if (vd != NULL) {
5268 5267                  /*
5269 5268                   * Normal vdevs cannot be removed (yet).
5270 5269                   */
5271 5270                  error = SET_ERROR(ENOTSUP);
5272 5271          } else {
5273 5272                  /*
5274 5273                   * There is no vdev of any kind with the specified guid.
5275 5274                   */
5276 5275                  error = SET_ERROR(ENOENT);
  
    | ↓ open down ↓ | 89 lines elided | ↑ open up ↑ | 
5277 5276          }
5278 5277  
5279 5278          if (!locked)
5280 5279                  return (spa_vdev_exit(spa, NULL, txg, error));
5281 5280  
5282 5281          return (error);
5283 5282  }
5284 5283  
5285 5284  /*
5286 5285   * Find any device that's done replacing, or a vdev marked 'unspare' that's
5287      - * current spared, so we can detach it.
     5286 + * currently spared, so we can detach it.
5288 5287   */
5289 5288  static vdev_t *
5290 5289  spa_vdev_resilver_done_hunt(vdev_t *vd)
5291 5290  {
5292 5291          vdev_t *newvd, *oldvd;
5293 5292  
5294 5293          for (int c = 0; c < vd->vdev_children; c++) {
5295 5294                  oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
5296 5295                  if (oldvd != NULL)
5297 5296                          return (oldvd);
5298 5297          }
5299 5298  
5300 5299          /*
5301 5300           * Check for a completed replacement.  We always consider the first
5302 5301           * vdev in the list to be the oldest vdev, and the last one to be
5303 5302           * the newest (see spa_vdev_attach() for how that works).  In
5304 5303           * the case where the newest vdev is faulted, we will not automatically
5305 5304           * remove it after a resilver completes.  This is OK as it will require
5306 5305           * user intervention to determine which disk the admin wishes to keep.
5307 5306           */
5308 5307          if (vd->vdev_ops == &vdev_replacing_ops) {
5309 5308                  ASSERT(vd->vdev_children > 1);
5310 5309  
5311 5310                  newvd = vd->vdev_child[vd->vdev_children - 1];
5312 5311                  oldvd = vd->vdev_child[0];
5313 5312  
5314 5313                  if (vdev_dtl_empty(newvd, DTL_MISSING) &&
5315 5314                      vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5316 5315                      !vdev_dtl_required(oldvd))
5317 5316                          return (oldvd);
5318 5317          }
5319 5318  
5320 5319          /*
5321 5320           * Check for a completed resilver with the 'unspare' flag set.
5322 5321           */
5323 5322          if (vd->vdev_ops == &vdev_spare_ops) {
5324 5323                  vdev_t *first = vd->vdev_child[0];
5325 5324                  vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
5326 5325  
5327 5326                  if (last->vdev_unspare) {
5328 5327                          oldvd = first;
5329 5328                          newvd = last;
5330 5329                  } else if (first->vdev_unspare) {
5331 5330                          oldvd = last;
5332 5331                          newvd = first;
5333 5332                  } else {
5334 5333                          oldvd = NULL;
5335 5334                  }
5336 5335  
5337 5336                  if (oldvd != NULL &&
5338 5337                      vdev_dtl_empty(newvd, DTL_MISSING) &&
5339 5338                      vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5340 5339                      !vdev_dtl_required(oldvd))
5341 5340                          return (oldvd);
5342 5341  
5343 5342                  /*
5344 5343                   * If there are more than two spares attached to a disk,
5345 5344                   * and those spares are not required, then we want to
5346 5345                   * attempt to free them up now so that they can be used
5347 5346                   * by other pools.  Once we're back down to a single
5348 5347                   * disk+spare, we stop removing them.
5349 5348                   */
5350 5349                  if (vd->vdev_children > 2) {
5351 5350                          newvd = vd->vdev_child[1];
5352 5351  
5353 5352                          if (newvd->vdev_isspare && last->vdev_isspare &&
5354 5353                              vdev_dtl_empty(last, DTL_MISSING) &&
5355 5354                              vdev_dtl_empty(last, DTL_OUTAGE) &&
5356 5355                              !vdev_dtl_required(newvd))
5357 5356                                  return (newvd);
5358 5357                  }
5359 5358          }
5360 5359  
5361 5360          return (NULL);
5362 5361  }
5363 5362  
5364 5363  static void
5365 5364  spa_vdev_resilver_done(spa_t *spa)
5366 5365  {
5367 5366          vdev_t *vd, *pvd, *ppvd;
5368 5367          uint64_t guid, sguid, pguid, ppguid;
5369 5368  
5370 5369          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5371 5370  
5372 5371          while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
5373 5372                  pvd = vd->vdev_parent;
5374 5373                  ppvd = pvd->vdev_parent;
5375 5374                  guid = vd->vdev_guid;
5376 5375                  pguid = pvd->vdev_guid;
5377 5376                  ppguid = ppvd->vdev_guid;
5378 5377                  sguid = 0;
5379 5378                  /*
5380 5379                   * If we have just finished replacing a hot spared device, then
5381 5380                   * we need to detach the parent's first child (the original hot
5382 5381                   * spare) as well.
5383 5382                   */
5384 5383                  if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
5385 5384                      ppvd->vdev_children == 2) {
5386 5385                          ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
5387 5386                          sguid = ppvd->vdev_child[1]->vdev_guid;
5388 5387                  }
5389 5388                  spa_config_exit(spa, SCL_ALL, FTAG);
5390 5389                  if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
5391 5390                          return;
5392 5391                  if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
5393 5392                          return;
5394 5393                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5395 5394          }
5396 5395  
5397 5396          spa_config_exit(spa, SCL_ALL, FTAG);
5398 5397  }
5399 5398  
5400 5399  /*
5401 5400   * Update the stored path or FRU for this vdev.
5402 5401   */
5403 5402  int
5404 5403  spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
5405 5404      boolean_t ispath)
5406 5405  {
5407 5406          vdev_t *vd;
5408 5407          boolean_t sync = B_FALSE;
5409 5408  
5410 5409          ASSERT(spa_writeable(spa));
5411 5410  
5412 5411          spa_vdev_state_enter(spa, SCL_ALL);
5413 5412  
5414 5413          if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
5415 5414                  return (spa_vdev_state_exit(spa, NULL, ENOENT));
5416 5415  
5417 5416          if (!vd->vdev_ops->vdev_op_leaf)
5418 5417                  return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
5419 5418  
5420 5419          if (ispath) {
5421 5420                  if (strcmp(value, vd->vdev_path) != 0) {
5422 5421                          spa_strfree(vd->vdev_path);
5423 5422                          vd->vdev_path = spa_strdup(value);
5424 5423                          sync = B_TRUE;
5425 5424                  }
5426 5425          } else {
5427 5426                  if (vd->vdev_fru == NULL) {
5428 5427                          vd->vdev_fru = spa_strdup(value);
5429 5428                          sync = B_TRUE;
5430 5429                  } else if (strcmp(value, vd->vdev_fru) != 0) {
5431 5430                          spa_strfree(vd->vdev_fru);
5432 5431                          vd->vdev_fru = spa_strdup(value);
5433 5432                          sync = B_TRUE;
5434 5433                  }
5435 5434          }
5436 5435  
5437 5436          return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
5438 5437  }
5439 5438  
5440 5439  int
5441 5440  spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
5442 5441  {
5443 5442          return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
5444 5443  }
5445 5444  
5446 5445  int
5447 5446  spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
5448 5447  {
5449 5448          return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
5450 5449  }
5451 5450  
5452 5451  /*
5453 5452   * ==========================================================================
5454 5453   * SPA Scanning
5455 5454   * ==========================================================================
5456 5455   */
5457 5456  
5458 5457  int
5459 5458  spa_scan_stop(spa_t *spa)
5460 5459  {
5461 5460          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5462 5461          if (dsl_scan_resilvering(spa->spa_dsl_pool))
5463 5462                  return (SET_ERROR(EBUSY));
5464 5463          return (dsl_scan_cancel(spa->spa_dsl_pool));
5465 5464  }
5466 5465  
5467 5466  int
5468 5467  spa_scan(spa_t *spa, pool_scan_func_t func)
5469 5468  {
5470 5469          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5471 5470  
5472 5471          if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5473 5472                  return (SET_ERROR(ENOTSUP));
5474 5473  
5475 5474          /*
5476 5475           * If a resilver was requested, but there is no DTL on a
5477 5476           * writeable leaf device, we have nothing to do.
5478 5477           */
5479 5478          if (func == POOL_SCAN_RESILVER &&
5480 5479              !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5481 5480                  spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5482 5481                  return (0);
5483 5482          }
5484 5483  
5485 5484          return (dsl_scan(spa->spa_dsl_pool, func));
5486 5485  }
5487 5486  
5488 5487  /*
5489 5488   * ==========================================================================
5490 5489   * SPA async task processing
5491 5490   * ==========================================================================
5492 5491   */
5493 5492  
5494 5493  static void
5495 5494  spa_async_remove(spa_t *spa, vdev_t *vd)
5496 5495  {
5497 5496          if (vd->vdev_remove_wanted) {
5498 5497                  vd->vdev_remove_wanted = B_FALSE;
5499 5498                  vd->vdev_delayed_close = B_FALSE;
5500 5499                  vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5501 5500  
5502 5501                  /*
5503 5502                   * We want to clear the stats, but we don't want to do a full
5504 5503                   * vdev_clear() as that will cause us to throw away
5505 5504                   * degraded/faulted state as well as attempt to reopen the
5506 5505                   * device, all of which is a waste.
5507 5506                   */
5508 5507                  vd->vdev_stat.vs_read_errors = 0;
5509 5508                  vd->vdev_stat.vs_write_errors = 0;
5510 5509                  vd->vdev_stat.vs_checksum_errors = 0;
5511 5510  
5512 5511                  vdev_state_dirty(vd->vdev_top);
5513 5512          }
5514 5513  
5515 5514          for (int c = 0; c < vd->vdev_children; c++)
5516 5515                  spa_async_remove(spa, vd->vdev_child[c]);
5517 5516  }
5518 5517  
5519 5518  static void
5520 5519  spa_async_probe(spa_t *spa, vdev_t *vd)
5521 5520  {
5522 5521          if (vd->vdev_probe_wanted) {
5523 5522                  vd->vdev_probe_wanted = B_FALSE;
5524 5523                  vdev_reopen(vd);        /* vdev_open() does the actual probe */
5525 5524          }
5526 5525  
5527 5526          for (int c = 0; c < vd->vdev_children; c++)
5528 5527                  spa_async_probe(spa, vd->vdev_child[c]);
5529 5528  }
5530 5529  
5531 5530  static void
5532 5531  spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5533 5532  {
5534 5533          sysevent_id_t eid;
5535 5534          nvlist_t *attr;
5536 5535          char *physpath;
5537 5536  
5538 5537          if (!spa->spa_autoexpand)
5539 5538                  return;
5540 5539  
5541 5540          for (int c = 0; c < vd->vdev_children; c++) {
5542 5541                  vdev_t *cvd = vd->vdev_child[c];
5543 5542                  spa_async_autoexpand(spa, cvd);
5544 5543          }
5545 5544  
5546 5545          if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5547 5546                  return;
5548 5547  
5549 5548          physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
5550 5549          (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
5551 5550  
5552 5551          VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5553 5552          VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
5554 5553  
5555 5554          (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
5556 5555              ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
5557 5556  
5558 5557          nvlist_free(attr);
5559 5558          kmem_free(physpath, MAXPATHLEN);
5560 5559  }
5561 5560  
5562 5561  static void
5563 5562  spa_async_thread(spa_t *spa)
5564 5563  {
5565 5564          int tasks;
5566 5565  
5567 5566          ASSERT(spa->spa_sync_on);
5568 5567  
5569 5568          mutex_enter(&spa->spa_async_lock);
5570 5569          tasks = spa->spa_async_tasks;
5571 5570          spa->spa_async_tasks = 0;
5572 5571          mutex_exit(&spa->spa_async_lock);
5573 5572  
5574 5573          /*
5575 5574           * See if the config needs to be updated.
5576 5575           */
5577 5576          if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5578 5577                  uint64_t old_space, new_space;
5579 5578  
5580 5579                  mutex_enter(&spa_namespace_lock);
5581 5580                  old_space = metaslab_class_get_space(spa_normal_class(spa));
5582 5581                  spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5583 5582                  new_space = metaslab_class_get_space(spa_normal_class(spa));
5584 5583                  mutex_exit(&spa_namespace_lock);
5585 5584  
5586 5585                  /*
5587 5586                   * If the pool grew as a result of the config update,
5588 5587                   * then log an internal history event.
5589 5588                   */
5590 5589                  if (new_space != old_space) {
5591 5590                          spa_history_log_internal(spa, "vdev online", NULL,
5592 5591                              "pool '%s' size: %llu(+%llu)",
5593 5592                              spa_name(spa), new_space, new_space - old_space);
5594 5593                  }
5595 5594          }
5596 5595  
5597 5596          /*
5598 5597           * See if any devices need to be marked REMOVED.
5599 5598           */
5600 5599          if (tasks & SPA_ASYNC_REMOVE) {
5601 5600                  spa_vdev_state_enter(spa, SCL_NONE);
5602 5601                  spa_async_remove(spa, spa->spa_root_vdev);
5603 5602                  for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
5604 5603                          spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5605 5604                  for (int i = 0; i < spa->spa_spares.sav_count; i++)
5606 5605                          spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5607 5606                  (void) spa_vdev_state_exit(spa, NULL, 0);
5608 5607          }
5609 5608  
5610 5609          if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5611 5610                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5612 5611                  spa_async_autoexpand(spa, spa->spa_root_vdev);
5613 5612                  spa_config_exit(spa, SCL_CONFIG, FTAG);
5614 5613          }
5615 5614  
5616 5615          /*
5617 5616           * See if any devices need to be probed.
5618 5617           */
5619 5618          if (tasks & SPA_ASYNC_PROBE) {
5620 5619                  spa_vdev_state_enter(spa, SCL_NONE);
5621 5620                  spa_async_probe(spa, spa->spa_root_vdev);
5622 5621                  (void) spa_vdev_state_exit(spa, NULL, 0);
5623 5622          }
5624 5623  
5625 5624          /*
5626 5625           * If any devices are done replacing, detach them.
5627 5626           */
5628 5627          if (tasks & SPA_ASYNC_RESILVER_DONE)
5629 5628                  spa_vdev_resilver_done(spa);
5630 5629  
5631 5630          /*
5632 5631           * Kick off a resilver.
5633 5632           */
5634 5633          if (tasks & SPA_ASYNC_RESILVER)
5635 5634                  dsl_resilver_restart(spa->spa_dsl_pool, 0);
5636 5635  
5637 5636          /*
5638 5637           * Let the world know that we're done.
5639 5638           */
5640 5639          mutex_enter(&spa->spa_async_lock);
5641 5640          spa->spa_async_thread = NULL;
5642 5641          cv_broadcast(&spa->spa_async_cv);
5643 5642          mutex_exit(&spa->spa_async_lock);
5644 5643          thread_exit();
5645 5644  }
5646 5645  
5647 5646  void
5648 5647  spa_async_suspend(spa_t *spa)
5649 5648  {
5650 5649          mutex_enter(&spa->spa_async_lock);
5651 5650          spa->spa_async_suspended++;
5652 5651          while (spa->spa_async_thread != NULL)
5653 5652                  cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5654 5653          mutex_exit(&spa->spa_async_lock);
5655 5654  }
5656 5655  
5657 5656  void
5658 5657  spa_async_resume(spa_t *spa)
5659 5658  {
5660 5659          mutex_enter(&spa->spa_async_lock);
5661 5660          ASSERT(spa->spa_async_suspended != 0);
5662 5661          spa->spa_async_suspended--;
5663 5662          mutex_exit(&spa->spa_async_lock);
5664 5663  }
5665 5664  
5666 5665  static void
5667 5666  spa_async_dispatch(spa_t *spa)
5668 5667  {
5669 5668          mutex_enter(&spa->spa_async_lock);
5670 5669          if (spa->spa_async_tasks && !spa->spa_async_suspended &&
5671 5670              spa->spa_async_thread == NULL &&
5672 5671              rootdir != NULL && !vn_is_readonly(rootdir))
5673 5672                  spa->spa_async_thread = thread_create(NULL, 0,
5674 5673                      spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5675 5674          mutex_exit(&spa->spa_async_lock);
5676 5675  }
5677 5676  
5678 5677  void
5679 5678  spa_async_request(spa_t *spa, int task)
5680 5679  {
5681 5680          zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5682 5681          mutex_enter(&spa->spa_async_lock);
5683 5682          spa->spa_async_tasks |= task;
5684 5683          mutex_exit(&spa->spa_async_lock);
5685 5684  }
5686 5685  
5687 5686  /*
5688 5687   * ==========================================================================
5689 5688   * SPA syncing routines
5690 5689   * ==========================================================================
5691 5690   */
5692 5691  
5693 5692  static int
5694 5693  bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5695 5694  {
5696 5695          bpobj_t *bpo = arg;
5697 5696          bpobj_enqueue(bpo, bp, tx);
5698 5697          return (0);
5699 5698  }
5700 5699  
5701 5700  static int
5702 5701  spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5703 5702  {
5704 5703          zio_t *zio = arg;
5705 5704  
5706 5705          zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5707 5706              zio->io_flags));
5708 5707          return (0);
5709 5708  }
5710 5709  
5711 5710  static void
5712 5711  spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5713 5712  {
5714 5713          char *packed = NULL;
5715 5714          size_t bufsize;
5716 5715          size_t nvsize = 0;
5717 5716          dmu_buf_t *db;
5718 5717  
5719 5718          VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5720 5719  
5721 5720          /*
5722 5721           * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5723 5722           * information.  This avoids the dbuf_will_dirty() path and
5724 5723           * saves us a pre-read to get data we don't actually care about.
5725 5724           */
5726 5725          bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
5727 5726          packed = kmem_alloc(bufsize, KM_SLEEP);
5728 5727  
5729 5728          VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5730 5729              KM_SLEEP) == 0);
5731 5730          bzero(packed + nvsize, bufsize - nvsize);
5732 5731  
5733 5732          dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5734 5733  
5735 5734          kmem_free(packed, bufsize);
5736 5735  
5737 5736          VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5738 5737          dmu_buf_will_dirty(db, tx);
5739 5738          *(uint64_t *)db->db_data = nvsize;
5740 5739          dmu_buf_rele(db, FTAG);
5741 5740  }
5742 5741  
5743 5742  static void
5744 5743  spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5745 5744      const char *config, const char *entry)
5746 5745  {
5747 5746          nvlist_t *nvroot;
5748 5747          nvlist_t **list;
5749 5748          int i;
5750 5749  
5751 5750          if (!sav->sav_sync)
5752 5751                  return;
5753 5752  
5754 5753          /*
5755 5754           * Update the MOS nvlist describing the list of available devices.
5756 5755           * spa_validate_aux() will have already made sure this nvlist is
5757 5756           * valid and the vdevs are labeled appropriately.
5758 5757           */
5759 5758          if (sav->sav_object == 0) {
5760 5759                  sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5761 5760                      DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5762 5761                      sizeof (uint64_t), tx);
5763 5762                  VERIFY(zap_update(spa->spa_meta_objset,
5764 5763                      DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5765 5764                      &sav->sav_object, tx) == 0);
5766 5765          }
5767 5766  
5768 5767          VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5769 5768          if (sav->sav_count == 0) {
5770 5769                  VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
5771 5770          } else {
5772 5771                  list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
5773 5772                  for (i = 0; i < sav->sav_count; i++)
5774 5773                          list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
5775 5774                              B_FALSE, VDEV_CONFIG_L2CACHE);
5776 5775                  VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5777 5776                      sav->sav_count) == 0);
5778 5777                  for (i = 0; i < sav->sav_count; i++)
5779 5778                          nvlist_free(list[i]);
5780 5779                  kmem_free(list, sav->sav_count * sizeof (void *));
5781 5780          }
5782 5781  
5783 5782          spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
5784 5783          nvlist_free(nvroot);
5785 5784  
5786 5785          sav->sav_sync = B_FALSE;
5787 5786  }
5788 5787  
5789 5788  static void
5790 5789  spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
5791 5790  {
5792 5791          nvlist_t *config;
5793 5792  
5794 5793          if (list_is_empty(&spa->spa_config_dirty_list))
5795 5794                  return;
5796 5795  
5797 5796          spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5798 5797  
5799 5798          config = spa_config_generate(spa, spa->spa_root_vdev,
5800 5799              dmu_tx_get_txg(tx), B_FALSE);
5801 5800  
5802 5801          /*
5803 5802           * If we're upgrading the spa version then make sure that
5804 5803           * the config object gets updated with the correct version.
5805 5804           */
5806 5805          if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
5807 5806                  fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5808 5807                      spa->spa_uberblock.ub_version);
5809 5808  
5810 5809          spa_config_exit(spa, SCL_STATE, FTAG);
5811 5810  
5812 5811          if (spa->spa_config_syncing)
5813 5812                  nvlist_free(spa->spa_config_syncing);
5814 5813          spa->spa_config_syncing = config;
5815 5814  
5816 5815          spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5817 5816  }
5818 5817  
5819 5818  static void
5820 5819  spa_sync_version(void *arg, dmu_tx_t *tx)
5821 5820  {
5822 5821          uint64_t *versionp = arg;
5823 5822          uint64_t version = *versionp;
5824 5823          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5825 5824  
5826 5825          /*
5827 5826           * Setting the version is special cased when first creating the pool.
5828 5827           */
5829 5828          ASSERT(tx->tx_txg != TXG_INITIAL);
5830 5829  
5831 5830          ASSERT(SPA_VERSION_IS_SUPPORTED(version));
5832 5831          ASSERT(version >= spa_version(spa));
5833 5832  
5834 5833          spa->spa_uberblock.ub_version = version;
5835 5834          vdev_config_dirty(spa->spa_root_vdev);
5836 5835          spa_history_log_internal(spa, "set", tx, "version=%lld", version);
5837 5836  }
5838 5837  
5839 5838  /*
5840 5839   * Set zpool properties.
5841 5840   */
5842 5841  static void
5843 5842  spa_sync_props(void *arg, dmu_tx_t *tx)
5844 5843  {
5845 5844          nvlist_t *nvp = arg;
5846 5845          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5847 5846          objset_t *mos = spa->spa_meta_objset;
5848 5847          nvpair_t *elem = NULL;
5849 5848  
5850 5849          mutex_enter(&spa->spa_props_lock);
5851 5850  
5852 5851          while ((elem = nvlist_next_nvpair(nvp, elem))) {
5853 5852                  uint64_t intval;
5854 5853                  char *strval, *fname;
5855 5854                  zpool_prop_t prop;
5856 5855                  const char *propname;
5857 5856                  zprop_type_t proptype;
5858 5857                  zfeature_info_t *feature;
5859 5858  
5860 5859                  switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
5861 5860                  case ZPROP_INVAL:
5862 5861                          /*
5863 5862                           * We checked this earlier in spa_prop_validate().
5864 5863                           */
5865 5864                          ASSERT(zpool_prop_feature(nvpair_name(elem)));
5866 5865  
5867 5866                          fname = strchr(nvpair_name(elem), '@') + 1;
5868 5867                          VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
5869 5868  
5870 5869                          spa_feature_enable(spa, feature, tx);
5871 5870                          spa_history_log_internal(spa, "set", tx,
5872 5871                              "%s=enabled", nvpair_name(elem));
5873 5872                          break;
5874 5873  
5875 5874                  case ZPOOL_PROP_VERSION:
5876 5875                          VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5877 5876                          /*
5878 5877                           * The version is synced seperatly before other
5879 5878                           * properties and should be correct by now.
5880 5879                           */
5881 5880                          ASSERT3U(spa_version(spa), >=, intval);
5882 5881                          break;
5883 5882  
5884 5883                  case ZPOOL_PROP_ALTROOT:
5885 5884                          /*
5886 5885                           * 'altroot' is a non-persistent property. It should
5887 5886                           * have been set temporarily at creation or import time.
5888 5887                           */
5889 5888                          ASSERT(spa->spa_root != NULL);
5890 5889                          break;
5891 5890  
5892 5891                  case ZPOOL_PROP_READONLY:
5893 5892                  case ZPOOL_PROP_CACHEFILE:
5894 5893                          /*
5895 5894                           * 'readonly' and 'cachefile' are also non-persisitent
5896 5895                           * properties.
5897 5896                           */
5898 5897                          break;
5899 5898                  case ZPOOL_PROP_COMMENT:
5900 5899                          VERIFY(nvpair_value_string(elem, &strval) == 0);
5901 5900                          if (spa->spa_comment != NULL)
5902 5901                                  spa_strfree(spa->spa_comment);
5903 5902                          spa->spa_comment = spa_strdup(strval);
5904 5903                          /*
5905 5904                           * We need to dirty the configuration on all the vdevs
5906 5905                           * so that their labels get updated.  It's unnecessary
5907 5906                           * to do this for pool creation since the vdev's
5908 5907                           * configuratoin has already been dirtied.
5909 5908                           */
5910 5909                          if (tx->tx_txg != TXG_INITIAL)
5911 5910                                  vdev_config_dirty(spa->spa_root_vdev);
5912 5911                          spa_history_log_internal(spa, "set", tx,
5913 5912                              "%s=%s", nvpair_name(elem), strval);
5914 5913                          break;
5915 5914                  default:
5916 5915                          /*
5917 5916                           * Set pool property values in the poolprops mos object.
5918 5917                           */
5919 5918                          if (spa->spa_pool_props_object == 0) {
5920 5919                                  spa->spa_pool_props_object =
5921 5920                                      zap_create_link(mos, DMU_OT_POOL_PROPS,
5922 5921                                      DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5923 5922                                      tx);
5924 5923                          }
5925 5924  
5926 5925                          /* normalize the property name */
5927 5926                          propname = zpool_prop_to_name(prop);
5928 5927                          proptype = zpool_prop_get_type(prop);
5929 5928  
5930 5929                          if (nvpair_type(elem) == DATA_TYPE_STRING) {
5931 5930                                  ASSERT(proptype == PROP_TYPE_STRING);
5932 5931                                  VERIFY(nvpair_value_string(elem, &strval) == 0);
5933 5932                                  VERIFY(zap_update(mos,
5934 5933                                      spa->spa_pool_props_object, propname,
5935 5934                                      1, strlen(strval) + 1, strval, tx) == 0);
5936 5935                                  spa_history_log_internal(spa, "set", tx,
5937 5936                                      "%s=%s", nvpair_name(elem), strval);
5938 5937                          } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5939 5938                                  VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5940 5939  
5941 5940                                  if (proptype == PROP_TYPE_INDEX) {
5942 5941                                          const char *unused;
5943 5942                                          VERIFY(zpool_prop_index_to_string(
5944 5943                                              prop, intval, &unused) == 0);
5945 5944                                  }
5946 5945                                  VERIFY(zap_update(mos,
5947 5946                                      spa->spa_pool_props_object, propname,
5948 5947                                      8, 1, &intval, tx) == 0);
5949 5948                                  spa_history_log_internal(spa, "set", tx,
5950 5949                                      "%s=%lld", nvpair_name(elem), intval);
5951 5950                          } else {
5952 5951                                  ASSERT(0); /* not allowed */
5953 5952                          }
5954 5953  
5955 5954                          switch (prop) {
5956 5955                          case ZPOOL_PROP_DELEGATION:
5957 5956                                  spa->spa_delegation = intval;
5958 5957                                  break;
5959 5958                          case ZPOOL_PROP_BOOTFS:
5960 5959                                  spa->spa_bootfs = intval;
5961 5960                                  break;
5962 5961                          case ZPOOL_PROP_FAILUREMODE:
5963 5962                                  spa->spa_failmode = intval;
5964 5963                                  break;
5965 5964                          case ZPOOL_PROP_AUTOEXPAND:
5966 5965                                  spa->spa_autoexpand = intval;
5967 5966                                  if (tx->tx_txg != TXG_INITIAL)
5968 5967                                          spa_async_request(spa,
5969 5968                                              SPA_ASYNC_AUTOEXPAND);
5970 5969                                  break;
5971 5970                          case ZPOOL_PROP_DEDUPDITTO:
5972 5971                                  spa->spa_dedup_ditto = intval;
5973 5972                                  break;
5974 5973                          default:
5975 5974                                  break;
5976 5975                          }
5977 5976                  }
5978 5977  
5979 5978          }
5980 5979  
5981 5980          mutex_exit(&spa->spa_props_lock);
5982 5981  }
5983 5982  
5984 5983  /*
5985 5984   * Perform one-time upgrade on-disk changes.  spa_version() does not
5986 5985   * reflect the new version this txg, so there must be no changes this
5987 5986   * txg to anything that the upgrade code depends on after it executes.
5988 5987   * Therefore this must be called after dsl_pool_sync() does the sync
5989 5988   * tasks.
5990 5989   */
5991 5990  static void
5992 5991  spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
5993 5992  {
5994 5993          dsl_pool_t *dp = spa->spa_dsl_pool;
5995 5994  
5996 5995          ASSERT(spa->spa_sync_pass == 1);
5997 5996  
5998 5997          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
5999 5998  
6000 5999          if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
6001 6000              spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
6002 6001                  dsl_pool_create_origin(dp, tx);
6003 6002  
6004 6003                  /* Keeping the origin open increases spa_minref */
6005 6004                  spa->spa_minref += 3;
6006 6005          }
6007 6006  
6008 6007          if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
6009 6008              spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
6010 6009                  dsl_pool_upgrade_clones(dp, tx);
6011 6010          }
6012 6011  
6013 6012          if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
6014 6013              spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
6015 6014                  dsl_pool_upgrade_dir_clones(dp, tx);
6016 6015  
6017 6016                  /* Keeping the freedir open increases spa_minref */
6018 6017                  spa->spa_minref += 3;
6019 6018          }
6020 6019  
6021 6020          if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6022 6021              spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6023 6022                  spa_feature_create_zap_objects(spa, tx);
6024 6023          }
6025 6024          rrw_exit(&dp->dp_config_rwlock, FTAG);
6026 6025  }
6027 6026  
6028 6027  /*
6029 6028   * Sync the specified transaction group.  New blocks may be dirtied as
6030 6029   * part of the process, so we iterate until it converges.
6031 6030   */
6032 6031  void
6033 6032  spa_sync(spa_t *spa, uint64_t txg)
6034 6033  {
6035 6034          dsl_pool_t *dp = spa->spa_dsl_pool;
6036 6035          objset_t *mos = spa->spa_meta_objset;
6037 6036          bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
6038 6037          bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6039 6038          vdev_t *rvd = spa->spa_root_vdev;
6040 6039          vdev_t *vd;
6041 6040          dmu_tx_t *tx;
6042 6041          int error;
6043 6042  
6044 6043          VERIFY(spa_writeable(spa));
6045 6044  
6046 6045          /*
6047 6046           * Lock out configuration changes.
6048 6047           */
6049 6048          spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6050 6049  
6051 6050          spa->spa_syncing_txg = txg;
6052 6051          spa->spa_sync_pass = 0;
6053 6052  
6054 6053          /*
6055 6054           * If there are any pending vdev state changes, convert them
6056 6055           * into config changes that go out with this transaction group.
6057 6056           */
6058 6057          spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6059 6058          while (list_head(&spa->spa_state_dirty_list) != NULL) {
6060 6059                  /*
6061 6060                   * We need the write lock here because, for aux vdevs,
6062 6061                   * calling vdev_config_dirty() modifies sav_config.
6063 6062                   * This is ugly and will become unnecessary when we
6064 6063                   * eliminate the aux vdev wart by integrating all vdevs
6065 6064                   * into the root vdev tree.
6066 6065                   */
6067 6066                  spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6068 6067                  spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
6069 6068                  while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
6070 6069                          vdev_state_clean(vd);
6071 6070                          vdev_config_dirty(vd);
6072 6071                  }
6073 6072                  spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6074 6073                  spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6075 6074          }
6076 6075          spa_config_exit(spa, SCL_STATE, FTAG);
6077 6076  
6078 6077          tx = dmu_tx_create_assigned(dp, txg);
6079 6078  
6080 6079          spa->spa_sync_starttime = gethrtime();
6081 6080          VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
6082 6081              spa->spa_sync_starttime + spa->spa_deadman_synctime));
6083 6082  
6084 6083          /*
6085 6084           * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
6086 6085           * set spa_deflate if we have no raid-z vdevs.
6087 6086           */
6088 6087          if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
6089 6088              spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
6090 6089                  int i;
6091 6090  
6092 6091                  for (i = 0; i < rvd->vdev_children; i++) {
6093 6092                          vd = rvd->vdev_child[i];
6094 6093                          if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
6095 6094                                  break;
6096 6095                  }
6097 6096                  if (i == rvd->vdev_children) {
6098 6097                          spa->spa_deflate = TRUE;
6099 6098                          VERIFY(0 == zap_add(spa->spa_meta_objset,
6100 6099                              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6101 6100                              sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6102 6101                  }
6103 6102          }
6104 6103  
6105 6104          /*
6106 6105           * If anything has changed in this txg, or if someone is waiting
6107 6106           * for this txg to sync (eg, spa_vdev_remove()), push the
6108 6107           * deferred frees from the previous txg.  If not, leave them
6109 6108           * alone so that we don't generate work on an otherwise idle
6110 6109           * system.
6111 6110           */
6112 6111          if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
6113 6112              !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
6114 6113              !txg_list_empty(&dp->dp_sync_tasks, txg) ||
6115 6114              ((dsl_scan_active(dp->dp_scan) ||
6116 6115              txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
6117 6116                  zio_t *zio = zio_root(spa, NULL, NULL, 0);
6118 6117                  VERIFY3U(bpobj_iterate(defer_bpo,
6119 6118                      spa_free_sync_cb, zio, tx), ==, 0);
6120 6119                  VERIFY0(zio_wait(zio));
6121 6120          }
6122 6121  
6123 6122          /*
6124 6123           * Iterate to convergence.
6125 6124           */
6126 6125          do {
6127 6126                  int pass = ++spa->spa_sync_pass;
6128 6127  
6129 6128                  spa_sync_config_object(spa, tx);
6130 6129                  spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6131 6130                      ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6132 6131                  spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6133 6132                      ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6134 6133                  spa_errlog_sync(spa, txg);
6135 6134                  dsl_pool_sync(dp, txg);
6136 6135  
6137 6136                  if (pass < zfs_sync_pass_deferred_free) {
6138 6137                          zio_t *zio = zio_root(spa, NULL, NULL, 0);
6139 6138                          bplist_iterate(free_bpl, spa_free_sync_cb,
6140 6139                              zio, tx);
6141 6140                          VERIFY(zio_wait(zio) == 0);
6142 6141                  } else {
6143 6142                          bplist_iterate(free_bpl, bpobj_enqueue_cb,
6144 6143                              defer_bpo, tx);
6145 6144                  }
6146 6145  
6147 6146                  ddt_sync(spa, txg);
6148 6147                  dsl_scan_sync(dp, tx);
6149 6148  
6150 6149                  while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
6151 6150                          vdev_sync(vd, txg);
6152 6151  
6153 6152                  if (pass == 1)
6154 6153                          spa_sync_upgrades(spa, tx);
6155 6154  
6156 6155          } while (dmu_objset_is_dirty(mos, txg));
6157 6156  
6158 6157          /*
6159 6158           * Rewrite the vdev configuration (which includes the uberblock)
6160 6159           * to commit the transaction group.
6161 6160           *
6162 6161           * If there are no dirty vdevs, we sync the uberblock to a few
6163 6162           * random top-level vdevs that are known to be visible in the
6164 6163           * config cache (see spa_vdev_add() for a complete description).
6165 6164           * If there *are* dirty vdevs, sync the uberblock to all vdevs.
6166 6165           */
6167 6166          for (;;) {
6168 6167                  /*
6169 6168                   * We hold SCL_STATE to prevent vdev open/close/etc.
6170 6169                   * while we're attempting to write the vdev labels.
6171 6170                   */
6172 6171                  spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6173 6172  
6174 6173                  if (list_is_empty(&spa->spa_config_dirty_list)) {
6175 6174                          vdev_t *svd[SPA_DVAS_PER_BP];
6176 6175                          int svdcount = 0;
6177 6176                          int children = rvd->vdev_children;
6178 6177                          int c0 = spa_get_random(children);
6179 6178  
6180 6179                          for (int c = 0; c < children; c++) {
6181 6180                                  vd = rvd->vdev_child[(c0 + c) % children];
6182 6181                                  if (vd->vdev_ms_array == 0 || vd->vdev_islog)
6183 6182                                          continue;
6184 6183                                  svd[svdcount++] = vd;
6185 6184                                  if (svdcount == SPA_DVAS_PER_BP)
6186 6185                                          break;
6187 6186                          }
6188 6187                          error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
6189 6188                          if (error != 0)
6190 6189                                  error = vdev_config_sync(svd, svdcount, txg,
6191 6190                                      B_TRUE);
6192 6191                  } else {
6193 6192                          error = vdev_config_sync(rvd->vdev_child,
6194 6193                              rvd->vdev_children, txg, B_FALSE);
6195 6194                          if (error != 0)
6196 6195                                  error = vdev_config_sync(rvd->vdev_child,
6197 6196                                      rvd->vdev_children, txg, B_TRUE);
6198 6197                  }
6199 6198  
6200 6199                  if (error == 0)
6201 6200                          spa->spa_last_synced_guid = rvd->vdev_guid;
6202 6201  
6203 6202                  spa_config_exit(spa, SCL_STATE, FTAG);
6204 6203  
6205 6204                  if (error == 0)
6206 6205                          break;
6207 6206                  zio_suspend(spa, NULL);
6208 6207                  zio_resume_wait(spa);
6209 6208          }
6210 6209          dmu_tx_commit(tx);
6211 6210  
6212 6211          VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
6213 6212  
6214 6213          /*
6215 6214           * Clear the dirty config list.
6216 6215           */
6217 6216          while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
6218 6217                  vdev_config_clean(vd);
6219 6218  
6220 6219          /*
6221 6220           * Now that the new config has synced transactionally,
6222 6221           * let it become visible to the config cache.
6223 6222           */
6224 6223          if (spa->spa_config_syncing != NULL) {
6225 6224                  spa_config_set(spa, spa->spa_config_syncing);
6226 6225                  spa->spa_config_txg = txg;
6227 6226                  spa->spa_config_syncing = NULL;
6228 6227          }
6229 6228  
6230 6229          spa->spa_ubsync = spa->spa_uberblock;
6231 6230  
6232 6231          dsl_pool_sync_done(dp, txg);
6233 6232  
6234 6233          /*
6235 6234           * Update usable space statistics.
6236 6235           */
6237 6236          while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
6238 6237                  vdev_sync_done(vd, txg);
6239 6238  
6240 6239          spa_update_dspace(spa);
6241 6240  
6242 6241          /*
6243 6242           * It had better be the case that we didn't dirty anything
6244 6243           * since vdev_config_sync().
6245 6244           */
6246 6245          ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
6247 6246          ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
6248 6247          ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
6249 6248  
6250 6249          spa->spa_sync_pass = 0;
6251 6250  
6252 6251          spa_config_exit(spa, SCL_CONFIG, FTAG);
6253 6252  
6254 6253          spa_handle_ignored_writes(spa);
6255 6254  
6256 6255          /*
6257 6256           * If any async tasks have been requested, kick them off.
6258 6257           */
6259 6258          spa_async_dispatch(spa);
6260 6259  }
6261 6260  
6262 6261  /*
6263 6262   * Sync all pools.  We don't want to hold the namespace lock across these
6264 6263   * operations, so we take a reference on the spa_t and drop the lock during the
6265 6264   * sync.
6266 6265   */
6267 6266  void
6268 6267  spa_sync_allpools(void)
6269 6268  {
6270 6269          spa_t *spa = NULL;
6271 6270          mutex_enter(&spa_namespace_lock);
6272 6271          while ((spa = spa_next(spa)) != NULL) {
6273 6272                  if (spa_state(spa) != POOL_STATE_ACTIVE ||
6274 6273                      !spa_writeable(spa) || spa_suspended(spa))
6275 6274                          continue;
6276 6275                  spa_open_ref(spa, FTAG);
6277 6276                  mutex_exit(&spa_namespace_lock);
6278 6277                  txg_wait_synced(spa_get_dsl(spa), 0);
6279 6278                  mutex_enter(&spa_namespace_lock);
6280 6279                  spa_close(spa, FTAG);
6281 6280          }
6282 6281          mutex_exit(&spa_namespace_lock);
6283 6282  }
6284 6283  
6285 6284  /*
6286 6285   * ==========================================================================
6287 6286   * Miscellaneous routines
6288 6287   * ==========================================================================
6289 6288   */
6290 6289  
6291 6290  /*
6292 6291   * Remove all pools in the system.
6293 6292   */
6294 6293  void
6295 6294  spa_evict_all(void)
6296 6295  {
6297 6296          spa_t *spa;
6298 6297  
6299 6298          /*
6300 6299           * Remove all cached state.  All pools should be closed now,
6301 6300           * so every spa in the AVL tree should be unreferenced.
6302 6301           */
6303 6302          mutex_enter(&spa_namespace_lock);
6304 6303          while ((spa = spa_next(NULL)) != NULL) {
6305 6304                  /*
6306 6305                   * Stop async tasks.  The async thread may need to detach
6307 6306                   * a device that's been replaced, which requires grabbing
6308 6307                   * spa_namespace_lock, so we must drop it here.
6309 6308                   */
6310 6309                  spa_open_ref(spa, FTAG);
6311 6310                  mutex_exit(&spa_namespace_lock);
6312 6311                  spa_async_suspend(spa);
6313 6312                  mutex_enter(&spa_namespace_lock);
6314 6313                  spa_close(spa, FTAG);
6315 6314  
6316 6315                  if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6317 6316                          spa_unload(spa);
6318 6317                          spa_deactivate(spa);
6319 6318                  }
6320 6319                  spa_remove(spa);
6321 6320          }
6322 6321          mutex_exit(&spa_namespace_lock);
6323 6322  }
6324 6323  
6325 6324  vdev_t *
6326 6325  spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
6327 6326  {
6328 6327          vdev_t *vd;
6329 6328          int i;
6330 6329  
6331 6330          if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
6332 6331                  return (vd);
6333 6332  
6334 6333          if (aux) {
6335 6334                  for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
6336 6335                          vd = spa->spa_l2cache.sav_vdevs[i];
6337 6336                          if (vd->vdev_guid == guid)
6338 6337                                  return (vd);
6339 6338                  }
6340 6339  
6341 6340                  for (i = 0; i < spa->spa_spares.sav_count; i++) {
6342 6341                          vd = spa->spa_spares.sav_vdevs[i];
6343 6342                          if (vd->vdev_guid == guid)
6344 6343                                  return (vd);
6345 6344                  }
6346 6345          }
6347 6346  
6348 6347          return (NULL);
6349 6348  }
6350 6349  
6351 6350  void
6352 6351  spa_upgrade(spa_t *spa, uint64_t version)
6353 6352  {
6354 6353          ASSERT(spa_writeable(spa));
6355 6354  
6356 6355          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6357 6356  
6358 6357          /*
6359 6358           * This should only be called for a non-faulted pool, and since a
6360 6359           * future version would result in an unopenable pool, this shouldn't be
6361 6360           * possible.
6362 6361           */
6363 6362          ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
6364 6363          ASSERT(version >= spa->spa_uberblock.ub_version);
6365 6364  
6366 6365          spa->spa_uberblock.ub_version = version;
6367 6366          vdev_config_dirty(spa->spa_root_vdev);
6368 6367  
6369 6368          spa_config_exit(spa, SCL_ALL, FTAG);
6370 6369  
6371 6370          txg_wait_synced(spa_get_dsl(spa), 0);
6372 6371  }
6373 6372  
6374 6373  boolean_t
6375 6374  spa_has_spare(spa_t *spa, uint64_t guid)
6376 6375  {
6377 6376          int i;
6378 6377          uint64_t spareguid;
6379 6378          spa_aux_vdev_t *sav = &spa->spa_spares;
6380 6379  
6381 6380          for (i = 0; i < sav->sav_count; i++)
6382 6381                  if (sav->sav_vdevs[i]->vdev_guid == guid)
6383 6382                          return (B_TRUE);
6384 6383  
6385 6384          for (i = 0; i < sav->sav_npending; i++) {
6386 6385                  if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
6387 6386                      &spareguid) == 0 && spareguid == guid)
6388 6387                          return (B_TRUE);
6389 6388          }
6390 6389  
6391 6390          return (B_FALSE);
6392 6391  }
6393 6392  
6394 6393  /*
6395 6394   * Check if a pool has an active shared spare device.
6396 6395   * Note: reference count of an active spare is 2, as a spare and as a replace
6397 6396   */
6398 6397  static boolean_t
6399 6398  spa_has_active_shared_spare(spa_t *spa)
6400 6399  {
6401 6400          int i, refcnt;
6402 6401          uint64_t pool;
6403 6402          spa_aux_vdev_t *sav = &spa->spa_spares;
6404 6403  
6405 6404          for (i = 0; i < sav->sav_count; i++) {
6406 6405                  if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
6407 6406                      &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
6408 6407                      refcnt > 2)
6409 6408                          return (B_TRUE);
6410 6409          }
6411 6410  
6412 6411          return (B_FALSE);
6413 6412  }
6414 6413  
6415 6414  /*
6416 6415   * Post a sysevent corresponding to the given event.  The 'name' must be one of
6417 6416   * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
6418 6417   * filled in from the spa and (optionally) the vdev.  This doesn't do anything
6419 6418   * in the userland libzpool, as we don't want consumers to misinterpret ztest
6420 6419   * or zdb as real changes.
6421 6420   */
6422 6421  void
6423 6422  spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
6424 6423  {
6425 6424  #ifdef _KERNEL
6426 6425          sysevent_t              *ev;
6427 6426          sysevent_attr_list_t    *attr = NULL;
6428 6427          sysevent_value_t        value;
6429 6428          sysevent_id_t           eid;
6430 6429  
6431 6430          ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
6432 6431              SE_SLEEP);
6433 6432  
6434 6433          value.value_type = SE_DATA_TYPE_STRING;
6435 6434          value.value.sv_string = spa_name(spa);
6436 6435          if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
6437 6436                  goto done;
6438 6437  
6439 6438          value.value_type = SE_DATA_TYPE_UINT64;
6440 6439          value.value.sv_uint64 = spa_guid(spa);
6441 6440          if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
6442 6441                  goto done;
6443 6442  
6444 6443          if (vd) {
6445 6444                  value.value_type = SE_DATA_TYPE_UINT64;
6446 6445                  value.value.sv_uint64 = vd->vdev_guid;
6447 6446                  if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
6448 6447                      SE_SLEEP) != 0)
6449 6448                          goto done;
6450 6449  
6451 6450                  if (vd->vdev_path) {
6452 6451                          value.value_type = SE_DATA_TYPE_STRING;
6453 6452                          value.value.sv_string = vd->vdev_path;
6454 6453                          if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
6455 6454                              &value, SE_SLEEP) != 0)
6456 6455                                  goto done;
6457 6456                  }
6458 6457          }
6459 6458  
6460 6459          if (sysevent_attach_attributes(ev, attr) != 0)
6461 6460                  goto done;
6462 6461          attr = NULL;
6463 6462  
6464 6463          (void) log_sysevent(ev, SE_SLEEP, &eid);
6465 6464  
6466 6465  done:
6467 6466          if (attr)
6468 6467                  sysevent_free_attr(attr);
6469 6468          sysevent_free(ev);
6470 6469  #endif
6471 6470  }
  
    | ↓ open down ↓ | 1174 lines elided | ↑ open up ↑ | 
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX