illumos-gate Wdiff usr/src/uts/common/fs/zfs/vdev.c

Print this page

7938 disable LBA weighting on files and SSDs
Reviewed by: Yuri Pankov <yuripv@gmx.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/vdev.c
          +++ new/usr/src/uts/common/fs/zfs/vdev.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  25   25   * Copyright 2017 Nexenta Systems, Inc.
  26   26   * Copyright (c) 2014 Integros [integros.com]
  27   27   * Copyright 2016 Toomas Soome <tsoome@me.com>
  28   28   * Copyright 2017 Joyent, Inc.
  29   29   */
  30   30  
  31   31  #include <sys/zfs_context.h>
  32   32  #include <sys/fm/fs/zfs.h>
  33   33  #include <sys/spa.h>
  34   34  #include <sys/spa_impl.h>
  35   35  #include <sys/dmu.h>
  36   36  #include <sys/dmu_tx.h>
  37   37  #include <sys/vdev_impl.h>
  38   38  #include <sys/uberblock_impl.h>
  39   39  #include <sys/metaslab.h>
  40   40  #include <sys/metaslab_impl.h>
  41   41  #include <sys/space_map.h>
  42   42  #include <sys/space_reftree.h>
  43   43  #include <sys/zio.h>
  44   44  #include <sys/zap.h>
  45   45  #include <sys/fs/zfs.h>
  46   46  #include <sys/arc.h>
  47   47  #include <sys/zil.h>
  48   48  #include <sys/dsl_scan.h>
  49   49  #include <sys/abd.h>
  50   50  
  51   51  /*
  52   52   * Virtual device management.
  53   53   */
  54   54  
  55   55  static vdev_ops_t *vdev_ops_table[] = {
  56   56          &vdev_root_ops,
  57   57          &vdev_raidz_ops,
  58   58          &vdev_mirror_ops,
  59   59          &vdev_replacing_ops,
  60   60          &vdev_spare_ops,
  61   61          &vdev_disk_ops,
  62   62          &vdev_file_ops,
  63   63          &vdev_missing_ops,
  64   64          &vdev_hole_ops,
  65   65          NULL
  66   66  };
  67   67  
  68   68  /* maximum scrub/resilver I/O queue per leaf vdev */
  69   69  int zfs_scrub_limit = 10;
  70   70  
  71   71  /*
  72   72   * When a vdev is added, it will be divided into approximately (but no
  73   73   * more than) this number of metaslabs.
  74   74   */
  75   75  int metaslabs_per_vdev = 200;
  76   76  
  77   77  /*
  78   78   * Given a vdev type, return the appropriate ops vector.
  79   79   */
  80   80  static vdev_ops_t *
  81   81  vdev_getops(const char *type)
  82   82  {
  83   83          vdev_ops_t *ops, **opspp;
  84   84  
  85   85          for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
  86   86                  if (strcmp(ops->vdev_op_type, type) == 0)
  87   87                          break;
  88   88  
  89   89          return (ops);
  90   90  }
  91   91  
  92   92  /*
  93   93   * Default asize function: return the MAX of psize with the asize of
  94   94   * all children.  This is what's used by anything other than RAID-Z.
  95   95   */
  96   96  uint64_t
  97   97  vdev_default_asize(vdev_t *vd, uint64_t psize)
  98   98  {
  99   99          uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 100  100          uint64_t csize;
 101  101  
 102  102          for (int c = 0; c < vd->vdev_children; c++) {
 103  103                  csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
 104  104                  asize = MAX(asize, csize);
 105  105          }
 106  106  
 107  107          return (asize);
 108  108  }
 109  109  
 110  110  /*
 111  111   * Get the minimum allocatable size. We define the allocatable size as
 112  112   * the vdev's asize rounded to the nearest metaslab. This allows us to
 113  113   * replace or attach devices which don't have the same physical size but
 114  114   * can still satisfy the same number of allocations.
 115  115   */
 116  116  uint64_t
 117  117  vdev_get_min_asize(vdev_t *vd)
 118  118  {
 119  119          vdev_t *pvd = vd->vdev_parent;
 120  120  
 121  121          /*
 122  122           * If our parent is NULL (inactive spare or cache) or is the root,
 123  123           * just return our own asize.
 124  124           */
 125  125          if (pvd == NULL)
 126  126                  return (vd->vdev_asize);
 127  127  
 128  128          /*
 129  129           * The top-level vdev just returns the allocatable size rounded
 130  130           * to the nearest metaslab.
 131  131           */
 132  132          if (vd == vd->vdev_top)
 133  133                  return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
 134  134  
 135  135          /*
 136  136           * The allocatable space for a raidz vdev is N * sizeof(smallest child),
 137  137           * so each child must provide at least 1/Nth of its asize.
 138  138           */
 139  139          if (pvd->vdev_ops == &vdev_raidz_ops)
 140  140                  return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
 141  141                      pvd->vdev_children);
 142  142  
 143  143          return (pvd->vdev_min_asize);
 144  144  }
 145  145  
 146  146  void
 147  147  vdev_set_min_asize(vdev_t *vd)
 148  148  {
 149  149          vd->vdev_min_asize = vdev_get_min_asize(vd);
 150  150  
 151  151          for (int c = 0; c < vd->vdev_children; c++)
 152  152                  vdev_set_min_asize(vd->vdev_child[c]);
 153  153  }
 154  154  
 155  155  vdev_t *
 156  156  vdev_lookup_top(spa_t *spa, uint64_t vdev)
 157  157  {
 158  158          vdev_t *rvd = spa->spa_root_vdev;
 159  159  
 160  160          ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 161  161  
 162  162          if (vdev < rvd->vdev_children) {
 163  163                  ASSERT(rvd->vdev_child[vdev] != NULL);
 164  164                  return (rvd->vdev_child[vdev]);
 165  165          }
 166  166  
 167  167          return (NULL);
 168  168  }
 169  169  
 170  170  vdev_t *
 171  171  vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 172  172  {
 173  173          vdev_t *mvd;
 174  174  
 175  175          if (vd->vdev_guid == guid)
 176  176                  return (vd);
 177  177  
 178  178          for (int c = 0; c < vd->vdev_children; c++)
 179  179                  if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 180  180                      NULL)
 181  181                          return (mvd);
 182  182  
 183  183          return (NULL);
 184  184  }
 185  185  
 186  186  static int
 187  187  vdev_count_leaves_impl(vdev_t *vd)
 188  188  {
 189  189          int n = 0;
 190  190  
 191  191          if (vd->vdev_ops->vdev_op_leaf)
 192  192                  return (1);
 193  193  
 194  194          for (int c = 0; c < vd->vdev_children; c++)
 195  195                  n += vdev_count_leaves_impl(vd->vdev_child[c]);
 196  196  
 197  197          return (n);
 198  198  }
 199  199  
 200  200  int
 201  201  vdev_count_leaves(spa_t *spa)
 202  202  {
 203  203          return (vdev_count_leaves_impl(spa->spa_root_vdev));
 204  204  }
 205  205  
 206  206  void
 207  207  vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 208  208  {
 209  209          size_t oldsize, newsize;
 210  210          uint64_t id = cvd->vdev_id;
 211  211          vdev_t **newchild;
 212  212          spa_t *spa = cvd->vdev_spa;
 213  213  
 214  214          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 215  215          ASSERT(cvd->vdev_parent == NULL);
 216  216  
 217  217          cvd->vdev_parent = pvd;
 218  218  
 219  219          if (pvd == NULL)
 220  220                  return;
 221  221  
 222  222          ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
 223  223  
 224  224          oldsize = pvd->vdev_children * sizeof (vdev_t *);
 225  225          pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
 226  226          newsize = pvd->vdev_children * sizeof (vdev_t *);
 227  227  
 228  228          newchild = kmem_zalloc(newsize, KM_SLEEP);
 229  229          if (pvd->vdev_child != NULL) {
 230  230                  bcopy(pvd->vdev_child, newchild, oldsize);
 231  231                  kmem_free(pvd->vdev_child, oldsize);
 232  232          }
 233  233  
 234  234          pvd->vdev_child = newchild;
 235  235          pvd->vdev_child[id] = cvd;
 236  236  
 237  237          cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 238  238          ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
 239  239  
 240  240          /*
 241  241           * Walk up all ancestors to update guid sum.
 242  242           */
 243  243          for (; pvd != NULL; pvd = pvd->vdev_parent)
 244  244                  pvd->vdev_guid_sum += cvd->vdev_guid_sum;
 245  245  }
 246  246  
 247  247  void
 248  248  vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 249  249  {
 250  250          int c;
 251  251          uint_t id = cvd->vdev_id;
 252  252  
 253  253          ASSERT(cvd->vdev_parent == pvd);
 254  254  
 255  255          if (pvd == NULL)
 256  256                  return;
 257  257  
 258  258          ASSERT(id < pvd->vdev_children);
 259  259          ASSERT(pvd->vdev_child[id] == cvd);
 260  260  
 261  261          pvd->vdev_child[id] = NULL;
 262  262          cvd->vdev_parent = NULL;
 263  263  
 264  264          for (c = 0; c < pvd->vdev_children; c++)
 265  265                  if (pvd->vdev_child[c])
 266  266                          break;
 267  267  
 268  268          if (c == pvd->vdev_children) {
 269  269                  kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
 270  270                  pvd->vdev_child = NULL;
 271  271                  pvd->vdev_children = 0;
 272  272          }
 273  273  
 274  274          /*
 275  275           * Walk up all ancestors to update guid sum.
 276  276           */
 277  277          for (; pvd != NULL; pvd = pvd->vdev_parent)
 278  278                  pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
 279  279  }
 280  280  
 281  281  /*
 282  282   * Remove any holes in the child array.
 283  283   */
 284  284  void
 285  285  vdev_compact_children(vdev_t *pvd)
 286  286  {
 287  287          vdev_t **newchild, *cvd;
 288  288          int oldc = pvd->vdev_children;
 289  289          int newc;
 290  290  
 291  291          ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 292  292  
 293  293          for (int c = newc = 0; c < oldc; c++)
 294  294                  if (pvd->vdev_child[c])
 295  295                          newc++;
 296  296  
 297  297          newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
 298  298  
 299  299          for (int c = newc = 0; c < oldc; c++) {
 300  300                  if ((cvd = pvd->vdev_child[c]) != NULL) {
 301  301                          newchild[newc] = cvd;
 302  302                          cvd->vdev_id = newc++;
 303  303                  }
 304  304          }
 305  305  
 306  306          kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
 307  307          pvd->vdev_child = newchild;
 308  308          pvd->vdev_children = newc;
 309  309  }
 310  310  
 311  311  /*
 312  312   * Allocate and minimally initialize a vdev_t.
 313  313   */
 314  314  vdev_t *
 315  315  vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 316  316  {
 317  317          vdev_t *vd;
 318  318  
 319  319          vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 320  320  
 321  321          if (spa->spa_root_vdev == NULL) {
 322  322                  ASSERT(ops == &vdev_root_ops);
 323  323                  spa->spa_root_vdev = vd;
 324  324                  spa->spa_load_guid = spa_generate_guid(NULL);
 325  325          }
 326  326  
 327  327          if (guid == 0 && ops != &vdev_hole_ops) {
 328  328                  if (spa->spa_root_vdev == vd) {
 329  329                          /*
 330  330                           * The root vdev's guid will also be the pool guid,
 331  331                           * which must be unique among all pools.
 332  332                           */
 333  333                          guid = spa_generate_guid(NULL);
 334  334                  } else {
 335  335                          /*
 336  336                           * Any other vdev's guid must be unique within the pool.
 337  337                           */
 338  338                          guid = spa_generate_guid(spa);
 339  339                  }
 340  340                  ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 341  341          }
 342  342  
 343  343          vd->vdev_spa = spa;
 344  344          vd->vdev_id = id;
 345  345          vd->vdev_guid = guid;
 346  346          vd->vdev_guid_sum = guid;
 347  347          vd->vdev_ops = ops;
 348  348          vd->vdev_state = VDEV_STATE_CLOSED;
 349  349          vd->vdev_ishole = (ops == &vdev_hole_ops);
 350  350  
 351  351          mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 352  352          mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 353  353          mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 354  354          mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 355  355          for (int t = 0; t < DTL_TYPES; t++) {
 356  356                  vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
 357  357                      &vd->vdev_dtl_lock);
 358  358          }
 359  359          txg_list_create(&vd->vdev_ms_list, spa,
 360  360              offsetof(struct metaslab, ms_txg_node));
 361  361          txg_list_create(&vd->vdev_dtl_list, spa,
 362  362              offsetof(struct vdev, vdev_dtl_node));
 363  363          vd->vdev_stat.vs_timestamp = gethrtime();
 364  364          vdev_queue_init(vd);
 365  365          vdev_cache_init(vd);
 366  366  
 367  367          return (vd);
 368  368  }
 369  369  
 370  370  /*
 371  371   * Allocate a new vdev.  The 'alloctype' is used to control whether we are
 372  372   * creating a new vdev or loading an existing one - the behavior is slightly
 373  373   * different for each case.
 374  374   */
 375  375  int
 376  376  vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 377  377      int alloctype)
 378  378  {
 379  379          vdev_ops_t *ops;
 380  380          char *type;
 381  381          uint64_t guid = 0, islog, nparity;
 382  382          vdev_t *vd;
 383  383  
 384  384          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 385  385  
 386  386          if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 387  387                  return (SET_ERROR(EINVAL));
 388  388  
 389  389          if ((ops = vdev_getops(type)) == NULL)
 390  390                  return (SET_ERROR(EINVAL));
 391  391  
 392  392          /*
 393  393           * If this is a load, get the vdev guid from the nvlist.
 394  394           * Otherwise, vdev_alloc_common() will generate one for us.
 395  395           */
 396  396          if (alloctype == VDEV_ALLOC_LOAD) {
 397  397                  uint64_t label_id;
 398  398  
 399  399                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 400  400                      label_id != id)
 401  401                          return (SET_ERROR(EINVAL));
 402  402  
 403  403                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 404  404                          return (SET_ERROR(EINVAL));
 405  405          } else if (alloctype == VDEV_ALLOC_SPARE) {
 406  406                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 407  407                          return (SET_ERROR(EINVAL));
 408  408          } else if (alloctype == VDEV_ALLOC_L2CACHE) {
 409  409                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 410  410                          return (SET_ERROR(EINVAL));
 411  411          } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 412  412                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 413  413                          return (SET_ERROR(EINVAL));
 414  414          }
 415  415  
 416  416          /*
 417  417           * The first allocated vdev must be of type 'root'.
 418  418           */
 419  419          if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 420  420                  return (SET_ERROR(EINVAL));
 421  421  
 422  422          /*
 423  423           * Determine whether we're a log vdev.
 424  424           */
 425  425          islog = 0;
 426  426          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 427  427          if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 428  428                  return (SET_ERROR(ENOTSUP));
 429  429  
 430  430          if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 431  431                  return (SET_ERROR(ENOTSUP));
 432  432  
 433  433          /*
 434  434           * Set the nparity property for RAID-Z vdevs.
 435  435           */
 436  436          nparity = -1ULL;
 437  437          if (ops == &vdev_raidz_ops) {
 438  438                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 439  439                      &nparity) == 0) {
 440  440                          if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 441  441                                  return (SET_ERROR(EINVAL));
 442  442                          /*
 443  443                           * Previous versions could only support 1 or 2 parity
 444  444                           * device.
 445  445                           */
 446  446                          if (nparity > 1 &&
 447  447                              spa_version(spa) < SPA_VERSION_RAIDZ2)
 448  448                                  return (SET_ERROR(ENOTSUP));
 449  449                          if (nparity > 2 &&
 450  450                              spa_version(spa) < SPA_VERSION_RAIDZ3)
 451  451                                  return (SET_ERROR(ENOTSUP));
 452  452                  } else {
 453  453                          /*
 454  454                           * We require the parity to be specified for SPAs that
 455  455                           * support multiple parity levels.
 456  456                           */
 457  457                          if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
 458  458                                  return (SET_ERROR(EINVAL));
 459  459                          /*
 460  460                           * Otherwise, we default to 1 parity device for RAID-Z.
 461  461                           */
 462  462                          nparity = 1;
 463  463                  }
 464  464          } else {
 465  465                  nparity = 0;
 466  466          }
 467  467          ASSERT(nparity != -1ULL);
 468  468  
 469  469          vd = vdev_alloc_common(spa, id, guid, ops);
 470  470  
 471  471          vd->vdev_islog = islog;
 472  472          vd->vdev_nparity = nparity;
 473  473  
 474  474          if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
 475  475                  vd->vdev_path = spa_strdup(vd->vdev_path);
 476  476          if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
 477  477                  vd->vdev_devid = spa_strdup(vd->vdev_devid);
 478  478          if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 479  479              &vd->vdev_physpath) == 0)
 480  480                  vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
 481  481          if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
 482  482                  vd->vdev_fru = spa_strdup(vd->vdev_fru);
 483  483  
 484  484          /*
 485  485           * Set the whole_disk property.  If it's not specified, leave the value
 486  486           * as -1.
 487  487           */
 488  488          if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 489  489              &vd->vdev_wholedisk) != 0)
 490  490                  vd->vdev_wholedisk = -1ULL;
 491  491  
 492  492          /*
 493  493           * Look for the 'not present' flag.  This will only be set if the device
 494  494           * was not present at the time of import.
 495  495           */
 496  496          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 497  497              &vd->vdev_not_present);
 498  498  
 499  499          /*
 500  500           * Get the alignment requirement.
 501  501           */
 502  502          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
 503  503  
 504  504          /*
 505  505           * Retrieve the vdev creation time.
 506  506           */
 507  507          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 508  508              &vd->vdev_crtxg);
 509  509  
 510  510          /*
 511  511           * If we're a top-level vdev, try to load the allocation parameters.
 512  512           */
 513  513          if (parent && !parent->vdev_parent &&
 514  514              (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 515  515                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 516  516                      &vd->vdev_ms_array);
 517  517                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 518  518                      &vd->vdev_ms_shift);
 519  519                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 520  520                      &vd->vdev_asize);
 521  521                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 522  522                      &vd->vdev_removing);
 523  523                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 524  524                      &vd->vdev_top_zap);
 525  525          } else {
 526  526                  ASSERT0(vd->vdev_top_zap);
 527  527          }
 528  528  
 529  529          if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
 530  530                  ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 531  531                      alloctype == VDEV_ALLOC_ADD ||
 532  532                      alloctype == VDEV_ALLOC_SPLIT ||
 533  533                      alloctype == VDEV_ALLOC_ROOTPOOL);
 534  534                  vd->vdev_mg = metaslab_group_create(islog ?
 535  535                      spa_log_class(spa) : spa_normal_class(spa), vd);
 536  536          }
 537  537  
 538  538          if (vd->vdev_ops->vdev_op_leaf &&
 539  539              (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 540  540                  (void) nvlist_lookup_uint64(nv,
 541  541                      ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
 542  542          } else {
 543  543                  ASSERT0(vd->vdev_leaf_zap);
 544  544          }
 545  545  
 546  546          /*
 547  547           * If we're a leaf vdev, try to load the DTL object and other state.
 548  548           */
 549  549  
 550  550          if (vd->vdev_ops->vdev_op_leaf &&
 551  551              (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 552  552              alloctype == VDEV_ALLOC_ROOTPOOL)) {
 553  553                  if (alloctype == VDEV_ALLOC_LOAD) {
 554  554                          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 555  555                              &vd->vdev_dtl_object);
 556  556                          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 557  557                              &vd->vdev_unspare);
 558  558                  }
 559  559  
 560  560                  if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 561  561                          uint64_t spare = 0;
 562  562  
 563  563                          if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 564  564                              &spare) == 0 && spare)
 565  565                                  spa_spare_add(vd);
 566  566                  }
 567  567  
 568  568                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 569  569                      &vd->vdev_offline);
 570  570  
 571  571                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 572  572                      &vd->vdev_resilver_txg);
 573  573  
 574  574                  /*
 575  575                   * When importing a pool, we want to ignore the persistent fault
 576  576                   * state, as the diagnosis made on another system may not be
 577  577                   * valid in the current context.  Local vdevs will
 578  578                   * remain in the faulted state.
 579  579                   */
 580  580                  if (spa_load_state(spa) == SPA_LOAD_OPEN) {
 581  581                          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 582  582                              &vd->vdev_faulted);
 583  583                          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 584  584                              &vd->vdev_degraded);
 585  585                          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 586  586                              &vd->vdev_removed);
 587  587  
 588  588                          if (vd->vdev_faulted || vd->vdev_degraded) {
 589  589                                  char *aux;
 590  590  
 591  591                                  vd->vdev_label_aux =
 592  592                                      VDEV_AUX_ERR_EXCEEDED;
 593  593                                  if (nvlist_lookup_string(nv,
 594  594                                      ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
 595  595                                      strcmp(aux, "external") == 0)
 596  596                                          vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 597  597                          }
 598  598                  }
 599  599          }
 600  600  
 601  601          /*
 602  602           * Add ourselves to the parent's list of children.
 603  603           */
 604  604          vdev_add_child(parent, vd);
 605  605  
 606  606          *vdp = vd;
 607  607  
 608  608          return (0);
 609  609  }
 610  610  
 611  611  void
 612  612  vdev_free(vdev_t *vd)
 613  613  {
 614  614          spa_t *spa = vd->vdev_spa;
 615  615  
 616  616          /*
 617  617           * vdev_free() implies closing the vdev first.  This is simpler than
 618  618           * trying to ensure complicated semantics for all callers.
 619  619           */
 620  620          vdev_close(vd);
 621  621  
 622  622          ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
 623  623          ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 624  624  
 625  625          /*
 626  626           * Free all children.
 627  627           */
 628  628          for (int c = 0; c < vd->vdev_children; c++)
 629  629                  vdev_free(vd->vdev_child[c]);
 630  630  
 631  631          ASSERT(vd->vdev_child == NULL);
 632  632          ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 633  633  
 634  634          /*
 635  635           * Discard allocation state.
 636  636           */
 637  637          if (vd->vdev_mg != NULL) {
 638  638                  vdev_metaslab_fini(vd);
 639  639                  metaslab_group_destroy(vd->vdev_mg);
 640  640          }
 641  641  
 642  642          ASSERT0(vd->vdev_stat.vs_space);
 643  643          ASSERT0(vd->vdev_stat.vs_dspace);
 644  644          ASSERT0(vd->vdev_stat.vs_alloc);
 645  645  
 646  646          /*
 647  647           * Remove this vdev from its parent's child list.
 648  648           */
 649  649          vdev_remove_child(vd->vdev_parent, vd);
 650  650  
 651  651          ASSERT(vd->vdev_parent == NULL);
 652  652  
 653  653          /*
 654  654           * Clean up vdev structure.
 655  655           */
 656  656          vdev_queue_fini(vd);
 657  657          vdev_cache_fini(vd);
 658  658  
 659  659          if (vd->vdev_path)
 660  660                  spa_strfree(vd->vdev_path);
 661  661          if (vd->vdev_devid)
 662  662                  spa_strfree(vd->vdev_devid);
 663  663          if (vd->vdev_physpath)
 664  664                  spa_strfree(vd->vdev_physpath);
 665  665          if (vd->vdev_fru)
 666  666                  spa_strfree(vd->vdev_fru);
 667  667  
 668  668          if (vd->vdev_isspare)
 669  669                  spa_spare_remove(vd);
 670  670          if (vd->vdev_isl2cache)
 671  671                  spa_l2cache_remove(vd);
 672  672  
 673  673          txg_list_destroy(&vd->vdev_ms_list);
 674  674          txg_list_destroy(&vd->vdev_dtl_list);
 675  675  
 676  676          mutex_enter(&vd->vdev_dtl_lock);
 677  677          space_map_close(vd->vdev_dtl_sm);
 678  678          for (int t = 0; t < DTL_TYPES; t++) {
 679  679                  range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
 680  680                  range_tree_destroy(vd->vdev_dtl[t]);
 681  681          }
 682  682          mutex_exit(&vd->vdev_dtl_lock);
 683  683  
 684  684          mutex_destroy(&vd->vdev_queue_lock);
 685  685          mutex_destroy(&vd->vdev_dtl_lock);
 686  686          mutex_destroy(&vd->vdev_stat_lock);
 687  687          mutex_destroy(&vd->vdev_probe_lock);
 688  688  
 689  689          if (vd == spa->spa_root_vdev)
 690  690                  spa->spa_root_vdev = NULL;
 691  691  
 692  692          kmem_free(vd, sizeof (vdev_t));
 693  693  }
 694  694  
 695  695  /*
 696  696   * Transfer top-level vdev state from svd to tvd.
 697  697   */
 698  698  static void
 699  699  vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 700  700  {
 701  701          spa_t *spa = svd->vdev_spa;
 702  702          metaslab_t *msp;
 703  703          vdev_t *vd;
 704  704          int t;
 705  705  
 706  706          ASSERT(tvd == tvd->vdev_top);
 707  707  
 708  708          tvd->vdev_ms_array = svd->vdev_ms_array;
 709  709          tvd->vdev_ms_shift = svd->vdev_ms_shift;
 710  710          tvd->vdev_ms_count = svd->vdev_ms_count;
 711  711          tvd->vdev_top_zap = svd->vdev_top_zap;
 712  712  
 713  713          svd->vdev_ms_array = 0;
 714  714          svd->vdev_ms_shift = 0;
 715  715          svd->vdev_ms_count = 0;
 716  716          svd->vdev_top_zap = 0;
 717  717  
 718  718          if (tvd->vdev_mg)
 719  719                  ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
 720  720          tvd->vdev_mg = svd->vdev_mg;
 721  721          tvd->vdev_ms = svd->vdev_ms;
 722  722  
 723  723          svd->vdev_mg = NULL;
 724  724          svd->vdev_ms = NULL;
 725  725  
 726  726          if (tvd->vdev_mg != NULL)
 727  727                  tvd->vdev_mg->mg_vd = tvd;
 728  728  
 729  729          tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 730  730          tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 731  731          tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
 732  732  
 733  733          svd->vdev_stat.vs_alloc = 0;
 734  734          svd->vdev_stat.vs_space = 0;
 735  735          svd->vdev_stat.vs_dspace = 0;
 736  736  
 737  737          for (t = 0; t < TXG_SIZE; t++) {
 738  738                  while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 739  739                          (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
 740  740                  while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
 741  741                          (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
 742  742                  if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
 743  743                          (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
 744  744          }
 745  745  
 746  746          if (list_link_active(&svd->vdev_config_dirty_node)) {
 747  747                  vdev_config_clean(svd);
 748  748                  vdev_config_dirty(tvd);
 749  749          }
 750  750  
 751  751          if (list_link_active(&svd->vdev_state_dirty_node)) {
 752  752                  vdev_state_clean(svd);
 753  753                  vdev_state_dirty(tvd);
 754  754          }
 755  755  
 756  756          tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 757  757          svd->vdev_deflate_ratio = 0;
 758  758  
 759  759          tvd->vdev_islog = svd->vdev_islog;
 760  760          svd->vdev_islog = 0;
 761  761  }
 762  762  
 763  763  static void
 764  764  vdev_top_update(vdev_t *tvd, vdev_t *vd)
 765  765  {
 766  766          if (vd == NULL)
 767  767                  return;
 768  768  
 769  769          vd->vdev_top = tvd;
 770  770  
 771  771          for (int c = 0; c < vd->vdev_children; c++)
 772  772                  vdev_top_update(tvd, vd->vdev_child[c]);
 773  773  }
 774  774  
 775  775  /*
 776  776   * Add a mirror/replacing vdev above an existing vdev.
 777  777   */
 778  778  vdev_t *
 779  779  vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 780  780  {
 781  781          spa_t *spa = cvd->vdev_spa;
 782  782          vdev_t *pvd = cvd->vdev_parent;
 783  783          vdev_t *mvd;
 784  784  
 785  785          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 786  786  
 787  787          mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 788  788  
 789  789          mvd->vdev_asize = cvd->vdev_asize;
 790  790          mvd->vdev_min_asize = cvd->vdev_min_asize;
 791  791          mvd->vdev_max_asize = cvd->vdev_max_asize;
 792  792          mvd->vdev_ashift = cvd->vdev_ashift;
 793  793          mvd->vdev_state = cvd->vdev_state;
 794  794          mvd->vdev_crtxg = cvd->vdev_crtxg;
 795  795  
 796  796          vdev_remove_child(pvd, cvd);
 797  797          vdev_add_child(pvd, mvd);
 798  798          cvd->vdev_id = mvd->vdev_children;
 799  799          vdev_add_child(mvd, cvd);
 800  800          vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 801  801  
 802  802          if (mvd == mvd->vdev_top)
 803  803                  vdev_top_transfer(cvd, mvd);
 804  804  
 805  805          return (mvd);
 806  806  }
 807  807  
 808  808  /*
 809  809   * Remove a 1-way mirror/replacing vdev from the tree.
 810  810   */
 811  811  void
 812  812  vdev_remove_parent(vdev_t *cvd)
 813  813  {
 814  814          vdev_t *mvd = cvd->vdev_parent;
 815  815          vdev_t *pvd = mvd->vdev_parent;
 816  816  
 817  817          ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 818  818  
 819  819          ASSERT(mvd->vdev_children == 1);
 820  820          ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
 821  821              mvd->vdev_ops == &vdev_replacing_ops ||
 822  822              mvd->vdev_ops == &vdev_spare_ops);
 823  823          cvd->vdev_ashift = mvd->vdev_ashift;
 824  824  
 825  825          vdev_remove_child(mvd, cvd);
 826  826          vdev_remove_child(pvd, mvd);
 827  827  
 828  828          /*
 829  829           * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 830  830           * Otherwise, we could have detached an offline device, and when we
 831  831           * go to import the pool we'll think we have two top-level vdevs,
 832  832           * instead of a different version of the same top-level vdev.
 833  833           */
 834  834          if (mvd->vdev_top == mvd) {
 835  835                  uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
 836  836                  cvd->vdev_orig_guid = cvd->vdev_guid;
 837  837                  cvd->vdev_guid += guid_delta;
 838  838                  cvd->vdev_guid_sum += guid_delta;
 839  839          }
 840  840          cvd->vdev_id = mvd->vdev_id;
 841  841          vdev_add_child(pvd, cvd);
 842  842          vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 843  843  
 844  844          if (cvd == cvd->vdev_top)
 845  845                  vdev_top_transfer(mvd, cvd);
 846  846  
 847  847          ASSERT(mvd->vdev_children == 0);
 848  848          vdev_free(mvd);
 849  849  }
 850  850  
 851  851  int
 852  852  vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 853  853  {
 854  854          spa_t *spa = vd->vdev_spa;
 855  855          objset_t *mos = spa->spa_meta_objset;
 856  856          uint64_t m;
 857  857          uint64_t oldc = vd->vdev_ms_count;
 858  858          uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 859  859          metaslab_t **mspp;
 860  860          int error;
 861  861  
 862  862          ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 863  863  
 864  864          /*
 865  865           * This vdev is not being allocated from yet or is a hole.
 866  866           */
 867  867          if (vd->vdev_ms_shift == 0)
 868  868                  return (0);
 869  869  
 870  870          ASSERT(!vd->vdev_ishole);
 871  871  
 872  872          /*
 873  873           * Compute the raidz-deflation ratio.  Note, we hard-code
 874  874           * in 128k (1 << 17) because it is the "typical" blocksize.
 875  875           * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
 876  876           * otherwise it would inconsistently account for existing bp's.
 877  877           */
 878  878          vd->vdev_deflate_ratio = (1 << 17) /
 879  879              (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
 880  880  
 881  881          ASSERT(oldc <= newc);
 882  882  
 883  883          mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 884  884  
 885  885          if (oldc != 0) {
 886  886                  bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
 887  887                  kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 888  888          }
 889  889  
 890  890          vd->vdev_ms = mspp;
 891  891          vd->vdev_ms_count = newc;
 892  892  
 893  893          for (m = oldc; m < newc; m++) {
 894  894                  uint64_t object = 0;
 895  895  
 896  896                  if (txg == 0) {
 897  897                          error = dmu_read(mos, vd->vdev_ms_array,
 898  898                              m * sizeof (uint64_t), sizeof (uint64_t), &object,
 899  899                              DMU_READ_PREFETCH);
 900  900                          if (error)
 901  901                                  return (error);
 902  902                  }
 903  903  
 904  904                  error = metaslab_init(vd->vdev_mg, m, object, txg,
 905  905                      &(vd->vdev_ms[m]));
 906  906                  if (error)
 907  907                          return (error);
 908  908          }
 909  909  
 910  910          if (txg == 0)
 911  911                  spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 912  912  
 913  913          /*
 914  914           * If the vdev is being removed we don't activate
 915  915           * the metaslabs since we want to ensure that no new
 916  916           * allocations are performed on this device.
 917  917           */
 918  918          if (oldc == 0 && !vd->vdev_removing)
 919  919                  metaslab_group_activate(vd->vdev_mg);
 920  920  
 921  921          if (txg == 0)
 922  922                  spa_config_exit(spa, SCL_ALLOC, FTAG);
 923  923  
 924  924          return (0);
 925  925  }
 926  926  
 927  927  void
 928  928  vdev_metaslab_fini(vdev_t *vd)
 929  929  {
 930  930          uint64_t m;
 931  931          uint64_t count = vd->vdev_ms_count;
 932  932  
 933  933          if (vd->vdev_ms != NULL) {
 934  934                  metaslab_group_passivate(vd->vdev_mg);
 935  935                  for (m = 0; m < count; m++) {
 936  936                          metaslab_t *msp = vd->vdev_ms[m];
 937  937  
 938  938                          if (msp != NULL)
 939  939                                  metaslab_fini(msp);
 940  940                  }
 941  941                  kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 942  942                  vd->vdev_ms = NULL;
 943  943          }
 944  944  }
 945  945  
 946  946  typedef struct vdev_probe_stats {
 947  947          boolean_t       vps_readable;
 948  948          boolean_t       vps_writeable;
 949  949          int             vps_flags;
 950  950  } vdev_probe_stats_t;
 951  951  
 952  952  static void
 953  953  vdev_probe_done(zio_t *zio)
 954  954  {
 955  955          spa_t *spa = zio->io_spa;
 956  956          vdev_t *vd = zio->io_vd;
 957  957          vdev_probe_stats_t *vps = zio->io_private;
 958  958  
 959  959          ASSERT(vd->vdev_probe_zio != NULL);
 960  960  
 961  961          if (zio->io_type == ZIO_TYPE_READ) {
 962  962                  if (zio->io_error == 0)
 963  963                          vps->vps_readable = 1;
 964  964                  if (zio->io_error == 0 && spa_writeable(spa)) {
 965  965                          zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 966  966                              zio->io_offset, zio->io_size, zio->io_abd,
 967  967                              ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 968  968                              ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
 969  969                  } else {
 970  970                          abd_free(zio->io_abd);
 971  971                  }
 972  972          } else if (zio->io_type == ZIO_TYPE_WRITE) {
 973  973                  if (zio->io_error == 0)
 974  974                          vps->vps_writeable = 1;
 975  975                  abd_free(zio->io_abd);
 976  976          } else if (zio->io_type == ZIO_TYPE_NULL) {
 977  977                  zio_t *pio;
 978  978  
 979  979                  vd->vdev_cant_read |= !vps->vps_readable;
 980  980                  vd->vdev_cant_write |= !vps->vps_writeable;
 981  981  
 982  982                  if (vdev_readable(vd) &&
 983  983                      (vdev_writeable(vd) || !spa_writeable(spa))) {
 984  984                          zio->io_error = 0;
 985  985                  } else {
 986  986                          ASSERT(zio->io_error != 0);
 987  987                          zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 988  988                              spa, vd, NULL, 0, 0);
 989  989                          zio->io_error = SET_ERROR(ENXIO);
 990  990                  }
 991  991  
 992  992                  mutex_enter(&vd->vdev_probe_lock);
 993  993                  ASSERT(vd->vdev_probe_zio == zio);
 994  994                  vd->vdev_probe_zio = NULL;
 995  995                  mutex_exit(&vd->vdev_probe_lock);
 996  996  
 997  997                  zio_link_t *zl = NULL;
 998  998                  while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 999  999                          if (!vdev_accessible(vd, pio))
1000 1000                                  pio->io_error = SET_ERROR(ENXIO);
1001 1001  
1002 1002                  kmem_free(vps, sizeof (*vps));
1003 1003          }
1004 1004  }
1005 1005  
1006 1006  /*
1007 1007   * Determine whether this device is accessible.
1008 1008   *
1009 1009   * Read and write to several known locations: the pad regions of each
1010 1010   * vdev label but the first, which we leave alone in case it contains
1011 1011   * a VTOC.
1012 1012   */
1013 1013  zio_t *
1014 1014  vdev_probe(vdev_t *vd, zio_t *zio)
1015 1015  {
1016 1016          spa_t *spa = vd->vdev_spa;
1017 1017          vdev_probe_stats_t *vps = NULL;
1018 1018          zio_t *pio;
1019 1019  
1020 1020          ASSERT(vd->vdev_ops->vdev_op_leaf);
1021 1021  
1022 1022          /*
1023 1023           * Don't probe the probe.
1024 1024           */
1025 1025          if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
1026 1026                  return (NULL);
1027 1027  
1028 1028          /*
1029 1029           * To prevent 'probe storms' when a device fails, we create
1030 1030           * just one probe i/o at a time.  All zios that want to probe
1031 1031           * this vdev will become parents of the probe io.
1032 1032           */
1033 1033          mutex_enter(&vd->vdev_probe_lock);
1034 1034  
1035 1035          if ((pio = vd->vdev_probe_zio) == NULL) {
1036 1036                  vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
1037 1037  
1038 1038                  vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
1039 1039                      ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
1040 1040                      ZIO_FLAG_TRYHARD;
1041 1041  
1042 1042                  if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
1043 1043                          /*
1044 1044                           * vdev_cant_read and vdev_cant_write can only
1045 1045                           * transition from TRUE to FALSE when we have the
1046 1046                           * SCL_ZIO lock as writer; otherwise they can only
1047 1047                           * transition from FALSE to TRUE.  This ensures that
1048 1048                           * any zio looking at these values can assume that
1049 1049                           * failures persist for the life of the I/O.  That's
1050 1050                           * important because when a device has intermittent
1051 1051                           * connectivity problems, we want to ensure that
1052 1052                           * they're ascribed to the device (ENXIO) and not
1053 1053                           * the zio (EIO).
1054 1054                           *
1055 1055                           * Since we hold SCL_ZIO as writer here, clear both
1056 1056                           * values so the probe can reevaluate from first
1057 1057                           * principles.
1058 1058                           */
1059 1059                          vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
1060 1060                          vd->vdev_cant_read = B_FALSE;
1061 1061                          vd->vdev_cant_write = B_FALSE;
1062 1062                  }
1063 1063  
1064 1064                  vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
1065 1065                      vdev_probe_done, vps,
1066 1066                      vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
1067 1067  
1068 1068                  /*
1069 1069                   * We can't change the vdev state in this context, so we
1070 1070                   * kick off an async task to do it on our behalf.
1071 1071                   */
1072 1072                  if (zio != NULL) {
1073 1073                          vd->vdev_probe_wanted = B_TRUE;
1074 1074                          spa_async_request(spa, SPA_ASYNC_PROBE);
1075 1075                  }
1076 1076          }
1077 1077  
1078 1078          if (zio != NULL)
1079 1079                  zio_add_child(zio, pio);
1080 1080  
1081 1081          mutex_exit(&vd->vdev_probe_lock);
1082 1082  
1083 1083          if (vps == NULL) {
1084 1084                  ASSERT(zio != NULL);
1085 1085                  return (NULL);
1086 1086          }
1087 1087  
1088 1088          for (int l = 1; l < VDEV_LABELS; l++) {
1089 1089                  zio_nowait(zio_read_phys(pio, vd,
1090 1090                      vdev_label_offset(vd->vdev_psize, l,
1091 1091                      offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE,
1092 1092                      abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
1093 1093                      ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1094 1094                      ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
1095 1095          }
1096 1096  
1097 1097          if (zio == NULL)
1098 1098                  return (pio);
1099 1099  
1100 1100          zio_nowait(pio);
1101 1101          return (NULL);

↓ open down ↓

1101 lines elided

↑ open up ↑

1102 1102  }
1103 1103  
1104 1104  static void
1105 1105  vdev_open_child(void *arg)
1106 1106  {
1107 1107          vdev_t *vd = arg;
1108 1108  
1109 1109          vd->vdev_open_thread = curthread;
1110 1110          vd->vdev_open_error = vdev_open(vd);
1111 1111          vd->vdev_open_thread = NULL;
     1112 +        vd->vdev_parent->vdev_nonrot &= vd->vdev_nonrot;
1112 1113  }
1113 1114  
1114 1115  boolean_t
1115 1116  vdev_uses_zvols(vdev_t *vd)
1116 1117  {
1117 1118          if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
1118 1119              strlen(ZVOL_DIR)) == 0)
1119 1120                  return (B_TRUE);
1120 1121          for (int c = 0; c < vd->vdev_children; c++)
1121 1122                  if (vdev_uses_zvols(vd->vdev_child[c]))
1122 1123                          return (B_TRUE);
1123 1124          return (B_FALSE);
1124 1125  }
1125 1126  
1126 1127  void
1127 1128  vdev_open_children(vdev_t *vd)
1128 1129  {
1129 1130          taskq_t *tq;
1130 1131          int children = vd->vdev_children;
1131 1132  
     1133 +        vd->vdev_nonrot = B_TRUE;
     1134 +
1132 1135          /*
1133 1136           * in order to handle pools on top of zvols, do the opens
1134 1137           * in a single thread so that the same thread holds the
1135 1138           * spa_namespace_lock
1136 1139           */
1137 1140          if (vdev_uses_zvols(vd)) {
1138      -                for (int c = 0; c < children; c++)
     1141 +                for (int c = 0; c < children; c++) {
1139 1142                          vd->vdev_child[c]->vdev_open_error =
1140 1143                              vdev_open(vd->vdev_child[c]);
     1144 +                        vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
     1145 +                }
1141 1146                  return;
1142 1147          }
1143 1148          tq = taskq_create("vdev_open", children, minclsyspri,
1144 1149              children, children, TASKQ_PREPOPULATE);
1145 1150  
1146 1151          for (int c = 0; c < children; c++)
1147 1152                  VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
1148 1153                      TQ_SLEEP) != NULL);
1149 1154  
1150 1155          taskq_destroy(tq);
     1156 +
     1157 +        for (int c = 0; c < children; c++)
     1158 +                vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
1151 1159  }
1152 1160  
1153 1161  /*
1154 1162   * Prepare a virtual device for access.
1155 1163   */
1156 1164  int
1157 1165  vdev_open(vdev_t *vd)
1158 1166  {
1159 1167          spa_t *spa = vd->vdev_spa;
1160 1168          int error;

1161 1169          uint64_t osize = 0;
1162 1170          uint64_t max_osize = 0;
1163 1171          uint64_t asize, max_asize, psize;
1164 1172          uint64_t ashift = 0;
1165 1173  
1166 1174          ASSERT(vd->vdev_open_thread == curthread ||
1167 1175              spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1168 1176          ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
1169 1177              vd->vdev_state == VDEV_STATE_CANT_OPEN ||
1170 1178              vd->vdev_state == VDEV_STATE_OFFLINE);
1171 1179  
1172 1180          vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1173 1181          vd->vdev_cant_read = B_FALSE;
1174 1182          vd->vdev_cant_write = B_FALSE;
1175 1183          vd->vdev_min_asize = vdev_get_min_asize(vd);
1176 1184  
1177 1185          /*
1178 1186           * If this vdev is not removed, check its fault status.  If it's
1179 1187           * faulted, bail out of the open.
1180 1188           */
1181 1189          if (!vd->vdev_removed && vd->vdev_faulted) {
1182 1190                  ASSERT(vd->vdev_children == 0);
1183 1191                  ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1184 1192                      vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1185 1193                  vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1186 1194                      vd->vdev_label_aux);
1187 1195                  return (SET_ERROR(ENXIO));
1188 1196          } else if (vd->vdev_offline) {
1189 1197                  ASSERT(vd->vdev_children == 0);
1190 1198                  vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
1191 1199                  return (SET_ERROR(ENXIO));
1192 1200          }
1193 1201  
1194 1202          error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
1195 1203  
1196 1204          /*
1197 1205           * Reset the vdev_reopening flag so that we actually close
1198 1206           * the vdev on error.
1199 1207           */
1200 1208          vd->vdev_reopening = B_FALSE;
1201 1209          if (zio_injection_enabled && error == 0)
1202 1210                  error = zio_handle_device_injection(vd, NULL, ENXIO);
1203 1211  
1204 1212          if (error) {
1205 1213                  if (vd->vdev_removed &&
1206 1214                      vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
1207 1215                          vd->vdev_removed = B_FALSE;
1208 1216  
1209 1217                  vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1210 1218                      vd->vdev_stat.vs_aux);
1211 1219                  return (error);
1212 1220          }
1213 1221  
1214 1222          vd->vdev_removed = B_FALSE;
1215 1223  
1216 1224          /*
1217 1225           * Recheck the faulted flag now that we have confirmed that
1218 1226           * the vdev is accessible.  If we're faulted, bail.
1219 1227           */
1220 1228          if (vd->vdev_faulted) {
1221 1229                  ASSERT(vd->vdev_children == 0);
1222 1230                  ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1223 1231                      vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1224 1232                  vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1225 1233                      vd->vdev_label_aux);
1226 1234                  return (SET_ERROR(ENXIO));
1227 1235          }
1228 1236  
1229 1237          if (vd->vdev_degraded) {
1230 1238                  ASSERT(vd->vdev_children == 0);
1231 1239                  vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1232 1240                      VDEV_AUX_ERR_EXCEEDED);
1233 1241          } else {
1234 1242                  vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
1235 1243          }
1236 1244  
1237 1245          /*
1238 1246           * For hole or missing vdevs we just return success.
1239 1247           */
1240 1248          if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
1241 1249                  return (0);
1242 1250  
1243 1251          for (int c = 0; c < vd->vdev_children; c++) {
1244 1252                  if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
1245 1253                          vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1246 1254                              VDEV_AUX_NONE);
1247 1255                          break;
1248 1256                  }
1249 1257          }
1250 1258  
1251 1259          osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
1252 1260          max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
1253 1261  
1254 1262          if (vd->vdev_children == 0) {
1255 1263                  if (osize < SPA_MINDEVSIZE) {
1256 1264                          vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1257 1265                              VDEV_AUX_TOO_SMALL);
1258 1266                          return (SET_ERROR(EOVERFLOW));
1259 1267                  }
1260 1268                  psize = osize;
1261 1269                  asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
1262 1270                  max_asize = max_osize - (VDEV_LABEL_START_SIZE +
1263 1271                      VDEV_LABEL_END_SIZE);
1264 1272          } else {
1265 1273                  if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
1266 1274                      (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
1267 1275                          vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1268 1276                              VDEV_AUX_TOO_SMALL);
1269 1277                          return (SET_ERROR(EOVERFLOW));
1270 1278                  }
1271 1279                  psize = 0;
1272 1280                  asize = osize;
1273 1281                  max_asize = max_osize;
1274 1282          }
1275 1283  
1276 1284          vd->vdev_psize = psize;
1277 1285  
1278 1286          /*
1279 1287           * Make sure the allocatable size hasn't shrunk too much.
1280 1288           */
1281 1289          if (asize < vd->vdev_min_asize) {
1282 1290                  vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1283 1291                      VDEV_AUX_BAD_LABEL);
1284 1292                  return (SET_ERROR(EINVAL));
1285 1293          }
1286 1294  
1287 1295          if (vd->vdev_asize == 0) {
1288 1296                  /*
1289 1297                   * This is the first-ever open, so use the computed values.
1290 1298                   * For testing purposes, a higher ashift can be requested.
1291 1299                   */
1292 1300                  vd->vdev_asize = asize;
1293 1301                  vd->vdev_max_asize = max_asize;
1294 1302                  vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
1295 1303          } else {
1296 1304                  /*
1297 1305                   * Detect if the alignment requirement has increased.
1298 1306                   * We don't want to make the pool unavailable, just
1299 1307                   * issue a warning instead.
1300 1308                   */
1301 1309                  if (ashift > vd->vdev_top->vdev_ashift &&
1302 1310                      vd->vdev_ops->vdev_op_leaf) {
1303 1311                          cmn_err(CE_WARN,
1304 1312                              "Disk, '%s', has a block alignment that is "
1305 1313                              "larger than the pool's alignment\n",
1306 1314                              vd->vdev_path);
1307 1315                  }
1308 1316                  vd->vdev_max_asize = max_asize;
1309 1317          }
1310 1318  
1311 1319          /*
1312 1320           * If all children are healthy we update asize if either:
1313 1321           * The asize has increased, due to a device expansion caused by dynamic
1314 1322           * LUN growth or vdev replacement, and automatic expansion is enabled;
1315 1323           * making the additional space available.
1316 1324           *
1317 1325           * The asize has decreased, due to a device shrink usually caused by a
1318 1326           * vdev replace with a smaller device. This ensures that calculations
1319 1327           * based of max_asize and asize e.g. esize are always valid. It's safe
1320 1328           * to do this as we've already validated that asize is greater than
1321 1329           * vdev_min_asize.
1322 1330           */
1323 1331          if (vd->vdev_state == VDEV_STATE_HEALTHY &&
1324 1332              ((asize > vd->vdev_asize &&
1325 1333              (vd->vdev_expanding || spa->spa_autoexpand)) ||
1326 1334              (asize < vd->vdev_asize)))
1327 1335                  vd->vdev_asize = asize;
1328 1336  
1329 1337          vdev_set_min_asize(vd);
1330 1338  
1331 1339          /*
1332 1340           * Ensure we can issue some IO before declaring the
1333 1341           * vdev open for business.
1334 1342           */
1335 1343          if (vd->vdev_ops->vdev_op_leaf &&
1336 1344              (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
1337 1345                  vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1338 1346                      VDEV_AUX_ERR_EXCEEDED);
1339 1347                  return (error);
1340 1348          }
1341 1349  
1342 1350          /*
1343 1351           * Track the min and max ashift values for normal data devices.
1344 1352           */
1345 1353          if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
1346 1354              !vd->vdev_islog && vd->vdev_aux == NULL) {
1347 1355                  if (vd->vdev_ashift > spa->spa_max_ashift)
1348 1356                          spa->spa_max_ashift = vd->vdev_ashift;
1349 1357                  if (vd->vdev_ashift < spa->spa_min_ashift)
1350 1358                          spa->spa_min_ashift = vd->vdev_ashift;
1351 1359          }
1352 1360  
1353 1361          /*
1354 1362           * If a leaf vdev has a DTL, and seems healthy, then kick off a
1355 1363           * resilver.  But don't do this if we are doing a reopen for a scrub,
1356 1364           * since this would just restart the scrub we are already doing.
1357 1365           */
1358 1366          if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
1359 1367              vdev_resilver_needed(vd, NULL, NULL))
1360 1368                  spa_async_request(spa, SPA_ASYNC_RESILVER);
1361 1369  
1362 1370          return (0);
1363 1371  }
1364 1372  
1365 1373  /*
1366 1374   * Called once the vdevs are all opened, this routine validates the label
1367 1375   * contents.  This needs to be done before vdev_load() so that we don't
1368 1376   * inadvertently do repair I/Os to the wrong device.
1369 1377   *
1370 1378   * If 'strict' is false ignore the spa guid check. This is necessary because
1371 1379   * if the machine crashed during a re-guid the new guid might have been written
1372 1380   * to all of the vdev labels, but not the cached config. The strict check
1373 1381   * will be performed when the pool is opened again using the mos config.
1374 1382   *
1375 1383   * This function will only return failure if one of the vdevs indicates that it
1376 1384   * has since been destroyed or exported.  This is only possible if
1377 1385   * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
1378 1386   * will be updated but the function will return 0.
1379 1387   */
1380 1388  int
1381 1389  vdev_validate(vdev_t *vd, boolean_t strict)
1382 1390  {
1383 1391          spa_t *spa = vd->vdev_spa;
1384 1392          nvlist_t *label;
1385 1393          uint64_t guid = 0, top_guid;
1386 1394          uint64_t state;
1387 1395  
1388 1396          for (int c = 0; c < vd->vdev_children; c++)
1389 1397                  if (vdev_validate(vd->vdev_child[c], strict) != 0)
1390 1398                          return (SET_ERROR(EBADF));
1391 1399  
1392 1400          /*
1393 1401           * If the device has already failed, or was marked offline, don't do
1394 1402           * any further validation.  Otherwise, label I/O will fail and we will
1395 1403           * overwrite the previous state.
1396 1404           */
1397 1405          if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
1398 1406                  uint64_t aux_guid = 0;
1399 1407                  nvlist_t *nvl;
1400 1408                  uint64_t txg = spa_last_synced_txg(spa) != 0 ?
1401 1409                      spa_last_synced_txg(spa) : -1ULL;
1402 1410  
1403 1411                  if ((label = vdev_label_read_config(vd, txg)) == NULL) {
1404 1412                          vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1405 1413                              VDEV_AUX_BAD_LABEL);
1406 1414                          return (0);
1407 1415                  }
1408 1416  
1409 1417                  /*
1410 1418                   * Determine if this vdev has been split off into another
1411 1419                   * pool.  If so, then refuse to open it.
1412 1420                   */
1413 1421                  if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
1414 1422                      &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
1415 1423                          vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1416 1424                              VDEV_AUX_SPLIT_POOL);
1417 1425                          nvlist_free(label);
1418 1426                          return (0);
1419 1427                  }
1420 1428  
1421 1429                  if (strict && (nvlist_lookup_uint64(label,
1422 1430                      ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
1423 1431                      guid != spa_guid(spa))) {
1424 1432                          vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1425 1433                              VDEV_AUX_CORRUPT_DATA);
1426 1434                          nvlist_free(label);
1427 1435                          return (0);
1428 1436                  }
1429 1437  
1430 1438                  if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
1431 1439                      != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
1432 1440                      &aux_guid) != 0)
1433 1441                          aux_guid = 0;
1434 1442  
1435 1443                  /*
1436 1444                   * If this vdev just became a top-level vdev because its
1437 1445                   * sibling was detached, it will have adopted the parent's
1438 1446                   * vdev guid -- but the label may or may not be on disk yet.
1439 1447                   * Fortunately, either version of the label will have the
1440 1448                   * same top guid, so if we're a top-level vdev, we can
1441 1449                   * safely compare to that instead.
1442 1450                   *
1443 1451                   * If we split this vdev off instead, then we also check the
1444 1452                   * original pool's guid.  We don't want to consider the vdev
1445 1453                   * corrupt if it is partway through a split operation.
1446 1454                   */
1447 1455                  if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
1448 1456                      &guid) != 0 ||
1449 1457                      nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
1450 1458                      &top_guid) != 0 ||
1451 1459                      ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
1452 1460                      (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
1453 1461                          vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1454 1462                              VDEV_AUX_CORRUPT_DATA);
1455 1463                          nvlist_free(label);
1456 1464                          return (0);
1457 1465                  }
1458 1466  
1459 1467                  if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1460 1468                      &state) != 0) {
1461 1469                          vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1462 1470                              VDEV_AUX_CORRUPT_DATA);
1463 1471                          nvlist_free(label);
1464 1472                          return (0);
1465 1473                  }
1466 1474  
1467 1475                  nvlist_free(label);
1468 1476  
1469 1477                  /*
1470 1478                   * If this is a verbatim import, no need to check the
1471 1479                   * state of the pool.
1472 1480                   */
1473 1481                  if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
1474 1482                      spa_load_state(spa) == SPA_LOAD_OPEN &&
1475 1483                      state != POOL_STATE_ACTIVE)
1476 1484                          return (SET_ERROR(EBADF));
1477 1485  
1478 1486                  /*
1479 1487                   * If we were able to open and validate a vdev that was
1480 1488                   * previously marked permanently unavailable, clear that state
1481 1489                   * now.
1482 1490                   */
1483 1491                  if (vd->vdev_not_present)
1484 1492                          vd->vdev_not_present = 0;
1485 1493          }
1486 1494  
1487 1495          return (0);
1488 1496  }
1489 1497  
1490 1498  /*
1491 1499   * Close a virtual device.
1492 1500   */
1493 1501  void
1494 1502  vdev_close(vdev_t *vd)
1495 1503  {
1496 1504          spa_t *spa = vd->vdev_spa;
1497 1505          vdev_t *pvd = vd->vdev_parent;
1498 1506  
1499 1507          ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1500 1508  
1501 1509          /*
1502 1510           * If our parent is reopening, then we are as well, unless we are
1503 1511           * going offline.
1504 1512           */
1505 1513          if (pvd != NULL && pvd->vdev_reopening)
1506 1514                  vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
1507 1515  
1508 1516          vd->vdev_ops->vdev_op_close(vd);
1509 1517  
1510 1518          vdev_cache_purge(vd);
1511 1519  
1512 1520          /*
1513 1521           * We record the previous state before we close it, so that if we are
1514 1522           * doing a reopen(), we don't generate FMA ereports if we notice that
1515 1523           * it's still faulted.
1516 1524           */
1517 1525          vd->vdev_prevstate = vd->vdev_state;
1518 1526  
1519 1527          if (vd->vdev_offline)
1520 1528                  vd->vdev_state = VDEV_STATE_OFFLINE;
1521 1529          else
1522 1530                  vd->vdev_state = VDEV_STATE_CLOSED;
1523 1531          vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1524 1532  }
1525 1533  
1526 1534  void
1527 1535  vdev_hold(vdev_t *vd)
1528 1536  {
1529 1537          spa_t *spa = vd->vdev_spa;
1530 1538  
1531 1539          ASSERT(spa_is_root(spa));
1532 1540          if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1533 1541                  return;
1534 1542  
1535 1543          for (int c = 0; c < vd->vdev_children; c++)
1536 1544                  vdev_hold(vd->vdev_child[c]);
1537 1545  
1538 1546          if (vd->vdev_ops->vdev_op_leaf)
1539 1547                  vd->vdev_ops->vdev_op_hold(vd);
1540 1548  }
1541 1549  
1542 1550  void
1543 1551  vdev_rele(vdev_t *vd)
1544 1552  {
1545 1553          spa_t *spa = vd->vdev_spa;
1546 1554  
1547 1555          ASSERT(spa_is_root(spa));
1548 1556          for (int c = 0; c < vd->vdev_children; c++)
1549 1557                  vdev_rele(vd->vdev_child[c]);
1550 1558  
1551 1559          if (vd->vdev_ops->vdev_op_leaf)
1552 1560                  vd->vdev_ops->vdev_op_rele(vd);
1553 1561  }
1554 1562  
1555 1563  /*
1556 1564   * Reopen all interior vdevs and any unopened leaves.  We don't actually
1557 1565   * reopen leaf vdevs which had previously been opened as they might deadlock
1558 1566   * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
1559 1567   * If the leaf has never been opened then open it, as usual.
1560 1568   */
1561 1569  void
1562 1570  vdev_reopen(vdev_t *vd)
1563 1571  {
1564 1572          spa_t *spa = vd->vdev_spa;
1565 1573  
1566 1574          ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1567 1575  
1568 1576          /* set the reopening flag unless we're taking the vdev offline */
1569 1577          vd->vdev_reopening = !vd->vdev_offline;
1570 1578          vdev_close(vd);
1571 1579          (void) vdev_open(vd);
1572 1580  
1573 1581          /*
1574 1582           * Call vdev_validate() here to make sure we have the same device.
1575 1583           * Otherwise, a device with an invalid label could be successfully
1576 1584           * opened in response to vdev_reopen().
1577 1585           */
1578 1586          if (vd->vdev_aux) {
1579 1587                  (void) vdev_validate_aux(vd);
1580 1588                  if (vdev_readable(vd) && vdev_writeable(vd) &&
1581 1589                      vd->vdev_aux == &spa->spa_l2cache &&
1582 1590                      !l2arc_vdev_present(vd))
1583 1591                          l2arc_add_vdev(spa, vd);
1584 1592          } else {
1585 1593                  (void) vdev_validate(vd, B_TRUE);
1586 1594          }
1587 1595  
1588 1596          /*
1589 1597           * Reassess parent vdev's health.
1590 1598           */
1591 1599          vdev_propagate_state(vd);
1592 1600  }
1593 1601  
1594 1602  int
1595 1603  vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
1596 1604  {
1597 1605          int error;
1598 1606  
1599 1607          /*
1600 1608           * Normally, partial opens (e.g. of a mirror) are allowed.
1601 1609           * For a create, however, we want to fail the request if
1602 1610           * there are any components we can't open.
1603 1611           */
1604 1612          error = vdev_open(vd);
1605 1613  
1606 1614          if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
1607 1615                  vdev_close(vd);
1608 1616                  return (error ? error : ENXIO);
1609 1617          }
1610 1618  
1611 1619          /*
1612 1620           * Recursively load DTLs and initialize all labels.
1613 1621           */
1614 1622          if ((error = vdev_dtl_load(vd)) != 0 ||
1615 1623              (error = vdev_label_init(vd, txg, isreplacing ?
1616 1624              VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
1617 1625                  vdev_close(vd);
1618 1626                  return (error);
1619 1627          }
1620 1628  
1621 1629          return (0);
1622 1630  }
1623 1631  
1624 1632  void
1625 1633  vdev_metaslab_set_size(vdev_t *vd)
1626 1634  {
1627 1635          /*
1628 1636           * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
1629 1637           */
1630 1638          vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
1631 1639          vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1632 1640  }
1633 1641  
1634 1642  void
1635 1643  vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1636 1644  {
1637 1645          ASSERT(vd == vd->vdev_top);
1638 1646          ASSERT(!vd->vdev_ishole);
1639 1647          ASSERT(ISP2(flags));
1640 1648          ASSERT(spa_writeable(vd->vdev_spa));
1641 1649  
1642 1650          if (flags & VDD_METASLAB)
1643 1651                  (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
1644 1652  
1645 1653          if (flags & VDD_DTL)
1646 1654                  (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
1647 1655  
1648 1656          (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
1649 1657  }
1650 1658  
1651 1659  void
1652 1660  vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
1653 1661  {
1654 1662          for (int c = 0; c < vd->vdev_children; c++)
1655 1663                  vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
1656 1664  
1657 1665          if (vd->vdev_ops->vdev_op_leaf)
1658 1666                  vdev_dirty(vd->vdev_top, flags, vd, txg);
1659 1667  }
1660 1668  
1661 1669  /*
1662 1670   * DTLs.
1663 1671   *
1664 1672   * A vdev's DTL (dirty time log) is the set of transaction groups for which
1665 1673   * the vdev has less than perfect replication.  There are four kinds of DTL:
1666 1674   *
1667 1675   * DTL_MISSING: txgs for which the vdev has no valid copies of the data
1668 1676   *
1669 1677   * DTL_PARTIAL: txgs for which data is available, but not fully replicated
1670 1678   *
1671 1679   * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
1672 1680   *      scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
1673 1681   *      txgs that was scrubbed.
1674 1682   *
1675 1683   * DTL_OUTAGE: txgs which cannot currently be read, whether due to
1676 1684   *      persistent errors or just some device being offline.
1677 1685   *      Unlike the other three, the DTL_OUTAGE map is not generally
1678 1686   *      maintained; it's only computed when needed, typically to
1679 1687   *      determine whether a device can be detached.
1680 1688   *
1681 1689   * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
1682 1690   * either has the data or it doesn't.
1683 1691   *
1684 1692   * For interior vdevs such as mirror and RAID-Z the picture is more complex.
1685 1693   * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
1686 1694   * if any child is less than fully replicated, then so is its parent.
1687 1695   * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
1688 1696   * comprising only those txgs which appear in 'maxfaults' or more children;
1689 1697   * those are the txgs we don't have enough replication to read.  For example,
1690 1698   * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
1691 1699   * thus, its DTL_MISSING consists of the set of txgs that appear in more than
1692 1700   * two child DTL_MISSING maps.
1693 1701   *
1694 1702   * It should be clear from the above that to compute the DTLs and outage maps
1695 1703   * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
1696 1704   * Therefore, that is all we keep on disk.  When loading the pool, or after
1697 1705   * a configuration change, we generate all other DTLs from first principles.
1698 1706   */
1699 1707  void
1700 1708  vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1701 1709  {
1702 1710          range_tree_t *rt = vd->vdev_dtl[t];
1703 1711  
1704 1712          ASSERT(t < DTL_TYPES);
1705 1713          ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1706 1714          ASSERT(spa_writeable(vd->vdev_spa));
1707 1715  
1708 1716          mutex_enter(rt->rt_lock);
1709 1717          if (!range_tree_contains(rt, txg, size))
1710 1718                  range_tree_add(rt, txg, size);
1711 1719          mutex_exit(rt->rt_lock);
1712 1720  }
1713 1721  
1714 1722  boolean_t
1715 1723  vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1716 1724  {
1717 1725          range_tree_t *rt = vd->vdev_dtl[t];
1718 1726          boolean_t dirty = B_FALSE;
1719 1727  
1720 1728          ASSERT(t < DTL_TYPES);
1721 1729          ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1722 1730  
1723 1731          mutex_enter(rt->rt_lock);
1724 1732          if (range_tree_space(rt) != 0)
1725 1733                  dirty = range_tree_contains(rt, txg, size);
1726 1734          mutex_exit(rt->rt_lock);
1727 1735  
1728 1736          return (dirty);
1729 1737  }
1730 1738  
1731 1739  boolean_t
1732 1740  vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1733 1741  {
1734 1742          range_tree_t *rt = vd->vdev_dtl[t];
1735 1743          boolean_t empty;
1736 1744  
1737 1745          mutex_enter(rt->rt_lock);
1738 1746          empty = (range_tree_space(rt) == 0);
1739 1747          mutex_exit(rt->rt_lock);
1740 1748  
1741 1749          return (empty);
1742 1750  }
1743 1751  
1744 1752  /*
1745 1753   * Returns the lowest txg in the DTL range.
1746 1754   */
1747 1755  static uint64_t
1748 1756  vdev_dtl_min(vdev_t *vd)
1749 1757  {
1750 1758          range_seg_t *rs;
1751 1759  
1752 1760          ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1753 1761          ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
1754 1762          ASSERT0(vd->vdev_children);
1755 1763  
1756 1764          rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
1757 1765          return (rs->rs_start - 1);
1758 1766  }
1759 1767  
1760 1768  /*
1761 1769   * Returns the highest txg in the DTL.
1762 1770   */
1763 1771  static uint64_t
1764 1772  vdev_dtl_max(vdev_t *vd)
1765 1773  {
1766 1774          range_seg_t *rs;
1767 1775  
1768 1776          ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1769 1777          ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
1770 1778          ASSERT0(vd->vdev_children);
1771 1779  
1772 1780          rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
1773 1781          return (rs->rs_end);
1774 1782  }
1775 1783  
1776 1784  /*
1777 1785   * Determine if a resilvering vdev should remove any DTL entries from
1778 1786   * its range. If the vdev was resilvering for the entire duration of the
1779 1787   * scan then it should excise that range from its DTLs. Otherwise, this
1780 1788   * vdev is considered partially resilvered and should leave its DTL
1781 1789   * entries intact. The comment in vdev_dtl_reassess() describes how we
1782 1790   * excise the DTLs.
1783 1791   */
1784 1792  static boolean_t
1785 1793  vdev_dtl_should_excise(vdev_t *vd)
1786 1794  {
1787 1795          spa_t *spa = vd->vdev_spa;
1788 1796          dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1789 1797  
1790 1798          ASSERT0(scn->scn_phys.scn_errors);
1791 1799          ASSERT0(vd->vdev_children);
1792 1800  
1793 1801          if (vd->vdev_state < VDEV_STATE_DEGRADED)
1794 1802                  return (B_FALSE);
1795 1803  
1796 1804          if (vd->vdev_resilver_txg == 0 ||
1797 1805              range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
1798 1806                  return (B_TRUE);
1799 1807  
1800 1808          /*
1801 1809           * When a resilver is initiated the scan will assign the scn_max_txg
1802 1810           * value to the highest txg value that exists in all DTLs. If this
1803 1811           * device's max DTL is not part of this scan (i.e. it is not in
1804 1812           * the range (scn_min_txg, scn_max_txg] then it is not eligible
1805 1813           * for excision.
1806 1814           */
1807 1815          if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
1808 1816                  ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
1809 1817                  ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
1810 1818                  ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
1811 1819                  return (B_TRUE);
1812 1820          }
1813 1821          return (B_FALSE);
1814 1822  }
1815 1823  
1816 1824  /*
1817 1825   * Reassess DTLs after a config change or scrub completion.
1818 1826   */
1819 1827  void
1820 1828  vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
1821 1829  {
1822 1830          spa_t *spa = vd->vdev_spa;
1823 1831          avl_tree_t reftree;
1824 1832          int minref;
1825 1833  
1826 1834          ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1827 1835  
1828 1836          for (int c = 0; c < vd->vdev_children; c++)
1829 1837                  vdev_dtl_reassess(vd->vdev_child[c], txg,
1830 1838                      scrub_txg, scrub_done);
1831 1839  
1832 1840          if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
1833 1841                  return;
1834 1842  
1835 1843          if (vd->vdev_ops->vdev_op_leaf) {
1836 1844                  dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1837 1845  
1838 1846                  mutex_enter(&vd->vdev_dtl_lock);
1839 1847  
1840 1848                  /*
1841 1849                   * If we've completed a scan cleanly then determine
1842 1850                   * if this vdev should remove any DTLs. We only want to
1843 1851                   * excise regions on vdevs that were available during
1844 1852                   * the entire duration of this scan.
1845 1853                   */
1846 1854                  if (scrub_txg != 0 &&
1847 1855                      (spa->spa_scrub_started ||
1848 1856                      (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
1849 1857                      vdev_dtl_should_excise(vd)) {
1850 1858                          /*
1851 1859                           * We completed a scrub up to scrub_txg.  If we
1852 1860                           * did it without rebooting, then the scrub dtl
1853 1861                           * will be valid, so excise the old region and
1854 1862                           * fold in the scrub dtl.  Otherwise, leave the
1855 1863                           * dtl as-is if there was an error.
1856 1864                           *
1857 1865                           * There's little trick here: to excise the beginning
1858 1866                           * of the DTL_MISSING map, we put it into a reference
1859 1867                           * tree and then add a segment with refcnt -1 that
1860 1868                           * covers the range [0, scrub_txg).  This means
1861 1869                           * that each txg in that range has refcnt -1 or 0.
1862 1870                           * We then add DTL_SCRUB with a refcnt of 2, so that
1863 1871                           * entries in the range [0, scrub_txg) will have a
1864 1872                           * positive refcnt -- either 1 or 2.  We then convert
1865 1873                           * the reference tree into the new DTL_MISSING map.
1866 1874                           */
1867 1875                          space_reftree_create(&reftree);
1868 1876                          space_reftree_add_map(&reftree,
1869 1877                              vd->vdev_dtl[DTL_MISSING], 1);
1870 1878                          space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
1871 1879                          space_reftree_add_map(&reftree,
1872 1880                              vd->vdev_dtl[DTL_SCRUB], 2);
1873 1881                          space_reftree_generate_map(&reftree,
1874 1882                              vd->vdev_dtl[DTL_MISSING], 1);
1875 1883                          space_reftree_destroy(&reftree);
1876 1884                  }
1877 1885                  range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
1878 1886                  range_tree_walk(vd->vdev_dtl[DTL_MISSING],
1879 1887                      range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
1880 1888                  if (scrub_done)
1881 1889                          range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
1882 1890                  range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
1883 1891                  if (!vdev_readable(vd))
1884 1892                          range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
1885 1893                  else
1886 1894                          range_tree_walk(vd->vdev_dtl[DTL_MISSING],
1887 1895                              range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
1888 1896  
1889 1897                  /*
1890 1898                   * If the vdev was resilvering and no longer has any
1891 1899                   * DTLs then reset its resilvering flag.
1892 1900                   */
1893 1901                  if (vd->vdev_resilver_txg != 0 &&
1894 1902                      range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
1895 1903                      range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0)
1896 1904                          vd->vdev_resilver_txg = 0;
1897 1905  
1898 1906                  mutex_exit(&vd->vdev_dtl_lock);
1899 1907  
1900 1908                  if (txg != 0)
1901 1909                          vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
1902 1910                  return;
1903 1911          }
1904 1912  
1905 1913          mutex_enter(&vd->vdev_dtl_lock);
1906 1914          for (int t = 0; t < DTL_TYPES; t++) {
1907 1915                  /* account for child's outage in parent's missing map */
1908 1916                  int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
1909 1917                  if (t == DTL_SCRUB)
1910 1918                          continue;                       /* leaf vdevs only */
1911 1919                  if (t == DTL_PARTIAL)
1912 1920                          minref = 1;                     /* i.e. non-zero */
1913 1921                  else if (vd->vdev_nparity != 0)
1914 1922                          minref = vd->vdev_nparity + 1;  /* RAID-Z */
1915 1923                  else
1916 1924                          minref = vd->vdev_children;     /* any kind of mirror */
1917 1925                  space_reftree_create(&reftree);
1918 1926                  for (int c = 0; c < vd->vdev_children; c++) {
1919 1927                          vdev_t *cvd = vd->vdev_child[c];
1920 1928                          mutex_enter(&cvd->vdev_dtl_lock);
1921 1929                          space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
1922 1930                          mutex_exit(&cvd->vdev_dtl_lock);
1923 1931                  }
1924 1932                  space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
1925 1933                  space_reftree_destroy(&reftree);
1926 1934          }
1927 1935          mutex_exit(&vd->vdev_dtl_lock);
1928 1936  }
1929 1937  
1930 1938  int
1931 1939  vdev_dtl_load(vdev_t *vd)
1932 1940  {
1933 1941          spa_t *spa = vd->vdev_spa;
1934 1942          objset_t *mos = spa->spa_meta_objset;
1935 1943          int error = 0;
1936 1944  
1937 1945          if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
1938 1946                  ASSERT(!vd->vdev_ishole);
1939 1947  
1940 1948                  error = space_map_open(&vd->vdev_dtl_sm, mos,
1941 1949                      vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
1942 1950                  if (error)
1943 1951                          return (error);
1944 1952                  ASSERT(vd->vdev_dtl_sm != NULL);
1945 1953  
1946 1954                  mutex_enter(&vd->vdev_dtl_lock);
1947 1955  
1948 1956                  /*
1949 1957                   * Now that we've opened the space_map we need to update
1950 1958                   * the in-core DTL.
1951 1959                   */
1952 1960                  space_map_update(vd->vdev_dtl_sm);
1953 1961  
1954 1962                  error = space_map_load(vd->vdev_dtl_sm,
1955 1963                      vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
1956 1964                  mutex_exit(&vd->vdev_dtl_lock);
1957 1965  
1958 1966                  return (error);
1959 1967          }
1960 1968  
1961 1969          for (int c = 0; c < vd->vdev_children; c++) {
1962 1970                  error = vdev_dtl_load(vd->vdev_child[c]);
1963 1971                  if (error != 0)
1964 1972                          break;
1965 1973          }
1966 1974  
1967 1975          return (error);
1968 1976  }
1969 1977  
1970 1978  void
1971 1979  vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
1972 1980  {
1973 1981          spa_t *spa = vd->vdev_spa;
1974 1982  
1975 1983          VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
1976 1984          VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
1977 1985              zapobj, tx));
1978 1986  }
1979 1987  
1980 1988  uint64_t
1981 1989  vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
1982 1990  {
1983 1991          spa_t *spa = vd->vdev_spa;
1984 1992          uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
1985 1993              DMU_OT_NONE, 0, tx);
1986 1994  
1987 1995          ASSERT(zap != 0);
1988 1996          VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
1989 1997              zap, tx));
1990 1998  
1991 1999          return (zap);
1992 2000  }
1993 2001  
1994 2002  void
1995 2003  vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
1996 2004  {
1997 2005          if (vd->vdev_ops != &vdev_hole_ops &&
1998 2006              vd->vdev_ops != &vdev_missing_ops &&
1999 2007              vd->vdev_ops != &vdev_root_ops &&
2000 2008              !vd->vdev_top->vdev_removing) {
2001 2009                  if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
2002 2010                          vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
2003 2011                  }
2004 2012                  if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
2005 2013                          vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
2006 2014                  }
2007 2015          }
2008 2016          for (uint64_t i = 0; i < vd->vdev_children; i++) {
2009 2017                  vdev_construct_zaps(vd->vdev_child[i], tx);
2010 2018          }
2011 2019  }
2012 2020  
2013 2021  void
2014 2022  vdev_dtl_sync(vdev_t *vd, uint64_t txg)
2015 2023  {
2016 2024          spa_t *spa = vd->vdev_spa;
2017 2025          range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
2018 2026          objset_t *mos = spa->spa_meta_objset;
2019 2027          range_tree_t *rtsync;
2020 2028          kmutex_t rtlock;
2021 2029          dmu_tx_t *tx;
2022 2030          uint64_t object = space_map_object(vd->vdev_dtl_sm);
2023 2031  
2024 2032          ASSERT(!vd->vdev_ishole);
2025 2033          ASSERT(vd->vdev_ops->vdev_op_leaf);
2026 2034  
2027 2035          tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2028 2036  
2029 2037          if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
2030 2038                  mutex_enter(&vd->vdev_dtl_lock);
2031 2039                  space_map_free(vd->vdev_dtl_sm, tx);
2032 2040                  space_map_close(vd->vdev_dtl_sm);
2033 2041                  vd->vdev_dtl_sm = NULL;
2034 2042                  mutex_exit(&vd->vdev_dtl_lock);
2035 2043  
2036 2044                  /*
2037 2045                   * We only destroy the leaf ZAP for detached leaves or for
2038 2046                   * removed log devices. Removed data devices handle leaf ZAP
2039 2047                   * cleanup later, once cancellation is no longer possible.
2040 2048                   */
2041 2049                  if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
2042 2050                      vd->vdev_top->vdev_islog)) {
2043 2051                          vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
2044 2052                          vd->vdev_leaf_zap = 0;
2045 2053                  }
2046 2054  
2047 2055                  dmu_tx_commit(tx);
2048 2056                  return;
2049 2057          }
2050 2058  
2051 2059          if (vd->vdev_dtl_sm == NULL) {
2052 2060                  uint64_t new_object;
2053 2061  
2054 2062                  new_object = space_map_alloc(mos, tx);
2055 2063                  VERIFY3U(new_object, !=, 0);
2056 2064  
2057 2065                  VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
2058 2066                      0, -1ULL, 0, &vd->vdev_dtl_lock));
2059 2067                  ASSERT(vd->vdev_dtl_sm != NULL);
2060 2068          }
2061 2069  
2062 2070          mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
2063 2071  
2064 2072          rtsync = range_tree_create(NULL, NULL, &rtlock);
2065 2073  
2066 2074          mutex_enter(&rtlock);
2067 2075  
2068 2076          mutex_enter(&vd->vdev_dtl_lock);
2069 2077          range_tree_walk(rt, range_tree_add, rtsync);
2070 2078          mutex_exit(&vd->vdev_dtl_lock);
2071 2079  
2072 2080          space_map_truncate(vd->vdev_dtl_sm, tx);
2073 2081          space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
2074 2082          range_tree_vacate(rtsync, NULL, NULL);
2075 2083  
2076 2084          range_tree_destroy(rtsync);
2077 2085  
2078 2086          mutex_exit(&rtlock);
2079 2087          mutex_destroy(&rtlock);
2080 2088  
2081 2089          /*
2082 2090           * If the object for the space map has changed then dirty
2083 2091           * the top level so that we update the config.
2084 2092           */
2085 2093          if (object != space_map_object(vd->vdev_dtl_sm)) {
2086 2094                  zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
2087 2095                      "new object %llu", txg, spa_name(spa), object,
2088 2096                      space_map_object(vd->vdev_dtl_sm));
2089 2097                  vdev_config_dirty(vd->vdev_top);
2090 2098          }
2091 2099  
2092 2100          dmu_tx_commit(tx);
2093 2101  
2094 2102          mutex_enter(&vd->vdev_dtl_lock);
2095 2103          space_map_update(vd->vdev_dtl_sm);
2096 2104          mutex_exit(&vd->vdev_dtl_lock);
2097 2105  }
2098 2106  
2099 2107  /*
2100 2108   * Determine whether the specified vdev can be offlined/detached/removed
2101 2109   * without losing data.
2102 2110   */
2103 2111  boolean_t
2104 2112  vdev_dtl_required(vdev_t *vd)
2105 2113  {
2106 2114          spa_t *spa = vd->vdev_spa;
2107 2115          vdev_t *tvd = vd->vdev_top;
2108 2116          uint8_t cant_read = vd->vdev_cant_read;
2109 2117          boolean_t required;
2110 2118  
2111 2119          ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2112 2120  
2113 2121          if (vd == spa->spa_root_vdev || vd == tvd)
2114 2122                  return (B_TRUE);
2115 2123  
2116 2124          /*
2117 2125           * Temporarily mark the device as unreadable, and then determine
2118 2126           * whether this results in any DTL outages in the top-level vdev.
2119 2127           * If not, we can safely offline/detach/remove the device.
2120 2128           */
2121 2129          vd->vdev_cant_read = B_TRUE;
2122 2130          vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
2123 2131          required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
2124 2132          vd->vdev_cant_read = cant_read;
2125 2133          vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
2126 2134  
2127 2135          if (!required && zio_injection_enabled)
2128 2136                  required = !!zio_handle_device_injection(vd, NULL, ECHILD);
2129 2137  
2130 2138          return (required);
2131 2139  }
2132 2140  
2133 2141  /*
2134 2142   * Determine if resilver is needed, and if so the txg range.
2135 2143   */
2136 2144  boolean_t
2137 2145  vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
2138 2146  {
2139 2147          boolean_t needed = B_FALSE;
2140 2148          uint64_t thismin = UINT64_MAX;
2141 2149          uint64_t thismax = 0;
2142 2150  
2143 2151          if (vd->vdev_children == 0) {
2144 2152                  mutex_enter(&vd->vdev_dtl_lock);
2145 2153                  if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
2146 2154                      vdev_writeable(vd)) {
2147 2155  
2148 2156                          thismin = vdev_dtl_min(vd);
2149 2157                          thismax = vdev_dtl_max(vd);
2150 2158                          needed = B_TRUE;
2151 2159                  }
2152 2160                  mutex_exit(&vd->vdev_dtl_lock);
2153 2161          } else {
2154 2162                  for (int c = 0; c < vd->vdev_children; c++) {
2155 2163                          vdev_t *cvd = vd->vdev_child[c];
2156 2164                          uint64_t cmin, cmax;
2157 2165  
2158 2166                          if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
2159 2167                                  thismin = MIN(thismin, cmin);
2160 2168                                  thismax = MAX(thismax, cmax);
2161 2169                                  needed = B_TRUE;
2162 2170                          }
2163 2171                  }
2164 2172          }
2165 2173  
2166 2174          if (needed && minp) {
2167 2175                  *minp = thismin;
2168 2176                  *maxp = thismax;
2169 2177          }
2170 2178          return (needed);
2171 2179  }
2172 2180  
2173 2181  void
2174 2182  vdev_load(vdev_t *vd)
2175 2183  {
2176 2184          /*
2177 2185           * Recursively load all children.
2178 2186           */
2179 2187          for (int c = 0; c < vd->vdev_children; c++)
2180 2188                  vdev_load(vd->vdev_child[c]);
2181 2189  
2182 2190          /*
2183 2191           * If this is a top-level vdev, initialize its metaslabs.
2184 2192           */
2185 2193          if (vd == vd->vdev_top && !vd->vdev_ishole &&
2186 2194              (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
2187 2195              vdev_metaslab_init(vd, 0) != 0))
2188 2196                  vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2189 2197                      VDEV_AUX_CORRUPT_DATA);
2190 2198  
2191 2199          /*
2192 2200           * If this is a leaf vdev, load its DTL.
2193 2201           */
2194 2202          if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
2195 2203                  vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2196 2204                      VDEV_AUX_CORRUPT_DATA);
2197 2205  }
2198 2206  
2199 2207  /*
2200 2208   * The special vdev case is used for hot spares and l2cache devices.  Its
2201 2209   * sole purpose it to set the vdev state for the associated vdev.  To do this,
2202 2210   * we make sure that we can open the underlying device, then try to read the
2203 2211   * label, and make sure that the label is sane and that it hasn't been
2204 2212   * repurposed to another pool.
2205 2213   */
2206 2214  int
2207 2215  vdev_validate_aux(vdev_t *vd)
2208 2216  {
2209 2217          nvlist_t *label;
2210 2218          uint64_t guid, version;
2211 2219          uint64_t state;
2212 2220  
2213 2221          if (!vdev_readable(vd))
2214 2222                  return (0);
2215 2223  
2216 2224          if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
2217 2225                  vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2218 2226                      VDEV_AUX_CORRUPT_DATA);
2219 2227                  return (-1);
2220 2228          }
2221 2229  
2222 2230          if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
2223 2231              !SPA_VERSION_IS_SUPPORTED(version) ||
2224 2232              nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
2225 2233              guid != vd->vdev_guid ||
2226 2234              nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
2227 2235                  vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2228 2236                      VDEV_AUX_CORRUPT_DATA);
2229 2237                  nvlist_free(label);
2230 2238                  return (-1);
2231 2239          }
2232 2240  
2233 2241          /*
2234 2242           * We don't actually check the pool state here.  If it's in fact in
2235 2243           * use by another pool, we update this fact on the fly when requested.
2236 2244           */
2237 2245          nvlist_free(label);
2238 2246          return (0);
2239 2247  }
2240 2248  
2241 2249  void
2242 2250  vdev_remove(vdev_t *vd, uint64_t txg)
2243 2251  {
2244 2252          spa_t *spa = vd->vdev_spa;
2245 2253          objset_t *mos = spa->spa_meta_objset;
2246 2254          dmu_tx_t *tx;
2247 2255  
2248 2256          tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2249 2257          ASSERT(vd == vd->vdev_top);
2250 2258          ASSERT3U(txg, ==, spa_syncing_txg(spa));
2251 2259  
2252 2260          if (vd->vdev_ms != NULL) {
2253 2261                  metaslab_group_t *mg = vd->vdev_mg;
2254 2262  
2255 2263                  metaslab_group_histogram_verify(mg);
2256 2264                  metaslab_class_histogram_verify(mg->mg_class);
2257 2265  
2258 2266                  for (int m = 0; m < vd->vdev_ms_count; m++) {
2259 2267                          metaslab_t *msp = vd->vdev_ms[m];
2260 2268  
2261 2269                          if (msp == NULL || msp->ms_sm == NULL)
2262 2270                                  continue;
2263 2271  
2264 2272                          mutex_enter(&msp->ms_lock);
2265 2273                          /*
2266 2274                           * If the metaslab was not loaded when the vdev
2267 2275                           * was removed then the histogram accounting may
2268 2276                           * not be accurate. Update the histogram information
2269 2277                           * here so that we ensure that the metaslab group
2270 2278                           * and metaslab class are up-to-date.
2271 2279                           */
2272 2280                          metaslab_group_histogram_remove(mg, msp);
2273 2281  
2274 2282                          VERIFY0(space_map_allocated(msp->ms_sm));
2275 2283                          space_map_free(msp->ms_sm, tx);
2276 2284                          space_map_close(msp->ms_sm);
2277 2285                          msp->ms_sm = NULL;
2278 2286                          mutex_exit(&msp->ms_lock);
2279 2287                  }
2280 2288  
2281 2289                  metaslab_group_histogram_verify(mg);
2282 2290                  metaslab_class_histogram_verify(mg->mg_class);
2283 2291                  for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
2284 2292                          ASSERT0(mg->mg_histogram[i]);
2285 2293  
2286 2294          }
2287 2295  
2288 2296          if (vd->vdev_ms_array) {
2289 2297                  (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
2290 2298                  vd->vdev_ms_array = 0;
2291 2299          }
2292 2300  
2293 2301          if (vd->vdev_islog && vd->vdev_top_zap != 0) {
2294 2302                  vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
2295 2303                  vd->vdev_top_zap = 0;
2296 2304          }
2297 2305          dmu_tx_commit(tx);
2298 2306  }
2299 2307  
2300 2308  void
2301 2309  vdev_sync_done(vdev_t *vd, uint64_t txg)
2302 2310  {
2303 2311          metaslab_t *msp;
2304 2312          boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
2305 2313  
2306 2314          ASSERT(!vd->vdev_ishole);
2307 2315  
2308 2316          while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
2309 2317                  metaslab_sync_done(msp, txg);
2310 2318  
2311 2319          if (reassess)
2312 2320                  metaslab_sync_reassess(vd->vdev_mg);
2313 2321  }
2314 2322  
2315 2323  void
2316 2324  vdev_sync(vdev_t *vd, uint64_t txg)
2317 2325  {
2318 2326          spa_t *spa = vd->vdev_spa;
2319 2327          vdev_t *lvd;
2320 2328          metaslab_t *msp;
2321 2329          dmu_tx_t *tx;
2322 2330  
2323 2331          ASSERT(!vd->vdev_ishole);
2324 2332  
2325 2333          if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
2326 2334                  ASSERT(vd == vd->vdev_top);
2327 2335                  tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2328 2336                  vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
2329 2337                      DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
2330 2338                  ASSERT(vd->vdev_ms_array != 0);
2331 2339                  vdev_config_dirty(vd);
2332 2340                  dmu_tx_commit(tx);
2333 2341          }
2334 2342  
2335 2343          /*
2336 2344           * Remove the metadata associated with this vdev once it's empty.
2337 2345           */
2338 2346          if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
2339 2347                  vdev_remove(vd, txg);
2340 2348  
2341 2349          while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
2342 2350                  metaslab_sync(msp, txg);
2343 2351                  (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
2344 2352          }
2345 2353  
2346 2354          while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
2347 2355                  vdev_dtl_sync(lvd, txg);
2348 2356  
2349 2357          (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
2350 2358  }
2351 2359  
2352 2360  uint64_t
2353 2361  vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
2354 2362  {
2355 2363          return (vd->vdev_ops->vdev_op_asize(vd, psize));
2356 2364  }
2357 2365  
2358 2366  /*
2359 2367   * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
2360 2368   * not be opened, and no I/O is attempted.
2361 2369   */
2362 2370  int
2363 2371  vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2364 2372  {
2365 2373          vdev_t *vd, *tvd;
2366 2374  
2367 2375          spa_vdev_state_enter(spa, SCL_NONE);
2368 2376  
2369 2377          if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2370 2378                  return (spa_vdev_state_exit(spa, NULL, ENODEV));
2371 2379  
2372 2380          if (!vd->vdev_ops->vdev_op_leaf)
2373 2381                  return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2374 2382  
2375 2383          tvd = vd->vdev_top;
2376 2384  
2377 2385          /*
2378 2386           * We don't directly use the aux state here, but if we do a
2379 2387           * vdev_reopen(), we need this value to be present to remember why we
2380 2388           * were faulted.
2381 2389           */
2382 2390          vd->vdev_label_aux = aux;
2383 2391  
2384 2392          /*
2385 2393           * Faulted state takes precedence over degraded.
2386 2394           */
2387 2395          vd->vdev_delayed_close = B_FALSE;
2388 2396          vd->vdev_faulted = 1ULL;
2389 2397          vd->vdev_degraded = 0ULL;
2390 2398          vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
2391 2399  
2392 2400          /*
2393 2401           * If this device has the only valid copy of the data, then
2394 2402           * back off and simply mark the vdev as degraded instead.
2395 2403           */
2396 2404          if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
2397 2405                  vd->vdev_degraded = 1ULL;
2398 2406                  vd->vdev_faulted = 0ULL;
2399 2407  
2400 2408                  /*
2401 2409                   * If we reopen the device and it's not dead, only then do we
2402 2410                   * mark it degraded.
2403 2411                   */
2404 2412                  vdev_reopen(tvd);
2405 2413  
2406 2414                  if (vdev_readable(vd))
2407 2415                          vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
2408 2416          }
2409 2417  
2410 2418          return (spa_vdev_state_exit(spa, vd, 0));
2411 2419  }
2412 2420  
2413 2421  /*
2414 2422   * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
2415 2423   * user that something is wrong.  The vdev continues to operate as normal as far
2416 2424   * as I/O is concerned.
2417 2425   */
2418 2426  int
2419 2427  vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2420 2428  {
2421 2429          vdev_t *vd;
2422 2430  
2423 2431          spa_vdev_state_enter(spa, SCL_NONE);
2424 2432  
2425 2433          if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2426 2434                  return (spa_vdev_state_exit(spa, NULL, ENODEV));
2427 2435  
2428 2436          if (!vd->vdev_ops->vdev_op_leaf)
2429 2437                  return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2430 2438  
2431 2439          /*
2432 2440           * If the vdev is already faulted, then don't do anything.
2433 2441           */
2434 2442          if (vd->vdev_faulted || vd->vdev_degraded)
2435 2443                  return (spa_vdev_state_exit(spa, NULL, 0));
2436 2444  
2437 2445          vd->vdev_degraded = 1ULL;
2438 2446          if (!vdev_is_dead(vd))
2439 2447                  vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
2440 2448                      aux);
2441 2449  
2442 2450          return (spa_vdev_state_exit(spa, vd, 0));
2443 2451  }
2444 2452  
2445 2453  /*
2446 2454   * Online the given vdev.
2447 2455   *
2448 2456   * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
2449 2457   * spare device should be detached when the device finishes resilvering.
2450 2458   * Second, the online should be treated like a 'test' online case, so no FMA
2451 2459   * events are generated if the device fails to open.
2452 2460   */
2453 2461  int
2454 2462  vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
2455 2463  {
2456 2464          vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
2457 2465          boolean_t wasoffline;
2458 2466          vdev_state_t oldstate;
2459 2467  
2460 2468          spa_vdev_state_enter(spa, SCL_NONE);
2461 2469  
2462 2470          if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2463 2471                  return (spa_vdev_state_exit(spa, NULL, ENODEV));
2464 2472  
2465 2473          if (!vd->vdev_ops->vdev_op_leaf)
2466 2474                  return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2467 2475  
2468 2476          wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
2469 2477          oldstate = vd->vdev_state;
2470 2478  
2471 2479          tvd = vd->vdev_top;
2472 2480          vd->vdev_offline = B_FALSE;
2473 2481          vd->vdev_tmpoffline = B_FALSE;
2474 2482          vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
2475 2483          vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
2476 2484  
2477 2485          /* XXX - L2ARC 1.0 does not support expansion */
2478 2486          if (!vd->vdev_aux) {
2479 2487                  for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2480 2488                          pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
2481 2489          }
2482 2490  
2483 2491          vdev_reopen(tvd);
2484 2492          vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
2485 2493  
2486 2494          if (!vd->vdev_aux) {
2487 2495                  for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2488 2496                          pvd->vdev_expanding = B_FALSE;
2489 2497          }
2490 2498  
2491 2499          if (newstate)
2492 2500                  *newstate = vd->vdev_state;
2493 2501          if ((flags & ZFS_ONLINE_UNSPARE) &&
2494 2502              !vdev_is_dead(vd) && vd->vdev_parent &&
2495 2503              vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2496 2504              vd->vdev_parent->vdev_child[0] == vd)
2497 2505                  vd->vdev_unspare = B_TRUE;
2498 2506  
2499 2507          if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
2500 2508  
2501 2509                  /* XXX - L2ARC 1.0 does not support expansion */
2502 2510                  if (vd->vdev_aux)
2503 2511                          return (spa_vdev_state_exit(spa, vd, ENOTSUP));
2504 2512                  spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2505 2513          }
2506 2514  
2507 2515          if (wasoffline ||
2508 2516              (oldstate < VDEV_STATE_DEGRADED &&
2509 2517              vd->vdev_state >= VDEV_STATE_DEGRADED))
2510 2518                  spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
2511 2519  
2512 2520          return (spa_vdev_state_exit(spa, vd, 0));
2513 2521  }
2514 2522  
2515 2523  static int
2516 2524  vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
2517 2525  {
2518 2526          vdev_t *vd, *tvd;
2519 2527          int error = 0;
2520 2528          uint64_t generation;
2521 2529          metaslab_group_t *mg;
2522 2530  
2523 2531  top:
2524 2532          spa_vdev_state_enter(spa, SCL_ALLOC);
2525 2533  
2526 2534          if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2527 2535                  return (spa_vdev_state_exit(spa, NULL, ENODEV));
2528 2536  
2529 2537          if (!vd->vdev_ops->vdev_op_leaf)
2530 2538                  return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2531 2539  
2532 2540          tvd = vd->vdev_top;
2533 2541          mg = tvd->vdev_mg;
2534 2542          generation = spa->spa_config_generation + 1;
2535 2543  
2536 2544          /*
2537 2545           * If the device isn't already offline, try to offline it.
2538 2546           */
2539 2547          if (!vd->vdev_offline) {
2540 2548                  /*
2541 2549                   * If this device has the only valid copy of some data,
2542 2550                   * don't allow it to be offlined. Log devices are always
2543 2551                   * expendable.
2544 2552                   */
2545 2553                  if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2546 2554                      vdev_dtl_required(vd))
2547 2555                          return (spa_vdev_state_exit(spa, NULL, EBUSY));
2548 2556  
2549 2557                  /*
2550 2558                   * If the top-level is a slog and it has had allocations
2551 2559                   * then proceed.  We check that the vdev's metaslab group
2552 2560                   * is not NULL since it's possible that we may have just
2553 2561                   * added this vdev but not yet initialized its metaslabs.
2554 2562                   */
2555 2563                  if (tvd->vdev_islog && mg != NULL) {
2556 2564                          /*
2557 2565                           * Prevent any future allocations.
2558 2566                           */
2559 2567                          metaslab_group_passivate(mg);
2560 2568                          (void) spa_vdev_state_exit(spa, vd, 0);
2561 2569  
2562 2570                          error = spa_offline_log(spa);
2563 2571  
2564 2572                          spa_vdev_state_enter(spa, SCL_ALLOC);
2565 2573  
2566 2574                          /*
2567 2575                           * Check to see if the config has changed.
2568 2576                           */
2569 2577                          if (error || generation != spa->spa_config_generation) {
2570 2578                                  metaslab_group_activate(mg);
2571 2579                                  if (error)
2572 2580                                          return (spa_vdev_state_exit(spa,
2573 2581                                              vd, error));
2574 2582                                  (void) spa_vdev_state_exit(spa, vd, 0);
2575 2583                                  goto top;
2576 2584                          }
2577 2585                          ASSERT0(tvd->vdev_stat.vs_alloc);
2578 2586                  }
2579 2587  
2580 2588                  /*
2581 2589                   * Offline this device and reopen its top-level vdev.
2582 2590                   * If the top-level vdev is a log device then just offline
2583 2591                   * it. Otherwise, if this action results in the top-level
2584 2592                   * vdev becoming unusable, undo it and fail the request.
2585 2593                   */
2586 2594                  vd->vdev_offline = B_TRUE;
2587 2595                  vdev_reopen(tvd);
2588 2596  
2589 2597                  if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2590 2598                      vdev_is_dead(tvd)) {
2591 2599                          vd->vdev_offline = B_FALSE;
2592 2600                          vdev_reopen(tvd);
2593 2601                          return (spa_vdev_state_exit(spa, NULL, EBUSY));
2594 2602                  }
2595 2603  
2596 2604                  /*
2597 2605                   * Add the device back into the metaslab rotor so that
2598 2606                   * once we online the device it's open for business.
2599 2607                   */
2600 2608                  if (tvd->vdev_islog && mg != NULL)
2601 2609                          metaslab_group_activate(mg);
2602 2610          }
2603 2611  
2604 2612          vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
2605 2613  
2606 2614          return (spa_vdev_state_exit(spa, vd, 0));
2607 2615  }
2608 2616  
2609 2617  int
2610 2618  vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
2611 2619  {
2612 2620          int error;
2613 2621  
2614 2622          mutex_enter(&spa->spa_vdev_top_lock);
2615 2623          error = vdev_offline_locked(spa, guid, flags);
2616 2624          mutex_exit(&spa->spa_vdev_top_lock);
2617 2625  
2618 2626          return (error);
2619 2627  }
2620 2628  
2621 2629  /*
2622 2630   * Clear the error counts associated with this vdev.  Unlike vdev_online() and
2623 2631   * vdev_offline(), we assume the spa config is locked.  We also clear all
2624 2632   * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
2625 2633   */
2626 2634  void
2627 2635  vdev_clear(spa_t *spa, vdev_t *vd)
2628 2636  {
2629 2637          vdev_t *rvd = spa->spa_root_vdev;
2630 2638  
2631 2639          ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2632 2640  
2633 2641          if (vd == NULL)
2634 2642                  vd = rvd;
2635 2643  
2636 2644          vd->vdev_stat.vs_read_errors = 0;
2637 2645          vd->vdev_stat.vs_write_errors = 0;
2638 2646          vd->vdev_stat.vs_checksum_errors = 0;
2639 2647  
2640 2648          for (int c = 0; c < vd->vdev_children; c++)
2641 2649                  vdev_clear(spa, vd->vdev_child[c]);
2642 2650  
2643 2651          /*
2644 2652           * If we're in the FAULTED state or have experienced failed I/O, then
2645 2653           * clear the persistent state and attempt to reopen the device.  We
2646 2654           * also mark the vdev config dirty, so that the new faulted state is
2647 2655           * written out to disk.
2648 2656           */
2649 2657          if (vd->vdev_faulted || vd->vdev_degraded ||
2650 2658              !vdev_readable(vd) || !vdev_writeable(vd)) {
2651 2659  
2652 2660                  /*
2653 2661                   * When reopening in reponse to a clear event, it may be due to
2654 2662                   * a fmadm repair request.  In this case, if the device is
2655 2663                   * still broken, we want to still post the ereport again.
2656 2664                   */
2657 2665                  vd->vdev_forcefault = B_TRUE;
2658 2666  
2659 2667                  vd->vdev_faulted = vd->vdev_degraded = 0ULL;
2660 2668                  vd->vdev_cant_read = B_FALSE;
2661 2669                  vd->vdev_cant_write = B_FALSE;
2662 2670  
2663 2671                  vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
2664 2672  
2665 2673                  vd->vdev_forcefault = B_FALSE;
2666 2674  
2667 2675                  if (vd != rvd && vdev_writeable(vd->vdev_top))
2668 2676                          vdev_state_dirty(vd->vdev_top);
2669 2677  
2670 2678                  if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
2671 2679                          spa_async_request(spa, SPA_ASYNC_RESILVER);
2672 2680  
2673 2681                  spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
2674 2682          }
2675 2683  
2676 2684          /*
2677 2685           * When clearing a FMA-diagnosed fault, we always want to
2678 2686           * unspare the device, as we assume that the original spare was
2679 2687           * done in response to the FMA fault.
2680 2688           */
2681 2689          if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
2682 2690              vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2683 2691              vd->vdev_parent->vdev_child[0] == vd)
2684 2692                  vd->vdev_unspare = B_TRUE;
2685 2693  }
2686 2694  
2687 2695  boolean_t
2688 2696  vdev_is_dead(vdev_t *vd)
2689 2697  {
2690 2698          /*
2691 2699           * Holes and missing devices are always considered "dead".
2692 2700           * This simplifies the code since we don't have to check for
2693 2701           * these types of devices in the various code paths.
2694 2702           * Instead we rely on the fact that we skip over dead devices
2695 2703           * before issuing I/O to them.
2696 2704           */
2697 2705          return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
2698 2706              vd->vdev_ops == &vdev_missing_ops);
2699 2707  }
2700 2708  
2701 2709  boolean_t
2702 2710  vdev_readable(vdev_t *vd)
2703 2711  {
2704 2712          return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
2705 2713  }
2706 2714  
2707 2715  boolean_t
2708 2716  vdev_writeable(vdev_t *vd)
2709 2717  {
2710 2718          return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
2711 2719  }
2712 2720  
2713 2721  boolean_t
2714 2722  vdev_allocatable(vdev_t *vd)
2715 2723  {
2716 2724          uint64_t state = vd->vdev_state;
2717 2725  
2718 2726          /*
2719 2727           * We currently allow allocations from vdevs which may be in the
2720 2728           * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
2721 2729           * fails to reopen then we'll catch it later when we're holding
2722 2730           * the proper locks.  Note that we have to get the vdev state
2723 2731           * in a local variable because although it changes atomically,
2724 2732           * we're asking two separate questions about it.
2725 2733           */
2726 2734          return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
2727 2735              !vd->vdev_cant_write && !vd->vdev_ishole &&
2728 2736              vd->vdev_mg->mg_initialized);
2729 2737  }
2730 2738  
2731 2739  boolean_t
2732 2740  vdev_accessible(vdev_t *vd, zio_t *zio)
2733 2741  {
2734 2742          ASSERT(zio->io_vd == vd);
2735 2743  
2736 2744          if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
2737 2745                  return (B_FALSE);
2738 2746  
2739 2747          if (zio->io_type == ZIO_TYPE_READ)
2740 2748                  return (!vd->vdev_cant_read);
2741 2749  
2742 2750          if (zio->io_type == ZIO_TYPE_WRITE)
2743 2751                  return (!vd->vdev_cant_write);
2744 2752  
2745 2753          return (B_TRUE);
2746 2754  }
2747 2755  
2748 2756  /*
2749 2757   * Get statistics for the given vdev.
2750 2758   */
2751 2759  void
2752 2760  vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
2753 2761  {
2754 2762          spa_t *spa = vd->vdev_spa;
2755 2763          vdev_t *rvd = spa->spa_root_vdev;
2756 2764          vdev_t *tvd = vd->vdev_top;
2757 2765  
2758 2766          ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2759 2767  
2760 2768          mutex_enter(&vd->vdev_stat_lock);
2761 2769          bcopy(&vd->vdev_stat, vs, sizeof (*vs));
2762 2770          vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
2763 2771          vs->vs_state = vd->vdev_state;
2764 2772          vs->vs_rsize = vdev_get_min_asize(vd);
2765 2773          if (vd->vdev_ops->vdev_op_leaf)
2766 2774                  vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
2767 2775          /*
2768 2776           * Report expandable space on top-level, non-auxillary devices only.
2769 2777           * The expandable space is reported in terms of metaslab sized units
2770 2778           * since that determines how much space the pool can expand.
2771 2779           */
2772 2780          if (vd->vdev_aux == NULL && tvd != NULL) {
2773 2781                  vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize -
2774 2782                      spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift);
2775 2783          }
2776 2784          if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
2777 2785                  vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
2778 2786          }
2779 2787  
2780 2788          /*
2781 2789           * If we're getting stats on the root vdev, aggregate the I/O counts
2782 2790           * over all top-level vdevs (i.e. the direct children of the root).
2783 2791           */
2784 2792          if (vd == rvd) {
2785 2793                  for (int c = 0; c < rvd->vdev_children; c++) {
2786 2794                          vdev_t *cvd = rvd->vdev_child[c];
2787 2795                          vdev_stat_t *cvs = &cvd->vdev_stat;
2788 2796  
2789 2797                          for (int t = 0; t < ZIO_TYPES; t++) {
2790 2798                                  vs->vs_ops[t] += cvs->vs_ops[t];
2791 2799                                  vs->vs_bytes[t] += cvs->vs_bytes[t];
2792 2800                          }
2793 2801                          cvs->vs_scan_removing = cvd->vdev_removing;
2794 2802                  }
2795 2803          }
2796 2804          mutex_exit(&vd->vdev_stat_lock);
2797 2805  }
2798 2806  
2799 2807  void
2800 2808  vdev_clear_stats(vdev_t *vd)
2801 2809  {
2802 2810          mutex_enter(&vd->vdev_stat_lock);
2803 2811          vd->vdev_stat.vs_space = 0;
2804 2812          vd->vdev_stat.vs_dspace = 0;
2805 2813          vd->vdev_stat.vs_alloc = 0;
2806 2814          mutex_exit(&vd->vdev_stat_lock);
2807 2815  }
2808 2816  
2809 2817  void
2810 2818  vdev_scan_stat_init(vdev_t *vd)
2811 2819  {
2812 2820          vdev_stat_t *vs = &vd->vdev_stat;
2813 2821  
2814 2822          for (int c = 0; c < vd->vdev_children; c++)
2815 2823                  vdev_scan_stat_init(vd->vdev_child[c]);
2816 2824  
2817 2825          mutex_enter(&vd->vdev_stat_lock);
2818 2826          vs->vs_scan_processed = 0;
2819 2827          mutex_exit(&vd->vdev_stat_lock);
2820 2828  }
2821 2829  
2822 2830  void
2823 2831  vdev_stat_update(zio_t *zio, uint64_t psize)
2824 2832  {
2825 2833          spa_t *spa = zio->io_spa;
2826 2834          vdev_t *rvd = spa->spa_root_vdev;
2827 2835          vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
2828 2836          vdev_t *pvd;
2829 2837          uint64_t txg = zio->io_txg;
2830 2838          vdev_stat_t *vs = &vd->vdev_stat;
2831 2839          zio_type_t type = zio->io_type;
2832 2840          int flags = zio->io_flags;
2833 2841  
2834 2842          /*
2835 2843           * If this i/o is a gang leader, it didn't do any actual work.
2836 2844           */
2837 2845          if (zio->io_gang_tree)
2838 2846                  return;
2839 2847  
2840 2848          if (zio->io_error == 0) {
2841 2849                  /*
2842 2850                   * If this is a root i/o, don't count it -- we've already
2843 2851                   * counted the top-level vdevs, and vdev_get_stats() will
2844 2852                   * aggregate them when asked.  This reduces contention on
2845 2853                   * the root vdev_stat_lock and implicitly handles blocks
2846 2854                   * that compress away to holes, for which there is no i/o.
2847 2855                   * (Holes never create vdev children, so all the counters
2848 2856                   * remain zero, which is what we want.)
2849 2857                   *
2850 2858                   * Note: this only applies to successful i/o (io_error == 0)
2851 2859                   * because unlike i/o counts, errors are not additive.
2852 2860                   * When reading a ditto block, for example, failure of
2853 2861                   * one top-level vdev does not imply a root-level error.
2854 2862                   */
2855 2863                  if (vd == rvd)
2856 2864                          return;
2857 2865  
2858 2866                  ASSERT(vd == zio->io_vd);
2859 2867  
2860 2868                  if (flags & ZIO_FLAG_IO_BYPASS)
2861 2869                          return;
2862 2870  
2863 2871                  mutex_enter(&vd->vdev_stat_lock);
2864 2872  
2865 2873                  if (flags & ZIO_FLAG_IO_REPAIR) {
2866 2874                          if (flags & ZIO_FLAG_SCAN_THREAD) {
2867 2875                                  dsl_scan_phys_t *scn_phys =
2868 2876                                      &spa->spa_dsl_pool->dp_scan->scn_phys;
2869 2877                                  uint64_t *processed = &scn_phys->scn_processed;
2870 2878  
2871 2879                                  /* XXX cleanup? */
2872 2880                                  if (vd->vdev_ops->vdev_op_leaf)
2873 2881                                          atomic_add_64(processed, psize);
2874 2882                                  vs->vs_scan_processed += psize;
2875 2883                          }
2876 2884  
2877 2885                          if (flags & ZIO_FLAG_SELF_HEAL)
2878 2886                                  vs->vs_self_healed += psize;
2879 2887                  }
2880 2888  
2881 2889                  vs->vs_ops[type]++;
2882 2890                  vs->vs_bytes[type] += psize;
2883 2891  
2884 2892                  mutex_exit(&vd->vdev_stat_lock);
2885 2893                  return;
2886 2894          }
2887 2895  
2888 2896          if (flags & ZIO_FLAG_SPECULATIVE)
2889 2897                  return;
2890 2898  
2891 2899          /*
2892 2900           * If this is an I/O error that is going to be retried, then ignore the
2893 2901           * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
2894 2902           * hard errors, when in reality they can happen for any number of
2895 2903           * innocuous reasons (bus resets, MPxIO link failure, etc).
2896 2904           */
2897 2905          if (zio->io_error == EIO &&
2898 2906              !(zio->io_flags & ZIO_FLAG_IO_RETRY))
2899 2907                  return;
2900 2908  
2901 2909          /*
2902 2910           * Intent logs writes won't propagate their error to the root
2903 2911           * I/O so don't mark these types of failures as pool-level
2904 2912           * errors.
2905 2913           */
2906 2914          if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
2907 2915                  return;
2908 2916  
2909 2917          mutex_enter(&vd->vdev_stat_lock);
2910 2918          if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
2911 2919                  if (zio->io_error == ECKSUM)
2912 2920                          vs->vs_checksum_errors++;
2913 2921                  else
2914 2922                          vs->vs_read_errors++;
2915 2923          }
2916 2924          if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
2917 2925                  vs->vs_write_errors++;
2918 2926          mutex_exit(&vd->vdev_stat_lock);
2919 2927  
2920 2928          if (type == ZIO_TYPE_WRITE && txg != 0 &&
2921 2929              (!(flags & ZIO_FLAG_IO_REPAIR) ||
2922 2930              (flags & ZIO_FLAG_SCAN_THREAD) ||
2923 2931              spa->spa_claiming)) {
2924 2932                  /*
2925 2933                   * This is either a normal write (not a repair), or it's
2926 2934                   * a repair induced by the scrub thread, or it's a repair
2927 2935                   * made by zil_claim() during spa_load() in the first txg.
2928 2936                   * In the normal case, we commit the DTL change in the same
2929 2937                   * txg as the block was born.  In the scrub-induced repair
2930 2938                   * case, we know that scrubs run in first-pass syncing context,
2931 2939                   * so we commit the DTL change in spa_syncing_txg(spa).
2932 2940                   * In the zil_claim() case, we commit in spa_first_txg(spa).
2933 2941                   *
2934 2942                   * We currently do not make DTL entries for failed spontaneous
2935 2943                   * self-healing writes triggered by normal (non-scrubbing)
2936 2944                   * reads, because we have no transactional context in which to
2937 2945                   * do so -- and it's not clear that it'd be desirable anyway.
2938 2946                   */
2939 2947                  if (vd->vdev_ops->vdev_op_leaf) {
2940 2948                          uint64_t commit_txg = txg;
2941 2949                          if (flags & ZIO_FLAG_SCAN_THREAD) {
2942 2950                                  ASSERT(flags & ZIO_FLAG_IO_REPAIR);
2943 2951                                  ASSERT(spa_sync_pass(spa) == 1);
2944 2952                                  vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
2945 2953                                  commit_txg = spa_syncing_txg(spa);
2946 2954                          } else if (spa->spa_claiming) {
2947 2955                                  ASSERT(flags & ZIO_FLAG_IO_REPAIR);
2948 2956                                  commit_txg = spa_first_txg(spa);
2949 2957                          }
2950 2958                          ASSERT(commit_txg >= spa_syncing_txg(spa));
2951 2959                          if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
2952 2960                                  return;
2953 2961                          for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2954 2962                                  vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
2955 2963                          vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
2956 2964                  }
2957 2965                  if (vd != rvd)
2958 2966                          vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
2959 2967          }
2960 2968  }
2961 2969  
2962 2970  /*
2963 2971   * Update the in-core space usage stats for this vdev, its metaslab class,
2964 2972   * and the root vdev.
2965 2973   */
2966 2974  void
2967 2975  vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
2968 2976      int64_t space_delta)
2969 2977  {
2970 2978          int64_t dspace_delta = space_delta;
2971 2979          spa_t *spa = vd->vdev_spa;
2972 2980          vdev_t *rvd = spa->spa_root_vdev;
2973 2981          metaslab_group_t *mg = vd->vdev_mg;
2974 2982          metaslab_class_t *mc = mg ? mg->mg_class : NULL;
2975 2983  
2976 2984          ASSERT(vd == vd->vdev_top);
2977 2985  
2978 2986          /*
2979 2987           * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
2980 2988           * factor.  We must calculate this here and not at the root vdev
2981 2989           * because the root vdev's psize-to-asize is simply the max of its
2982 2990           * childrens', thus not accurate enough for us.
2983 2991           */
2984 2992          ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
2985 2993          ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
2986 2994          dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
2987 2995              vd->vdev_deflate_ratio;
2988 2996  
2989 2997          mutex_enter(&vd->vdev_stat_lock);
2990 2998          vd->vdev_stat.vs_alloc += alloc_delta;
2991 2999          vd->vdev_stat.vs_space += space_delta;
2992 3000          vd->vdev_stat.vs_dspace += dspace_delta;
2993 3001          mutex_exit(&vd->vdev_stat_lock);
2994 3002  
2995 3003          if (mc == spa_normal_class(spa)) {
2996 3004                  mutex_enter(&rvd->vdev_stat_lock);
2997 3005                  rvd->vdev_stat.vs_alloc += alloc_delta;
2998 3006                  rvd->vdev_stat.vs_space += space_delta;
2999 3007                  rvd->vdev_stat.vs_dspace += dspace_delta;
3000 3008                  mutex_exit(&rvd->vdev_stat_lock);
3001 3009          }
3002 3010  
3003 3011          if (mc != NULL) {
3004 3012                  ASSERT(rvd == vd->vdev_parent);
3005 3013                  ASSERT(vd->vdev_ms_count != 0);
3006 3014  
3007 3015                  metaslab_class_space_update(mc,
3008 3016                      alloc_delta, defer_delta, space_delta, dspace_delta);
3009 3017          }
3010 3018  }
3011 3019  
3012 3020  /*
3013 3021   * Mark a top-level vdev's config as dirty, placing it on the dirty list
3014 3022   * so that it will be written out next time the vdev configuration is synced.
3015 3023   * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
3016 3024   */
3017 3025  void
3018 3026  vdev_config_dirty(vdev_t *vd)
3019 3027  {
3020 3028          spa_t *spa = vd->vdev_spa;
3021 3029          vdev_t *rvd = spa->spa_root_vdev;
3022 3030          int c;
3023 3031  
3024 3032          ASSERT(spa_writeable(spa));
3025 3033  
3026 3034          /*
3027 3035           * If this is an aux vdev (as with l2cache and spare devices), then we
3028 3036           * update the vdev config manually and set the sync flag.
3029 3037           */
3030 3038          if (vd->vdev_aux != NULL) {
3031 3039                  spa_aux_vdev_t *sav = vd->vdev_aux;
3032 3040                  nvlist_t **aux;
3033 3041                  uint_t naux;
3034 3042  
3035 3043                  for (c = 0; c < sav->sav_count; c++) {
3036 3044                          if (sav->sav_vdevs[c] == vd)
3037 3045                                  break;
3038 3046                  }
3039 3047  
3040 3048                  if (c == sav->sav_count) {
3041 3049                          /*
3042 3050                           * We're being removed.  There's nothing more to do.
3043 3051                           */
3044 3052                          ASSERT(sav->sav_sync == B_TRUE);
3045 3053                          return;
3046 3054                  }
3047 3055  
3048 3056                  sav->sav_sync = B_TRUE;
3049 3057  
3050 3058                  if (nvlist_lookup_nvlist_array(sav->sav_config,
3051 3059                      ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
3052 3060                          VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
3053 3061                              ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
3054 3062                  }
3055 3063  
3056 3064                  ASSERT(c < naux);
3057 3065  
3058 3066                  /*
3059 3067                   * Setting the nvlist in the middle if the array is a little
3060 3068                   * sketchy, but it will work.
3061 3069                   */
3062 3070                  nvlist_free(aux[c]);
3063 3071                  aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
3064 3072  
3065 3073                  return;
3066 3074          }
3067 3075  
3068 3076          /*
3069 3077           * The dirty list is protected by the SCL_CONFIG lock.  The caller
3070 3078           * must either hold SCL_CONFIG as writer, or must be the sync thread
3071 3079           * (which holds SCL_CONFIG as reader).  There's only one sync thread,
3072 3080           * so this is sufficient to ensure mutual exclusion.
3073 3081           */
3074 3082          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
3075 3083              (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3076 3084              spa_config_held(spa, SCL_CONFIG, RW_READER)));
3077 3085  
3078 3086          if (vd == rvd) {
3079 3087                  for (c = 0; c < rvd->vdev_children; c++)
3080 3088                          vdev_config_dirty(rvd->vdev_child[c]);
3081 3089          } else {
3082 3090                  ASSERT(vd == vd->vdev_top);
3083 3091  
3084 3092                  if (!list_link_active(&vd->vdev_config_dirty_node) &&
3085 3093                      !vd->vdev_ishole)
3086 3094                          list_insert_head(&spa->spa_config_dirty_list, vd);
3087 3095          }
3088 3096  }
3089 3097  
3090 3098  void
3091 3099  vdev_config_clean(vdev_t *vd)
3092 3100  {
3093 3101          spa_t *spa = vd->vdev_spa;
3094 3102  
3095 3103          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
3096 3104              (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3097 3105              spa_config_held(spa, SCL_CONFIG, RW_READER)));
3098 3106  
3099 3107          ASSERT(list_link_active(&vd->vdev_config_dirty_node));
3100 3108          list_remove(&spa->spa_config_dirty_list, vd);
3101 3109  }
3102 3110  
3103 3111  /*
3104 3112   * Mark a top-level vdev's state as dirty, so that the next pass of
3105 3113   * spa_sync() can convert this into vdev_config_dirty().  We distinguish
3106 3114   * the state changes from larger config changes because they require
3107 3115   * much less locking, and are often needed for administrative actions.
3108 3116   */
3109 3117  void
3110 3118  vdev_state_dirty(vdev_t *vd)
3111 3119  {
3112 3120          spa_t *spa = vd->vdev_spa;
3113 3121  
3114 3122          ASSERT(spa_writeable(spa));
3115 3123          ASSERT(vd == vd->vdev_top);
3116 3124  
3117 3125          /*
3118 3126           * The state list is protected by the SCL_STATE lock.  The caller
3119 3127           * must either hold SCL_STATE as writer, or must be the sync thread
3120 3128           * (which holds SCL_STATE as reader).  There's only one sync thread,
3121 3129           * so this is sufficient to ensure mutual exclusion.
3122 3130           */
3123 3131          ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
3124 3132              (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3125 3133              spa_config_held(spa, SCL_STATE, RW_READER)));
3126 3134  
3127 3135          if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
3128 3136                  list_insert_head(&spa->spa_state_dirty_list, vd);
3129 3137  }
3130 3138  
3131 3139  void
3132 3140  vdev_state_clean(vdev_t *vd)
3133 3141  {
3134 3142          spa_t *spa = vd->vdev_spa;
3135 3143  
3136 3144          ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
3137 3145              (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3138 3146              spa_config_held(spa, SCL_STATE, RW_READER)));
3139 3147  
3140 3148          ASSERT(list_link_active(&vd->vdev_state_dirty_node));
3141 3149          list_remove(&spa->spa_state_dirty_list, vd);
3142 3150  }
3143 3151  
3144 3152  /*
3145 3153   * Propagate vdev state up from children to parent.
3146 3154   */
3147 3155  void
3148 3156  vdev_propagate_state(vdev_t *vd)
3149 3157  {
3150 3158          spa_t *spa = vd->vdev_spa;
3151 3159          vdev_t *rvd = spa->spa_root_vdev;
3152 3160          int degraded = 0, faulted = 0;
3153 3161          int corrupted = 0;
3154 3162          vdev_t *child;
3155 3163  
3156 3164          if (vd->vdev_children > 0) {
3157 3165                  for (int c = 0; c < vd->vdev_children; c++) {
3158 3166                          child = vd->vdev_child[c];
3159 3167  
3160 3168                          /*
3161 3169                           * Don't factor holes into the decision.
3162 3170                           */
3163 3171                          if (child->vdev_ishole)
3164 3172                                  continue;
3165 3173  
3166 3174                          if (!vdev_readable(child) ||
3167 3175                              (!vdev_writeable(child) && spa_writeable(spa))) {
3168 3176                                  /*
3169 3177                                   * Root special: if there is a top-level log
3170 3178                                   * device, treat the root vdev as if it were
3171 3179                                   * degraded.
3172 3180                                   */
3173 3181                                  if (child->vdev_islog && vd == rvd)
3174 3182                                          degraded++;
3175 3183                                  else
3176 3184                                          faulted++;
3177 3185                          } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
3178 3186                                  degraded++;
3179 3187                          }
3180 3188  
3181 3189                          if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
3182 3190                                  corrupted++;
3183 3191                  }
3184 3192  
3185 3193                  vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
3186 3194  
3187 3195                  /*
3188 3196                   * Root special: if there is a top-level vdev that cannot be
3189 3197                   * opened due to corrupted metadata, then propagate the root
3190 3198                   * vdev's aux state as 'corrupt' rather than 'insufficient
3191 3199                   * replicas'.
3192 3200                   */
3193 3201                  if (corrupted && vd == rvd &&
3194 3202                      rvd->vdev_state == VDEV_STATE_CANT_OPEN)
3195 3203                          vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
3196 3204                              VDEV_AUX_CORRUPT_DATA);
3197 3205          }
3198 3206  
3199 3207          if (vd->vdev_parent)
3200 3208                  vdev_propagate_state(vd->vdev_parent);
3201 3209  }
3202 3210  
3203 3211  /*
3204 3212   * Set a vdev's state.  If this is during an open, we don't update the parent
3205 3213   * state, because we're in the process of opening children depth-first.
3206 3214   * Otherwise, we propagate the change to the parent.
3207 3215   *
3208 3216   * If this routine places a device in a faulted state, an appropriate ereport is
3209 3217   * generated.
3210 3218   */
3211 3219  void
3212 3220  vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
3213 3221  {
3214 3222          uint64_t save_state;
3215 3223          spa_t *spa = vd->vdev_spa;
3216 3224  
3217 3225          if (state == vd->vdev_state) {
3218 3226                  vd->vdev_stat.vs_aux = aux;
3219 3227                  return;
3220 3228          }
3221 3229  
3222 3230          save_state = vd->vdev_state;
3223 3231  
3224 3232          vd->vdev_state = state;
3225 3233          vd->vdev_stat.vs_aux = aux;
3226 3234  
3227 3235          /*
3228 3236           * If we are setting the vdev state to anything but an open state, then
3229 3237           * always close the underlying device unless the device has requested
3230 3238           * a delayed close (i.e. we're about to remove or fault the device).
3231 3239           * Otherwise, we keep accessible but invalid devices open forever.
3232 3240           * We don't call vdev_close() itself, because that implies some extra
3233 3241           * checks (offline, etc) that we don't want here.  This is limited to
3234 3242           * leaf devices, because otherwise closing the device will affect other
3235 3243           * children.
3236 3244           */
3237 3245          if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
3238 3246              vd->vdev_ops->vdev_op_leaf)
3239 3247                  vd->vdev_ops->vdev_op_close(vd);
3240 3248  
3241 3249          /*
3242 3250           * If we have brought this vdev back into service, we need
3243 3251           * to notify fmd so that it can gracefully repair any outstanding
3244 3252           * cases due to a missing device.  We do this in all cases, even those
3245 3253           * that probably don't correlate to a repaired fault.  This is sure to
3246 3254           * catch all cases, and we let the zfs-retire agent sort it out.  If
3247 3255           * this is a transient state it's OK, as the retire agent will
3248 3256           * double-check the state of the vdev before repairing it.
3249 3257           */
3250 3258          if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
3251 3259              vd->vdev_prevstate != state)
3252 3260                  zfs_post_state_change(spa, vd);
3253 3261  
3254 3262          if (vd->vdev_removed &&
3255 3263              state == VDEV_STATE_CANT_OPEN &&
3256 3264              (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
3257 3265                  /*
3258 3266                   * If the previous state is set to VDEV_STATE_REMOVED, then this
3259 3267                   * device was previously marked removed and someone attempted to
3260 3268                   * reopen it.  If this failed due to a nonexistent device, then
3261 3269                   * keep the device in the REMOVED state.  We also let this be if
3262 3270                   * it is one of our special test online cases, which is only
3263 3271                   * attempting to online the device and shouldn't generate an FMA
3264 3272                   * fault.
3265 3273                   */
3266 3274                  vd->vdev_state = VDEV_STATE_REMOVED;
3267 3275                  vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
3268 3276          } else if (state == VDEV_STATE_REMOVED) {
3269 3277                  vd->vdev_removed = B_TRUE;
3270 3278          } else if (state == VDEV_STATE_CANT_OPEN) {
3271 3279                  /*
3272 3280                   * If we fail to open a vdev during an import or recovery, we
3273 3281                   * mark it as "not available", which signifies that it was
3274 3282                   * never there to begin with.  Failure to open such a device
3275 3283                   * is not considered an error.
3276 3284                   */
3277 3285                  if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
3278 3286                      spa_load_state(spa) == SPA_LOAD_RECOVER) &&
3279 3287                      vd->vdev_ops->vdev_op_leaf)
3280 3288                          vd->vdev_not_present = 1;
3281 3289  
3282 3290                  /*
3283 3291                   * Post the appropriate ereport.  If the 'prevstate' field is
3284 3292                   * set to something other than VDEV_STATE_UNKNOWN, it indicates
3285 3293                   * that this is part of a vdev_reopen().  In this case, we don't
3286 3294                   * want to post the ereport if the device was already in the
3287 3295                   * CANT_OPEN state beforehand.
3288 3296                   *
3289 3297                   * If the 'checkremove' flag is set, then this is an attempt to
3290 3298                   * online the device in response to an insertion event.  If we
3291 3299                   * hit this case, then we have detected an insertion event for a
3292 3300                   * faulted or offline device that wasn't in the removed state.
3293 3301                   * In this scenario, we don't post an ereport because we are
3294 3302                   * about to replace the device, or attempt an online with
3295 3303                   * vdev_forcefault, which will generate the fault for us.
3296 3304                   */
3297 3305                  if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
3298 3306                      !vd->vdev_not_present && !vd->vdev_checkremove &&
3299 3307                      vd != spa->spa_root_vdev) {
3300 3308                          const char *class;
3301 3309  
3302 3310                          switch (aux) {
3303 3311                          case VDEV_AUX_OPEN_FAILED:
3304 3312                                  class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
3305 3313                                  break;
3306 3314                          case VDEV_AUX_CORRUPT_DATA:
3307 3315                                  class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
3308 3316                                  break;
3309 3317                          case VDEV_AUX_NO_REPLICAS:
3310 3318                                  class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
3311 3319                                  break;
3312 3320                          case VDEV_AUX_BAD_GUID_SUM:
3313 3321                                  class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
3314 3322                                  break;
3315 3323                          case VDEV_AUX_TOO_SMALL:
3316 3324                                  class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
3317 3325                                  break;
3318 3326                          case VDEV_AUX_BAD_LABEL:
3319 3327                                  class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
3320 3328                                  break;
3321 3329                          default:
3322 3330                                  class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
3323 3331                          }
3324 3332  
3325 3333                          zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
3326 3334                  }
3327 3335  
3328 3336                  /* Erase any notion of persistent removed state */
3329 3337                  vd->vdev_removed = B_FALSE;
3330 3338          } else {
3331 3339                  vd->vdev_removed = B_FALSE;
3332 3340          }
3333 3341  
3334 3342          if (!isopen && vd->vdev_parent)
3335 3343                  vdev_propagate_state(vd->vdev_parent);
3336 3344  }
3337 3345  
3338 3346  /*
3339 3347   * Check the vdev configuration to ensure that it's capable of supporting
3340 3348   * a root pool. We do not support partial configuration.
3341 3349   * In addition, only a single top-level vdev is allowed.
3342 3350   */
3343 3351  boolean_t
3344 3352  vdev_is_bootable(vdev_t *vd)
3345 3353  {
3346 3354          if (!vd->vdev_ops->vdev_op_leaf) {
3347 3355                  char *vdev_type = vd->vdev_ops->vdev_op_type;
3348 3356  
3349 3357                  if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
3350 3358                      vd->vdev_children > 1) {
3351 3359                          return (B_FALSE);
3352 3360                  } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
3353 3361                          return (B_FALSE);
3354 3362                  }
3355 3363          }
3356 3364  
3357 3365          for (int c = 0; c < vd->vdev_children; c++) {
3358 3366                  if (!vdev_is_bootable(vd->vdev_child[c]))
3359 3367                          return (B_FALSE);
3360 3368          }
3361 3369          return (B_TRUE);
3362 3370  }
3363 3371  
3364 3372  /*
3365 3373   * Load the state from the original vdev tree (ovd) which
3366 3374   * we've retrieved from the MOS config object. If the original
3367 3375   * vdev was offline or faulted then we transfer that state to the
3368 3376   * device in the current vdev tree (nvd).
3369 3377   */
3370 3378  void
3371 3379  vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
3372 3380  {
3373 3381          spa_t *spa = nvd->vdev_spa;
3374 3382  
3375 3383          ASSERT(nvd->vdev_top->vdev_islog);
3376 3384          ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3377 3385          ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
3378 3386  
3379 3387          for (int c = 0; c < nvd->vdev_children; c++)
3380 3388                  vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
3381 3389  
3382 3390          if (nvd->vdev_ops->vdev_op_leaf) {
3383 3391                  /*
3384 3392                   * Restore the persistent vdev state
3385 3393                   */
3386 3394                  nvd->vdev_offline = ovd->vdev_offline;
3387 3395                  nvd->vdev_faulted = ovd->vdev_faulted;
3388 3396                  nvd->vdev_degraded = ovd->vdev_degraded;
3389 3397                  nvd->vdev_removed = ovd->vdev_removed;
3390 3398          }
3391 3399  }
3392 3400  
3393 3401  /*
3394 3402   * Determine if a log device has valid content.  If the vdev was
3395 3403   * removed or faulted in the MOS config then we know that
3396 3404   * the content on the log device has already been written to the pool.
3397 3405   */
3398 3406  boolean_t
3399 3407  vdev_log_state_valid(vdev_t *vd)
3400 3408  {
3401 3409          if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
3402 3410              !vd->vdev_removed)
3403 3411                  return (B_TRUE);
3404 3412  
3405 3413          for (int c = 0; c < vd->vdev_children; c++)
3406 3414                  if (vdev_log_state_valid(vd->vdev_child[c]))
3407 3415                          return (B_TRUE);
3408 3416  
3409 3417          return (B_FALSE);
3410 3418  }
3411 3419  
3412 3420  /*
3413 3421   * Expand a vdev if possible.
3414 3422   */
3415 3423  void
3416 3424  vdev_expand(vdev_t *vd, uint64_t txg)
3417 3425  {
3418 3426          ASSERT(vd->vdev_top == vd);
3419 3427          ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3420 3428  
3421 3429          if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
3422 3430                  VERIFY(vdev_metaslab_init(vd, txg) == 0);
3423 3431                  vdev_config_dirty(vd);
3424 3432          }
3425 3433  }
3426 3434  
3427 3435  /*
3428 3436   * Split a vdev.
3429 3437   */
3430 3438  void
3431 3439  vdev_split(vdev_t *vd)
3432 3440  {
3433 3441          vdev_t *cvd, *pvd = vd->vdev_parent;
3434 3442  
3435 3443          vdev_remove_child(pvd, vd);
3436 3444          vdev_compact_children(pvd);
3437 3445  
3438 3446          cvd = pvd->vdev_child[0];
3439 3447          if (pvd->vdev_children == 1) {
3440 3448                  vdev_remove_parent(cvd);
3441 3449                  cvd->vdev_splitting = B_TRUE;
3442 3450          }
3443 3451          vdev_propagate_state(cvd);
3444 3452  }
3445 3453  
3446 3454  void
3447 3455  vdev_deadman(vdev_t *vd)
3448 3456  {
3449 3457          for (int c = 0; c < vd->vdev_children; c++) {
3450 3458                  vdev_t *cvd = vd->vdev_child[c];
3451 3459  
3452 3460                  vdev_deadman(cvd);
3453 3461          }
3454 3462  
3455 3463          if (vd->vdev_ops->vdev_op_leaf) {
3456 3464                  vdev_queue_t *vq = &vd->vdev_queue;
3457 3465  
3458 3466                  mutex_enter(&vq->vq_lock);
3459 3467                  if (avl_numnodes(&vq->vq_active_tree) > 0) {
3460 3468                          spa_t *spa = vd->vdev_spa;
3461 3469                          zio_t *fio;
3462 3470                          uint64_t delta;
3463 3471  
3464 3472                          /*
3465 3473                           * Look at the head of all the pending queues,
3466 3474                           * if any I/O has been outstanding for longer than
3467 3475                           * the spa_deadman_synctime we panic the system.
3468 3476                           */
3469 3477                          fio = avl_first(&vq->vq_active_tree);
3470 3478                          delta = gethrtime() - fio->io_timestamp;
3471 3479                          if (delta > spa_deadman_synctime(spa)) {
3472 3480                                  zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
3473 3481                                      "delta %lluns, last io %lluns",
3474 3482                                      fio->io_timestamp, delta,
3475 3483                                      vq->vq_io_complete_ts);
3476 3484                                  fm_panic("I/O to pool '%s' appears to be "
3477 3485                                      "hung.", spa_name(spa));
3478 3486                          }
3479 3487                  }
3480 3488                  mutex_exit(&vq->vq_lock);
3481 3489          }
3482 3490  }

↓ open down ↓

2322 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX