illumos-gate Wdiff usr/src/uts/common/fs/zfs/vdev.c

Print this page

5269 zfs: zpool import slow
PORTING: this code relies on the property of taskq_wait to wait
until no more tasks are queued and no more tasks are active. As
we always queue new tasks from within other tasks, task_wait
reliably waits for the full recursion to finish, even though we
enqueue new tasks after taskq_wait has been called.
On platforms other than illumos, taskq_wait may not have this
property.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: George Wilson <george.wilson@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/vdev.c
          +++ new/usr/src/uts/common/fs/zfs/vdev.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  25   25   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  26   26   */
  27   27  
  28   28  #include <sys/zfs_context.h>
  29   29  #include <sys/fm/fs/zfs.h>
  30   30  #include <sys/spa.h>
  31   31  #include <sys/spa_impl.h>
  32   32  #include <sys/dmu.h>
  33   33  #include <sys/dmu_tx.h>
  34   34  #include <sys/vdev_impl.h>
  35   35  #include <sys/uberblock_impl.h>
  36   36  #include <sys/metaslab.h>
  37   37  #include <sys/metaslab_impl.h>
  38   38  #include <sys/space_map.h>
  39   39  #include <sys/space_reftree.h>
  40   40  #include <sys/zio.h>
  41   41  #include <sys/zap.h>
  42   42  #include <sys/fs/zfs.h>
  43   43  #include <sys/arc.h>
  44   44  #include <sys/zil.h>
  45   45  #include <sys/dsl_scan.h>
  46   46  
  47   47  /*
  48   48   * Virtual device management.
  49   49   */
  50   50  
  51   51  static vdev_ops_t *vdev_ops_table[] = {
  52   52          &vdev_root_ops,
  53   53          &vdev_raidz_ops,
  54   54          &vdev_mirror_ops,
  55   55          &vdev_replacing_ops,
  56   56          &vdev_spare_ops,
  57   57          &vdev_disk_ops,
  58   58          &vdev_file_ops,
  59   59          &vdev_missing_ops,
  60   60          &vdev_hole_ops,
  61   61          NULL
  62   62  };
  63   63  
  64   64  /* maximum scrub/resilver I/O queue per leaf vdev */
  65   65  int zfs_scrub_limit = 10;
  66   66  
  67   67  /*
  68   68   * When a vdev is added, it will be divided into approximately (but no
  69   69   * more than) this number of metaslabs.
  70   70   */
  71   71  int metaslabs_per_vdev = 200;
  72   72  
  73   73  /*
  74   74   * Given a vdev type, return the appropriate ops vector.
  75   75   */
  76   76  static vdev_ops_t *
  77   77  vdev_getops(const char *type)
  78   78  {
  79   79          vdev_ops_t *ops, **opspp;
  80   80  
  81   81          for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
  82   82                  if (strcmp(ops->vdev_op_type, type) == 0)
  83   83                          break;
  84   84  
  85   85          return (ops);
  86   86  }
  87   87  
  88   88  /*
  89   89   * Default asize function: return the MAX of psize with the asize of
  90   90   * all children.  This is what's used by anything other than RAID-Z.
  91   91   */
  92   92  uint64_t
  93   93  vdev_default_asize(vdev_t *vd, uint64_t psize)
  94   94  {
  95   95          uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
  96   96          uint64_t csize;
  97   97  
  98   98          for (int c = 0; c < vd->vdev_children; c++) {
  99   99                  csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
 100  100                  asize = MAX(asize, csize);
 101  101          }
 102  102  
 103  103          return (asize);
 104  104  }
 105  105  
 106  106  /*
 107  107   * Get the minimum allocatable size. We define the allocatable size as
 108  108   * the vdev's asize rounded to the nearest metaslab. This allows us to
 109  109   * replace or attach devices which don't have the same physical size but
 110  110   * can still satisfy the same number of allocations.
 111  111   */
 112  112  uint64_t
 113  113  vdev_get_min_asize(vdev_t *vd)
 114  114  {
 115  115          vdev_t *pvd = vd->vdev_parent;
 116  116  
 117  117          /*
 118  118           * If our parent is NULL (inactive spare or cache) or is the root,
 119  119           * just return our own asize.
 120  120           */
 121  121          if (pvd == NULL)
 122  122                  return (vd->vdev_asize);
 123  123  
 124  124          /*
 125  125           * The top-level vdev just returns the allocatable size rounded
 126  126           * to the nearest metaslab.
 127  127           */
 128  128          if (vd == vd->vdev_top)
 129  129                  return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
 130  130  
 131  131          /*
 132  132           * The allocatable space for a raidz vdev is N * sizeof(smallest child),
 133  133           * so each child must provide at least 1/Nth of its asize.
 134  134           */
 135  135          if (pvd->vdev_ops == &vdev_raidz_ops)
 136  136                  return (pvd->vdev_min_asize / pvd->vdev_children);
 137  137  
 138  138          return (pvd->vdev_min_asize);
 139  139  }
 140  140  
 141  141  void
 142  142  vdev_set_min_asize(vdev_t *vd)
 143  143  {
 144  144          vd->vdev_min_asize = vdev_get_min_asize(vd);
 145  145  
 146  146          for (int c = 0; c < vd->vdev_children; c++)
 147  147                  vdev_set_min_asize(vd->vdev_child[c]);
 148  148  }
 149  149  
 150  150  vdev_t *
 151  151  vdev_lookup_top(spa_t *spa, uint64_t vdev)
 152  152  {
 153  153          vdev_t *rvd = spa->spa_root_vdev;
 154  154  
 155  155          ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 156  156  
 157  157          if (vdev < rvd->vdev_children) {
 158  158                  ASSERT(rvd->vdev_child[vdev] != NULL);
 159  159                  return (rvd->vdev_child[vdev]);
 160  160          }
 161  161  
 162  162          return (NULL);
 163  163  }
 164  164  
 165  165  vdev_t *
 166  166  vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 167  167  {
 168  168          vdev_t *mvd;
 169  169  
 170  170          if (vd->vdev_guid == guid)

↓ open down ↓

170 lines elided

↑ open up ↑

 171  171                  return (vd);
 172  172  
 173  173          for (int c = 0; c < vd->vdev_children; c++)
 174  174                  if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 175  175                      NULL)
 176  176                          return (mvd);
 177  177  
 178  178          return (NULL);
 179  179  }
 180  180  
      181 +static int
      182 +vdev_count_leaves_impl(vdev_t *vd)
      183 +{
      184 +        int n = 0;
      185 +
      186 +        if (vd->vdev_ops->vdev_op_leaf)
      187 +                return (1);
      188 +
      189 +        for (int c = 0; c < vd->vdev_children; c++)
      190 +                n += vdev_count_leaves_impl(vd->vdev_child[c]);
      191 +
      192 +        return (n);
      193 +}
      194 +
      195 +int
      196 +vdev_count_leaves(spa_t *spa)
      197 +{
      198 +        return (vdev_count_leaves_impl(spa->spa_root_vdev));
      199 +}
      200 +
 181  201  void
 182  202  vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 183  203  {
 184  204          size_t oldsize, newsize;
 185  205          uint64_t id = cvd->vdev_id;
 186  206          vdev_t **newchild;
 187  207  
 188  208          ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 189  209          ASSERT(cvd->vdev_parent == NULL);
 190  210

 191  211          cvd->vdev_parent = pvd;
 192  212  
 193  213          if (pvd == NULL)
 194  214                  return;
 195  215  
 196  216          ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
 197  217  
 198  218          oldsize = pvd->vdev_children * sizeof (vdev_t *);
 199  219          pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
 200  220          newsize = pvd->vdev_children * sizeof (vdev_t *);
 201  221  
 202  222          newchild = kmem_zalloc(newsize, KM_SLEEP);
 203  223          if (pvd->vdev_child != NULL) {
 204  224                  bcopy(pvd->vdev_child, newchild, oldsize);
 205  225                  kmem_free(pvd->vdev_child, oldsize);
 206  226          }
 207  227  
 208  228          pvd->vdev_child = newchild;
 209  229          pvd->vdev_child[id] = cvd;
 210  230  
 211  231          cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 212  232          ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
 213  233  
 214  234          /*
 215  235           * Walk up all ancestors to update guid sum.
 216  236           */
 217  237          for (; pvd != NULL; pvd = pvd->vdev_parent)
 218  238                  pvd->vdev_guid_sum += cvd->vdev_guid_sum;
 219  239  }
 220  240  
 221  241  void
 222  242  vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 223  243  {
 224  244          int c;
 225  245          uint_t id = cvd->vdev_id;
 226  246  
 227  247          ASSERT(cvd->vdev_parent == pvd);
 228  248  
 229  249          if (pvd == NULL)
 230  250                  return;
 231  251  
 232  252          ASSERT(id < pvd->vdev_children);
 233  253          ASSERT(pvd->vdev_child[id] == cvd);
 234  254  
 235  255          pvd->vdev_child[id] = NULL;
 236  256          cvd->vdev_parent = NULL;
 237  257  
 238  258          for (c = 0; c < pvd->vdev_children; c++)
 239  259                  if (pvd->vdev_child[c])
 240  260                          break;
 241  261  
 242  262          if (c == pvd->vdev_children) {
 243  263                  kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
 244  264                  pvd->vdev_child = NULL;
 245  265                  pvd->vdev_children = 0;
 246  266          }
 247  267  
 248  268          /*
 249  269           * Walk up all ancestors to update guid sum.
 250  270           */
 251  271          for (; pvd != NULL; pvd = pvd->vdev_parent)
 252  272                  pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
 253  273  }
 254  274  
 255  275  /*
 256  276   * Remove any holes in the child array.
 257  277   */
 258  278  void
 259  279  vdev_compact_children(vdev_t *pvd)
 260  280  {
 261  281          vdev_t **newchild, *cvd;
 262  282          int oldc = pvd->vdev_children;
 263  283          int newc;
 264  284  
 265  285          ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 266  286  
 267  287          for (int c = newc = 0; c < oldc; c++)
 268  288                  if (pvd->vdev_child[c])
 269  289                          newc++;
 270  290  
 271  291          newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
 272  292  
 273  293          for (int c = newc = 0; c < oldc; c++) {
 274  294                  if ((cvd = pvd->vdev_child[c]) != NULL) {
 275  295                          newchild[newc] = cvd;
 276  296                          cvd->vdev_id = newc++;
 277  297                  }
 278  298          }
 279  299  
 280  300          kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
 281  301          pvd->vdev_child = newchild;
 282  302          pvd->vdev_children = newc;
 283  303  }
 284  304  
 285  305  /*
 286  306   * Allocate and minimally initialize a vdev_t.
 287  307   */
 288  308  vdev_t *
 289  309  vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 290  310  {
 291  311          vdev_t *vd;
 292  312  
 293  313          vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 294  314  
 295  315          if (spa->spa_root_vdev == NULL) {
 296  316                  ASSERT(ops == &vdev_root_ops);
 297  317                  spa->spa_root_vdev = vd;
 298  318                  spa->spa_load_guid = spa_generate_guid(NULL);
 299  319          }
 300  320  
 301  321          if (guid == 0 && ops != &vdev_hole_ops) {
 302  322                  if (spa->spa_root_vdev == vd) {
 303  323                          /*
 304  324                           * The root vdev's guid will also be the pool guid,
 305  325                           * which must be unique among all pools.
 306  326                           */
 307  327                          guid = spa_generate_guid(NULL);
 308  328                  } else {
 309  329                          /*
 310  330                           * Any other vdev's guid must be unique within the pool.
 311  331                           */
 312  332                          guid = spa_generate_guid(spa);
 313  333                  }
 314  334                  ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 315  335          }
 316  336  
 317  337          vd->vdev_spa = spa;
 318  338          vd->vdev_id = id;
 319  339          vd->vdev_guid = guid;
 320  340          vd->vdev_guid_sum = guid;
 321  341          vd->vdev_ops = ops;
 322  342          vd->vdev_state = VDEV_STATE_CLOSED;
 323  343          vd->vdev_ishole = (ops == &vdev_hole_ops);
 324  344  
 325  345          mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 326  346          mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 327  347          mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 328  348          for (int t = 0; t < DTL_TYPES; t++) {
 329  349                  vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
 330  350                      &vd->vdev_dtl_lock);
 331  351          }
 332  352          txg_list_create(&vd->vdev_ms_list,
 333  353              offsetof(struct metaslab, ms_txg_node));
 334  354          txg_list_create(&vd->vdev_dtl_list,
 335  355              offsetof(struct vdev, vdev_dtl_node));
 336  356          vd->vdev_stat.vs_timestamp = gethrtime();
 337  357          vdev_queue_init(vd);
 338  358          vdev_cache_init(vd);
 339  359  
 340  360          return (vd);
 341  361  }
 342  362  
 343  363  /*
 344  364   * Allocate a new vdev.  The 'alloctype' is used to control whether we are
 345  365   * creating a new vdev or loading an existing one - the behavior is slightly
 346  366   * different for each case.
 347  367   */
 348  368  int
 349  369  vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 350  370      int alloctype)
 351  371  {
 352  372          vdev_ops_t *ops;
 353  373          char *type;
 354  374          uint64_t guid = 0, islog, nparity;
 355  375          vdev_t *vd;
 356  376  
 357  377          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 358  378  
 359  379          if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 360  380                  return (SET_ERROR(EINVAL));
 361  381  
 362  382          if ((ops = vdev_getops(type)) == NULL)
 363  383                  return (SET_ERROR(EINVAL));
 364  384  
 365  385          /*
 366  386           * If this is a load, get the vdev guid from the nvlist.
 367  387           * Otherwise, vdev_alloc_common() will generate one for us.
 368  388           */
 369  389          if (alloctype == VDEV_ALLOC_LOAD) {
 370  390                  uint64_t label_id;
 371  391  
 372  392                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 373  393                      label_id != id)
 374  394                          return (SET_ERROR(EINVAL));
 375  395  
 376  396                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 377  397                          return (SET_ERROR(EINVAL));
 378  398          } else if (alloctype == VDEV_ALLOC_SPARE) {
 379  399                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 380  400                          return (SET_ERROR(EINVAL));
 381  401          } else if (alloctype == VDEV_ALLOC_L2CACHE) {
 382  402                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 383  403                          return (SET_ERROR(EINVAL));
 384  404          } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 385  405                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 386  406                          return (SET_ERROR(EINVAL));
 387  407          }
 388  408  
 389  409          /*
 390  410           * The first allocated vdev must be of type 'root'.
 391  411           */
 392  412          if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 393  413                  return (SET_ERROR(EINVAL));
 394  414  
 395  415          /*
 396  416           * Determine whether we're a log vdev.
 397  417           */
 398  418          islog = 0;
 399  419          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 400  420          if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 401  421                  return (SET_ERROR(ENOTSUP));
 402  422  
 403  423          if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 404  424                  return (SET_ERROR(ENOTSUP));
 405  425  
 406  426          /*
 407  427           * Set the nparity property for RAID-Z vdevs.
 408  428           */
 409  429          nparity = -1ULL;
 410  430          if (ops == &vdev_raidz_ops) {
 411  431                  if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 412  432                      &nparity) == 0) {
 413  433                          if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 414  434                                  return (SET_ERROR(EINVAL));
 415  435                          /*
 416  436                           * Previous versions could only support 1 or 2 parity
 417  437                           * device.
 418  438                           */
 419  439                          if (nparity > 1 &&
 420  440                              spa_version(spa) < SPA_VERSION_RAIDZ2)
 421  441                                  return (SET_ERROR(ENOTSUP));
 422  442                          if (nparity > 2 &&
 423  443                              spa_version(spa) < SPA_VERSION_RAIDZ3)
 424  444                                  return (SET_ERROR(ENOTSUP));
 425  445                  } else {
 426  446                          /*
 427  447                           * We require the parity to be specified for SPAs that
 428  448                           * support multiple parity levels.
 429  449                           */
 430  450                          if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
 431  451                                  return (SET_ERROR(EINVAL));
 432  452                          /*
 433  453                           * Otherwise, we default to 1 parity device for RAID-Z.
 434  454                           */
 435  455                          nparity = 1;
 436  456                  }
 437  457          } else {
 438  458                  nparity = 0;
 439  459          }
 440  460          ASSERT(nparity != -1ULL);
 441  461  
 442  462          vd = vdev_alloc_common(spa, id, guid, ops);
 443  463  
 444  464          vd->vdev_islog = islog;
 445  465          vd->vdev_nparity = nparity;
 446  466  
 447  467          if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
 448  468                  vd->vdev_path = spa_strdup(vd->vdev_path);
 449  469          if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
 450  470                  vd->vdev_devid = spa_strdup(vd->vdev_devid);
 451  471          if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 452  472              &vd->vdev_physpath) == 0)
 453  473                  vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
 454  474          if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
 455  475                  vd->vdev_fru = spa_strdup(vd->vdev_fru);
 456  476  
 457  477          /*
 458  478           * Set the whole_disk property.  If it's not specified, leave the value
 459  479           * as -1.
 460  480           */
 461  481          if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 462  482              &vd->vdev_wholedisk) != 0)
 463  483                  vd->vdev_wholedisk = -1ULL;
 464  484  
 465  485          /*
 466  486           * Look for the 'not present' flag.  This will only be set if the device
 467  487           * was not present at the time of import.
 468  488           */
 469  489          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 470  490              &vd->vdev_not_present);
 471  491  
 472  492          /*
 473  493           * Get the alignment requirement.
 474  494           */
 475  495          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
 476  496  
 477  497          /*
 478  498           * Retrieve the vdev creation time.
 479  499           */
 480  500          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 481  501              &vd->vdev_crtxg);
 482  502  
 483  503          /*
 484  504           * If we're a top-level vdev, try to load the allocation parameters.
 485  505           */
 486  506          if (parent && !parent->vdev_parent &&
 487  507              (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 488  508                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 489  509                      &vd->vdev_ms_array);
 490  510                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 491  511                      &vd->vdev_ms_shift);
 492  512                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 493  513                      &vd->vdev_asize);
 494  514                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 495  515                      &vd->vdev_removing);
 496  516          }
 497  517  
 498  518          if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
 499  519                  ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 500  520                      alloctype == VDEV_ALLOC_ADD ||
 501  521                      alloctype == VDEV_ALLOC_SPLIT ||
 502  522                      alloctype == VDEV_ALLOC_ROOTPOOL);
 503  523                  vd->vdev_mg = metaslab_group_create(islog ?
 504  524                      spa_log_class(spa) : spa_normal_class(spa), vd);
 505  525          }
 506  526  
 507  527          /*
 508  528           * If we're a leaf vdev, try to load the DTL object and other state.
 509  529           */
 510  530          if (vd->vdev_ops->vdev_op_leaf &&
 511  531              (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 512  532              alloctype == VDEV_ALLOC_ROOTPOOL)) {
 513  533                  if (alloctype == VDEV_ALLOC_LOAD) {
 514  534                          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 515  535                              &vd->vdev_dtl_object);
 516  536                          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 517  537                              &vd->vdev_unspare);
 518  538                  }
 519  539  
 520  540                  if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 521  541                          uint64_t spare = 0;
 522  542  
 523  543                          if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 524  544                              &spare) == 0 && spare)
 525  545                                  spa_spare_add(vd);
 526  546                  }
 527  547  
 528  548                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 529  549                      &vd->vdev_offline);
 530  550  
 531  551                  (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 532  552                      &vd->vdev_resilver_txg);
 533  553  
 534  554                  /*
 535  555                   * When importing a pool, we want to ignore the persistent fault
 536  556                   * state, as the diagnosis made on another system may not be
 537  557                   * valid in the current context.  Local vdevs will
 538  558                   * remain in the faulted state.
 539  559                   */
 540  560                  if (spa_load_state(spa) == SPA_LOAD_OPEN) {
 541  561                          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 542  562                              &vd->vdev_faulted);
 543  563                          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 544  564                              &vd->vdev_degraded);
 545  565                          (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 546  566                              &vd->vdev_removed);
 547  567  
 548  568                          if (vd->vdev_faulted || vd->vdev_degraded) {
 549  569                                  char *aux;
 550  570  
 551  571                                  vd->vdev_label_aux =
 552  572                                      VDEV_AUX_ERR_EXCEEDED;
 553  573                                  if (nvlist_lookup_string(nv,
 554  574                                      ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
 555  575                                      strcmp(aux, "external") == 0)
 556  576                                          vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 557  577                          }
 558  578                  }
 559  579          }
 560  580  
 561  581          /*
 562  582           * Add ourselves to the parent's list of children.
 563  583           */
 564  584          vdev_add_child(parent, vd);
 565  585  
 566  586          *vdp = vd;
 567  587  
 568  588          return (0);
 569  589  }
 570  590  
 571  591  void
 572  592  vdev_free(vdev_t *vd)
 573  593  {
 574  594          spa_t *spa = vd->vdev_spa;
 575  595  
 576  596          /*
 577  597           * vdev_free() implies closing the vdev first.  This is simpler than
 578  598           * trying to ensure complicated semantics for all callers.
 579  599           */
 580  600          vdev_close(vd);
 581  601  
 582  602          ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
 583  603          ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 584  604  
 585  605          /*
 586  606           * Free all children.
 587  607           */
 588  608          for (int c = 0; c < vd->vdev_children; c++)
 589  609                  vdev_free(vd->vdev_child[c]);
 590  610  
 591  611          ASSERT(vd->vdev_child == NULL);
 592  612          ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 593  613  
 594  614          /*
 595  615           * Discard allocation state.
 596  616           */
 597  617          if (vd->vdev_mg != NULL) {
 598  618                  vdev_metaslab_fini(vd);
 599  619                  metaslab_group_destroy(vd->vdev_mg);
 600  620          }
 601  621  
 602  622          ASSERT0(vd->vdev_stat.vs_space);
 603  623          ASSERT0(vd->vdev_stat.vs_dspace);
 604  624          ASSERT0(vd->vdev_stat.vs_alloc);
 605  625  
 606  626          /*
 607  627           * Remove this vdev from its parent's child list.
 608  628           */
 609  629          vdev_remove_child(vd->vdev_parent, vd);
 610  630  
 611  631          ASSERT(vd->vdev_parent == NULL);
 612  632  
 613  633          /*
 614  634           * Clean up vdev structure.
 615  635           */
 616  636          vdev_queue_fini(vd);
 617  637          vdev_cache_fini(vd);
 618  638  
 619  639          if (vd->vdev_path)
 620  640                  spa_strfree(vd->vdev_path);
 621  641          if (vd->vdev_devid)
 622  642                  spa_strfree(vd->vdev_devid);
 623  643          if (vd->vdev_physpath)
 624  644                  spa_strfree(vd->vdev_physpath);
 625  645          if (vd->vdev_fru)
 626  646                  spa_strfree(vd->vdev_fru);
 627  647  
 628  648          if (vd->vdev_isspare)
 629  649                  spa_spare_remove(vd);
 630  650          if (vd->vdev_isl2cache)
 631  651                  spa_l2cache_remove(vd);
 632  652  
 633  653          txg_list_destroy(&vd->vdev_ms_list);
 634  654          txg_list_destroy(&vd->vdev_dtl_list);
 635  655  
 636  656          mutex_enter(&vd->vdev_dtl_lock);
 637  657          space_map_close(vd->vdev_dtl_sm);
 638  658          for (int t = 0; t < DTL_TYPES; t++) {
 639  659                  range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
 640  660                  range_tree_destroy(vd->vdev_dtl[t]);
 641  661          }
 642  662          mutex_exit(&vd->vdev_dtl_lock);
 643  663  
 644  664          mutex_destroy(&vd->vdev_dtl_lock);
 645  665          mutex_destroy(&vd->vdev_stat_lock);
 646  666          mutex_destroy(&vd->vdev_probe_lock);
 647  667  
 648  668          if (vd == spa->spa_root_vdev)
 649  669                  spa->spa_root_vdev = NULL;
 650  670  
 651  671          kmem_free(vd, sizeof (vdev_t));
 652  672  }
 653  673  
 654  674  /*
 655  675   * Transfer top-level vdev state from svd to tvd.
 656  676   */
 657  677  static void
 658  678  vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 659  679  {
 660  680          spa_t *spa = svd->vdev_spa;
 661  681          metaslab_t *msp;
 662  682          vdev_t *vd;
 663  683          int t;
 664  684  
 665  685          ASSERT(tvd == tvd->vdev_top);
 666  686  
 667  687          tvd->vdev_ms_array = svd->vdev_ms_array;
 668  688          tvd->vdev_ms_shift = svd->vdev_ms_shift;
 669  689          tvd->vdev_ms_count = svd->vdev_ms_count;
 670  690  
 671  691          svd->vdev_ms_array = 0;
 672  692          svd->vdev_ms_shift = 0;
 673  693          svd->vdev_ms_count = 0;
 674  694  
 675  695          if (tvd->vdev_mg)
 676  696                  ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
 677  697          tvd->vdev_mg = svd->vdev_mg;
 678  698          tvd->vdev_ms = svd->vdev_ms;
 679  699  
 680  700          svd->vdev_mg = NULL;
 681  701          svd->vdev_ms = NULL;
 682  702  
 683  703          if (tvd->vdev_mg != NULL)
 684  704                  tvd->vdev_mg->mg_vd = tvd;
 685  705  
 686  706          tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 687  707          tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 688  708          tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
 689  709  
 690  710          svd->vdev_stat.vs_alloc = 0;
 691  711          svd->vdev_stat.vs_space = 0;
 692  712          svd->vdev_stat.vs_dspace = 0;
 693  713  
 694  714          for (t = 0; t < TXG_SIZE; t++) {
 695  715                  while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 696  716                          (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
 697  717                  while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
 698  718                          (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
 699  719                  if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
 700  720                          (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
 701  721          }
 702  722  
 703  723          if (list_link_active(&svd->vdev_config_dirty_node)) {
 704  724                  vdev_config_clean(svd);
 705  725                  vdev_config_dirty(tvd);
 706  726          }
 707  727  
 708  728          if (list_link_active(&svd->vdev_state_dirty_node)) {
 709  729                  vdev_state_clean(svd);
 710  730                  vdev_state_dirty(tvd);
 711  731          }
 712  732  
 713  733          tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 714  734          svd->vdev_deflate_ratio = 0;
 715  735  
 716  736          tvd->vdev_islog = svd->vdev_islog;
 717  737          svd->vdev_islog = 0;
 718  738  }
 719  739  
 720  740  static void
 721  741  vdev_top_update(vdev_t *tvd, vdev_t *vd)
 722  742  {
 723  743          if (vd == NULL)
 724  744                  return;
 725  745  
 726  746          vd->vdev_top = tvd;
 727  747  
 728  748          for (int c = 0; c < vd->vdev_children; c++)
 729  749                  vdev_top_update(tvd, vd->vdev_child[c]);
 730  750  }
 731  751  
 732  752  /*
 733  753   * Add a mirror/replacing vdev above an existing vdev.
 734  754   */
 735  755  vdev_t *
 736  756  vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 737  757  {
 738  758          spa_t *spa = cvd->vdev_spa;
 739  759          vdev_t *pvd = cvd->vdev_parent;
 740  760          vdev_t *mvd;
 741  761  
 742  762          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 743  763  
 744  764          mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 745  765  
 746  766          mvd->vdev_asize = cvd->vdev_asize;
 747  767          mvd->vdev_min_asize = cvd->vdev_min_asize;
 748  768          mvd->vdev_max_asize = cvd->vdev_max_asize;
 749  769          mvd->vdev_ashift = cvd->vdev_ashift;
 750  770          mvd->vdev_state = cvd->vdev_state;
 751  771          mvd->vdev_crtxg = cvd->vdev_crtxg;
 752  772  
 753  773          vdev_remove_child(pvd, cvd);
 754  774          vdev_add_child(pvd, mvd);
 755  775          cvd->vdev_id = mvd->vdev_children;
 756  776          vdev_add_child(mvd, cvd);
 757  777          vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 758  778  
 759  779          if (mvd == mvd->vdev_top)
 760  780                  vdev_top_transfer(cvd, mvd);
 761  781  
 762  782          return (mvd);
 763  783  }
 764  784  
 765  785  /*
 766  786   * Remove a 1-way mirror/replacing vdev from the tree.
 767  787   */
 768  788  void
 769  789  vdev_remove_parent(vdev_t *cvd)
 770  790  {
 771  791          vdev_t *mvd = cvd->vdev_parent;
 772  792          vdev_t *pvd = mvd->vdev_parent;
 773  793  
 774  794          ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 775  795  
 776  796          ASSERT(mvd->vdev_children == 1);
 777  797          ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
 778  798              mvd->vdev_ops == &vdev_replacing_ops ||
 779  799              mvd->vdev_ops == &vdev_spare_ops);
 780  800          cvd->vdev_ashift = mvd->vdev_ashift;
 781  801  
 782  802          vdev_remove_child(mvd, cvd);
 783  803          vdev_remove_child(pvd, mvd);
 784  804  
 785  805          /*
 786  806           * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 787  807           * Otherwise, we could have detached an offline device, and when we
 788  808           * go to import the pool we'll think we have two top-level vdevs,
 789  809           * instead of a different version of the same top-level vdev.
 790  810           */
 791  811          if (mvd->vdev_top == mvd) {
 792  812                  uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
 793  813                  cvd->vdev_orig_guid = cvd->vdev_guid;
 794  814                  cvd->vdev_guid += guid_delta;
 795  815                  cvd->vdev_guid_sum += guid_delta;
 796  816          }
 797  817          cvd->vdev_id = mvd->vdev_id;
 798  818          vdev_add_child(pvd, cvd);
 799  819          vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 800  820  
 801  821          if (cvd == cvd->vdev_top)
 802  822                  vdev_top_transfer(mvd, cvd);
 803  823  
 804  824          ASSERT(mvd->vdev_children == 0);
 805  825          vdev_free(mvd);
 806  826  }
 807  827  
 808  828  int
 809  829  vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 810  830  {
 811  831          spa_t *spa = vd->vdev_spa;
 812  832          objset_t *mos = spa->spa_meta_objset;
 813  833          uint64_t m;
 814  834          uint64_t oldc = vd->vdev_ms_count;
 815  835          uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 816  836          metaslab_t **mspp;
 817  837          int error;
 818  838  
 819  839          ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 820  840  
 821  841          /*
 822  842           * This vdev is not being allocated from yet or is a hole.
 823  843           */
 824  844          if (vd->vdev_ms_shift == 0)
 825  845                  return (0);
 826  846  
 827  847          ASSERT(!vd->vdev_ishole);
 828  848  
 829  849          /*
 830  850           * Compute the raidz-deflation ratio.  Note, we hard-code
 831  851           * in 128k (1 << 17) because it is the "typical" blocksize.
 832  852           * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
 833  853           * otherwise it would inconsistently account for existing bp's.
 834  854           */
 835  855          vd->vdev_deflate_ratio = (1 << 17) /
 836  856              (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
 837  857  
 838  858          ASSERT(oldc <= newc);
 839  859  
 840  860          mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 841  861  
 842  862          if (oldc != 0) {
 843  863                  bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
 844  864                  kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 845  865          }
 846  866  
 847  867          vd->vdev_ms = mspp;
 848  868          vd->vdev_ms_count = newc;
 849  869  
 850  870          for (m = oldc; m < newc; m++) {
 851  871                  uint64_t object = 0;
 852  872  
 853  873                  if (txg == 0) {
 854  874                          error = dmu_read(mos, vd->vdev_ms_array,
 855  875                              m * sizeof (uint64_t), sizeof (uint64_t), &object,
 856  876                              DMU_READ_PREFETCH);
 857  877                          if (error)
 858  878                                  return (error);
 859  879                  }
 860  880  
 861  881                  error = metaslab_init(vd->vdev_mg, m, object, txg,
 862  882                      &(vd->vdev_ms[m]));
 863  883                  if (error)
 864  884                          return (error);
 865  885          }
 866  886  
 867  887          if (txg == 0)
 868  888                  spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 869  889  
 870  890          /*
 871  891           * If the vdev is being removed we don't activate
 872  892           * the metaslabs since we want to ensure that no new
 873  893           * allocations are performed on this device.
 874  894           */
 875  895          if (oldc == 0 && !vd->vdev_removing)
 876  896                  metaslab_group_activate(vd->vdev_mg);
 877  897  
 878  898          if (txg == 0)
 879  899                  spa_config_exit(spa, SCL_ALLOC, FTAG);
 880  900  
 881  901          return (0);
 882  902  }
 883  903  
 884  904  void
 885  905  vdev_metaslab_fini(vdev_t *vd)
 886  906  {
 887  907          uint64_t m;
 888  908          uint64_t count = vd->vdev_ms_count;
 889  909  
 890  910          if (vd->vdev_ms != NULL) {
 891  911                  metaslab_group_passivate(vd->vdev_mg);
 892  912                  for (m = 0; m < count; m++) {
 893  913                          metaslab_t *msp = vd->vdev_ms[m];
 894  914  
 895  915                          if (msp != NULL)
 896  916                                  metaslab_fini(msp);
 897  917                  }
 898  918                  kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 899  919                  vd->vdev_ms = NULL;
 900  920          }
 901  921  }
 902  922  
 903  923  typedef struct vdev_probe_stats {
 904  924          boolean_t       vps_readable;
 905  925          boolean_t       vps_writeable;
 906  926          int             vps_flags;
 907  927  } vdev_probe_stats_t;
 908  928  
 909  929  static void
 910  930  vdev_probe_done(zio_t *zio)
 911  931  {
 912  932          spa_t *spa = zio->io_spa;
 913  933          vdev_t *vd = zio->io_vd;
 914  934          vdev_probe_stats_t *vps = zio->io_private;
 915  935  
 916  936          ASSERT(vd->vdev_probe_zio != NULL);
 917  937  
 918  938          if (zio->io_type == ZIO_TYPE_READ) {
 919  939                  if (zio->io_error == 0)
 920  940                          vps->vps_readable = 1;
 921  941                  if (zio->io_error == 0 && spa_writeable(spa)) {
 922  942                          zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 923  943                              zio->io_offset, zio->io_size, zio->io_data,
 924  944                              ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 925  945                              ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
 926  946                  } else {
 927  947                          zio_buf_free(zio->io_data, zio->io_size);
 928  948                  }
 929  949          } else if (zio->io_type == ZIO_TYPE_WRITE) {
 930  950                  if (zio->io_error == 0)
 931  951                          vps->vps_writeable = 1;
 932  952                  zio_buf_free(zio->io_data, zio->io_size);
 933  953          } else if (zio->io_type == ZIO_TYPE_NULL) {
 934  954                  zio_t *pio;
 935  955  
 936  956                  vd->vdev_cant_read |= !vps->vps_readable;
 937  957                  vd->vdev_cant_write |= !vps->vps_writeable;
 938  958  
 939  959                  if (vdev_readable(vd) &&
 940  960                      (vdev_writeable(vd) || !spa_writeable(spa))) {
 941  961                          zio->io_error = 0;
 942  962                  } else {
 943  963                          ASSERT(zio->io_error != 0);
 944  964                          zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 945  965                              spa, vd, NULL, 0, 0);
 946  966                          zio->io_error = SET_ERROR(ENXIO);
 947  967                  }
 948  968  
 949  969                  mutex_enter(&vd->vdev_probe_lock);
 950  970                  ASSERT(vd->vdev_probe_zio == zio);
 951  971                  vd->vdev_probe_zio = NULL;
 952  972                  mutex_exit(&vd->vdev_probe_lock);
 953  973  
 954  974                  while ((pio = zio_walk_parents(zio)) != NULL)
 955  975                          if (!vdev_accessible(vd, pio))
 956  976                                  pio->io_error = SET_ERROR(ENXIO);
 957  977  
 958  978                  kmem_free(vps, sizeof (*vps));
 959  979          }
 960  980  }
 961  981  
 962  982  /*
 963  983   * Determine whether this device is accessible.
 964  984   *
 965  985   * Read and write to several known locations: the pad regions of each
 966  986   * vdev label but the first, which we leave alone in case it contains
 967  987   * a VTOC.
 968  988   */
 969  989  zio_t *
 970  990  vdev_probe(vdev_t *vd, zio_t *zio)
 971  991  {
 972  992          spa_t *spa = vd->vdev_spa;
 973  993          vdev_probe_stats_t *vps = NULL;
 974  994          zio_t *pio;
 975  995  
 976  996          ASSERT(vd->vdev_ops->vdev_op_leaf);
 977  997  
 978  998          /*
 979  999           * Don't probe the probe.
 980 1000           */
 981 1001          if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
 982 1002                  return (NULL);
 983 1003  
 984 1004          /*
 985 1005           * To prevent 'probe storms' when a device fails, we create
 986 1006           * just one probe i/o at a time.  All zios that want to probe
 987 1007           * this vdev will become parents of the probe io.
 988 1008           */
 989 1009          mutex_enter(&vd->vdev_probe_lock);
 990 1010  
 991 1011          if ((pio = vd->vdev_probe_zio) == NULL) {
 992 1012                  vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
 993 1013  
 994 1014                  vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
 995 1015                      ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
 996 1016                      ZIO_FLAG_TRYHARD;
 997 1017  
 998 1018                  if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 999 1019                          /*
1000 1020                           * vdev_cant_read and vdev_cant_write can only
1001 1021                           * transition from TRUE to FALSE when we have the
1002 1022                           * SCL_ZIO lock as writer; otherwise they can only
1003 1023                           * transition from FALSE to TRUE.  This ensures that
1004 1024                           * any zio looking at these values can assume that
1005 1025                           * failures persist for the life of the I/O.  That's
1006 1026                           * important because when a device has intermittent
1007 1027                           * connectivity problems, we want to ensure that
1008 1028                           * they're ascribed to the device (ENXIO) and not
1009 1029                           * the zio (EIO).
1010 1030                           *
1011 1031                           * Since we hold SCL_ZIO as writer here, clear both
1012 1032                           * values so the probe can reevaluate from first
1013 1033                           * principles.
1014 1034                           */
1015 1035                          vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
1016 1036                          vd->vdev_cant_read = B_FALSE;
1017 1037                          vd->vdev_cant_write = B_FALSE;
1018 1038                  }
1019 1039  
1020 1040                  vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
1021 1041                      vdev_probe_done, vps,
1022 1042                      vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
1023 1043  
1024 1044                  /*
1025 1045                   * We can't change the vdev state in this context, so we
1026 1046                   * kick off an async task to do it on our behalf.
1027 1047                   */
1028 1048                  if (zio != NULL) {
1029 1049                          vd->vdev_probe_wanted = B_TRUE;
1030 1050                          spa_async_request(spa, SPA_ASYNC_PROBE);
1031 1051                  }
1032 1052          }
1033 1053  
1034 1054          if (zio != NULL)
1035 1055                  zio_add_child(zio, pio);
1036 1056  
1037 1057          mutex_exit(&vd->vdev_probe_lock);
1038 1058  
1039 1059          if (vps == NULL) {
1040 1060                  ASSERT(zio != NULL);
1041 1061                  return (NULL);
1042 1062          }
1043 1063  
1044 1064          for (int l = 1; l < VDEV_LABELS; l++) {
1045 1065                  zio_nowait(zio_read_phys(pio, vd,
1046 1066                      vdev_label_offset(vd->vdev_psize, l,
1047 1067                      offsetof(vdev_label_t, vl_pad2)),
1048 1068                      VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
1049 1069                      ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1050 1070                      ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
1051 1071          }
1052 1072  
1053 1073          if (zio == NULL)
1054 1074                  return (pio);
1055 1075  
1056 1076          zio_nowait(pio);
1057 1077          return (NULL);
1058 1078  }
1059 1079  
1060 1080  static void
1061 1081  vdev_open_child(void *arg)
1062 1082  {
1063 1083          vdev_t *vd = arg;
1064 1084  
1065 1085          vd->vdev_open_thread = curthread;
1066 1086          vd->vdev_open_error = vdev_open(vd);
1067 1087          vd->vdev_open_thread = NULL;
1068 1088  }
1069 1089  
1070 1090  boolean_t
1071 1091  vdev_uses_zvols(vdev_t *vd)
1072 1092  {
1073 1093          if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
1074 1094              strlen(ZVOL_DIR)) == 0)
1075 1095                  return (B_TRUE);
1076 1096          for (int c = 0; c < vd->vdev_children; c++)
1077 1097                  if (vdev_uses_zvols(vd->vdev_child[c]))
1078 1098                          return (B_TRUE);
1079 1099          return (B_FALSE);
1080 1100  }
1081 1101  
1082 1102  void
1083 1103  vdev_open_children(vdev_t *vd)
1084 1104  {
1085 1105          taskq_t *tq;
1086 1106          int children = vd->vdev_children;
1087 1107  
1088 1108          /*
1089 1109           * in order to handle pools on top of zvols, do the opens
1090 1110           * in a single thread so that the same thread holds the
1091 1111           * spa_namespace_lock
1092 1112           */
1093 1113          if (vdev_uses_zvols(vd)) {
1094 1114                  for (int c = 0; c < children; c++)
1095 1115                          vd->vdev_child[c]->vdev_open_error =
1096 1116                              vdev_open(vd->vdev_child[c]);
1097 1117                  return;
1098 1118          }
1099 1119          tq = taskq_create("vdev_open", children, minclsyspri,
1100 1120              children, children, TASKQ_PREPOPULATE);
1101 1121  
1102 1122          for (int c = 0; c < children; c++)
1103 1123                  VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
1104 1124                      TQ_SLEEP) != NULL);
1105 1125  
1106 1126          taskq_destroy(tq);
1107 1127  }
1108 1128  
1109 1129  /*
1110 1130   * Prepare a virtual device for access.
1111 1131   */
1112 1132  int
1113 1133  vdev_open(vdev_t *vd)
1114 1134  {
1115 1135          spa_t *spa = vd->vdev_spa;
1116 1136          int error;
1117 1137          uint64_t osize = 0;
1118 1138          uint64_t max_osize = 0;
1119 1139          uint64_t asize, max_asize, psize;
1120 1140          uint64_t ashift = 0;
1121 1141  
1122 1142          ASSERT(vd->vdev_open_thread == curthread ||
1123 1143              spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1124 1144          ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
1125 1145              vd->vdev_state == VDEV_STATE_CANT_OPEN ||
1126 1146              vd->vdev_state == VDEV_STATE_OFFLINE);
1127 1147  
1128 1148          vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1129 1149          vd->vdev_cant_read = B_FALSE;
1130 1150          vd->vdev_cant_write = B_FALSE;
1131 1151          vd->vdev_min_asize = vdev_get_min_asize(vd);
1132 1152  
1133 1153          /*
1134 1154           * If this vdev is not removed, check its fault status.  If it's
1135 1155           * faulted, bail out of the open.
1136 1156           */
1137 1157          if (!vd->vdev_removed && vd->vdev_faulted) {
1138 1158                  ASSERT(vd->vdev_children == 0);
1139 1159                  ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1140 1160                      vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1141 1161                  vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1142 1162                      vd->vdev_label_aux);
1143 1163                  return (SET_ERROR(ENXIO));
1144 1164          } else if (vd->vdev_offline) {
1145 1165                  ASSERT(vd->vdev_children == 0);
1146 1166                  vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
1147 1167                  return (SET_ERROR(ENXIO));
1148 1168          }
1149 1169  
1150 1170          error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
1151 1171  
1152 1172          /*
1153 1173           * Reset the vdev_reopening flag so that we actually close
1154 1174           * the vdev on error.
1155 1175           */
1156 1176          vd->vdev_reopening = B_FALSE;
1157 1177          if (zio_injection_enabled && error == 0)
1158 1178                  error = zio_handle_device_injection(vd, NULL, ENXIO);
1159 1179  
1160 1180          if (error) {
1161 1181                  if (vd->vdev_removed &&
1162 1182                      vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
1163 1183                          vd->vdev_removed = B_FALSE;
1164 1184  
1165 1185                  vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1166 1186                      vd->vdev_stat.vs_aux);
1167 1187                  return (error);
1168 1188          }
1169 1189  
1170 1190          vd->vdev_removed = B_FALSE;
1171 1191  
1172 1192          /*
1173 1193           * Recheck the faulted flag now that we have confirmed that
1174 1194           * the vdev is accessible.  If we're faulted, bail.
1175 1195           */
1176 1196          if (vd->vdev_faulted) {
1177 1197                  ASSERT(vd->vdev_children == 0);
1178 1198                  ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1179 1199                      vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1180 1200                  vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1181 1201                      vd->vdev_label_aux);
1182 1202                  return (SET_ERROR(ENXIO));
1183 1203          }
1184 1204  
1185 1205          if (vd->vdev_degraded) {
1186 1206                  ASSERT(vd->vdev_children == 0);
1187 1207                  vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1188 1208                      VDEV_AUX_ERR_EXCEEDED);
1189 1209          } else {
1190 1210                  vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
1191 1211          }
1192 1212  
1193 1213          /*
1194 1214           * For hole or missing vdevs we just return success.
1195 1215           */
1196 1216          if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
1197 1217                  return (0);
1198 1218  
1199 1219          for (int c = 0; c < vd->vdev_children; c++) {
1200 1220                  if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
1201 1221                          vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1202 1222                              VDEV_AUX_NONE);
1203 1223                          break;
1204 1224                  }
1205 1225          }
1206 1226  
1207 1227          osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
1208 1228          max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
1209 1229  
1210 1230          if (vd->vdev_children == 0) {
1211 1231                  if (osize < SPA_MINDEVSIZE) {
1212 1232                          vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1213 1233                              VDEV_AUX_TOO_SMALL);
1214 1234                          return (SET_ERROR(EOVERFLOW));
1215 1235                  }
1216 1236                  psize = osize;
1217 1237                  asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
1218 1238                  max_asize = max_osize - (VDEV_LABEL_START_SIZE +
1219 1239                      VDEV_LABEL_END_SIZE);
1220 1240          } else {
1221 1241                  if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
1222 1242                      (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
1223 1243                          vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1224 1244                              VDEV_AUX_TOO_SMALL);
1225 1245                          return (SET_ERROR(EOVERFLOW));
1226 1246                  }
1227 1247                  psize = 0;
1228 1248                  asize = osize;
1229 1249                  max_asize = max_osize;
1230 1250          }
1231 1251  
1232 1252          vd->vdev_psize = psize;
1233 1253  
1234 1254          /*
1235 1255           * Make sure the allocatable size hasn't shrunk.
1236 1256           */
1237 1257          if (asize < vd->vdev_min_asize) {
1238 1258                  vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1239 1259                      VDEV_AUX_BAD_LABEL);
1240 1260                  return (SET_ERROR(EINVAL));
1241 1261          }
1242 1262  
1243 1263          if (vd->vdev_asize == 0) {
1244 1264                  /*
1245 1265                   * This is the first-ever open, so use the computed values.
1246 1266                   * For testing purposes, a higher ashift can be requested.
1247 1267                   */
1248 1268                  vd->vdev_asize = asize;
1249 1269                  vd->vdev_max_asize = max_asize;
1250 1270                  vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
1251 1271          } else {
1252 1272                  /*
1253 1273                   * Detect if the alignment requirement has increased.
1254 1274                   * We don't want to make the pool unavailable, just
1255 1275                   * issue a warning instead.
1256 1276                   */
1257 1277                  if (ashift > vd->vdev_top->vdev_ashift &&
1258 1278                      vd->vdev_ops->vdev_op_leaf) {
1259 1279                          cmn_err(CE_WARN,
1260 1280                              "Disk, '%s', has a block alignment that is "
1261 1281                              "larger than the pool's alignment\n",
1262 1282                              vd->vdev_path);
1263 1283                  }
1264 1284                  vd->vdev_max_asize = max_asize;
1265 1285          }
1266 1286  
1267 1287          /*
1268 1288           * If all children are healthy and the asize has increased,
1269 1289           * then we've experienced dynamic LUN growth.  If automatic
1270 1290           * expansion is enabled then use the additional space.
1271 1291           */
1272 1292          if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
1273 1293              (vd->vdev_expanding || spa->spa_autoexpand))
1274 1294                  vd->vdev_asize = asize;
1275 1295  
1276 1296          vdev_set_min_asize(vd);
1277 1297  
1278 1298          /*
1279 1299           * Ensure we can issue some IO before declaring the
1280 1300           * vdev open for business.
1281 1301           */
1282 1302          if (vd->vdev_ops->vdev_op_leaf &&
1283 1303              (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
1284 1304                  vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1285 1305                      VDEV_AUX_ERR_EXCEEDED);
1286 1306                  return (error);
1287 1307          }
1288 1308  
1289 1309          /*
1290 1310           * If a leaf vdev has a DTL, and seems healthy, then kick off a
1291 1311           * resilver.  But don't do this if we are doing a reopen for a scrub,
1292 1312           * since this would just restart the scrub we are already doing.
1293 1313           */
1294 1314          if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
1295 1315              vdev_resilver_needed(vd, NULL, NULL))
1296 1316                  spa_async_request(spa, SPA_ASYNC_RESILVER);
1297 1317  
1298 1318          return (0);
1299 1319  }
1300 1320  
1301 1321  /*
1302 1322   * Called once the vdevs are all opened, this routine validates the label
1303 1323   * contents.  This needs to be done before vdev_load() so that we don't
1304 1324   * inadvertently do repair I/Os to the wrong device.
1305 1325   *
1306 1326   * If 'strict' is false ignore the spa guid check. This is necessary because
1307 1327   * if the machine crashed during a re-guid the new guid might have been written
1308 1328   * to all of the vdev labels, but not the cached config. The strict check
1309 1329   * will be performed when the pool is opened again using the mos config.
1310 1330   *
1311 1331   * This function will only return failure if one of the vdevs indicates that it
1312 1332   * has since been destroyed or exported.  This is only possible if
1313 1333   * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
1314 1334   * will be updated but the function will return 0.
1315 1335   */
1316 1336  int
1317 1337  vdev_validate(vdev_t *vd, boolean_t strict)
1318 1338  {
1319 1339          spa_t *spa = vd->vdev_spa;
1320 1340          nvlist_t *label;
1321 1341          uint64_t guid = 0, top_guid;
1322 1342          uint64_t state;
1323 1343  
1324 1344          for (int c = 0; c < vd->vdev_children; c++)
1325 1345                  if (vdev_validate(vd->vdev_child[c], strict) != 0)
1326 1346                          return (SET_ERROR(EBADF));
1327 1347  
1328 1348          /*
1329 1349           * If the device has already failed, or was marked offline, don't do
1330 1350           * any further validation.  Otherwise, label I/O will fail and we will
1331 1351           * overwrite the previous state.
1332 1352           */
1333 1353          if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
1334 1354                  uint64_t aux_guid = 0;
1335 1355                  nvlist_t *nvl;
1336 1356                  uint64_t txg = spa_last_synced_txg(spa) != 0 ?
1337 1357                      spa_last_synced_txg(spa) : -1ULL;
1338 1358  
1339 1359                  if ((label = vdev_label_read_config(vd, txg)) == NULL) {
1340 1360                          vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1341 1361                              VDEV_AUX_BAD_LABEL);
1342 1362                          return (0);
1343 1363                  }
1344 1364  
1345 1365                  /*
1346 1366                   * Determine if this vdev has been split off into another
1347 1367                   * pool.  If so, then refuse to open it.
1348 1368                   */
1349 1369                  if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
1350 1370                      &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
1351 1371                          vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1352 1372                              VDEV_AUX_SPLIT_POOL);
1353 1373                          nvlist_free(label);
1354 1374                          return (0);
1355 1375                  }
1356 1376  
1357 1377                  if (strict && (nvlist_lookup_uint64(label,
1358 1378                      ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
1359 1379                      guid != spa_guid(spa))) {
1360 1380                          vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1361 1381                              VDEV_AUX_CORRUPT_DATA);
1362 1382                          nvlist_free(label);
1363 1383                          return (0);
1364 1384                  }
1365 1385  
1366 1386                  if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
1367 1387                      != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
1368 1388                      &aux_guid) != 0)
1369 1389                          aux_guid = 0;
1370 1390  
1371 1391                  /*
1372 1392                   * If this vdev just became a top-level vdev because its
1373 1393                   * sibling was detached, it will have adopted the parent's
1374 1394                   * vdev guid -- but the label may or may not be on disk yet.
1375 1395                   * Fortunately, either version of the label will have the
1376 1396                   * same top guid, so if we're a top-level vdev, we can
1377 1397                   * safely compare to that instead.
1378 1398                   *
1379 1399                   * If we split this vdev off instead, then we also check the
1380 1400                   * original pool's guid.  We don't want to consider the vdev
1381 1401                   * corrupt if it is partway through a split operation.
1382 1402                   */
1383 1403                  if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
1384 1404                      &guid) != 0 ||
1385 1405                      nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
1386 1406                      &top_guid) != 0 ||
1387 1407                      ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
1388 1408                      (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
1389 1409                          vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1390 1410                              VDEV_AUX_CORRUPT_DATA);
1391 1411                          nvlist_free(label);
1392 1412                          return (0);
1393 1413                  }
1394 1414  
1395 1415                  if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1396 1416                      &state) != 0) {
1397 1417                          vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1398 1418                              VDEV_AUX_CORRUPT_DATA);
1399 1419                          nvlist_free(label);
1400 1420                          return (0);
1401 1421                  }
1402 1422  
1403 1423                  nvlist_free(label);
1404 1424  
1405 1425                  /*
1406 1426                   * If this is a verbatim import, no need to check the
1407 1427                   * state of the pool.
1408 1428                   */
1409 1429                  if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
1410 1430                      spa_load_state(spa) == SPA_LOAD_OPEN &&
1411 1431                      state != POOL_STATE_ACTIVE)
1412 1432                          return (SET_ERROR(EBADF));
1413 1433  
1414 1434                  /*
1415 1435                   * If we were able to open and validate a vdev that was
1416 1436                   * previously marked permanently unavailable, clear that state
1417 1437                   * now.
1418 1438                   */
1419 1439                  if (vd->vdev_not_present)
1420 1440                          vd->vdev_not_present = 0;
1421 1441          }
1422 1442  
1423 1443          return (0);
1424 1444  }
1425 1445  
1426 1446  /*
1427 1447   * Close a virtual device.
1428 1448   */
1429 1449  void
1430 1450  vdev_close(vdev_t *vd)
1431 1451  {
1432 1452          spa_t *spa = vd->vdev_spa;
1433 1453          vdev_t *pvd = vd->vdev_parent;
1434 1454  
1435 1455          ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1436 1456  
1437 1457          /*
1438 1458           * If our parent is reopening, then we are as well, unless we are
1439 1459           * going offline.
1440 1460           */
1441 1461          if (pvd != NULL && pvd->vdev_reopening)
1442 1462                  vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
1443 1463  
1444 1464          vd->vdev_ops->vdev_op_close(vd);
1445 1465  
1446 1466          vdev_cache_purge(vd);
1447 1467  
1448 1468          /*
1449 1469           * We record the previous state before we close it, so that if we are
1450 1470           * doing a reopen(), we don't generate FMA ereports if we notice that
1451 1471           * it's still faulted.
1452 1472           */
1453 1473          vd->vdev_prevstate = vd->vdev_state;
1454 1474  
1455 1475          if (vd->vdev_offline)
1456 1476                  vd->vdev_state = VDEV_STATE_OFFLINE;
1457 1477          else
1458 1478                  vd->vdev_state = VDEV_STATE_CLOSED;
1459 1479          vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1460 1480  }
1461 1481  
1462 1482  void
1463 1483  vdev_hold(vdev_t *vd)
1464 1484  {
1465 1485          spa_t *spa = vd->vdev_spa;
1466 1486  
1467 1487          ASSERT(spa_is_root(spa));
1468 1488          if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1469 1489                  return;
1470 1490  
1471 1491          for (int c = 0; c < vd->vdev_children; c++)
1472 1492                  vdev_hold(vd->vdev_child[c]);
1473 1493  
1474 1494          if (vd->vdev_ops->vdev_op_leaf)
1475 1495                  vd->vdev_ops->vdev_op_hold(vd);
1476 1496  }
1477 1497  
1478 1498  void
1479 1499  vdev_rele(vdev_t *vd)
1480 1500  {
1481 1501          spa_t *spa = vd->vdev_spa;
1482 1502  
1483 1503          ASSERT(spa_is_root(spa));
1484 1504          for (int c = 0; c < vd->vdev_children; c++)
1485 1505                  vdev_rele(vd->vdev_child[c]);
1486 1506  
1487 1507          if (vd->vdev_ops->vdev_op_leaf)
1488 1508                  vd->vdev_ops->vdev_op_rele(vd);
1489 1509  }
1490 1510  
1491 1511  /*
1492 1512   * Reopen all interior vdevs and any unopened leaves.  We don't actually
1493 1513   * reopen leaf vdevs which had previously been opened as they might deadlock
1494 1514   * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
1495 1515   * If the leaf has never been opened then open it, as usual.
1496 1516   */
1497 1517  void
1498 1518  vdev_reopen(vdev_t *vd)
1499 1519  {
1500 1520          spa_t *spa = vd->vdev_spa;
1501 1521  
1502 1522          ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1503 1523  
1504 1524          /* set the reopening flag unless we're taking the vdev offline */
1505 1525          vd->vdev_reopening = !vd->vdev_offline;
1506 1526          vdev_close(vd);
1507 1527          (void) vdev_open(vd);
1508 1528  
1509 1529          /*
1510 1530           * Call vdev_validate() here to make sure we have the same device.
1511 1531           * Otherwise, a device with an invalid label could be successfully
1512 1532           * opened in response to vdev_reopen().
1513 1533           */
1514 1534          if (vd->vdev_aux) {
1515 1535                  (void) vdev_validate_aux(vd);
1516 1536                  if (vdev_readable(vd) && vdev_writeable(vd) &&
1517 1537                      vd->vdev_aux == &spa->spa_l2cache &&
1518 1538                      !l2arc_vdev_present(vd))
1519 1539                          l2arc_add_vdev(spa, vd);
1520 1540          } else {
1521 1541                  (void) vdev_validate(vd, B_TRUE);
1522 1542          }
1523 1543  
1524 1544          /*
1525 1545           * Reassess parent vdev's health.
1526 1546           */
1527 1547          vdev_propagate_state(vd);
1528 1548  }
1529 1549  
1530 1550  int
1531 1551  vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
1532 1552  {
1533 1553          int error;
1534 1554  
1535 1555          /*
1536 1556           * Normally, partial opens (e.g. of a mirror) are allowed.
1537 1557           * For a create, however, we want to fail the request if
1538 1558           * there are any components we can't open.
1539 1559           */
1540 1560          error = vdev_open(vd);
1541 1561  
1542 1562          if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
1543 1563                  vdev_close(vd);
1544 1564                  return (error ? error : ENXIO);
1545 1565          }
1546 1566  
1547 1567          /*
1548 1568           * Recursively load DTLs and initialize all labels.
1549 1569           */
1550 1570          if ((error = vdev_dtl_load(vd)) != 0 ||
1551 1571              (error = vdev_label_init(vd, txg, isreplacing ?
1552 1572              VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
1553 1573                  vdev_close(vd);
1554 1574                  return (error);
1555 1575          }
1556 1576  
1557 1577          return (0);
1558 1578  }
1559 1579  
1560 1580  void
1561 1581  vdev_metaslab_set_size(vdev_t *vd)
1562 1582  {
1563 1583          /*
1564 1584           * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
1565 1585           */
1566 1586          vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
1567 1587          vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1568 1588  }
1569 1589  
1570 1590  void
1571 1591  vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1572 1592  {
1573 1593          ASSERT(vd == vd->vdev_top);
1574 1594          ASSERT(!vd->vdev_ishole);
1575 1595          ASSERT(ISP2(flags));
1576 1596          ASSERT(spa_writeable(vd->vdev_spa));
1577 1597  
1578 1598          if (flags & VDD_METASLAB)
1579 1599                  (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
1580 1600  
1581 1601          if (flags & VDD_DTL)
1582 1602                  (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
1583 1603  
1584 1604          (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
1585 1605  }
1586 1606  
1587 1607  void
1588 1608  vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
1589 1609  {
1590 1610          for (int c = 0; c < vd->vdev_children; c++)
1591 1611                  vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
1592 1612  
1593 1613          if (vd->vdev_ops->vdev_op_leaf)
1594 1614                  vdev_dirty(vd->vdev_top, flags, vd, txg);
1595 1615  }
1596 1616  
1597 1617  /*
1598 1618   * DTLs.
1599 1619   *
1600 1620   * A vdev's DTL (dirty time log) is the set of transaction groups for which
1601 1621   * the vdev has less than perfect replication.  There are four kinds of DTL:
1602 1622   *
1603 1623   * DTL_MISSING: txgs for which the vdev has no valid copies of the data
1604 1624   *
1605 1625   * DTL_PARTIAL: txgs for which data is available, but not fully replicated
1606 1626   *
1607 1627   * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
1608 1628   *      scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
1609 1629   *      txgs that was scrubbed.
1610 1630   *
1611 1631   * DTL_OUTAGE: txgs which cannot currently be read, whether due to
1612 1632   *      persistent errors or just some device being offline.
1613 1633   *      Unlike the other three, the DTL_OUTAGE map is not generally
1614 1634   *      maintained; it's only computed when needed, typically to
1615 1635   *      determine whether a device can be detached.
1616 1636   *
1617 1637   * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
1618 1638   * either has the data or it doesn't.
1619 1639   *
1620 1640   * For interior vdevs such as mirror and RAID-Z the picture is more complex.
1621 1641   * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
1622 1642   * if any child is less than fully replicated, then so is its parent.
1623 1643   * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
1624 1644   * comprising only those txgs which appear in 'maxfaults' or more children;
1625 1645   * those are the txgs we don't have enough replication to read.  For example,
1626 1646   * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
1627 1647   * thus, its DTL_MISSING consists of the set of txgs that appear in more than
1628 1648   * two child DTL_MISSING maps.
1629 1649   *
1630 1650   * It should be clear from the above that to compute the DTLs and outage maps
1631 1651   * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
1632 1652   * Therefore, that is all we keep on disk.  When loading the pool, or after
1633 1653   * a configuration change, we generate all other DTLs from first principles.
1634 1654   */
1635 1655  void
1636 1656  vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1637 1657  {
1638 1658          range_tree_t *rt = vd->vdev_dtl[t];
1639 1659  
1640 1660          ASSERT(t < DTL_TYPES);
1641 1661          ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1642 1662          ASSERT(spa_writeable(vd->vdev_spa));
1643 1663  
1644 1664          mutex_enter(rt->rt_lock);
1645 1665          if (!range_tree_contains(rt, txg, size))
1646 1666                  range_tree_add(rt, txg, size);
1647 1667          mutex_exit(rt->rt_lock);
1648 1668  }
1649 1669  
1650 1670  boolean_t
1651 1671  vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1652 1672  {
1653 1673          range_tree_t *rt = vd->vdev_dtl[t];
1654 1674          boolean_t dirty = B_FALSE;
1655 1675  
1656 1676          ASSERT(t < DTL_TYPES);
1657 1677          ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1658 1678  
1659 1679          mutex_enter(rt->rt_lock);
1660 1680          if (range_tree_space(rt) != 0)
1661 1681                  dirty = range_tree_contains(rt, txg, size);
1662 1682          mutex_exit(rt->rt_lock);
1663 1683  
1664 1684          return (dirty);
1665 1685  }
1666 1686  
1667 1687  boolean_t
1668 1688  vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1669 1689  {
1670 1690          range_tree_t *rt = vd->vdev_dtl[t];
1671 1691          boolean_t empty;
1672 1692  
1673 1693          mutex_enter(rt->rt_lock);
1674 1694          empty = (range_tree_space(rt) == 0);
1675 1695          mutex_exit(rt->rt_lock);
1676 1696  
1677 1697          return (empty);
1678 1698  }
1679 1699  
1680 1700  /*
1681 1701   * Returns the lowest txg in the DTL range.
1682 1702   */
1683 1703  static uint64_t
1684 1704  vdev_dtl_min(vdev_t *vd)
1685 1705  {
1686 1706          range_seg_t *rs;
1687 1707  
1688 1708          ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1689 1709          ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
1690 1710          ASSERT0(vd->vdev_children);
1691 1711  
1692 1712          rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
1693 1713          return (rs->rs_start - 1);
1694 1714  }
1695 1715  
1696 1716  /*
1697 1717   * Returns the highest txg in the DTL.
1698 1718   */
1699 1719  static uint64_t
1700 1720  vdev_dtl_max(vdev_t *vd)
1701 1721  {
1702 1722          range_seg_t *rs;
1703 1723  
1704 1724          ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1705 1725          ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
1706 1726          ASSERT0(vd->vdev_children);
1707 1727  
1708 1728          rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
1709 1729          return (rs->rs_end);
1710 1730  }
1711 1731  
1712 1732  /*
1713 1733   * Determine if a resilvering vdev should remove any DTL entries from
1714 1734   * its range. If the vdev was resilvering for the entire duration of the
1715 1735   * scan then it should excise that range from its DTLs. Otherwise, this
1716 1736   * vdev is considered partially resilvered and should leave its DTL
1717 1737   * entries intact. The comment in vdev_dtl_reassess() describes how we
1718 1738   * excise the DTLs.
1719 1739   */
1720 1740  static boolean_t
1721 1741  vdev_dtl_should_excise(vdev_t *vd)
1722 1742  {
1723 1743          spa_t *spa = vd->vdev_spa;
1724 1744          dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1725 1745  
1726 1746          ASSERT0(scn->scn_phys.scn_errors);
1727 1747          ASSERT0(vd->vdev_children);
1728 1748  
1729 1749          if (vd->vdev_resilver_txg == 0 ||
1730 1750              range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
1731 1751                  return (B_TRUE);
1732 1752  
1733 1753          /*
1734 1754           * When a resilver is initiated the scan will assign the scn_max_txg
1735 1755           * value to the highest txg value that exists in all DTLs. If this
1736 1756           * device's max DTL is not part of this scan (i.e. it is not in
1737 1757           * the range (scn_min_txg, scn_max_txg] then it is not eligible
1738 1758           * for excision.
1739 1759           */
1740 1760          if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
1741 1761                  ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
1742 1762                  ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
1743 1763                  ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
1744 1764                  return (B_TRUE);
1745 1765          }
1746 1766          return (B_FALSE);
1747 1767  }
1748 1768  
1749 1769  /*
1750 1770   * Reassess DTLs after a config change or scrub completion.
1751 1771   */
1752 1772  void
1753 1773  vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
1754 1774  {
1755 1775          spa_t *spa = vd->vdev_spa;
1756 1776          avl_tree_t reftree;
1757 1777          int minref;
1758 1778  
1759 1779          ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1760 1780  
1761 1781          for (int c = 0; c < vd->vdev_children; c++)
1762 1782                  vdev_dtl_reassess(vd->vdev_child[c], txg,
1763 1783                      scrub_txg, scrub_done);
1764 1784  
1765 1785          if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
1766 1786                  return;
1767 1787  
1768 1788          if (vd->vdev_ops->vdev_op_leaf) {
1769 1789                  dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1770 1790  
1771 1791                  mutex_enter(&vd->vdev_dtl_lock);
1772 1792  
1773 1793                  /*
1774 1794                   * If we've completed a scan cleanly then determine
1775 1795                   * if this vdev should remove any DTLs. We only want to
1776 1796                   * excise regions on vdevs that were available during
1777 1797                   * the entire duration of this scan.
1778 1798                   */
1779 1799                  if (scrub_txg != 0 &&
1780 1800                      (spa->spa_scrub_started ||
1781 1801                      (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
1782 1802                      vdev_dtl_should_excise(vd)) {
1783 1803                          /*
1784 1804                           * We completed a scrub up to scrub_txg.  If we
1785 1805                           * did it without rebooting, then the scrub dtl
1786 1806                           * will be valid, so excise the old region and
1787 1807                           * fold in the scrub dtl.  Otherwise, leave the
1788 1808                           * dtl as-is if there was an error.
1789 1809                           *
1790 1810                           * There's little trick here: to excise the beginning
1791 1811                           * of the DTL_MISSING map, we put it into a reference
1792 1812                           * tree and then add a segment with refcnt -1 that
1793 1813                           * covers the range [0, scrub_txg).  This means
1794 1814                           * that each txg in that range has refcnt -1 or 0.
1795 1815                           * We then add DTL_SCRUB with a refcnt of 2, so that
1796 1816                           * entries in the range [0, scrub_txg) will have a
1797 1817                           * positive refcnt -- either 1 or 2.  We then convert
1798 1818                           * the reference tree into the new DTL_MISSING map.
1799 1819                           */
1800 1820                          space_reftree_create(&reftree);
1801 1821                          space_reftree_add_map(&reftree,
1802 1822                              vd->vdev_dtl[DTL_MISSING], 1);
1803 1823                          space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
1804 1824                          space_reftree_add_map(&reftree,
1805 1825                              vd->vdev_dtl[DTL_SCRUB], 2);
1806 1826                          space_reftree_generate_map(&reftree,
1807 1827                              vd->vdev_dtl[DTL_MISSING], 1);
1808 1828                          space_reftree_destroy(&reftree);
1809 1829                  }
1810 1830                  range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
1811 1831                  range_tree_walk(vd->vdev_dtl[DTL_MISSING],
1812 1832                      range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
1813 1833                  if (scrub_done)
1814 1834                          range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
1815 1835                  range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
1816 1836                  if (!vdev_readable(vd))
1817 1837                          range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
1818 1838                  else
1819 1839                          range_tree_walk(vd->vdev_dtl[DTL_MISSING],
1820 1840                              range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
1821 1841  
1822 1842                  /*
1823 1843                   * If the vdev was resilvering and no longer has any
1824 1844                   * DTLs then reset its resilvering flag.
1825 1845                   */
1826 1846                  if (vd->vdev_resilver_txg != 0 &&
1827 1847                      range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
1828 1848                      range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0)
1829 1849                          vd->vdev_resilver_txg = 0;
1830 1850  
1831 1851                  mutex_exit(&vd->vdev_dtl_lock);
1832 1852  
1833 1853                  if (txg != 0)
1834 1854                          vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
1835 1855                  return;
1836 1856          }
1837 1857  
1838 1858          mutex_enter(&vd->vdev_dtl_lock);
1839 1859          for (int t = 0; t < DTL_TYPES; t++) {
1840 1860                  /* account for child's outage in parent's missing map */
1841 1861                  int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
1842 1862                  if (t == DTL_SCRUB)
1843 1863                          continue;                       /* leaf vdevs only */
1844 1864                  if (t == DTL_PARTIAL)
1845 1865                          minref = 1;                     /* i.e. non-zero */
1846 1866                  else if (vd->vdev_nparity != 0)
1847 1867                          minref = vd->vdev_nparity + 1;  /* RAID-Z */
1848 1868                  else
1849 1869                          minref = vd->vdev_children;     /* any kind of mirror */
1850 1870                  space_reftree_create(&reftree);
1851 1871                  for (int c = 0; c < vd->vdev_children; c++) {
1852 1872                          vdev_t *cvd = vd->vdev_child[c];
1853 1873                          mutex_enter(&cvd->vdev_dtl_lock);
1854 1874                          space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
1855 1875                          mutex_exit(&cvd->vdev_dtl_lock);
1856 1876                  }
1857 1877                  space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
1858 1878                  space_reftree_destroy(&reftree);
1859 1879          }
1860 1880          mutex_exit(&vd->vdev_dtl_lock);
1861 1881  }
1862 1882  
1863 1883  int
1864 1884  vdev_dtl_load(vdev_t *vd)
1865 1885  {
1866 1886          spa_t *spa = vd->vdev_spa;
1867 1887          objset_t *mos = spa->spa_meta_objset;
1868 1888          int error = 0;
1869 1889  
1870 1890          if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
1871 1891                  ASSERT(!vd->vdev_ishole);
1872 1892  
1873 1893                  error = space_map_open(&vd->vdev_dtl_sm, mos,
1874 1894                      vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
1875 1895                  if (error)
1876 1896                          return (error);
1877 1897                  ASSERT(vd->vdev_dtl_sm != NULL);
1878 1898  
1879 1899                  mutex_enter(&vd->vdev_dtl_lock);
1880 1900  
1881 1901                  /*
1882 1902                   * Now that we've opened the space_map we need to update
1883 1903                   * the in-core DTL.
1884 1904                   */
1885 1905                  space_map_update(vd->vdev_dtl_sm);
1886 1906  
1887 1907                  error = space_map_load(vd->vdev_dtl_sm,
1888 1908                      vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
1889 1909                  mutex_exit(&vd->vdev_dtl_lock);
1890 1910  
1891 1911                  return (error);
1892 1912          }
1893 1913  
1894 1914          for (int c = 0; c < vd->vdev_children; c++) {
1895 1915                  error = vdev_dtl_load(vd->vdev_child[c]);
1896 1916                  if (error != 0)
1897 1917                          break;
1898 1918          }
1899 1919  
1900 1920          return (error);
1901 1921  }
1902 1922  
1903 1923  void
1904 1924  vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1905 1925  {
1906 1926          spa_t *spa = vd->vdev_spa;
1907 1927          range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
1908 1928          objset_t *mos = spa->spa_meta_objset;
1909 1929          range_tree_t *rtsync;
1910 1930          kmutex_t rtlock;
1911 1931          dmu_tx_t *tx;
1912 1932          uint64_t object = space_map_object(vd->vdev_dtl_sm);
1913 1933  
1914 1934          ASSERT(!vd->vdev_ishole);
1915 1935          ASSERT(vd->vdev_ops->vdev_op_leaf);
1916 1936  
1917 1937          tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1918 1938  
1919 1939          if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
1920 1940                  mutex_enter(&vd->vdev_dtl_lock);
1921 1941                  space_map_free(vd->vdev_dtl_sm, tx);
1922 1942                  space_map_close(vd->vdev_dtl_sm);
1923 1943                  vd->vdev_dtl_sm = NULL;
1924 1944                  mutex_exit(&vd->vdev_dtl_lock);
1925 1945                  dmu_tx_commit(tx);
1926 1946                  return;
1927 1947          }
1928 1948  
1929 1949          if (vd->vdev_dtl_sm == NULL) {
1930 1950                  uint64_t new_object;
1931 1951  
1932 1952                  new_object = space_map_alloc(mos, tx);
1933 1953                  VERIFY3U(new_object, !=, 0);
1934 1954  
1935 1955                  VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
1936 1956                      0, -1ULL, 0, &vd->vdev_dtl_lock));
1937 1957                  ASSERT(vd->vdev_dtl_sm != NULL);
1938 1958          }
1939 1959  
1940 1960          mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
1941 1961  
1942 1962          rtsync = range_tree_create(NULL, NULL, &rtlock);
1943 1963  
1944 1964          mutex_enter(&rtlock);
1945 1965  
1946 1966          mutex_enter(&vd->vdev_dtl_lock);
1947 1967          range_tree_walk(rt, range_tree_add, rtsync);
1948 1968          mutex_exit(&vd->vdev_dtl_lock);
1949 1969  
1950 1970          space_map_truncate(vd->vdev_dtl_sm, tx);
1951 1971          space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
1952 1972          range_tree_vacate(rtsync, NULL, NULL);
1953 1973  
1954 1974          range_tree_destroy(rtsync);
1955 1975  
1956 1976          mutex_exit(&rtlock);
1957 1977          mutex_destroy(&rtlock);
1958 1978  
1959 1979          /*
1960 1980           * If the object for the space map has changed then dirty
1961 1981           * the top level so that we update the config.
1962 1982           */
1963 1983          if (object != space_map_object(vd->vdev_dtl_sm)) {
1964 1984                  zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
1965 1985                      "new object %llu", txg, spa_name(spa), object,
1966 1986                      space_map_object(vd->vdev_dtl_sm));
1967 1987                  vdev_config_dirty(vd->vdev_top);
1968 1988          }
1969 1989  
1970 1990          dmu_tx_commit(tx);
1971 1991  
1972 1992          mutex_enter(&vd->vdev_dtl_lock);
1973 1993          space_map_update(vd->vdev_dtl_sm);
1974 1994          mutex_exit(&vd->vdev_dtl_lock);
1975 1995  }
1976 1996  
1977 1997  /*
1978 1998   * Determine whether the specified vdev can be offlined/detached/removed
1979 1999   * without losing data.
1980 2000   */
1981 2001  boolean_t
1982 2002  vdev_dtl_required(vdev_t *vd)
1983 2003  {
1984 2004          spa_t *spa = vd->vdev_spa;
1985 2005          vdev_t *tvd = vd->vdev_top;
1986 2006          uint8_t cant_read = vd->vdev_cant_read;
1987 2007          boolean_t required;
1988 2008  
1989 2009          ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1990 2010  
1991 2011          if (vd == spa->spa_root_vdev || vd == tvd)
1992 2012                  return (B_TRUE);
1993 2013  
1994 2014          /*
1995 2015           * Temporarily mark the device as unreadable, and then determine
1996 2016           * whether this results in any DTL outages in the top-level vdev.
1997 2017           * If not, we can safely offline/detach/remove the device.
1998 2018           */
1999 2019          vd->vdev_cant_read = B_TRUE;
2000 2020          vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
2001 2021          required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
2002 2022          vd->vdev_cant_read = cant_read;
2003 2023          vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
2004 2024  
2005 2025          if (!required && zio_injection_enabled)
2006 2026                  required = !!zio_handle_device_injection(vd, NULL, ECHILD);
2007 2027  
2008 2028          return (required);
2009 2029  }
2010 2030  
2011 2031  /*
2012 2032   * Determine if resilver is needed, and if so the txg range.
2013 2033   */
2014 2034  boolean_t
2015 2035  vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
2016 2036  {
2017 2037          boolean_t needed = B_FALSE;
2018 2038          uint64_t thismin = UINT64_MAX;
2019 2039          uint64_t thismax = 0;
2020 2040  
2021 2041          if (vd->vdev_children == 0) {
2022 2042                  mutex_enter(&vd->vdev_dtl_lock);
2023 2043                  if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
2024 2044                      vdev_writeable(vd)) {
2025 2045  
2026 2046                          thismin = vdev_dtl_min(vd);
2027 2047                          thismax = vdev_dtl_max(vd);
2028 2048                          needed = B_TRUE;
2029 2049                  }
2030 2050                  mutex_exit(&vd->vdev_dtl_lock);
2031 2051          } else {
2032 2052                  for (int c = 0; c < vd->vdev_children; c++) {
2033 2053                          vdev_t *cvd = vd->vdev_child[c];
2034 2054                          uint64_t cmin, cmax;
2035 2055  
2036 2056                          if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
2037 2057                                  thismin = MIN(thismin, cmin);
2038 2058                                  thismax = MAX(thismax, cmax);
2039 2059                                  needed = B_TRUE;
2040 2060                          }
2041 2061                  }
2042 2062          }
2043 2063  
2044 2064          if (needed && minp) {
2045 2065                  *minp = thismin;
2046 2066                  *maxp = thismax;
2047 2067          }
2048 2068          return (needed);
2049 2069  }
2050 2070  
2051 2071  void
2052 2072  vdev_load(vdev_t *vd)
2053 2073  {
2054 2074          /*
2055 2075           * Recursively load all children.
2056 2076           */
2057 2077          for (int c = 0; c < vd->vdev_children; c++)
2058 2078                  vdev_load(vd->vdev_child[c]);
2059 2079  
2060 2080          /*
2061 2081           * If this is a top-level vdev, initialize its metaslabs.
2062 2082           */
2063 2083          if (vd == vd->vdev_top && !vd->vdev_ishole &&
2064 2084              (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
2065 2085              vdev_metaslab_init(vd, 0) != 0))
2066 2086                  vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2067 2087                      VDEV_AUX_CORRUPT_DATA);
2068 2088  
2069 2089          /*
2070 2090           * If this is a leaf vdev, load its DTL.
2071 2091           */
2072 2092          if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
2073 2093                  vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2074 2094                      VDEV_AUX_CORRUPT_DATA);
2075 2095  }
2076 2096  
2077 2097  /*
2078 2098   * The special vdev case is used for hot spares and l2cache devices.  Its
2079 2099   * sole purpose it to set the vdev state for the associated vdev.  To do this,
2080 2100   * we make sure that we can open the underlying device, then try to read the
2081 2101   * label, and make sure that the label is sane and that it hasn't been
2082 2102   * repurposed to another pool.
2083 2103   */
2084 2104  int
2085 2105  vdev_validate_aux(vdev_t *vd)
2086 2106  {
2087 2107          nvlist_t *label;
2088 2108          uint64_t guid, version;
2089 2109          uint64_t state;
2090 2110  
2091 2111          if (!vdev_readable(vd))
2092 2112                  return (0);
2093 2113  
2094 2114          if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
2095 2115                  vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2096 2116                      VDEV_AUX_CORRUPT_DATA);
2097 2117                  return (-1);
2098 2118          }
2099 2119  
2100 2120          if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
2101 2121              !SPA_VERSION_IS_SUPPORTED(version) ||
2102 2122              nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
2103 2123              guid != vd->vdev_guid ||
2104 2124              nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
2105 2125                  vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2106 2126                      VDEV_AUX_CORRUPT_DATA);
2107 2127                  nvlist_free(label);
2108 2128                  return (-1);
2109 2129          }
2110 2130  
2111 2131          /*
2112 2132           * We don't actually check the pool state here.  If it's in fact in
2113 2133           * use by another pool, we update this fact on the fly when requested.
2114 2134           */
2115 2135          nvlist_free(label);
2116 2136          return (0);
2117 2137  }
2118 2138  
2119 2139  void
2120 2140  vdev_remove(vdev_t *vd, uint64_t txg)
2121 2141  {
2122 2142          spa_t *spa = vd->vdev_spa;
2123 2143          objset_t *mos = spa->spa_meta_objset;
2124 2144          dmu_tx_t *tx;
2125 2145  
2126 2146          tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2127 2147  
2128 2148          if (vd->vdev_ms != NULL) {
2129 2149                  metaslab_group_t *mg = vd->vdev_mg;
2130 2150  
2131 2151                  metaslab_group_histogram_verify(mg);
2132 2152                  metaslab_class_histogram_verify(mg->mg_class);
2133 2153  
2134 2154                  for (int m = 0; m < vd->vdev_ms_count; m++) {
2135 2155                          metaslab_t *msp = vd->vdev_ms[m];
2136 2156  
2137 2157                          if (msp == NULL || msp->ms_sm == NULL)
2138 2158                                  continue;
2139 2159  
2140 2160                          mutex_enter(&msp->ms_lock);
2141 2161                          /*
2142 2162                           * If the metaslab was not loaded when the vdev
2143 2163                           * was removed then the histogram accounting may
2144 2164                           * not be accurate. Update the histogram information
2145 2165                           * here so that we ensure that the metaslab group
2146 2166                           * and metaslab class are up-to-date.
2147 2167                           */
2148 2168                          metaslab_group_histogram_remove(mg, msp);
2149 2169  
2150 2170                          VERIFY0(space_map_allocated(msp->ms_sm));
2151 2171                          space_map_free(msp->ms_sm, tx);
2152 2172                          space_map_close(msp->ms_sm);
2153 2173                          msp->ms_sm = NULL;
2154 2174                          mutex_exit(&msp->ms_lock);
2155 2175                  }
2156 2176  
2157 2177                  metaslab_group_histogram_verify(mg);
2158 2178                  metaslab_class_histogram_verify(mg->mg_class);
2159 2179                  for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
2160 2180                          ASSERT0(mg->mg_histogram[i]);
2161 2181  
2162 2182          }
2163 2183  
2164 2184          if (vd->vdev_ms_array) {
2165 2185                  (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
2166 2186                  vd->vdev_ms_array = 0;
2167 2187          }
2168 2188          dmu_tx_commit(tx);
2169 2189  }
2170 2190  
2171 2191  void
2172 2192  vdev_sync_done(vdev_t *vd, uint64_t txg)
2173 2193  {
2174 2194          metaslab_t *msp;
2175 2195          boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
2176 2196  
2177 2197          ASSERT(!vd->vdev_ishole);
2178 2198  
2179 2199          while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
2180 2200                  metaslab_sync_done(msp, txg);
2181 2201  
2182 2202          if (reassess)
2183 2203                  metaslab_sync_reassess(vd->vdev_mg);
2184 2204  }
2185 2205  
2186 2206  void
2187 2207  vdev_sync(vdev_t *vd, uint64_t txg)
2188 2208  {
2189 2209          spa_t *spa = vd->vdev_spa;
2190 2210          vdev_t *lvd;
2191 2211          metaslab_t *msp;
2192 2212          dmu_tx_t *tx;
2193 2213  
2194 2214          ASSERT(!vd->vdev_ishole);
2195 2215  
2196 2216          if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
2197 2217                  ASSERT(vd == vd->vdev_top);
2198 2218                  tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2199 2219                  vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
2200 2220                      DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
2201 2221                  ASSERT(vd->vdev_ms_array != 0);
2202 2222                  vdev_config_dirty(vd);
2203 2223                  dmu_tx_commit(tx);
2204 2224          }
2205 2225  
2206 2226          /*
2207 2227           * Remove the metadata associated with this vdev once it's empty.
2208 2228           */
2209 2229          if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
2210 2230                  vdev_remove(vd, txg);
2211 2231  
2212 2232          while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
2213 2233                  metaslab_sync(msp, txg);
2214 2234                  (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
2215 2235          }
2216 2236  
2217 2237          while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
2218 2238                  vdev_dtl_sync(lvd, txg);
2219 2239  
2220 2240          (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
2221 2241  }
2222 2242  
2223 2243  uint64_t
2224 2244  vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
2225 2245  {
2226 2246          return (vd->vdev_ops->vdev_op_asize(vd, psize));
2227 2247  }
2228 2248  
2229 2249  /*
2230 2250   * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
2231 2251   * not be opened, and no I/O is attempted.
2232 2252   */
2233 2253  int
2234 2254  vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2235 2255  {
2236 2256          vdev_t *vd, *tvd;
2237 2257  
2238 2258          spa_vdev_state_enter(spa, SCL_NONE);
2239 2259  
2240 2260          if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2241 2261                  return (spa_vdev_state_exit(spa, NULL, ENODEV));
2242 2262  
2243 2263          if (!vd->vdev_ops->vdev_op_leaf)
2244 2264                  return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2245 2265  
2246 2266          tvd = vd->vdev_top;
2247 2267  
2248 2268          /*
2249 2269           * We don't directly use the aux state here, but if we do a
2250 2270           * vdev_reopen(), we need this value to be present to remember why we
2251 2271           * were faulted.
2252 2272           */
2253 2273          vd->vdev_label_aux = aux;
2254 2274  
2255 2275          /*
2256 2276           * Faulted state takes precedence over degraded.
2257 2277           */
2258 2278          vd->vdev_delayed_close = B_FALSE;
2259 2279          vd->vdev_faulted = 1ULL;
2260 2280          vd->vdev_degraded = 0ULL;
2261 2281          vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
2262 2282  
2263 2283          /*
2264 2284           * If this device has the only valid copy of the data, then
2265 2285           * back off and simply mark the vdev as degraded instead.
2266 2286           */
2267 2287          if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
2268 2288                  vd->vdev_degraded = 1ULL;
2269 2289                  vd->vdev_faulted = 0ULL;
2270 2290  
2271 2291                  /*
2272 2292                   * If we reopen the device and it's not dead, only then do we
2273 2293                   * mark it degraded.
2274 2294                   */
2275 2295                  vdev_reopen(tvd);
2276 2296  
2277 2297                  if (vdev_readable(vd))
2278 2298                          vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
2279 2299          }
2280 2300  
2281 2301          return (spa_vdev_state_exit(spa, vd, 0));
2282 2302  }
2283 2303  
2284 2304  /*
2285 2305   * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
2286 2306   * user that something is wrong.  The vdev continues to operate as normal as far
2287 2307   * as I/O is concerned.
2288 2308   */
2289 2309  int
2290 2310  vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2291 2311  {
2292 2312          vdev_t *vd;
2293 2313  
2294 2314          spa_vdev_state_enter(spa, SCL_NONE);
2295 2315  
2296 2316          if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2297 2317                  return (spa_vdev_state_exit(spa, NULL, ENODEV));
2298 2318  
2299 2319          if (!vd->vdev_ops->vdev_op_leaf)
2300 2320                  return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2301 2321  
2302 2322          /*
2303 2323           * If the vdev is already faulted, then don't do anything.
2304 2324           */
2305 2325          if (vd->vdev_faulted || vd->vdev_degraded)
2306 2326                  return (spa_vdev_state_exit(spa, NULL, 0));
2307 2327  
2308 2328          vd->vdev_degraded = 1ULL;
2309 2329          if (!vdev_is_dead(vd))
2310 2330                  vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
2311 2331                      aux);
2312 2332  
2313 2333          return (spa_vdev_state_exit(spa, vd, 0));
2314 2334  }
2315 2335  
2316 2336  /*
2317 2337   * Online the given vdev.
2318 2338   *
2319 2339   * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
2320 2340   * spare device should be detached when the device finishes resilvering.
2321 2341   * Second, the online should be treated like a 'test' online case, so no FMA
2322 2342   * events are generated if the device fails to open.
2323 2343   */
2324 2344  int
2325 2345  vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
2326 2346  {
2327 2347          vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
2328 2348  
2329 2349          spa_vdev_state_enter(spa, SCL_NONE);
2330 2350  
2331 2351          if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2332 2352                  return (spa_vdev_state_exit(spa, NULL, ENODEV));
2333 2353  
2334 2354          if (!vd->vdev_ops->vdev_op_leaf)
2335 2355                  return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2336 2356  
2337 2357          tvd = vd->vdev_top;
2338 2358          vd->vdev_offline = B_FALSE;
2339 2359          vd->vdev_tmpoffline = B_FALSE;
2340 2360          vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
2341 2361          vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
2342 2362  
2343 2363          /* XXX - L2ARC 1.0 does not support expansion */
2344 2364          if (!vd->vdev_aux) {
2345 2365                  for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2346 2366                          pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
2347 2367          }
2348 2368  
2349 2369          vdev_reopen(tvd);
2350 2370          vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
2351 2371  
2352 2372          if (!vd->vdev_aux) {
2353 2373                  for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2354 2374                          pvd->vdev_expanding = B_FALSE;
2355 2375          }
2356 2376  
2357 2377          if (newstate)
2358 2378                  *newstate = vd->vdev_state;
2359 2379          if ((flags & ZFS_ONLINE_UNSPARE) &&
2360 2380              !vdev_is_dead(vd) && vd->vdev_parent &&
2361 2381              vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2362 2382              vd->vdev_parent->vdev_child[0] == vd)
2363 2383                  vd->vdev_unspare = B_TRUE;
2364 2384  
2365 2385          if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
2366 2386  
2367 2387                  /* XXX - L2ARC 1.0 does not support expansion */
2368 2388                  if (vd->vdev_aux)
2369 2389                          return (spa_vdev_state_exit(spa, vd, ENOTSUP));
2370 2390                  spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2371 2391          }
2372 2392          return (spa_vdev_state_exit(spa, vd, 0));
2373 2393  }
2374 2394  
2375 2395  static int
2376 2396  vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
2377 2397  {
2378 2398          vdev_t *vd, *tvd;
2379 2399          int error = 0;
2380 2400          uint64_t generation;
2381 2401          metaslab_group_t *mg;
2382 2402  
2383 2403  top:
2384 2404          spa_vdev_state_enter(spa, SCL_ALLOC);
2385 2405  
2386 2406          if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2387 2407                  return (spa_vdev_state_exit(spa, NULL, ENODEV));
2388 2408  
2389 2409          if (!vd->vdev_ops->vdev_op_leaf)
2390 2410                  return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2391 2411  
2392 2412          tvd = vd->vdev_top;
2393 2413          mg = tvd->vdev_mg;
2394 2414          generation = spa->spa_config_generation + 1;
2395 2415  
2396 2416          /*
2397 2417           * If the device isn't already offline, try to offline it.
2398 2418           */
2399 2419          if (!vd->vdev_offline) {
2400 2420                  /*
2401 2421                   * If this device has the only valid copy of some data,
2402 2422                   * don't allow it to be offlined. Log devices are always
2403 2423                   * expendable.
2404 2424                   */
2405 2425                  if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2406 2426                      vdev_dtl_required(vd))
2407 2427                          return (spa_vdev_state_exit(spa, NULL, EBUSY));
2408 2428  
2409 2429                  /*
2410 2430                   * If the top-level is a slog and it has had allocations
2411 2431                   * then proceed.  We check that the vdev's metaslab group
2412 2432                   * is not NULL since it's possible that we may have just
2413 2433                   * added this vdev but not yet initialized its metaslabs.
2414 2434                   */
2415 2435                  if (tvd->vdev_islog && mg != NULL) {
2416 2436                          /*
2417 2437                           * Prevent any future allocations.
2418 2438                           */
2419 2439                          metaslab_group_passivate(mg);
2420 2440                          (void) spa_vdev_state_exit(spa, vd, 0);
2421 2441  
2422 2442                          error = spa_offline_log(spa);
2423 2443  
2424 2444                          spa_vdev_state_enter(spa, SCL_ALLOC);
2425 2445  
2426 2446                          /*
2427 2447                           * Check to see if the config has changed.
2428 2448                           */
2429 2449                          if (error || generation != spa->spa_config_generation) {
2430 2450                                  metaslab_group_activate(mg);
2431 2451                                  if (error)
2432 2452                                          return (spa_vdev_state_exit(spa,
2433 2453                                              vd, error));
2434 2454                                  (void) spa_vdev_state_exit(spa, vd, 0);
2435 2455                                  goto top;
2436 2456                          }
2437 2457                          ASSERT0(tvd->vdev_stat.vs_alloc);
2438 2458                  }
2439 2459  
2440 2460                  /*
2441 2461                   * Offline this device and reopen its top-level vdev.
2442 2462                   * If the top-level vdev is a log device then just offline
2443 2463                   * it. Otherwise, if this action results in the top-level
2444 2464                   * vdev becoming unusable, undo it and fail the request.
2445 2465                   */
2446 2466                  vd->vdev_offline = B_TRUE;
2447 2467                  vdev_reopen(tvd);
2448 2468  
2449 2469                  if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2450 2470                      vdev_is_dead(tvd)) {
2451 2471                          vd->vdev_offline = B_FALSE;
2452 2472                          vdev_reopen(tvd);
2453 2473                          return (spa_vdev_state_exit(spa, NULL, EBUSY));
2454 2474                  }
2455 2475  
2456 2476                  /*
2457 2477                   * Add the device back into the metaslab rotor so that
2458 2478                   * once we online the device it's open for business.
2459 2479                   */
2460 2480                  if (tvd->vdev_islog && mg != NULL)
2461 2481                          metaslab_group_activate(mg);
2462 2482          }
2463 2483  
2464 2484          vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
2465 2485  
2466 2486          return (spa_vdev_state_exit(spa, vd, 0));
2467 2487  }
2468 2488  
2469 2489  int
2470 2490  vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
2471 2491  {
2472 2492          int error;
2473 2493  
2474 2494          mutex_enter(&spa->spa_vdev_top_lock);
2475 2495          error = vdev_offline_locked(spa, guid, flags);
2476 2496          mutex_exit(&spa->spa_vdev_top_lock);
2477 2497  
2478 2498          return (error);
2479 2499  }
2480 2500  
2481 2501  /*
2482 2502   * Clear the error counts associated with this vdev.  Unlike vdev_online() and
2483 2503   * vdev_offline(), we assume the spa config is locked.  We also clear all
2484 2504   * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
2485 2505   */
2486 2506  void
2487 2507  vdev_clear(spa_t *spa, vdev_t *vd)
2488 2508  {
2489 2509          vdev_t *rvd = spa->spa_root_vdev;
2490 2510  
2491 2511          ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2492 2512  
2493 2513          if (vd == NULL)
2494 2514                  vd = rvd;
2495 2515  
2496 2516          vd->vdev_stat.vs_read_errors = 0;
2497 2517          vd->vdev_stat.vs_write_errors = 0;
2498 2518          vd->vdev_stat.vs_checksum_errors = 0;
2499 2519  
2500 2520          for (int c = 0; c < vd->vdev_children; c++)
2501 2521                  vdev_clear(spa, vd->vdev_child[c]);
2502 2522  
2503 2523          /*
2504 2524           * If we're in the FAULTED state or have experienced failed I/O, then
2505 2525           * clear the persistent state and attempt to reopen the device.  We
2506 2526           * also mark the vdev config dirty, so that the new faulted state is
2507 2527           * written out to disk.
2508 2528           */
2509 2529          if (vd->vdev_faulted || vd->vdev_degraded ||
2510 2530              !vdev_readable(vd) || !vdev_writeable(vd)) {
2511 2531  
2512 2532                  /*
2513 2533                   * When reopening in reponse to a clear event, it may be due to
2514 2534                   * a fmadm repair request.  In this case, if the device is
2515 2535                   * still broken, we want to still post the ereport again.
2516 2536                   */
2517 2537                  vd->vdev_forcefault = B_TRUE;
2518 2538  
2519 2539                  vd->vdev_faulted = vd->vdev_degraded = 0ULL;
2520 2540                  vd->vdev_cant_read = B_FALSE;
2521 2541                  vd->vdev_cant_write = B_FALSE;
2522 2542  
2523 2543                  vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
2524 2544  
2525 2545                  vd->vdev_forcefault = B_FALSE;
2526 2546  
2527 2547                  if (vd != rvd && vdev_writeable(vd->vdev_top))
2528 2548                          vdev_state_dirty(vd->vdev_top);
2529 2549  
2530 2550                  if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
2531 2551                          spa_async_request(spa, SPA_ASYNC_RESILVER);
2532 2552  
2533 2553                  spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
2534 2554          }
2535 2555  
2536 2556          /*
2537 2557           * When clearing a FMA-diagnosed fault, we always want to
2538 2558           * unspare the device, as we assume that the original spare was
2539 2559           * done in response to the FMA fault.
2540 2560           */
2541 2561          if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
2542 2562              vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2543 2563              vd->vdev_parent->vdev_child[0] == vd)
2544 2564                  vd->vdev_unspare = B_TRUE;
2545 2565  }
2546 2566  
2547 2567  boolean_t
2548 2568  vdev_is_dead(vdev_t *vd)
2549 2569  {
2550 2570          /*
2551 2571           * Holes and missing devices are always considered "dead".
2552 2572           * This simplifies the code since we don't have to check for
2553 2573           * these types of devices in the various code paths.
2554 2574           * Instead we rely on the fact that we skip over dead devices
2555 2575           * before issuing I/O to them.
2556 2576           */
2557 2577          return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
2558 2578              vd->vdev_ops == &vdev_missing_ops);
2559 2579  }
2560 2580  
2561 2581  boolean_t
2562 2582  vdev_readable(vdev_t *vd)
2563 2583  {
2564 2584          return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
2565 2585  }
2566 2586  
2567 2587  boolean_t
2568 2588  vdev_writeable(vdev_t *vd)
2569 2589  {
2570 2590          return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
2571 2591  }
2572 2592  
2573 2593  boolean_t
2574 2594  vdev_allocatable(vdev_t *vd)
2575 2595  {
2576 2596          uint64_t state = vd->vdev_state;
2577 2597  
2578 2598          /*
2579 2599           * We currently allow allocations from vdevs which may be in the
2580 2600           * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
2581 2601           * fails to reopen then we'll catch it later when we're holding
2582 2602           * the proper locks.  Note that we have to get the vdev state
2583 2603           * in a local variable because although it changes atomically,
2584 2604           * we're asking two separate questions about it.
2585 2605           */
2586 2606          return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
2587 2607              !vd->vdev_cant_write && !vd->vdev_ishole);
2588 2608  }
2589 2609  
2590 2610  boolean_t
2591 2611  vdev_accessible(vdev_t *vd, zio_t *zio)
2592 2612  {
2593 2613          ASSERT(zio->io_vd == vd);
2594 2614  
2595 2615          if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
2596 2616                  return (B_FALSE);
2597 2617  
2598 2618          if (zio->io_type == ZIO_TYPE_READ)
2599 2619                  return (!vd->vdev_cant_read);
2600 2620  
2601 2621          if (zio->io_type == ZIO_TYPE_WRITE)
2602 2622                  return (!vd->vdev_cant_write);
2603 2623  
2604 2624          return (B_TRUE);
2605 2625  }
2606 2626  
2607 2627  /*
2608 2628   * Get statistics for the given vdev.
2609 2629   */
2610 2630  void
2611 2631  vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
2612 2632  {
2613 2633          spa_t *spa = vd->vdev_spa;
2614 2634          vdev_t *rvd = spa->spa_root_vdev;
2615 2635  
2616 2636          ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2617 2637  
2618 2638          mutex_enter(&vd->vdev_stat_lock);
2619 2639          bcopy(&vd->vdev_stat, vs, sizeof (*vs));
2620 2640          vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
2621 2641          vs->vs_state = vd->vdev_state;
2622 2642          vs->vs_rsize = vdev_get_min_asize(vd);
2623 2643          if (vd->vdev_ops->vdev_op_leaf)
2624 2644                  vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
2625 2645          vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
2626 2646          if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
2627 2647                  vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
2628 2648          }
2629 2649  
2630 2650          /*
2631 2651           * If we're getting stats on the root vdev, aggregate the I/O counts
2632 2652           * over all top-level vdevs (i.e. the direct children of the root).
2633 2653           */
2634 2654          if (vd == rvd) {
2635 2655                  for (int c = 0; c < rvd->vdev_children; c++) {
2636 2656                          vdev_t *cvd = rvd->vdev_child[c];
2637 2657                          vdev_stat_t *cvs = &cvd->vdev_stat;
2638 2658  
2639 2659                          for (int t = 0; t < ZIO_TYPES; t++) {
2640 2660                                  vs->vs_ops[t] += cvs->vs_ops[t];
2641 2661                                  vs->vs_bytes[t] += cvs->vs_bytes[t];
2642 2662                          }
2643 2663                          cvs->vs_scan_removing = cvd->vdev_removing;
2644 2664                  }
2645 2665          }
2646 2666          mutex_exit(&vd->vdev_stat_lock);
2647 2667  }
2648 2668  
2649 2669  void
2650 2670  vdev_clear_stats(vdev_t *vd)
2651 2671  {
2652 2672          mutex_enter(&vd->vdev_stat_lock);
2653 2673          vd->vdev_stat.vs_space = 0;
2654 2674          vd->vdev_stat.vs_dspace = 0;
2655 2675          vd->vdev_stat.vs_alloc = 0;
2656 2676          mutex_exit(&vd->vdev_stat_lock);
2657 2677  }
2658 2678  
2659 2679  void
2660 2680  vdev_scan_stat_init(vdev_t *vd)
2661 2681  {
2662 2682          vdev_stat_t *vs = &vd->vdev_stat;
2663 2683  
2664 2684          for (int c = 0; c < vd->vdev_children; c++)
2665 2685                  vdev_scan_stat_init(vd->vdev_child[c]);
2666 2686  
2667 2687          mutex_enter(&vd->vdev_stat_lock);
2668 2688          vs->vs_scan_processed = 0;
2669 2689          mutex_exit(&vd->vdev_stat_lock);
2670 2690  }
2671 2691  
2672 2692  void
2673 2693  vdev_stat_update(zio_t *zio, uint64_t psize)
2674 2694  {
2675 2695          spa_t *spa = zio->io_spa;
2676 2696          vdev_t *rvd = spa->spa_root_vdev;
2677 2697          vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
2678 2698          vdev_t *pvd;
2679 2699          uint64_t txg = zio->io_txg;
2680 2700          vdev_stat_t *vs = &vd->vdev_stat;
2681 2701          zio_type_t type = zio->io_type;
2682 2702          int flags = zio->io_flags;
2683 2703  
2684 2704          /*
2685 2705           * If this i/o is a gang leader, it didn't do any actual work.
2686 2706           */
2687 2707          if (zio->io_gang_tree)
2688 2708                  return;
2689 2709  
2690 2710          if (zio->io_error == 0) {
2691 2711                  /*
2692 2712                   * If this is a root i/o, don't count it -- we've already
2693 2713                   * counted the top-level vdevs, and vdev_get_stats() will
2694 2714                   * aggregate them when asked.  This reduces contention on
2695 2715                   * the root vdev_stat_lock and implicitly handles blocks
2696 2716                   * that compress away to holes, for which there is no i/o.
2697 2717                   * (Holes never create vdev children, so all the counters
2698 2718                   * remain zero, which is what we want.)
2699 2719                   *
2700 2720                   * Note: this only applies to successful i/o (io_error == 0)
2701 2721                   * because unlike i/o counts, errors are not additive.
2702 2722                   * When reading a ditto block, for example, failure of
2703 2723                   * one top-level vdev does not imply a root-level error.
2704 2724                   */
2705 2725                  if (vd == rvd)
2706 2726                          return;
2707 2727  
2708 2728                  ASSERT(vd == zio->io_vd);
2709 2729  
2710 2730                  if (flags & ZIO_FLAG_IO_BYPASS)
2711 2731                          return;
2712 2732  
2713 2733                  mutex_enter(&vd->vdev_stat_lock);
2714 2734  
2715 2735                  if (flags & ZIO_FLAG_IO_REPAIR) {
2716 2736                          if (flags & ZIO_FLAG_SCAN_THREAD) {
2717 2737                                  dsl_scan_phys_t *scn_phys =
2718 2738                                      &spa->spa_dsl_pool->dp_scan->scn_phys;
2719 2739                                  uint64_t *processed = &scn_phys->scn_processed;
2720 2740  
2721 2741                                  /* XXX cleanup? */
2722 2742                                  if (vd->vdev_ops->vdev_op_leaf)
2723 2743                                          atomic_add_64(processed, psize);
2724 2744                                  vs->vs_scan_processed += psize;
2725 2745                          }
2726 2746  
2727 2747                          if (flags & ZIO_FLAG_SELF_HEAL)
2728 2748                                  vs->vs_self_healed += psize;
2729 2749                  }
2730 2750  
2731 2751                  vs->vs_ops[type]++;
2732 2752                  vs->vs_bytes[type] += psize;
2733 2753  
2734 2754                  mutex_exit(&vd->vdev_stat_lock);
2735 2755                  return;
2736 2756          }
2737 2757  
2738 2758          if (flags & ZIO_FLAG_SPECULATIVE)
2739 2759                  return;
2740 2760  
2741 2761          /*
2742 2762           * If this is an I/O error that is going to be retried, then ignore the
2743 2763           * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
2744 2764           * hard errors, when in reality they can happen for any number of
2745 2765           * innocuous reasons (bus resets, MPxIO link failure, etc).
2746 2766           */
2747 2767          if (zio->io_error == EIO &&
2748 2768              !(zio->io_flags & ZIO_FLAG_IO_RETRY))
2749 2769                  return;
2750 2770  
2751 2771          /*
2752 2772           * Intent logs writes won't propagate their error to the root
2753 2773           * I/O so don't mark these types of failures as pool-level
2754 2774           * errors.
2755 2775           */
2756 2776          if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
2757 2777                  return;
2758 2778  
2759 2779          mutex_enter(&vd->vdev_stat_lock);
2760 2780          if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
2761 2781                  if (zio->io_error == ECKSUM)
2762 2782                          vs->vs_checksum_errors++;
2763 2783                  else
2764 2784                          vs->vs_read_errors++;
2765 2785          }
2766 2786          if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
2767 2787                  vs->vs_write_errors++;
2768 2788          mutex_exit(&vd->vdev_stat_lock);
2769 2789  
2770 2790          if (type == ZIO_TYPE_WRITE && txg != 0 &&
2771 2791              (!(flags & ZIO_FLAG_IO_REPAIR) ||
2772 2792              (flags & ZIO_FLAG_SCAN_THREAD) ||
2773 2793              spa->spa_claiming)) {
2774 2794                  /*
2775 2795                   * This is either a normal write (not a repair), or it's
2776 2796                   * a repair induced by the scrub thread, or it's a repair
2777 2797                   * made by zil_claim() during spa_load() in the first txg.
2778 2798                   * In the normal case, we commit the DTL change in the same
2779 2799                   * txg as the block was born.  In the scrub-induced repair
2780 2800                   * case, we know that scrubs run in first-pass syncing context,
2781 2801                   * so we commit the DTL change in spa_syncing_txg(spa).
2782 2802                   * In the zil_claim() case, we commit in spa_first_txg(spa).
2783 2803                   *
2784 2804                   * We currently do not make DTL entries for failed spontaneous
2785 2805                   * self-healing writes triggered by normal (non-scrubbing)
2786 2806                   * reads, because we have no transactional context in which to
2787 2807                   * do so -- and it's not clear that it'd be desirable anyway.
2788 2808                   */
2789 2809                  if (vd->vdev_ops->vdev_op_leaf) {
2790 2810                          uint64_t commit_txg = txg;
2791 2811                          if (flags & ZIO_FLAG_SCAN_THREAD) {
2792 2812                                  ASSERT(flags & ZIO_FLAG_IO_REPAIR);
2793 2813                                  ASSERT(spa_sync_pass(spa) == 1);
2794 2814                                  vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
2795 2815                                  commit_txg = spa_syncing_txg(spa);
2796 2816                          } else if (spa->spa_claiming) {
2797 2817                                  ASSERT(flags & ZIO_FLAG_IO_REPAIR);
2798 2818                                  commit_txg = spa_first_txg(spa);
2799 2819                          }
2800 2820                          ASSERT(commit_txg >= spa_syncing_txg(spa));
2801 2821                          if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
2802 2822                                  return;
2803 2823                          for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2804 2824                                  vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
2805 2825                          vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
2806 2826                  }
2807 2827                  if (vd != rvd)
2808 2828                          vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
2809 2829          }
2810 2830  }
2811 2831  
2812 2832  /*
2813 2833   * Update the in-core space usage stats for this vdev, its metaslab class,
2814 2834   * and the root vdev.
2815 2835   */
2816 2836  void
2817 2837  vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
2818 2838      int64_t space_delta)
2819 2839  {
2820 2840          int64_t dspace_delta = space_delta;
2821 2841          spa_t *spa = vd->vdev_spa;
2822 2842          vdev_t *rvd = spa->spa_root_vdev;
2823 2843          metaslab_group_t *mg = vd->vdev_mg;
2824 2844          metaslab_class_t *mc = mg ? mg->mg_class : NULL;
2825 2845  
2826 2846          ASSERT(vd == vd->vdev_top);
2827 2847  
2828 2848          /*
2829 2849           * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
2830 2850           * factor.  We must calculate this here and not at the root vdev
2831 2851           * because the root vdev's psize-to-asize is simply the max of its
2832 2852           * childrens', thus not accurate enough for us.
2833 2853           */
2834 2854          ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
2835 2855          ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
2836 2856          dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
2837 2857              vd->vdev_deflate_ratio;
2838 2858  
2839 2859          mutex_enter(&vd->vdev_stat_lock);
2840 2860          vd->vdev_stat.vs_alloc += alloc_delta;
2841 2861          vd->vdev_stat.vs_space += space_delta;
2842 2862          vd->vdev_stat.vs_dspace += dspace_delta;
2843 2863          mutex_exit(&vd->vdev_stat_lock);
2844 2864  
2845 2865          if (mc == spa_normal_class(spa)) {
2846 2866                  mutex_enter(&rvd->vdev_stat_lock);
2847 2867                  rvd->vdev_stat.vs_alloc += alloc_delta;
2848 2868                  rvd->vdev_stat.vs_space += space_delta;
2849 2869                  rvd->vdev_stat.vs_dspace += dspace_delta;
2850 2870                  mutex_exit(&rvd->vdev_stat_lock);
2851 2871          }
2852 2872  
2853 2873          if (mc != NULL) {
2854 2874                  ASSERT(rvd == vd->vdev_parent);
2855 2875                  ASSERT(vd->vdev_ms_count != 0);
2856 2876  
2857 2877                  metaslab_class_space_update(mc,
2858 2878                      alloc_delta, defer_delta, space_delta, dspace_delta);
2859 2879          }
2860 2880  }
2861 2881  
2862 2882  /*
2863 2883   * Mark a top-level vdev's config as dirty, placing it on the dirty list
2864 2884   * so that it will be written out next time the vdev configuration is synced.
2865 2885   * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
2866 2886   */
2867 2887  void
2868 2888  vdev_config_dirty(vdev_t *vd)
2869 2889  {
2870 2890          spa_t *spa = vd->vdev_spa;
2871 2891          vdev_t *rvd = spa->spa_root_vdev;
2872 2892          int c;
2873 2893  
2874 2894          ASSERT(spa_writeable(spa));
2875 2895  
2876 2896          /*
2877 2897           * If this is an aux vdev (as with l2cache and spare devices), then we
2878 2898           * update the vdev config manually and set the sync flag.
2879 2899           */
2880 2900          if (vd->vdev_aux != NULL) {
2881 2901                  spa_aux_vdev_t *sav = vd->vdev_aux;
2882 2902                  nvlist_t **aux;
2883 2903                  uint_t naux;
2884 2904  
2885 2905                  for (c = 0; c < sav->sav_count; c++) {
2886 2906                          if (sav->sav_vdevs[c] == vd)
2887 2907                                  break;
2888 2908                  }
2889 2909  
2890 2910                  if (c == sav->sav_count) {
2891 2911                          /*
2892 2912                           * We're being removed.  There's nothing more to do.
2893 2913                           */
2894 2914                          ASSERT(sav->sav_sync == B_TRUE);
2895 2915                          return;
2896 2916                  }
2897 2917  
2898 2918                  sav->sav_sync = B_TRUE;
2899 2919  
2900 2920                  if (nvlist_lookup_nvlist_array(sav->sav_config,
2901 2921                      ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
2902 2922                          VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
2903 2923                              ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
2904 2924                  }
2905 2925  
2906 2926                  ASSERT(c < naux);
2907 2927  
2908 2928                  /*
2909 2929                   * Setting the nvlist in the middle if the array is a little
2910 2930                   * sketchy, but it will work.
2911 2931                   */
2912 2932                  nvlist_free(aux[c]);
2913 2933                  aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
2914 2934  
2915 2935                  return;
2916 2936          }
2917 2937  
2918 2938          /*
2919 2939           * The dirty list is protected by the SCL_CONFIG lock.  The caller
2920 2940           * must either hold SCL_CONFIG as writer, or must be the sync thread
2921 2941           * (which holds SCL_CONFIG as reader).  There's only one sync thread,
2922 2942           * so this is sufficient to ensure mutual exclusion.
2923 2943           */
2924 2944          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
2925 2945              (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2926 2946              spa_config_held(spa, SCL_CONFIG, RW_READER)));
2927 2947  
2928 2948          if (vd == rvd) {
2929 2949                  for (c = 0; c < rvd->vdev_children; c++)
2930 2950                          vdev_config_dirty(rvd->vdev_child[c]);
2931 2951          } else {
2932 2952                  ASSERT(vd == vd->vdev_top);
2933 2953  
2934 2954                  if (!list_link_active(&vd->vdev_config_dirty_node) &&
2935 2955                      !vd->vdev_ishole)
2936 2956                          list_insert_head(&spa->spa_config_dirty_list, vd);
2937 2957          }
2938 2958  }
2939 2959  
2940 2960  void
2941 2961  vdev_config_clean(vdev_t *vd)
2942 2962  {
2943 2963          spa_t *spa = vd->vdev_spa;
2944 2964  
2945 2965          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
2946 2966              (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2947 2967              spa_config_held(spa, SCL_CONFIG, RW_READER)));
2948 2968  
2949 2969          ASSERT(list_link_active(&vd->vdev_config_dirty_node));
2950 2970          list_remove(&spa->spa_config_dirty_list, vd);
2951 2971  }
2952 2972  
2953 2973  /*
2954 2974   * Mark a top-level vdev's state as dirty, so that the next pass of
2955 2975   * spa_sync() can convert this into vdev_config_dirty().  We distinguish
2956 2976   * the state changes from larger config changes because they require
2957 2977   * much less locking, and are often needed for administrative actions.
2958 2978   */
2959 2979  void
2960 2980  vdev_state_dirty(vdev_t *vd)
2961 2981  {
2962 2982          spa_t *spa = vd->vdev_spa;
2963 2983  
2964 2984          ASSERT(spa_writeable(spa));
2965 2985          ASSERT(vd == vd->vdev_top);
2966 2986  
2967 2987          /*
2968 2988           * The state list is protected by the SCL_STATE lock.  The caller
2969 2989           * must either hold SCL_STATE as writer, or must be the sync thread
2970 2990           * (which holds SCL_STATE as reader).  There's only one sync thread,
2971 2991           * so this is sufficient to ensure mutual exclusion.
2972 2992           */
2973 2993          ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
2974 2994              (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2975 2995              spa_config_held(spa, SCL_STATE, RW_READER)));
2976 2996  
2977 2997          if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
2978 2998                  list_insert_head(&spa->spa_state_dirty_list, vd);
2979 2999  }
2980 3000  
2981 3001  void
2982 3002  vdev_state_clean(vdev_t *vd)
2983 3003  {
2984 3004          spa_t *spa = vd->vdev_spa;
2985 3005  
2986 3006          ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
2987 3007              (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2988 3008              spa_config_held(spa, SCL_STATE, RW_READER)));
2989 3009  
2990 3010          ASSERT(list_link_active(&vd->vdev_state_dirty_node));
2991 3011          list_remove(&spa->spa_state_dirty_list, vd);
2992 3012  }
2993 3013  
2994 3014  /*
2995 3015   * Propagate vdev state up from children to parent.
2996 3016   */
2997 3017  void
2998 3018  vdev_propagate_state(vdev_t *vd)
2999 3019  {
3000 3020          spa_t *spa = vd->vdev_spa;
3001 3021          vdev_t *rvd = spa->spa_root_vdev;
3002 3022          int degraded = 0, faulted = 0;
3003 3023          int corrupted = 0;
3004 3024          vdev_t *child;
3005 3025  
3006 3026          if (vd->vdev_children > 0) {
3007 3027                  for (int c = 0; c < vd->vdev_children; c++) {
3008 3028                          child = vd->vdev_child[c];
3009 3029  
3010 3030                          /*
3011 3031                           * Don't factor holes into the decision.
3012 3032                           */
3013 3033                          if (child->vdev_ishole)
3014 3034                                  continue;
3015 3035  
3016 3036                          if (!vdev_readable(child) ||
3017 3037                              (!vdev_writeable(child) && spa_writeable(spa))) {
3018 3038                                  /*
3019 3039                                   * Root special: if there is a top-level log
3020 3040                                   * device, treat the root vdev as if it were
3021 3041                                   * degraded.
3022 3042                                   */
3023 3043                                  if (child->vdev_islog && vd == rvd)
3024 3044                                          degraded++;
3025 3045                                  else
3026 3046                                          faulted++;
3027 3047                          } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
3028 3048                                  degraded++;
3029 3049                          }
3030 3050  
3031 3051                          if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
3032 3052                                  corrupted++;
3033 3053                  }
3034 3054  
3035 3055                  vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
3036 3056  
3037 3057                  /*
3038 3058                   * Root special: if there is a top-level vdev that cannot be
3039 3059                   * opened due to corrupted metadata, then propagate the root
3040 3060                   * vdev's aux state as 'corrupt' rather than 'insufficient
3041 3061                   * replicas'.
3042 3062                   */
3043 3063                  if (corrupted && vd == rvd &&
3044 3064                      rvd->vdev_state == VDEV_STATE_CANT_OPEN)
3045 3065                          vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
3046 3066                              VDEV_AUX_CORRUPT_DATA);
3047 3067          }
3048 3068  
3049 3069          if (vd->vdev_parent)
3050 3070                  vdev_propagate_state(vd->vdev_parent);
3051 3071  }
3052 3072  
3053 3073  /*
3054 3074   * Set a vdev's state.  If this is during an open, we don't update the parent
3055 3075   * state, because we're in the process of opening children depth-first.
3056 3076   * Otherwise, we propagate the change to the parent.
3057 3077   *
3058 3078   * If this routine places a device in a faulted state, an appropriate ereport is
3059 3079   * generated.
3060 3080   */
3061 3081  void
3062 3082  vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
3063 3083  {
3064 3084          uint64_t save_state;
3065 3085          spa_t *spa = vd->vdev_spa;
3066 3086  
3067 3087          if (state == vd->vdev_state) {
3068 3088                  vd->vdev_stat.vs_aux = aux;
3069 3089                  return;
3070 3090          }
3071 3091  
3072 3092          save_state = vd->vdev_state;
3073 3093  
3074 3094          vd->vdev_state = state;
3075 3095          vd->vdev_stat.vs_aux = aux;
3076 3096  
3077 3097          /*
3078 3098           * If we are setting the vdev state to anything but an open state, then
3079 3099           * always close the underlying device unless the device has requested
3080 3100           * a delayed close (i.e. we're about to remove or fault the device).
3081 3101           * Otherwise, we keep accessible but invalid devices open forever.
3082 3102           * We don't call vdev_close() itself, because that implies some extra
3083 3103           * checks (offline, etc) that we don't want here.  This is limited to
3084 3104           * leaf devices, because otherwise closing the device will affect other
3085 3105           * children.
3086 3106           */
3087 3107          if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
3088 3108              vd->vdev_ops->vdev_op_leaf)
3089 3109                  vd->vdev_ops->vdev_op_close(vd);
3090 3110  
3091 3111          /*
3092 3112           * If we have brought this vdev back into service, we need
3093 3113           * to notify fmd so that it can gracefully repair any outstanding
3094 3114           * cases due to a missing device.  We do this in all cases, even those
3095 3115           * that probably don't correlate to a repaired fault.  This is sure to
3096 3116           * catch all cases, and we let the zfs-retire agent sort it out.  If
3097 3117           * this is a transient state it's OK, as the retire agent will
3098 3118           * double-check the state of the vdev before repairing it.
3099 3119           */
3100 3120          if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
3101 3121              vd->vdev_prevstate != state)
3102 3122                  zfs_post_state_change(spa, vd);
3103 3123  
3104 3124          if (vd->vdev_removed &&
3105 3125              state == VDEV_STATE_CANT_OPEN &&
3106 3126              (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
3107 3127                  /*
3108 3128                   * If the previous state is set to VDEV_STATE_REMOVED, then this
3109 3129                   * device was previously marked removed and someone attempted to
3110 3130                   * reopen it.  If this failed due to a nonexistent device, then
3111 3131                   * keep the device in the REMOVED state.  We also let this be if
3112 3132                   * it is one of our special test online cases, which is only
3113 3133                   * attempting to online the device and shouldn't generate an FMA
3114 3134                   * fault.
3115 3135                   */
3116 3136                  vd->vdev_state = VDEV_STATE_REMOVED;
3117 3137                  vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
3118 3138          } else if (state == VDEV_STATE_REMOVED) {
3119 3139                  vd->vdev_removed = B_TRUE;
3120 3140          } else if (state == VDEV_STATE_CANT_OPEN) {
3121 3141                  /*
3122 3142                   * If we fail to open a vdev during an import or recovery, we
3123 3143                   * mark it as "not available", which signifies that it was
3124 3144                   * never there to begin with.  Failure to open such a device
3125 3145                   * is not considered an error.
3126 3146                   */
3127 3147                  if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
3128 3148                      spa_load_state(spa) == SPA_LOAD_RECOVER) &&
3129 3149                      vd->vdev_ops->vdev_op_leaf)
3130 3150                          vd->vdev_not_present = 1;
3131 3151  
3132 3152                  /*
3133 3153                   * Post the appropriate ereport.  If the 'prevstate' field is
3134 3154                   * set to something other than VDEV_STATE_UNKNOWN, it indicates
3135 3155                   * that this is part of a vdev_reopen().  In this case, we don't
3136 3156                   * want to post the ereport if the device was already in the
3137 3157                   * CANT_OPEN state beforehand.
3138 3158                   *
3139 3159                   * If the 'checkremove' flag is set, then this is an attempt to
3140 3160                   * online the device in response to an insertion event.  If we
3141 3161                   * hit this case, then we have detected an insertion event for a
3142 3162                   * faulted or offline device that wasn't in the removed state.
3143 3163                   * In this scenario, we don't post an ereport because we are
3144 3164                   * about to replace the device, or attempt an online with
3145 3165                   * vdev_forcefault, which will generate the fault for us.
3146 3166                   */
3147 3167                  if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
3148 3168                      !vd->vdev_not_present && !vd->vdev_checkremove &&
3149 3169                      vd != spa->spa_root_vdev) {
3150 3170                          const char *class;
3151 3171  
3152 3172                          switch (aux) {
3153 3173                          case VDEV_AUX_OPEN_FAILED:
3154 3174                                  class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
3155 3175                                  break;
3156 3176                          case VDEV_AUX_CORRUPT_DATA:
3157 3177                                  class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
3158 3178                                  break;
3159 3179                          case VDEV_AUX_NO_REPLICAS:
3160 3180                                  class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
3161 3181                                  break;
3162 3182                          case VDEV_AUX_BAD_GUID_SUM:
3163 3183                                  class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
3164 3184                                  break;
3165 3185                          case VDEV_AUX_TOO_SMALL:
3166 3186                                  class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
3167 3187                                  break;
3168 3188                          case VDEV_AUX_BAD_LABEL:
3169 3189                                  class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
3170 3190                                  break;
3171 3191                          default:
3172 3192                                  class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
3173 3193                          }
3174 3194  
3175 3195                          zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
3176 3196                  }
3177 3197  
3178 3198                  /* Erase any notion of persistent removed state */
3179 3199                  vd->vdev_removed = B_FALSE;
3180 3200          } else {
3181 3201                  vd->vdev_removed = B_FALSE;
3182 3202          }
3183 3203  
3184 3204          if (!isopen && vd->vdev_parent)
3185 3205                  vdev_propagate_state(vd->vdev_parent);
3186 3206  }
3187 3207  
3188 3208  /*
3189 3209   * Check the vdev configuration to ensure that it's capable of supporting
3190 3210   * a root pool. Currently, we do not support RAID-Z or partial configuration.
3191 3211   * In addition, only a single top-level vdev is allowed and none of the leaves
3192 3212   * can be wholedisks.
3193 3213   */
3194 3214  boolean_t
3195 3215  vdev_is_bootable(vdev_t *vd)
3196 3216  {
3197 3217          if (!vd->vdev_ops->vdev_op_leaf) {
3198 3218                  char *vdev_type = vd->vdev_ops->vdev_op_type;
3199 3219  
3200 3220                  if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
3201 3221                      vd->vdev_children > 1) {
3202 3222                          return (B_FALSE);
3203 3223                  } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
3204 3224                      strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
3205 3225                          return (B_FALSE);
3206 3226                  }
3207 3227          }
3208 3228  
3209 3229          for (int c = 0; c < vd->vdev_children; c++) {
3210 3230                  if (!vdev_is_bootable(vd->vdev_child[c]))
3211 3231                          return (B_FALSE);
3212 3232          }
3213 3233          return (B_TRUE);
3214 3234  }
3215 3235  
3216 3236  /*
3217 3237   * Load the state from the original vdev tree (ovd) which
3218 3238   * we've retrieved from the MOS config object. If the original
3219 3239   * vdev was offline or faulted then we transfer that state to the
3220 3240   * device in the current vdev tree (nvd).
3221 3241   */
3222 3242  void
3223 3243  vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
3224 3244  {
3225 3245          spa_t *spa = nvd->vdev_spa;
3226 3246  
3227 3247          ASSERT(nvd->vdev_top->vdev_islog);
3228 3248          ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3229 3249          ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
3230 3250  
3231 3251          for (int c = 0; c < nvd->vdev_children; c++)
3232 3252                  vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
3233 3253  
3234 3254          if (nvd->vdev_ops->vdev_op_leaf) {
3235 3255                  /*
3236 3256                   * Restore the persistent vdev state
3237 3257                   */
3238 3258                  nvd->vdev_offline = ovd->vdev_offline;
3239 3259                  nvd->vdev_faulted = ovd->vdev_faulted;
3240 3260                  nvd->vdev_degraded = ovd->vdev_degraded;
3241 3261                  nvd->vdev_removed = ovd->vdev_removed;
3242 3262          }
3243 3263  }
3244 3264  
3245 3265  /*
3246 3266   * Determine if a log device has valid content.  If the vdev was
3247 3267   * removed or faulted in the MOS config then we know that
3248 3268   * the content on the log device has already been written to the pool.
3249 3269   */
3250 3270  boolean_t
3251 3271  vdev_log_state_valid(vdev_t *vd)
3252 3272  {
3253 3273          if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
3254 3274              !vd->vdev_removed)
3255 3275                  return (B_TRUE);
3256 3276  
3257 3277          for (int c = 0; c < vd->vdev_children; c++)
3258 3278                  if (vdev_log_state_valid(vd->vdev_child[c]))
3259 3279                          return (B_TRUE);
3260 3280  
3261 3281          return (B_FALSE);
3262 3282  }
3263 3283  
3264 3284  /*
3265 3285   * Expand a vdev if possible.
3266 3286   */
3267 3287  void
3268 3288  vdev_expand(vdev_t *vd, uint64_t txg)
3269 3289  {
3270 3290          ASSERT(vd->vdev_top == vd);
3271 3291          ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3272 3292  
3273 3293          if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
3274 3294                  VERIFY(vdev_metaslab_init(vd, txg) == 0);
3275 3295                  vdev_config_dirty(vd);
3276 3296          }
3277 3297  }
3278 3298  
3279 3299  /*
3280 3300   * Split a vdev.
3281 3301   */
3282 3302  void
3283 3303  vdev_split(vdev_t *vd)
3284 3304  {
3285 3305          vdev_t *cvd, *pvd = vd->vdev_parent;
3286 3306  
3287 3307          vdev_remove_child(pvd, vd);
3288 3308          vdev_compact_children(pvd);
3289 3309  
3290 3310          cvd = pvd->vdev_child[0];
3291 3311          if (pvd->vdev_children == 1) {
3292 3312                  vdev_remove_parent(cvd);
3293 3313                  cvd->vdev_splitting = B_TRUE;
3294 3314          }
3295 3315          vdev_propagate_state(cvd);
3296 3316  }
3297 3317  
3298 3318  void
3299 3319  vdev_deadman(vdev_t *vd)
3300 3320  {
3301 3321          for (int c = 0; c < vd->vdev_children; c++) {
3302 3322                  vdev_t *cvd = vd->vdev_child[c];
3303 3323  
3304 3324                  vdev_deadman(cvd);
3305 3325          }
3306 3326  
3307 3327          if (vd->vdev_ops->vdev_op_leaf) {
3308 3328                  vdev_queue_t *vq = &vd->vdev_queue;
3309 3329  
3310 3330                  mutex_enter(&vq->vq_lock);
3311 3331                  if (avl_numnodes(&vq->vq_active_tree) > 0) {
3312 3332                          spa_t *spa = vd->vdev_spa;
3313 3333                          zio_t *fio;
3314 3334                          uint64_t delta;
3315 3335  
3316 3336                          /*
3317 3337                           * Look at the head of all the pending queues,
3318 3338                           * if any I/O has been outstanding for longer than
3319 3339                           * the spa_deadman_synctime we panic the system.
3320 3340                           */
3321 3341                          fio = avl_first(&vq->vq_active_tree);
3322 3342                          delta = gethrtime() - fio->io_timestamp;
3323 3343                          if (delta > spa_deadman_synctime(spa)) {
3324 3344                                  zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
3325 3345                                      "delta %lluns, last io %lluns",
3326 3346                                      fio->io_timestamp, delta,
3327 3347                                      vq->vq_io_complete_ts);
3328 3348                                  fm_panic("I/O to pool '%s' appears to be "
3329 3349                                      "hung.", spa_name(spa));
3330 3350                          }
3331 3351                  }
3332 3352                  mutex_exit(&vq->vq_lock);
3333 3353          }
3334 3354  }

↓ open down ↓

3144 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX