Print this page
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4104 ::spa_space no longer works
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>


  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2013 by Delphix. All rights reserved.
  26  */
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/fm/fs/zfs.h>
  30 #include <sys/spa.h>
  31 #include <sys/spa_impl.h>
  32 #include <sys/dmu.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/vdev_impl.h>
  35 #include <sys/uberblock_impl.h>
  36 #include <sys/metaslab.h>
  37 #include <sys/metaslab_impl.h>
  38 #include <sys/space_map.h>

  39 #include <sys/zio.h>
  40 #include <sys/zap.h>
  41 #include <sys/fs/zfs.h>
  42 #include <sys/arc.h>
  43 #include <sys/zil.h>
  44 #include <sys/dsl_scan.h>
  45 
  46 /*
  47  * Virtual device management.
  48  */
  49 
  50 static vdev_ops_t *vdev_ops_table[] = {
  51         &vdev_root_ops,
  52         &vdev_raidz_ops,
  53         &vdev_mirror_ops,
  54         &vdev_replacing_ops,
  55         &vdev_spare_ops,
  56         &vdev_disk_ops,
  57         &vdev_file_ops,
  58         &vdev_missing_ops,


 302                         /*
 303                          * Any other vdev's guid must be unique within the pool.
 304                          */
 305                         guid = spa_generate_guid(spa);
 306                 }
 307                 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 308         }
 309 
 310         vd->vdev_spa = spa;
 311         vd->vdev_id = id;
 312         vd->vdev_guid = guid;
 313         vd->vdev_guid_sum = guid;
 314         vd->vdev_ops = ops;
 315         vd->vdev_state = VDEV_STATE_CLOSED;
 316         vd->vdev_ishole = (ops == &vdev_hole_ops);
 317 
 318         mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 319         mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 320         mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 321         for (int t = 0; t < DTL_TYPES; t++) {
 322                 space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
 323                     &vd->vdev_dtl_lock);
 324         }
 325         txg_list_create(&vd->vdev_ms_list,
 326             offsetof(struct metaslab, ms_txg_node));
 327         txg_list_create(&vd->vdev_dtl_list,
 328             offsetof(struct vdev, vdev_dtl_node));
 329         vd->vdev_stat.vs_timestamp = gethrtime();
 330         vdev_queue_init(vd);
 331         vdev_cache_init(vd);
 332 
 333         return (vd);
 334 }
 335 
 336 /*
 337  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
 338  * creating a new vdev or loading an existing one - the behavior is slightly
 339  * different for each case.
 340  */
 341 int
 342 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,


 488                     &vd->vdev_removing);
 489         }
 490 
 491         if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
 492                 ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 493                     alloctype == VDEV_ALLOC_ADD ||
 494                     alloctype == VDEV_ALLOC_SPLIT ||
 495                     alloctype == VDEV_ALLOC_ROOTPOOL);
 496                 vd->vdev_mg = metaslab_group_create(islog ?
 497                     spa_log_class(spa) : spa_normal_class(spa), vd);
 498         }
 499 
 500         /*
 501          * If we're a leaf vdev, try to load the DTL object and other state.
 502          */
 503         if (vd->vdev_ops->vdev_op_leaf &&
 504             (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 505             alloctype == VDEV_ALLOC_ROOTPOOL)) {
 506                 if (alloctype == VDEV_ALLOC_LOAD) {
 507                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 508                             &vd->vdev_dtl_smo.smo_object);
 509                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 510                             &vd->vdev_unspare);
 511                 }
 512 
 513                 if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 514                         uint64_t spare = 0;
 515 
 516                         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 517                             &spare) == 0 && spare)
 518                                 spa_spare_add(vd);
 519                 }
 520 
 521                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 522                     &vd->vdev_offline);
 523 
 524                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 525                     &vd->vdev_resilver_txg);
 526 
 527                 /*
 528                  * When importing a pool, we want to ignore the persistent fault


 610         vdev_cache_fini(vd);
 611 
 612         if (vd->vdev_path)
 613                 spa_strfree(vd->vdev_path);
 614         if (vd->vdev_devid)
 615                 spa_strfree(vd->vdev_devid);
 616         if (vd->vdev_physpath)
 617                 spa_strfree(vd->vdev_physpath);
 618         if (vd->vdev_fru)
 619                 spa_strfree(vd->vdev_fru);
 620 
 621         if (vd->vdev_isspare)
 622                 spa_spare_remove(vd);
 623         if (vd->vdev_isl2cache)
 624                 spa_l2cache_remove(vd);
 625 
 626         txg_list_destroy(&vd->vdev_ms_list);
 627         txg_list_destroy(&vd->vdev_dtl_list);
 628 
 629         mutex_enter(&vd->vdev_dtl_lock);

 630         for (int t = 0; t < DTL_TYPES; t++) {
 631                 space_map_unload(&vd->vdev_dtl[t]);
 632                 space_map_destroy(&vd->vdev_dtl[t]);
 633         }
 634         mutex_exit(&vd->vdev_dtl_lock);
 635 
 636         mutex_destroy(&vd->vdev_dtl_lock);
 637         mutex_destroy(&vd->vdev_stat_lock);
 638         mutex_destroy(&vd->vdev_probe_lock);
 639 
 640         if (vd == spa->spa_root_vdev)
 641                 spa->spa_root_vdev = NULL;
 642 
 643         kmem_free(vd, sizeof (vdev_t));
 644 }
 645 
 646 /*
 647  * Transfer top-level vdev state from svd to tvd.
 648  */
 649 static void
 650 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 651 {
 652         spa_t *spa = svd->vdev_spa;


 823          * in 128k (1 << 17) because it is the current "typical" blocksize.
 824          * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
 825          * or we will inconsistently account for existing bp's.
 826          */
 827         vd->vdev_deflate_ratio = (1 << 17) /
 828             (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
 829 
 830         ASSERT(oldc <= newc);
 831 
 832         mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 833 
 834         if (oldc != 0) {
 835                 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
 836                 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 837         }
 838 
 839         vd->vdev_ms = mspp;
 840         vd->vdev_ms_count = newc;
 841 
 842         for (m = oldc; m < newc; m++) {
 843                 space_map_obj_t smo = { 0, 0, 0 };
 844                 if (txg == 0) {
 845                         uint64_t object = 0;


 846                         error = dmu_read(mos, vd->vdev_ms_array,
 847                             m * sizeof (uint64_t), sizeof (uint64_t), &object,
 848                             DMU_READ_PREFETCH);
 849                         if (error)
 850                                 return (error);
 851                         if (object != 0) {
 852                                 dmu_buf_t *db;
 853                                 error = dmu_bonus_hold(mos, object, FTAG, &db);
 854                                 if (error)
 855                                         return (error);
 856                                 ASSERT3U(db->db_size, >=, sizeof (smo));
 857                                 bcopy(db->db_data, &smo, sizeof (smo));
 858                                 ASSERT3U(smo.smo_object, ==, object);
 859                                 dmu_buf_rele(db, FTAG);
 860                         }

 861                 }
 862                 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
 863                     m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
 864         }
 865 
 866         if (txg == 0)
 867                 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 868 
 869         /*
 870          * If the vdev is being removed we don't activate
 871          * the metaslabs since we want to ensure that no new
 872          * allocations are performed on this device.
 873          */
 874         if (oldc == 0 && !vd->vdev_removing)
 875                 metaslab_group_activate(vd->vdev_mg);
 876 
 877         if (txg == 0)
 878                 spa_config_exit(spa, SCL_ALLOC, FTAG);
 879 
 880         return (0);
 881 }
 882 
 883 void
 884 vdev_metaslab_fini(vdev_t *vd)
 885 {
 886         uint64_t m;
 887         uint64_t count = vd->vdev_ms_count;
 888 
 889         if (vd->vdev_ms != NULL) {
 890                 metaslab_group_passivate(vd->vdev_mg);
 891                 for (m = 0; m < count; m++)
 892                         if (vd->vdev_ms[m] != NULL)
 893                                 metaslab_fini(vd->vdev_ms[m]);



 894                 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 895                 vd->vdev_ms = NULL;
 896         }
 897 }
 898 
 899 typedef struct vdev_probe_stats {
 900         boolean_t       vps_readable;
 901         boolean_t       vps_writeable;
 902         int             vps_flags;
 903 } vdev_probe_stats_t;
 904 
 905 static void
 906 vdev_probe_done(zio_t *zio)
 907 {
 908         spa_t *spa = zio->io_spa;
 909         vdev_t *vd = zio->io_vd;
 910         vdev_probe_stats_t *vps = zio->io_private;
 911 
 912         ASSERT(vd->vdev_probe_zio != NULL);
 913 


1524 }
1525 
1526 int
1527 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
1528 {
1529         int error;
1530 
1531         /*
1532          * Normally, partial opens (e.g. of a mirror) are allowed.
1533          * For a create, however, we want to fail the request if
1534          * there are any components we can't open.
1535          */
1536         error = vdev_open(vd);
1537 
1538         if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
1539                 vdev_close(vd);
1540                 return (error ? error : ENXIO);
1541         }
1542 
1543         /*
1544          * Recursively initialize all labels.
1545          */
1546         if ((error = vdev_label_init(vd, txg, isreplacing ?

1547             VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
1548                 vdev_close(vd);
1549                 return (error);
1550         }
1551 
1552         return (0);
1553 }
1554 
1555 void
1556 vdev_metaslab_set_size(vdev_t *vd)
1557 {
1558         /*
1559          * Aim for roughly 200 metaslabs per vdev.
1560          */
1561         vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
1562         vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1563 }
1564 
1565 void
1566 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1567 {
1568         ASSERT(vd == vd->vdev_top);
1569         ASSERT(!vd->vdev_ishole);
1570         ASSERT(ISP2(flags));
1571         ASSERT(spa_writeable(vd->vdev_spa));
1572 
1573         if (flags & VDD_METASLAB)
1574                 (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
1575 
1576         if (flags & VDD_DTL)
1577                 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
1578 
1579         (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
1580 }
1581 










1582 /*
1583  * DTLs.
1584  *
1585  * A vdev's DTL (dirty time log) is the set of transaction groups for which
1586  * the vdev has less than perfect replication.  There are four kinds of DTL:
1587  *
1588  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
1589  *
1590  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
1591  *
1592  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
1593  *      scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
1594  *      txgs that was scrubbed.
1595  *
1596  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
1597  *      persistent errors or just some device being offline.
1598  *      Unlike the other three, the DTL_OUTAGE map is not generally
1599  *      maintained; it's only computed when needed, typically to
1600  *      determine whether a device can be detached.
1601  *


1603  * either has the data or it doesn't.
1604  *
1605  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
1606  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
1607  * if any child is less than fully replicated, then so is its parent.
1608  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
1609  * comprising only those txgs which appear in 'maxfaults' or more children;
1610  * those are the txgs we don't have enough replication to read.  For example,
1611  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
1612  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
1613  * two child DTL_MISSING maps.
1614  *
1615  * It should be clear from the above that to compute the DTLs and outage maps
1616  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
1617  * Therefore, that is all we keep on disk.  When loading the pool, or after
1618  * a configuration change, we generate all other DTLs from first principles.
1619  */
1620 void
1621 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1622 {
1623         space_map_t *sm = &vd->vdev_dtl[t];
1624 
1625         ASSERT(t < DTL_TYPES);
1626         ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1627         ASSERT(spa_writeable(vd->vdev_spa));
1628 
1629         mutex_enter(sm->sm_lock);
1630         if (!space_map_contains(sm, txg, size))
1631                 space_map_add(sm, txg, size);
1632         mutex_exit(sm->sm_lock);
1633 }
1634 
1635 boolean_t
1636 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1637 {
1638         space_map_t *sm = &vd->vdev_dtl[t];
1639         boolean_t dirty = B_FALSE;
1640 
1641         ASSERT(t < DTL_TYPES);
1642         ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1643 
1644         mutex_enter(sm->sm_lock);
1645         if (sm->sm_space != 0)
1646                 dirty = space_map_contains(sm, txg, size);
1647         mutex_exit(sm->sm_lock);
1648 
1649         return (dirty);
1650 }
1651 
1652 boolean_t
1653 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1654 {
1655         space_map_t *sm = &vd->vdev_dtl[t];
1656         boolean_t empty;
1657 
1658         mutex_enter(sm->sm_lock);
1659         empty = (sm->sm_space == 0);
1660         mutex_exit(sm->sm_lock);
1661 
1662         return (empty);
1663 }
1664 
1665 /*
1666  * Returns the lowest txg in the DTL range.
1667  */
1668 static uint64_t
1669 vdev_dtl_min(vdev_t *vd)
1670 {
1671         space_seg_t *ss;
1672 
1673         ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1674         ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
1675         ASSERT0(vd->vdev_children);
1676 
1677         ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
1678         return (ss->ss_start - 1);
1679 }
1680 
1681 /*
1682  * Returns the highest txg in the DTL.
1683  */
1684 static uint64_t
1685 vdev_dtl_max(vdev_t *vd)
1686 {
1687         space_seg_t *ss;
1688 
1689         ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1690         ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
1691         ASSERT0(vd->vdev_children);
1692 
1693         ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
1694         return (ss->ss_end);
1695 }
1696 
1697 /*
1698  * Determine if a resilvering vdev should remove any DTL entries from
1699  * its range. If the vdev was resilvering for the entire duration of the
1700  * scan then it should excise that range from its DTLs. Otherwise, this
1701  * vdev is considered partially resilvered and should leave its DTL
1702  * entries intact. The comment in vdev_dtl_reassess() describes how we
1703  * excise the DTLs.
1704  */
1705 static boolean_t
1706 vdev_dtl_should_excise(vdev_t *vd)
1707 {
1708         spa_t *spa = vd->vdev_spa;
1709         dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1710 
1711         ASSERT0(scn->scn_phys.scn_errors);
1712         ASSERT0(vd->vdev_children);
1713 
1714         if (vd->vdev_resilver_txg == 0 ||
1715             vd->vdev_dtl[DTL_MISSING].sm_space == 0)
1716                 return (B_TRUE);
1717 
1718         /*
1719          * When a resilver is initiated the scan will assign the scn_max_txg
1720          * value to the highest txg value that exists in all DTLs. If this
1721          * device's max DTL is not part of this scan (i.e. it is not in
1722          * the range (scn_min_txg, scn_max_txg] then it is not eligible
1723          * for excision.
1724          */
1725         if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
1726                 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
1727                 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
1728                 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
1729                 return (B_TRUE);
1730         }
1731         return (B_FALSE);
1732 }
1733 
1734 /*
1735  * Reassess DTLs after a config change or scrub completion.


1765                     (spa->spa_scrub_started ||
1766                     (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
1767                     vdev_dtl_should_excise(vd)) {
1768                         /*
1769                          * We completed a scrub up to scrub_txg.  If we
1770                          * did it without rebooting, then the scrub dtl
1771                          * will be valid, so excise the old region and
1772                          * fold in the scrub dtl.  Otherwise, leave the
1773                          * dtl as-is if there was an error.
1774                          *
1775                          * There's little trick here: to excise the beginning
1776                          * of the DTL_MISSING map, we put it into a reference
1777                          * tree and then add a segment with refcnt -1 that
1778                          * covers the range [0, scrub_txg).  This means
1779                          * that each txg in that range has refcnt -1 or 0.
1780                          * We then add DTL_SCRUB with a refcnt of 2, so that
1781                          * entries in the range [0, scrub_txg) will have a
1782                          * positive refcnt -- either 1 or 2.  We then convert
1783                          * the reference tree into the new DTL_MISSING map.
1784                          */
1785                         space_map_ref_create(&reftree);
1786                         space_map_ref_add_map(&reftree,
1787                             &vd->vdev_dtl[DTL_MISSING], 1);
1788                         space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
1789                         space_map_ref_add_map(&reftree,
1790                             &vd->vdev_dtl[DTL_SCRUB], 2);
1791                         space_map_ref_generate_map(&reftree,
1792                             &vd->vdev_dtl[DTL_MISSING], 1);
1793                         space_map_ref_destroy(&reftree);
1794                 }
1795                 space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
1796                 space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1797                     space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
1798                 if (scrub_done)
1799                         space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
1800                 space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
1801                 if (!vdev_readable(vd))
1802                         space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
1803                 else
1804                         space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1805                             space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
1806 
1807                 /*
1808                  * If the vdev was resilvering and no longer has any
1809                  * DTLs then reset its resilvering flag.
1810                  */
1811                 if (vd->vdev_resilver_txg != 0 &&
1812                     vd->vdev_dtl[DTL_MISSING].sm_space == 0 &&
1813                     vd->vdev_dtl[DTL_OUTAGE].sm_space == 0)
1814                         vd->vdev_resilver_txg = 0;
1815 
1816                 mutex_exit(&vd->vdev_dtl_lock);
1817 
1818                 if (txg != 0)
1819                         vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
1820                 return;
1821         }
1822 
1823         mutex_enter(&vd->vdev_dtl_lock);
1824         for (int t = 0; t < DTL_TYPES; t++) {
1825                 /* account for child's outage in parent's missing map */
1826                 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
1827                 if (t == DTL_SCRUB)
1828                         continue;                       /* leaf vdevs only */
1829                 if (t == DTL_PARTIAL)
1830                         minref = 1;                     /* i.e. non-zero */
1831                 else if (vd->vdev_nparity != 0)
1832                         minref = vd->vdev_nparity + 1;       /* RAID-Z */
1833                 else
1834                         minref = vd->vdev_children;  /* any kind of mirror */
1835                 space_map_ref_create(&reftree);
1836                 for (int c = 0; c < vd->vdev_children; c++) {
1837                         vdev_t *cvd = vd->vdev_child[c];
1838                         mutex_enter(&cvd->vdev_dtl_lock);
1839                         space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
1840                         mutex_exit(&cvd->vdev_dtl_lock);
1841                 }
1842                 space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
1843                 space_map_ref_destroy(&reftree);
1844         }
1845         mutex_exit(&vd->vdev_dtl_lock);
1846 }
1847 
1848 static int
1849 vdev_dtl_load(vdev_t *vd)
1850 {
1851         spa_t *spa = vd->vdev_spa;
1852         space_map_obj_t *smo = &vd->vdev_dtl_smo;
1853         objset_t *mos = spa->spa_meta_objset;
1854         dmu_buf_t *db;
1855         int error;
1856 
1857         ASSERT(vd->vdev_children == 0);
1858 
1859         if (smo->smo_object == 0)
1860                 return (0);
1861 
1862         ASSERT(!vd->vdev_ishole);
1863 
1864         if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)


1865                 return (error);

1866 
1867         ASSERT3U(db->db_size, >=, sizeof (*smo));
1868         bcopy(db->db_data, smo, sizeof (*smo));
1869         dmu_buf_rele(db, FTAG);
1870 
1871         mutex_enter(&vd->vdev_dtl_lock);
1872         error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
1873             NULL, SM_ALLOC, smo, mos);







1874         mutex_exit(&vd->vdev_dtl_lock);
1875 
1876         return (error);









1877 }
1878 
1879 void
1880 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1881 {
1882         spa_t *spa = vd->vdev_spa;
1883         space_map_obj_t *smo = &vd->vdev_dtl_smo;
1884         space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
1885         objset_t *mos = spa->spa_meta_objset;
1886         space_map_t smsync;
1887         kmutex_t smlock;
1888         dmu_buf_t *db;
1889         dmu_tx_t *tx;

1890 
1891         ASSERT(!vd->vdev_ishole);

1892 
1893         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1894 
1895         if (vd->vdev_detached) {
1896                 if (smo->smo_object != 0) {
1897                         int err = dmu_object_free(mos, smo->smo_object, tx);
1898                         ASSERT0(err);
1899                         smo->smo_object = 0;
1900                 }
1901                 dmu_tx_commit(tx);
1902                 return;
1903         }
1904 
1905         if (smo->smo_object == 0) {
1906                 ASSERT(smo->smo_objsize == 0);
1907                 ASSERT(smo->smo_alloc == 0);
1908                 smo->smo_object = dmu_object_alloc(mos,
1909                     DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1910                     DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1911                 ASSERT(smo->smo_object != 0);
1912                 vdev_config_dirty(vd->vdev_top);

1913         }
1914 
1915         mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
1916 
1917         space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
1918             &smlock);
1919 
1920         mutex_enter(&smlock);
1921 
1922         mutex_enter(&vd->vdev_dtl_lock);
1923         space_map_walk(sm, space_map_add, &smsync);
1924         mutex_exit(&vd->vdev_dtl_lock);
1925 
1926         space_map_truncate(smo, mos, tx);
1927         space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
1928         space_map_vacate(&smsync, NULL, NULL);
1929 
1930         space_map_destroy(&smsync);
1931 
1932         mutex_exit(&smlock);
1933         mutex_destroy(&smlock);
1934 
1935         VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1936         dmu_buf_will_dirty(db, tx);
1937         ASSERT3U(db->db_size, >=, sizeof (*smo));
1938         bcopy(smo, db->db_data, sizeof (*smo));
1939         dmu_buf_rele(db, FTAG);





1940 
1941         dmu_tx_commit(tx);




1942 }
1943 
1944 /*
1945  * Determine whether the specified vdev can be offlined/detached/removed
1946  * without losing data.
1947  */
1948 boolean_t
1949 vdev_dtl_required(vdev_t *vd)
1950 {
1951         spa_t *spa = vd->vdev_spa;
1952         vdev_t *tvd = vd->vdev_top;
1953         uint8_t cant_read = vd->vdev_cant_read;
1954         boolean_t required;
1955 
1956         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1957 
1958         if (vd == spa->spa_root_vdev || vd == tvd)
1959                 return (B_TRUE);
1960 
1961         /*


1970         vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
1971 
1972         if (!required && zio_injection_enabled)
1973                 required = !!zio_handle_device_injection(vd, NULL, ECHILD);
1974 
1975         return (required);
1976 }
1977 
1978 /*
1979  * Determine if resilver is needed, and if so the txg range.
1980  */
1981 boolean_t
1982 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
1983 {
1984         boolean_t needed = B_FALSE;
1985         uint64_t thismin = UINT64_MAX;
1986         uint64_t thismax = 0;
1987 
1988         if (vd->vdev_children == 0) {
1989                 mutex_enter(&vd->vdev_dtl_lock);
1990                 if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
1991                     vdev_writeable(vd)) {
1992 
1993                         thismin = vdev_dtl_min(vd);
1994                         thismax = vdev_dtl_max(vd);
1995                         needed = B_TRUE;
1996                 }
1997                 mutex_exit(&vd->vdev_dtl_lock);
1998         } else {
1999                 for (int c = 0; c < vd->vdev_children; c++) {
2000                         vdev_t *cvd = vd->vdev_child[c];
2001                         uint64_t cmin, cmax;
2002 
2003                         if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
2004                                 thismin = MIN(thismin, cmin);
2005                                 thismax = MAX(thismax, cmax);
2006                                 needed = B_TRUE;
2007                         }
2008                 }
2009         }
2010 


2075                 return (-1);
2076         }
2077 
2078         /*
2079          * We don't actually check the pool state here.  If it's in fact in
2080          * use by another pool, we update this fact on the fly when requested.
2081          */
2082         nvlist_free(label);
2083         return (0);
2084 }
2085 
2086 void
2087 vdev_remove(vdev_t *vd, uint64_t txg)
2088 {
2089         spa_t *spa = vd->vdev_spa;
2090         objset_t *mos = spa->spa_meta_objset;
2091         dmu_tx_t *tx;
2092 
2093         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2094 
2095         if (vd->vdev_dtl_smo.smo_object) {
2096                 ASSERT0(vd->vdev_dtl_smo.smo_alloc);
2097                 (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
2098                 vd->vdev_dtl_smo.smo_object = 0;
2099         }
2100 
2101         if (vd->vdev_ms != NULL) {
2102                 for (int m = 0; m < vd->vdev_ms_count; m++) {
2103                         metaslab_t *msp = vd->vdev_ms[m];
2104 
2105                         if (msp == NULL || msp->ms_smo.smo_object == 0)
2106                                 continue;
2107 
2108                         ASSERT0(msp->ms_smo.smo_alloc);
2109                         (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
2110                         msp->ms_smo.smo_object = 0;



2111                 }
2112         }
2113 
2114         if (vd->vdev_ms_array) {
2115                 (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
2116                 vd->vdev_ms_array = 0;
2117                 vd->vdev_ms_shift = 0;
2118         }
2119         dmu_tx_commit(tx);
2120 }
2121 
2122 void
2123 vdev_sync_done(vdev_t *vd, uint64_t txg)
2124 {
2125         metaslab_t *msp;
2126         boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
2127 
2128         ASSERT(!vd->vdev_ishole);
2129 
2130         while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
2131                 metaslab_sync_done(msp, txg);
2132 
2133         if (reassess)
2134                 metaslab_sync_reassess(vd->vdev_mg);
2135 }
2136 
2137 void




  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2013 by Delphix. All rights reserved.
  26  */
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/fm/fs/zfs.h>
  30 #include <sys/spa.h>
  31 #include <sys/spa_impl.h>
  32 #include <sys/dmu.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/vdev_impl.h>
  35 #include <sys/uberblock_impl.h>
  36 #include <sys/metaslab.h>
  37 #include <sys/metaslab_impl.h>
  38 #include <sys/space_map.h>
  39 #include <sys/space_reftree.h>
  40 #include <sys/zio.h>
  41 #include <sys/zap.h>
  42 #include <sys/fs/zfs.h>
  43 #include <sys/arc.h>
  44 #include <sys/zil.h>
  45 #include <sys/dsl_scan.h>
  46 
  47 /*
  48  * Virtual device management.
  49  */
  50 
  51 static vdev_ops_t *vdev_ops_table[] = {
  52         &vdev_root_ops,
  53         &vdev_raidz_ops,
  54         &vdev_mirror_ops,
  55         &vdev_replacing_ops,
  56         &vdev_spare_ops,
  57         &vdev_disk_ops,
  58         &vdev_file_ops,
  59         &vdev_missing_ops,


 303                         /*
 304                          * Any other vdev's guid must be unique within the pool.
 305                          */
 306                         guid = spa_generate_guid(spa);
 307                 }
 308                 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 309         }
 310 
 311         vd->vdev_spa = spa;
 312         vd->vdev_id = id;
 313         vd->vdev_guid = guid;
 314         vd->vdev_guid_sum = guid;
 315         vd->vdev_ops = ops;
 316         vd->vdev_state = VDEV_STATE_CLOSED;
 317         vd->vdev_ishole = (ops == &vdev_hole_ops);
 318 
 319         mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 320         mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 321         mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 322         for (int t = 0; t < DTL_TYPES; t++) {
 323                 vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
 324                     &vd->vdev_dtl_lock);
 325         }
 326         txg_list_create(&vd->vdev_ms_list,
 327             offsetof(struct metaslab, ms_txg_node));
 328         txg_list_create(&vd->vdev_dtl_list,
 329             offsetof(struct vdev, vdev_dtl_node));
 330         vd->vdev_stat.vs_timestamp = gethrtime();
 331         vdev_queue_init(vd);
 332         vdev_cache_init(vd);
 333 
 334         return (vd);
 335 }
 336 
 337 /*
 338  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
 339  * creating a new vdev or loading an existing one - the behavior is slightly
 340  * different for each case.
 341  */
 342 int
 343 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,


 489                     &vd->vdev_removing);
 490         }
 491 
 492         if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
 493                 ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 494                     alloctype == VDEV_ALLOC_ADD ||
 495                     alloctype == VDEV_ALLOC_SPLIT ||
 496                     alloctype == VDEV_ALLOC_ROOTPOOL);
 497                 vd->vdev_mg = metaslab_group_create(islog ?
 498                     spa_log_class(spa) : spa_normal_class(spa), vd);
 499         }
 500 
 501         /*
 502          * If we're a leaf vdev, try to load the DTL object and other state.
 503          */
 504         if (vd->vdev_ops->vdev_op_leaf &&
 505             (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 506             alloctype == VDEV_ALLOC_ROOTPOOL)) {
 507                 if (alloctype == VDEV_ALLOC_LOAD) {
 508                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 509                             &vd->vdev_dtl_object);
 510                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 511                             &vd->vdev_unspare);
 512                 }
 513 
 514                 if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 515                         uint64_t spare = 0;
 516 
 517                         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 518                             &spare) == 0 && spare)
 519                                 spa_spare_add(vd);
 520                 }
 521 
 522                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 523                     &vd->vdev_offline);
 524 
 525                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 526                     &vd->vdev_resilver_txg);
 527 
 528                 /*
 529                  * When importing a pool, we want to ignore the persistent fault


 611         vdev_cache_fini(vd);
 612 
 613         if (vd->vdev_path)
 614                 spa_strfree(vd->vdev_path);
 615         if (vd->vdev_devid)
 616                 spa_strfree(vd->vdev_devid);
 617         if (vd->vdev_physpath)
 618                 spa_strfree(vd->vdev_physpath);
 619         if (vd->vdev_fru)
 620                 spa_strfree(vd->vdev_fru);
 621 
 622         if (vd->vdev_isspare)
 623                 spa_spare_remove(vd);
 624         if (vd->vdev_isl2cache)
 625                 spa_l2cache_remove(vd);
 626 
 627         txg_list_destroy(&vd->vdev_ms_list);
 628         txg_list_destroy(&vd->vdev_dtl_list);
 629 
 630         mutex_enter(&vd->vdev_dtl_lock);
 631         space_map_close(vd->vdev_dtl_sm);
 632         for (int t = 0; t < DTL_TYPES; t++) {
 633                 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
 634                 range_tree_destroy(vd->vdev_dtl[t]);
 635         }
 636         mutex_exit(&vd->vdev_dtl_lock);
 637 
 638         mutex_destroy(&vd->vdev_dtl_lock);
 639         mutex_destroy(&vd->vdev_stat_lock);
 640         mutex_destroy(&vd->vdev_probe_lock);
 641 
 642         if (vd == spa->spa_root_vdev)
 643                 spa->spa_root_vdev = NULL;
 644 
 645         kmem_free(vd, sizeof (vdev_t));
 646 }
 647 
 648 /*
 649  * Transfer top-level vdev state from svd to tvd.
 650  */
 651 static void
 652 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 653 {
 654         spa_t *spa = svd->vdev_spa;


 825          * in 128k (1 << 17) because it is the current "typical" blocksize.
 826          * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
 827          * or we will inconsistently account for existing bp's.
 828          */
 829         vd->vdev_deflate_ratio = (1 << 17) /
 830             (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
 831 
 832         ASSERT(oldc <= newc);
 833 
 834         mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 835 
 836         if (oldc != 0) {
 837                 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
 838                 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 839         }
 840 
 841         vd->vdev_ms = mspp;
 842         vd->vdev_ms_count = newc;
 843 
 844         for (m = oldc; m < newc; m++) {


 845                 uint64_t object = 0;
 846 
 847                 if (txg == 0) {
 848                         error = dmu_read(mos, vd->vdev_ms_array,
 849                             m * sizeof (uint64_t), sizeof (uint64_t), &object,
 850                             DMU_READ_PREFETCH);
 851                         if (error)
 852                                 return (error);









 853                 }
 854                 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, m, object, txg);
 855         }



 856 
 857         if (txg == 0)
 858                 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 859 
 860         /*
 861          * If the vdev is being removed we don't activate
 862          * the metaslabs since we want to ensure that no new
 863          * allocations are performed on this device.
 864          */
 865         if (oldc == 0 && !vd->vdev_removing)
 866                 metaslab_group_activate(vd->vdev_mg);
 867 
 868         if (txg == 0)
 869                 spa_config_exit(spa, SCL_ALLOC, FTAG);
 870 
 871         return (0);
 872 }
 873 
 874 void
 875 vdev_metaslab_fini(vdev_t *vd)
 876 {
 877         uint64_t m;
 878         uint64_t count = vd->vdev_ms_count;
 879 
 880         if (vd->vdev_ms != NULL) {
 881                 metaslab_group_passivate(vd->vdev_mg);
 882                 for (m = 0; m < count; m++) {
 883                         metaslab_t *msp = vd->vdev_ms[m];
 884 
 885                         if (msp != NULL)
 886                                 metaslab_fini(msp);
 887                 }
 888                 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 889                 vd->vdev_ms = NULL;
 890         }
 891 }
 892 
 893 typedef struct vdev_probe_stats {
 894         boolean_t       vps_readable;
 895         boolean_t       vps_writeable;
 896         int             vps_flags;
 897 } vdev_probe_stats_t;
 898 
 899 static void
 900 vdev_probe_done(zio_t *zio)
 901 {
 902         spa_t *spa = zio->io_spa;
 903         vdev_t *vd = zio->io_vd;
 904         vdev_probe_stats_t *vps = zio->io_private;
 905 
 906         ASSERT(vd->vdev_probe_zio != NULL);
 907 


1518 }
1519 
1520 int
1521 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
1522 {
1523         int error;
1524 
1525         /*
1526          * Normally, partial opens (e.g. of a mirror) are allowed.
1527          * For a create, however, we want to fail the request if
1528          * there are any components we can't open.
1529          */
1530         error = vdev_open(vd);
1531 
1532         if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
1533                 vdev_close(vd);
1534                 return (error ? error : ENXIO);
1535         }
1536 
1537         /*
1538          * Recursively load DTLs and initialize all labels.
1539          */
1540         if ((error = vdev_dtl_load(vd)) != 0 ||
1541             (error = vdev_label_init(vd, txg, isreplacing ?
1542             VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
1543                 vdev_close(vd);
1544                 return (error);
1545         }
1546 
1547         return (0);
1548 }
1549 
1550 void
1551 vdev_metaslab_set_size(vdev_t *vd)
1552 {
1553         /*
1554          * Aim for roughly 200 metaslabs per vdev.
1555          */
1556         vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
1557         vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1558 }
1559 
1560 void
1561 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1562 {
1563         ASSERT(vd == vd->vdev_top);
1564         ASSERT(!vd->vdev_ishole);
1565         ASSERT(ISP2(flags));
1566         ASSERT(spa_writeable(vd->vdev_spa));
1567 
1568         if (flags & VDD_METASLAB)
1569                 (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
1570 
1571         if (flags & VDD_DTL)
1572                 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
1573 
1574         (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
1575 }
1576 
1577 void
1578 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
1579 {
1580         for (int c = 0; c < vd->vdev_children; c++)
1581                 vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
1582 
1583         if (vd->vdev_ops->vdev_op_leaf)
1584                 vdev_dirty(vd->vdev_top, flags, vd, txg);
1585 }
1586 
1587 /*
1588  * DTLs.
1589  *
1590  * A vdev's DTL (dirty time log) is the set of transaction groups for which
1591  * the vdev has less than perfect replication.  There are four kinds of DTL:
1592  *
1593  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
1594  *
1595  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
1596  *
1597  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
1598  *      scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
1599  *      txgs that was scrubbed.
1600  *
1601  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
1602  *      persistent errors or just some device being offline.
1603  *      Unlike the other three, the DTL_OUTAGE map is not generally
1604  *      maintained; it's only computed when needed, typically to
1605  *      determine whether a device can be detached.
1606  *


1608  * either has the data or it doesn't.
1609  *
1610  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
1611  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
1612  * if any child is less than fully replicated, then so is its parent.
1613  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
1614  * comprising only those txgs which appear in 'maxfaults' or more children;
1615  * those are the txgs we don't have enough replication to read.  For example,
1616  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
1617  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
1618  * two child DTL_MISSING maps.
1619  *
1620  * It should be clear from the above that to compute the DTLs and outage maps
1621  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
1622  * Therefore, that is all we keep on disk.  When loading the pool, or after
1623  * a configuration change, we generate all other DTLs from first principles.
1624  */
1625 void
1626 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1627 {
1628         range_tree_t *rt = vd->vdev_dtl[t];
1629 
1630         ASSERT(t < DTL_TYPES);
1631         ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1632         ASSERT(spa_writeable(vd->vdev_spa));
1633 
1634         mutex_enter(rt->rt_lock);
1635         if (!range_tree_contains(rt, txg, size))
1636                 range_tree_add(rt, txg, size);
1637         mutex_exit(rt->rt_lock);
1638 }
1639 
1640 boolean_t
1641 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1642 {
1643         range_tree_t *rt = vd->vdev_dtl[t];
1644         boolean_t dirty = B_FALSE;
1645 
1646         ASSERT(t < DTL_TYPES);
1647         ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1648 
1649         mutex_enter(rt->rt_lock);
1650         if (range_tree_space(rt) != 0)
1651                 dirty = range_tree_contains(rt, txg, size);
1652         mutex_exit(rt->rt_lock);
1653 
1654         return (dirty);
1655 }
1656 
1657 boolean_t
1658 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1659 {
1660         range_tree_t *rt = vd->vdev_dtl[t];
1661         boolean_t empty;
1662 
1663         mutex_enter(rt->rt_lock);
1664         empty = (range_tree_space(rt) == 0);
1665         mutex_exit(rt->rt_lock);
1666 
1667         return (empty);
1668 }
1669 
1670 /*
1671  * Returns the lowest txg in the DTL range.
1672  */
1673 static uint64_t
1674 vdev_dtl_min(vdev_t *vd)
1675 {
1676         range_seg_t *rs;
1677 
1678         ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1679         ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
1680         ASSERT0(vd->vdev_children);
1681 
1682         rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
1683         return (rs->rs_start - 1);
1684 }
1685 
1686 /*
1687  * Returns the highest txg in the DTL.
1688  */
1689 static uint64_t
1690 vdev_dtl_max(vdev_t *vd)
1691 {
1692         range_seg_t *rs;
1693 
1694         ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1695         ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
1696         ASSERT0(vd->vdev_children);
1697 
1698         rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
1699         return (rs->rs_end);
1700 }
1701 
1702 /*
1703  * Determine if a resilvering vdev should remove any DTL entries from
1704  * its range. If the vdev was resilvering for the entire duration of the
1705  * scan then it should excise that range from its DTLs. Otherwise, this
1706  * vdev is considered partially resilvered and should leave its DTL
1707  * entries intact. The comment in vdev_dtl_reassess() describes how we
1708  * excise the DTLs.
1709  */
1710 static boolean_t
1711 vdev_dtl_should_excise(vdev_t *vd)
1712 {
1713         spa_t *spa = vd->vdev_spa;
1714         dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1715 
1716         ASSERT0(scn->scn_phys.scn_errors);
1717         ASSERT0(vd->vdev_children);
1718 
1719         if (vd->vdev_resilver_txg == 0 ||
1720             range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
1721                 return (B_TRUE);
1722 
1723         /*
1724          * When a resilver is initiated the scan will assign the scn_max_txg
1725          * value to the highest txg value that exists in all DTLs. If this
1726          * device's max DTL is not part of this scan (i.e. it is not in
1727          * the range (scn_min_txg, scn_max_txg] then it is not eligible
1728          * for excision.
1729          */
1730         if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
1731                 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
1732                 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
1733                 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
1734                 return (B_TRUE);
1735         }
1736         return (B_FALSE);
1737 }
1738 
1739 /*
1740  * Reassess DTLs after a config change or scrub completion.


1770                     (spa->spa_scrub_started ||
1771                     (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
1772                     vdev_dtl_should_excise(vd)) {
1773                         /*
1774                          * We completed a scrub up to scrub_txg.  If we
1775                          * did it without rebooting, then the scrub dtl
1776                          * will be valid, so excise the old region and
1777                          * fold in the scrub dtl.  Otherwise, leave the
1778                          * dtl as-is if there was an error.
1779                          *
1780                          * There's little trick here: to excise the beginning
1781                          * of the DTL_MISSING map, we put it into a reference
1782                          * tree and then add a segment with refcnt -1 that
1783                          * covers the range [0, scrub_txg).  This means
1784                          * that each txg in that range has refcnt -1 or 0.
1785                          * We then add DTL_SCRUB with a refcnt of 2, so that
1786                          * entries in the range [0, scrub_txg) will have a
1787                          * positive refcnt -- either 1 or 2.  We then convert
1788                          * the reference tree into the new DTL_MISSING map.
1789                          */
1790                         space_reftree_create(&reftree);
1791                         space_reftree_add_map(&reftree,
1792                             vd->vdev_dtl[DTL_MISSING], 1);
1793                         space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
1794                         space_reftree_add_map(&reftree,
1795                             vd->vdev_dtl[DTL_SCRUB], 2);
1796                         space_reftree_generate_map(&reftree,
1797                             vd->vdev_dtl[DTL_MISSING], 1);
1798                         space_reftree_destroy(&reftree);
1799                 }
1800                 range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
1801                 range_tree_walk(vd->vdev_dtl[DTL_MISSING],
1802                     range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
1803                 if (scrub_done)
1804                         range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
1805                 range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
1806                 if (!vdev_readable(vd))
1807                         range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
1808                 else
1809                         range_tree_walk(vd->vdev_dtl[DTL_MISSING],
1810                             range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
1811 
1812                 /*
1813                  * If the vdev was resilvering and no longer has any
1814                  * DTLs then reset its resilvering flag.
1815                  */
1816                 if (vd->vdev_resilver_txg != 0 &&
1817                     range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
1818                     range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0)
1819                         vd->vdev_resilver_txg = 0;
1820 
1821                 mutex_exit(&vd->vdev_dtl_lock);
1822 
1823                 if (txg != 0)
1824                         vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
1825                 return;
1826         }
1827 
1828         mutex_enter(&vd->vdev_dtl_lock);
1829         for (int t = 0; t < DTL_TYPES; t++) {
1830                 /* account for child's outage in parent's missing map */
1831                 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
1832                 if (t == DTL_SCRUB)
1833                         continue;                       /* leaf vdevs only */
1834                 if (t == DTL_PARTIAL)
1835                         minref = 1;                     /* i.e. non-zero */
1836                 else if (vd->vdev_nparity != 0)
1837                         minref = vd->vdev_nparity + 1;       /* RAID-Z */
1838                 else
1839                         minref = vd->vdev_children;  /* any kind of mirror */
1840                 space_reftree_create(&reftree);
1841                 for (int c = 0; c < vd->vdev_children; c++) {
1842                         vdev_t *cvd = vd->vdev_child[c];
1843                         mutex_enter(&cvd->vdev_dtl_lock);
1844                         space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
1845                         mutex_exit(&cvd->vdev_dtl_lock);
1846                 }
1847                 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
1848                 space_reftree_destroy(&reftree);
1849         }
1850         mutex_exit(&vd->vdev_dtl_lock);
1851 }
1852 
1853 int
1854 vdev_dtl_load(vdev_t *vd)
1855 {
1856         spa_t *spa = vd->vdev_spa;

1857         objset_t *mos = spa->spa_meta_objset;
1858         int error = 0;

1859 
1860         if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {




1861                 ASSERT(!vd->vdev_ishole);
1862 
1863                 error = space_map_open(&vd->vdev_dtl_sm, mos,
1864                     vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
1865                 if (error)
1866                         return (error);
1867                 ASSERT(vd->vdev_dtl_sm != NULL);
1868 




1869                 mutex_enter(&vd->vdev_dtl_lock);
1870 
1871                 /*
1872                  * Now that we've opened the space_map we need to update
1873                  * the in-core DTL.
1874                  */
1875                 space_map_update(vd->vdev_dtl_sm);
1876 
1877                 error = space_map_load(vd->vdev_dtl_sm,
1878                     vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
1879                 mutex_exit(&vd->vdev_dtl_lock);
1880 
1881                 return (error);
1882         }
1883 
1884         for (int c = 0; c < vd->vdev_children; c++) {
1885                 error = vdev_dtl_load(vd->vdev_child[c]);
1886                 if (error != 0)
1887                         break;
1888         }
1889 
1890         return (error);
1891 }
1892 
1893 void
1894 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1895 {
1896         spa_t *spa = vd->vdev_spa;
1897         range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];

1898         objset_t *mos = spa->spa_meta_objset;
1899         range_tree_t *rtsync;
1900         kmutex_t rtlock;

1901         dmu_tx_t *tx;
1902         uint64_t object = space_map_object(vd->vdev_dtl_sm);
1903 
1904         ASSERT(!vd->vdev_ishole);
1905         ASSERT(vd->vdev_ops->vdev_op_leaf);
1906 
1907         tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1908 
1909         if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
1910                 mutex_enter(&vd->vdev_dtl_lock);
1911                 space_map_free(vd->vdev_dtl_sm, tx);
1912                 space_map_close(vd->vdev_dtl_sm);
1913                 vd->vdev_dtl_sm = NULL;
1914                 mutex_exit(&vd->vdev_dtl_lock);
1915                 dmu_tx_commit(tx);
1916                 return;
1917         }
1918 
1919         if (vd->vdev_dtl_sm == NULL) {
1920                 uint64_t new_object;
1921 
1922                 new_object = space_map_alloc(mos, tx);
1923                 VERIFY3U(new_object, !=, 0);
1924 
1925                 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
1926                     0, -1ULL, 0, &vd->vdev_dtl_lock));
1927                 ASSERT(vd->vdev_dtl_sm != NULL);
1928         }
1929 
1930         mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
1931 
1932         rtsync = range_tree_create(NULL, NULL, &rtlock);

1933 
1934         mutex_enter(&rtlock);
1935 
1936         mutex_enter(&vd->vdev_dtl_lock);
1937         range_tree_walk(rt, range_tree_add, rtsync);
1938         mutex_exit(&vd->vdev_dtl_lock);
1939 
1940         space_map_truncate(vd->vdev_dtl_sm, tx);
1941         space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
1942         range_tree_vacate(rtsync, NULL, NULL);
1943 
1944         range_tree_destroy(rtsync);
1945 
1946         mutex_exit(&rtlock);
1947         mutex_destroy(&rtlock);
1948 
1949         /*
1950          * If the object for the space map has changed then dirty
1951          * the top level so that we update the config.
1952          */
1953         if (object != space_map_object(vd->vdev_dtl_sm)) {
1954                 zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
1955                     "new object %llu", txg, spa_name(spa), object,
1956                     space_map_object(vd->vdev_dtl_sm));
1957                 vdev_config_dirty(vd->vdev_top);
1958         }
1959 
1960         dmu_tx_commit(tx);
1961 
1962         mutex_enter(&vd->vdev_dtl_lock);
1963         space_map_update(vd->vdev_dtl_sm);
1964         mutex_exit(&vd->vdev_dtl_lock);
1965 }
1966 
1967 /*
1968  * Determine whether the specified vdev can be offlined/detached/removed
1969  * without losing data.
1970  */
1971 boolean_t
1972 vdev_dtl_required(vdev_t *vd)
1973 {
1974         spa_t *spa = vd->vdev_spa;
1975         vdev_t *tvd = vd->vdev_top;
1976         uint8_t cant_read = vd->vdev_cant_read;
1977         boolean_t required;
1978 
1979         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1980 
1981         if (vd == spa->spa_root_vdev || vd == tvd)
1982                 return (B_TRUE);
1983 
1984         /*


1993         vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
1994 
1995         if (!required && zio_injection_enabled)
1996                 required = !!zio_handle_device_injection(vd, NULL, ECHILD);
1997 
1998         return (required);
1999 }
2000 
2001 /*
2002  * Determine if resilver is needed, and if so the txg range.
2003  */
2004 boolean_t
2005 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
2006 {
2007         boolean_t needed = B_FALSE;
2008         uint64_t thismin = UINT64_MAX;
2009         uint64_t thismax = 0;
2010 
2011         if (vd->vdev_children == 0) {
2012                 mutex_enter(&vd->vdev_dtl_lock);
2013                 if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
2014                     vdev_writeable(vd)) {
2015 
2016                         thismin = vdev_dtl_min(vd);
2017                         thismax = vdev_dtl_max(vd);
2018                         needed = B_TRUE;
2019                 }
2020                 mutex_exit(&vd->vdev_dtl_lock);
2021         } else {
2022                 for (int c = 0; c < vd->vdev_children; c++) {
2023                         vdev_t *cvd = vd->vdev_child[c];
2024                         uint64_t cmin, cmax;
2025 
2026                         if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
2027                                 thismin = MIN(thismin, cmin);
2028                                 thismax = MAX(thismax, cmax);
2029                                 needed = B_TRUE;
2030                         }
2031                 }
2032         }
2033 


2098                 return (-1);
2099         }
2100 
2101         /*
2102          * We don't actually check the pool state here.  If it's in fact in
2103          * use by another pool, we update this fact on the fly when requested.
2104          */
2105         nvlist_free(label);
2106         return (0);
2107 }
2108 
2109 void
2110 vdev_remove(vdev_t *vd, uint64_t txg)
2111 {
2112         spa_t *spa = vd->vdev_spa;
2113         objset_t *mos = spa->spa_meta_objset;
2114         dmu_tx_t *tx;
2115 
2116         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2117 






2118         if (vd->vdev_ms != NULL) {
2119                 for (int m = 0; m < vd->vdev_ms_count; m++) {
2120                         metaslab_t *msp = vd->vdev_ms[m];
2121 
2122                         if (msp == NULL || msp->ms_sm == NULL)
2123                                 continue;
2124 
2125                         mutex_enter(&msp->ms_lock);
2126                         VERIFY0(space_map_allocated(msp->ms_sm));
2127                         space_map_free(msp->ms_sm, tx);
2128                         space_map_close(msp->ms_sm);
2129                         msp->ms_sm = NULL;
2130                         mutex_exit(&msp->ms_lock);
2131                 }
2132         }
2133 
2134         if (vd->vdev_ms_array) {
2135                 (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
2136                 vd->vdev_ms_array = 0;

2137         }
2138         dmu_tx_commit(tx);
2139 }
2140 
2141 void
2142 vdev_sync_done(vdev_t *vd, uint64_t txg)
2143 {
2144         metaslab_t *msp;
2145         boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
2146 
2147         ASSERT(!vd->vdev_ishole);
2148 
2149         while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
2150                 metaslab_sync_done(msp, txg);
2151 
2152         if (reassess)
2153                 metaslab_sync_reassess(vd->vdev_mg);
2154 }
2155 
2156 void