19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2013 by Delphix. All rights reserved.
26 */
27
28 #include <sys/zfs_context.h>
29 #include <sys/fm/fs/zfs.h>
30 #include <sys/spa.h>
31 #include <sys/spa_impl.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/uberblock_impl.h>
36 #include <sys/metaslab.h>
37 #include <sys/metaslab_impl.h>
38 #include <sys/space_map.h>
39 #include <sys/zio.h>
40 #include <sys/zap.h>
41 #include <sys/fs/zfs.h>
42 #include <sys/arc.h>
43 #include <sys/zil.h>
44 #include <sys/dsl_scan.h>
45
46 /*
47 * Virtual device management.
48 */
49
50 static vdev_ops_t *vdev_ops_table[] = {
51 &vdev_root_ops,
52 &vdev_raidz_ops,
53 &vdev_mirror_ops,
54 &vdev_replacing_ops,
55 &vdev_spare_ops,
56 &vdev_disk_ops,
57 &vdev_file_ops,
58 &vdev_missing_ops,
302 /*
303 * Any other vdev's guid must be unique within the pool.
304 */
305 guid = spa_generate_guid(spa);
306 }
307 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
308 }
309
310 vd->vdev_spa = spa;
311 vd->vdev_id = id;
312 vd->vdev_guid = guid;
313 vd->vdev_guid_sum = guid;
314 vd->vdev_ops = ops;
315 vd->vdev_state = VDEV_STATE_CLOSED;
316 vd->vdev_ishole = (ops == &vdev_hole_ops);
317
318 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
319 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
320 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
321 for (int t = 0; t < DTL_TYPES; t++) {
322 space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
323 &vd->vdev_dtl_lock);
324 }
325 txg_list_create(&vd->vdev_ms_list,
326 offsetof(struct metaslab, ms_txg_node));
327 txg_list_create(&vd->vdev_dtl_list,
328 offsetof(struct vdev, vdev_dtl_node));
329 vd->vdev_stat.vs_timestamp = gethrtime();
330 vdev_queue_init(vd);
331 vdev_cache_init(vd);
332
333 return (vd);
334 }
335
336 /*
337 * Allocate a new vdev. The 'alloctype' is used to control whether we are
338 * creating a new vdev or loading an existing one - the behavior is slightly
339 * different for each case.
340 */
341 int
342 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
488 &vd->vdev_removing);
489 }
490
491 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
492 ASSERT(alloctype == VDEV_ALLOC_LOAD ||
493 alloctype == VDEV_ALLOC_ADD ||
494 alloctype == VDEV_ALLOC_SPLIT ||
495 alloctype == VDEV_ALLOC_ROOTPOOL);
496 vd->vdev_mg = metaslab_group_create(islog ?
497 spa_log_class(spa) : spa_normal_class(spa), vd);
498 }
499
500 /*
501 * If we're a leaf vdev, try to load the DTL object and other state.
502 */
503 if (vd->vdev_ops->vdev_op_leaf &&
504 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
505 alloctype == VDEV_ALLOC_ROOTPOOL)) {
506 if (alloctype == VDEV_ALLOC_LOAD) {
507 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
508 &vd->vdev_dtl_smo.smo_object);
509 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
510 &vd->vdev_unspare);
511 }
512
513 if (alloctype == VDEV_ALLOC_ROOTPOOL) {
514 uint64_t spare = 0;
515
516 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
517 &spare) == 0 && spare)
518 spa_spare_add(vd);
519 }
520
521 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
522 &vd->vdev_offline);
523
524 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
525 &vd->vdev_resilver_txg);
526
527 /*
528 * When importing a pool, we want to ignore the persistent fault
610 vdev_cache_fini(vd);
611
612 if (vd->vdev_path)
613 spa_strfree(vd->vdev_path);
614 if (vd->vdev_devid)
615 spa_strfree(vd->vdev_devid);
616 if (vd->vdev_physpath)
617 spa_strfree(vd->vdev_physpath);
618 if (vd->vdev_fru)
619 spa_strfree(vd->vdev_fru);
620
621 if (vd->vdev_isspare)
622 spa_spare_remove(vd);
623 if (vd->vdev_isl2cache)
624 spa_l2cache_remove(vd);
625
626 txg_list_destroy(&vd->vdev_ms_list);
627 txg_list_destroy(&vd->vdev_dtl_list);
628
629 mutex_enter(&vd->vdev_dtl_lock);
630 for (int t = 0; t < DTL_TYPES; t++) {
631 space_map_unload(&vd->vdev_dtl[t]);
632 space_map_destroy(&vd->vdev_dtl[t]);
633 }
634 mutex_exit(&vd->vdev_dtl_lock);
635
636 mutex_destroy(&vd->vdev_dtl_lock);
637 mutex_destroy(&vd->vdev_stat_lock);
638 mutex_destroy(&vd->vdev_probe_lock);
639
640 if (vd == spa->spa_root_vdev)
641 spa->spa_root_vdev = NULL;
642
643 kmem_free(vd, sizeof (vdev_t));
644 }
645
646 /*
647 * Transfer top-level vdev state from svd to tvd.
648 */
649 static void
650 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
651 {
652 spa_t *spa = svd->vdev_spa;
823 * in 128k (1 << 17) because it is the current "typical" blocksize.
824 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
825 * or we will inconsistently account for existing bp's.
826 */
827 vd->vdev_deflate_ratio = (1 << 17) /
828 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
829
830 ASSERT(oldc <= newc);
831
832 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
833
834 if (oldc != 0) {
835 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
836 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
837 }
838
839 vd->vdev_ms = mspp;
840 vd->vdev_ms_count = newc;
841
842 for (m = oldc; m < newc; m++) {
843 space_map_obj_t smo = { 0, 0, 0 };
844 if (txg == 0) {
845 uint64_t object = 0;
846 error = dmu_read(mos, vd->vdev_ms_array,
847 m * sizeof (uint64_t), sizeof (uint64_t), &object,
848 DMU_READ_PREFETCH);
849 if (error)
850 return (error);
851 if (object != 0) {
852 dmu_buf_t *db;
853 error = dmu_bonus_hold(mos, object, FTAG, &db);
854 if (error)
855 return (error);
856 ASSERT3U(db->db_size, >=, sizeof (smo));
857 bcopy(db->db_data, &smo, sizeof (smo));
858 ASSERT3U(smo.smo_object, ==, object);
859 dmu_buf_rele(db, FTAG);
860 }
861 }
862 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
863 m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
864 }
865
866 if (txg == 0)
867 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
868
869 /*
870 * If the vdev is being removed we don't activate
871 * the metaslabs since we want to ensure that no new
872 * allocations are performed on this device.
873 */
874 if (oldc == 0 && !vd->vdev_removing)
875 metaslab_group_activate(vd->vdev_mg);
876
877 if (txg == 0)
878 spa_config_exit(spa, SCL_ALLOC, FTAG);
879
880 return (0);
881 }
882
883 void
884 vdev_metaslab_fini(vdev_t *vd)
885 {
886 uint64_t m;
887 uint64_t count = vd->vdev_ms_count;
888
889 if (vd->vdev_ms != NULL) {
890 metaslab_group_passivate(vd->vdev_mg);
891 for (m = 0; m < count; m++)
892 if (vd->vdev_ms[m] != NULL)
893 metaslab_fini(vd->vdev_ms[m]);
894 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
895 vd->vdev_ms = NULL;
896 }
897 }
898
899 typedef struct vdev_probe_stats {
900 boolean_t vps_readable;
901 boolean_t vps_writeable;
902 int vps_flags;
903 } vdev_probe_stats_t;
904
905 static void
906 vdev_probe_done(zio_t *zio)
907 {
908 spa_t *spa = zio->io_spa;
909 vdev_t *vd = zio->io_vd;
910 vdev_probe_stats_t *vps = zio->io_private;
911
912 ASSERT(vd->vdev_probe_zio != NULL);
913
1524 }
1525
1526 int
1527 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
1528 {
1529 int error;
1530
1531 /*
1532 * Normally, partial opens (e.g. of a mirror) are allowed.
1533 * For a create, however, we want to fail the request if
1534 * there are any components we can't open.
1535 */
1536 error = vdev_open(vd);
1537
1538 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
1539 vdev_close(vd);
1540 return (error ? error : ENXIO);
1541 }
1542
1543 /*
1544 * Recursively initialize all labels.
1545 */
1546 if ((error = vdev_label_init(vd, txg, isreplacing ?
1547 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
1548 vdev_close(vd);
1549 return (error);
1550 }
1551
1552 return (0);
1553 }
1554
1555 void
1556 vdev_metaslab_set_size(vdev_t *vd)
1557 {
1558 /*
1559 * Aim for roughly 200 metaslabs per vdev.
1560 */
1561 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
1562 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1563 }
1564
1565 void
1566 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1567 {
1568 ASSERT(vd == vd->vdev_top);
1569 ASSERT(!vd->vdev_ishole);
1570 ASSERT(ISP2(flags));
1571 ASSERT(spa_writeable(vd->vdev_spa));
1572
1573 if (flags & VDD_METASLAB)
1574 (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
1575
1576 if (flags & VDD_DTL)
1577 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
1578
1579 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
1580 }
1581
1582 /*
1583 * DTLs.
1584 *
1585 * A vdev's DTL (dirty time log) is the set of transaction groups for which
1586 * the vdev has less than perfect replication. There are four kinds of DTL:
1587 *
1588 * DTL_MISSING: txgs for which the vdev has no valid copies of the data
1589 *
1590 * DTL_PARTIAL: txgs for which data is available, but not fully replicated
1591 *
1592 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
1593 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
1594 * txgs that was scrubbed.
1595 *
1596 * DTL_OUTAGE: txgs which cannot currently be read, whether due to
1597 * persistent errors or just some device being offline.
1598 * Unlike the other three, the DTL_OUTAGE map is not generally
1599 * maintained; it's only computed when needed, typically to
1600 * determine whether a device can be detached.
1601 *
1603 * either has the data or it doesn't.
1604 *
1605 * For interior vdevs such as mirror and RAID-Z the picture is more complex.
1606 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
1607 * if any child is less than fully replicated, then so is its parent.
1608 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
1609 * comprising only those txgs which appear in 'maxfaults' or more children;
1610 * those are the txgs we don't have enough replication to read. For example,
1611 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
1612 * thus, its DTL_MISSING consists of the set of txgs that appear in more than
1613 * two child DTL_MISSING maps.
1614 *
1615 * It should be clear from the above that to compute the DTLs and outage maps
1616 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
1617 * Therefore, that is all we keep on disk. When loading the pool, or after
1618 * a configuration change, we generate all other DTLs from first principles.
1619 */
1620 void
1621 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1622 {
1623 space_map_t *sm = &vd->vdev_dtl[t];
1624
1625 ASSERT(t < DTL_TYPES);
1626 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1627 ASSERT(spa_writeable(vd->vdev_spa));
1628
1629 mutex_enter(sm->sm_lock);
1630 if (!space_map_contains(sm, txg, size))
1631 space_map_add(sm, txg, size);
1632 mutex_exit(sm->sm_lock);
1633 }
1634
1635 boolean_t
1636 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1637 {
1638 space_map_t *sm = &vd->vdev_dtl[t];
1639 boolean_t dirty = B_FALSE;
1640
1641 ASSERT(t < DTL_TYPES);
1642 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1643
1644 mutex_enter(sm->sm_lock);
1645 if (sm->sm_space != 0)
1646 dirty = space_map_contains(sm, txg, size);
1647 mutex_exit(sm->sm_lock);
1648
1649 return (dirty);
1650 }
1651
1652 boolean_t
1653 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1654 {
1655 space_map_t *sm = &vd->vdev_dtl[t];
1656 boolean_t empty;
1657
1658 mutex_enter(sm->sm_lock);
1659 empty = (sm->sm_space == 0);
1660 mutex_exit(sm->sm_lock);
1661
1662 return (empty);
1663 }
1664
1665 /*
1666 * Returns the lowest txg in the DTL range.
1667 */
1668 static uint64_t
1669 vdev_dtl_min(vdev_t *vd)
1670 {
1671 space_seg_t *ss;
1672
1673 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1674 ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
1675 ASSERT0(vd->vdev_children);
1676
1677 ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
1678 return (ss->ss_start - 1);
1679 }
1680
1681 /*
1682 * Returns the highest txg in the DTL.
1683 */
1684 static uint64_t
1685 vdev_dtl_max(vdev_t *vd)
1686 {
1687 space_seg_t *ss;
1688
1689 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1690 ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
1691 ASSERT0(vd->vdev_children);
1692
1693 ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
1694 return (ss->ss_end);
1695 }
1696
1697 /*
1698 * Determine if a resilvering vdev should remove any DTL entries from
1699 * its range. If the vdev was resilvering for the entire duration of the
1700 * scan then it should excise that range from its DTLs. Otherwise, this
1701 * vdev is considered partially resilvered and should leave its DTL
1702 * entries intact. The comment in vdev_dtl_reassess() describes how we
1703 * excise the DTLs.
1704 */
1705 static boolean_t
1706 vdev_dtl_should_excise(vdev_t *vd)
1707 {
1708 spa_t *spa = vd->vdev_spa;
1709 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1710
1711 ASSERT0(scn->scn_phys.scn_errors);
1712 ASSERT0(vd->vdev_children);
1713
1714 if (vd->vdev_resilver_txg == 0 ||
1715 vd->vdev_dtl[DTL_MISSING].sm_space == 0)
1716 return (B_TRUE);
1717
1718 /*
1719 * When a resilver is initiated the scan will assign the scn_max_txg
1720 * value to the highest txg value that exists in all DTLs. If this
1721 * device's max DTL is not part of this scan (i.e. it is not in
1722 * the range (scn_min_txg, scn_max_txg] then it is not eligible
1723 * for excision.
1724 */
1725 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
1726 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
1727 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
1728 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
1729 return (B_TRUE);
1730 }
1731 return (B_FALSE);
1732 }
1733
1734 /*
1735 * Reassess DTLs after a config change or scrub completion.
1765 (spa->spa_scrub_started ||
1766 (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
1767 vdev_dtl_should_excise(vd)) {
1768 /*
1769 * We completed a scrub up to scrub_txg. If we
1770 * did it without rebooting, then the scrub dtl
1771 * will be valid, so excise the old region and
1772 * fold in the scrub dtl. Otherwise, leave the
1773 * dtl as-is if there was an error.
1774 *
1775 * There's little trick here: to excise the beginning
1776 * of the DTL_MISSING map, we put it into a reference
1777 * tree and then add a segment with refcnt -1 that
1778 * covers the range [0, scrub_txg). This means
1779 * that each txg in that range has refcnt -1 or 0.
1780 * We then add DTL_SCRUB with a refcnt of 2, so that
1781 * entries in the range [0, scrub_txg) will have a
1782 * positive refcnt -- either 1 or 2. We then convert
1783 * the reference tree into the new DTL_MISSING map.
1784 */
1785 space_map_ref_create(&reftree);
1786 space_map_ref_add_map(&reftree,
1787 &vd->vdev_dtl[DTL_MISSING], 1);
1788 space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
1789 space_map_ref_add_map(&reftree,
1790 &vd->vdev_dtl[DTL_SCRUB], 2);
1791 space_map_ref_generate_map(&reftree,
1792 &vd->vdev_dtl[DTL_MISSING], 1);
1793 space_map_ref_destroy(&reftree);
1794 }
1795 space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
1796 space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1797 space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
1798 if (scrub_done)
1799 space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
1800 space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
1801 if (!vdev_readable(vd))
1802 space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
1803 else
1804 space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1805 space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
1806
1807 /*
1808 * If the vdev was resilvering and no longer has any
1809 * DTLs then reset its resilvering flag.
1810 */
1811 if (vd->vdev_resilver_txg != 0 &&
1812 vd->vdev_dtl[DTL_MISSING].sm_space == 0 &&
1813 vd->vdev_dtl[DTL_OUTAGE].sm_space == 0)
1814 vd->vdev_resilver_txg = 0;
1815
1816 mutex_exit(&vd->vdev_dtl_lock);
1817
1818 if (txg != 0)
1819 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
1820 return;
1821 }
1822
1823 mutex_enter(&vd->vdev_dtl_lock);
1824 for (int t = 0; t < DTL_TYPES; t++) {
1825 /* account for child's outage in parent's missing map */
1826 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
1827 if (t == DTL_SCRUB)
1828 continue; /* leaf vdevs only */
1829 if (t == DTL_PARTIAL)
1830 minref = 1; /* i.e. non-zero */
1831 else if (vd->vdev_nparity != 0)
1832 minref = vd->vdev_nparity + 1; /* RAID-Z */
1833 else
1834 minref = vd->vdev_children; /* any kind of mirror */
1835 space_map_ref_create(&reftree);
1836 for (int c = 0; c < vd->vdev_children; c++) {
1837 vdev_t *cvd = vd->vdev_child[c];
1838 mutex_enter(&cvd->vdev_dtl_lock);
1839 space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
1840 mutex_exit(&cvd->vdev_dtl_lock);
1841 }
1842 space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
1843 space_map_ref_destroy(&reftree);
1844 }
1845 mutex_exit(&vd->vdev_dtl_lock);
1846 }
1847
1848 static int
1849 vdev_dtl_load(vdev_t *vd)
1850 {
1851 spa_t *spa = vd->vdev_spa;
1852 space_map_obj_t *smo = &vd->vdev_dtl_smo;
1853 objset_t *mos = spa->spa_meta_objset;
1854 dmu_buf_t *db;
1855 int error;
1856
1857 ASSERT(vd->vdev_children == 0);
1858
1859 if (smo->smo_object == 0)
1860 return (0);
1861
1862 ASSERT(!vd->vdev_ishole);
1863
1864 if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
1865 return (error);
1866
1867 ASSERT3U(db->db_size, >=, sizeof (*smo));
1868 bcopy(db->db_data, smo, sizeof (*smo));
1869 dmu_buf_rele(db, FTAG);
1870
1871 mutex_enter(&vd->vdev_dtl_lock);
1872 error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
1873 NULL, SM_ALLOC, smo, mos);
1874 mutex_exit(&vd->vdev_dtl_lock);
1875
1876 return (error);
1877 }
1878
1879 void
1880 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1881 {
1882 spa_t *spa = vd->vdev_spa;
1883 space_map_obj_t *smo = &vd->vdev_dtl_smo;
1884 space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
1885 objset_t *mos = spa->spa_meta_objset;
1886 space_map_t smsync;
1887 kmutex_t smlock;
1888 dmu_buf_t *db;
1889 dmu_tx_t *tx;
1890
1891 ASSERT(!vd->vdev_ishole);
1892
1893 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1894
1895 if (vd->vdev_detached) {
1896 if (smo->smo_object != 0) {
1897 int err = dmu_object_free(mos, smo->smo_object, tx);
1898 ASSERT0(err);
1899 smo->smo_object = 0;
1900 }
1901 dmu_tx_commit(tx);
1902 return;
1903 }
1904
1905 if (smo->smo_object == 0) {
1906 ASSERT(smo->smo_objsize == 0);
1907 ASSERT(smo->smo_alloc == 0);
1908 smo->smo_object = dmu_object_alloc(mos,
1909 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1910 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1911 ASSERT(smo->smo_object != 0);
1912 vdev_config_dirty(vd->vdev_top);
1913 }
1914
1915 mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
1916
1917 space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
1918 &smlock);
1919
1920 mutex_enter(&smlock);
1921
1922 mutex_enter(&vd->vdev_dtl_lock);
1923 space_map_walk(sm, space_map_add, &smsync);
1924 mutex_exit(&vd->vdev_dtl_lock);
1925
1926 space_map_truncate(smo, mos, tx);
1927 space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
1928 space_map_vacate(&smsync, NULL, NULL);
1929
1930 space_map_destroy(&smsync);
1931
1932 mutex_exit(&smlock);
1933 mutex_destroy(&smlock);
1934
1935 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1936 dmu_buf_will_dirty(db, tx);
1937 ASSERT3U(db->db_size, >=, sizeof (*smo));
1938 bcopy(smo, db->db_data, sizeof (*smo));
1939 dmu_buf_rele(db, FTAG);
1940
1941 dmu_tx_commit(tx);
1942 }
1943
1944 /*
1945 * Determine whether the specified vdev can be offlined/detached/removed
1946 * without losing data.
1947 */
1948 boolean_t
1949 vdev_dtl_required(vdev_t *vd)
1950 {
1951 spa_t *spa = vd->vdev_spa;
1952 vdev_t *tvd = vd->vdev_top;
1953 uint8_t cant_read = vd->vdev_cant_read;
1954 boolean_t required;
1955
1956 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1957
1958 if (vd == spa->spa_root_vdev || vd == tvd)
1959 return (B_TRUE);
1960
1961 /*
1970 vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
1971
1972 if (!required && zio_injection_enabled)
1973 required = !!zio_handle_device_injection(vd, NULL, ECHILD);
1974
1975 return (required);
1976 }
1977
1978 /*
1979 * Determine if resilver is needed, and if so the txg range.
1980 */
1981 boolean_t
1982 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
1983 {
1984 boolean_t needed = B_FALSE;
1985 uint64_t thismin = UINT64_MAX;
1986 uint64_t thismax = 0;
1987
1988 if (vd->vdev_children == 0) {
1989 mutex_enter(&vd->vdev_dtl_lock);
1990 if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
1991 vdev_writeable(vd)) {
1992
1993 thismin = vdev_dtl_min(vd);
1994 thismax = vdev_dtl_max(vd);
1995 needed = B_TRUE;
1996 }
1997 mutex_exit(&vd->vdev_dtl_lock);
1998 } else {
1999 for (int c = 0; c < vd->vdev_children; c++) {
2000 vdev_t *cvd = vd->vdev_child[c];
2001 uint64_t cmin, cmax;
2002
2003 if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
2004 thismin = MIN(thismin, cmin);
2005 thismax = MAX(thismax, cmax);
2006 needed = B_TRUE;
2007 }
2008 }
2009 }
2010
2075 return (-1);
2076 }
2077
2078 /*
2079 * We don't actually check the pool state here. If it's in fact in
2080 * use by another pool, we update this fact on the fly when requested.
2081 */
2082 nvlist_free(label);
2083 return (0);
2084 }
2085
2086 void
2087 vdev_remove(vdev_t *vd, uint64_t txg)
2088 {
2089 spa_t *spa = vd->vdev_spa;
2090 objset_t *mos = spa->spa_meta_objset;
2091 dmu_tx_t *tx;
2092
2093 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2094
2095 if (vd->vdev_dtl_smo.smo_object) {
2096 ASSERT0(vd->vdev_dtl_smo.smo_alloc);
2097 (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
2098 vd->vdev_dtl_smo.smo_object = 0;
2099 }
2100
2101 if (vd->vdev_ms != NULL) {
2102 for (int m = 0; m < vd->vdev_ms_count; m++) {
2103 metaslab_t *msp = vd->vdev_ms[m];
2104
2105 if (msp == NULL || msp->ms_smo.smo_object == 0)
2106 continue;
2107
2108 ASSERT0(msp->ms_smo.smo_alloc);
2109 (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
2110 msp->ms_smo.smo_object = 0;
2111 }
2112 }
2113
2114 if (vd->vdev_ms_array) {
2115 (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
2116 vd->vdev_ms_array = 0;
2117 vd->vdev_ms_shift = 0;
2118 }
2119 dmu_tx_commit(tx);
2120 }
2121
2122 void
2123 vdev_sync_done(vdev_t *vd, uint64_t txg)
2124 {
2125 metaslab_t *msp;
2126 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
2127
2128 ASSERT(!vd->vdev_ishole);
2129
2130 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
2131 metaslab_sync_done(msp, txg);
2132
2133 if (reassess)
2134 metaslab_sync_reassess(vd->vdev_mg);
2135 }
2136
2137 void
|
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2013 by Delphix. All rights reserved.
26 */
27
28 #include <sys/zfs_context.h>
29 #include <sys/fm/fs/zfs.h>
30 #include <sys/spa.h>
31 #include <sys/spa_impl.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/uberblock_impl.h>
36 #include <sys/metaslab.h>
37 #include <sys/metaslab_impl.h>
38 #include <sys/space_map.h>
39 #include <sys/space_reftree.h>
40 #include <sys/zio.h>
41 #include <sys/zap.h>
42 #include <sys/fs/zfs.h>
43 #include <sys/arc.h>
44 #include <sys/zil.h>
45 #include <sys/dsl_scan.h>
46
47 /*
48 * Virtual device management.
49 */
50
51 static vdev_ops_t *vdev_ops_table[] = {
52 &vdev_root_ops,
53 &vdev_raidz_ops,
54 &vdev_mirror_ops,
55 &vdev_replacing_ops,
56 &vdev_spare_ops,
57 &vdev_disk_ops,
58 &vdev_file_ops,
59 &vdev_missing_ops,
303 /*
304 * Any other vdev's guid must be unique within the pool.
305 */
306 guid = spa_generate_guid(spa);
307 }
308 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
309 }
310
311 vd->vdev_spa = spa;
312 vd->vdev_id = id;
313 vd->vdev_guid = guid;
314 vd->vdev_guid_sum = guid;
315 vd->vdev_ops = ops;
316 vd->vdev_state = VDEV_STATE_CLOSED;
317 vd->vdev_ishole = (ops == &vdev_hole_ops);
318
319 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
320 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
321 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
322 for (int t = 0; t < DTL_TYPES; t++) {
323 vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
324 &vd->vdev_dtl_lock);
325 }
326 txg_list_create(&vd->vdev_ms_list,
327 offsetof(struct metaslab, ms_txg_node));
328 txg_list_create(&vd->vdev_dtl_list,
329 offsetof(struct vdev, vdev_dtl_node));
330 vd->vdev_stat.vs_timestamp = gethrtime();
331 vdev_queue_init(vd);
332 vdev_cache_init(vd);
333
334 return (vd);
335 }
336
337 /*
338 * Allocate a new vdev. The 'alloctype' is used to control whether we are
339 * creating a new vdev or loading an existing one - the behavior is slightly
340 * different for each case.
341 */
342 int
343 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
489 &vd->vdev_removing);
490 }
491
492 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
493 ASSERT(alloctype == VDEV_ALLOC_LOAD ||
494 alloctype == VDEV_ALLOC_ADD ||
495 alloctype == VDEV_ALLOC_SPLIT ||
496 alloctype == VDEV_ALLOC_ROOTPOOL);
497 vd->vdev_mg = metaslab_group_create(islog ?
498 spa_log_class(spa) : spa_normal_class(spa), vd);
499 }
500
501 /*
502 * If we're a leaf vdev, try to load the DTL object and other state.
503 */
504 if (vd->vdev_ops->vdev_op_leaf &&
505 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
506 alloctype == VDEV_ALLOC_ROOTPOOL)) {
507 if (alloctype == VDEV_ALLOC_LOAD) {
508 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
509 &vd->vdev_dtl_object);
510 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
511 &vd->vdev_unspare);
512 }
513
514 if (alloctype == VDEV_ALLOC_ROOTPOOL) {
515 uint64_t spare = 0;
516
517 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
518 &spare) == 0 && spare)
519 spa_spare_add(vd);
520 }
521
522 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
523 &vd->vdev_offline);
524
525 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
526 &vd->vdev_resilver_txg);
527
528 /*
529 * When importing a pool, we want to ignore the persistent fault
611 vdev_cache_fini(vd);
612
613 if (vd->vdev_path)
614 spa_strfree(vd->vdev_path);
615 if (vd->vdev_devid)
616 spa_strfree(vd->vdev_devid);
617 if (vd->vdev_physpath)
618 spa_strfree(vd->vdev_physpath);
619 if (vd->vdev_fru)
620 spa_strfree(vd->vdev_fru);
621
622 if (vd->vdev_isspare)
623 spa_spare_remove(vd);
624 if (vd->vdev_isl2cache)
625 spa_l2cache_remove(vd);
626
627 txg_list_destroy(&vd->vdev_ms_list);
628 txg_list_destroy(&vd->vdev_dtl_list);
629
630 mutex_enter(&vd->vdev_dtl_lock);
631 space_map_close(vd->vdev_dtl_sm);
632 for (int t = 0; t < DTL_TYPES; t++) {
633 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
634 range_tree_destroy(vd->vdev_dtl[t]);
635 }
636 mutex_exit(&vd->vdev_dtl_lock);
637
638 mutex_destroy(&vd->vdev_dtl_lock);
639 mutex_destroy(&vd->vdev_stat_lock);
640 mutex_destroy(&vd->vdev_probe_lock);
641
642 if (vd == spa->spa_root_vdev)
643 spa->spa_root_vdev = NULL;
644
645 kmem_free(vd, sizeof (vdev_t));
646 }
647
648 /*
649 * Transfer top-level vdev state from svd to tvd.
650 */
651 static void
652 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
653 {
654 spa_t *spa = svd->vdev_spa;
825 * in 128k (1 << 17) because it is the current "typical" blocksize.
826 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
827 * or we will inconsistently account for existing bp's.
828 */
829 vd->vdev_deflate_ratio = (1 << 17) /
830 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
831
832 ASSERT(oldc <= newc);
833
834 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
835
836 if (oldc != 0) {
837 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
838 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
839 }
840
841 vd->vdev_ms = mspp;
842 vd->vdev_ms_count = newc;
843
844 for (m = oldc; m < newc; m++) {
845 uint64_t object = 0;
846
847 if (txg == 0) {
848 error = dmu_read(mos, vd->vdev_ms_array,
849 m * sizeof (uint64_t), sizeof (uint64_t), &object,
850 DMU_READ_PREFETCH);
851 if (error)
852 return (error);
853 }
854 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, m, object, txg);
855 }
856
857 if (txg == 0)
858 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
859
860 /*
861 * If the vdev is being removed we don't activate
862 * the metaslabs since we want to ensure that no new
863 * allocations are performed on this device.
864 */
865 if (oldc == 0 && !vd->vdev_removing)
866 metaslab_group_activate(vd->vdev_mg);
867
868 if (txg == 0)
869 spa_config_exit(spa, SCL_ALLOC, FTAG);
870
871 return (0);
872 }
873
874 void
875 vdev_metaslab_fini(vdev_t *vd)
876 {
877 uint64_t m;
878 uint64_t count = vd->vdev_ms_count;
879
880 if (vd->vdev_ms != NULL) {
881 metaslab_group_passivate(vd->vdev_mg);
882 for (m = 0; m < count; m++) {
883 metaslab_t *msp = vd->vdev_ms[m];
884
885 if (msp != NULL)
886 metaslab_fini(msp);
887 }
888 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
889 vd->vdev_ms = NULL;
890 }
891 }
892
893 typedef struct vdev_probe_stats {
894 boolean_t vps_readable;
895 boolean_t vps_writeable;
896 int vps_flags;
897 } vdev_probe_stats_t;
898
899 static void
900 vdev_probe_done(zio_t *zio)
901 {
902 spa_t *spa = zio->io_spa;
903 vdev_t *vd = zio->io_vd;
904 vdev_probe_stats_t *vps = zio->io_private;
905
906 ASSERT(vd->vdev_probe_zio != NULL);
907
1518 }
1519
1520 int
1521 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
1522 {
1523 int error;
1524
1525 /*
1526 * Normally, partial opens (e.g. of a mirror) are allowed.
1527 * For a create, however, we want to fail the request if
1528 * there are any components we can't open.
1529 */
1530 error = vdev_open(vd);
1531
1532 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
1533 vdev_close(vd);
1534 return (error ? error : ENXIO);
1535 }
1536
1537 /*
1538 * Recursively load DTLs and initialize all labels.
1539 */
1540 if ((error = vdev_dtl_load(vd)) != 0 ||
1541 (error = vdev_label_init(vd, txg, isreplacing ?
1542 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
1543 vdev_close(vd);
1544 return (error);
1545 }
1546
1547 return (0);
1548 }
1549
1550 void
1551 vdev_metaslab_set_size(vdev_t *vd)
1552 {
1553 /*
1554 * Aim for roughly 200 metaslabs per vdev.
1555 */
1556 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
1557 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1558 }
1559
1560 void
1561 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1562 {
1563 ASSERT(vd == vd->vdev_top);
1564 ASSERT(!vd->vdev_ishole);
1565 ASSERT(ISP2(flags));
1566 ASSERT(spa_writeable(vd->vdev_spa));
1567
1568 if (flags & VDD_METASLAB)
1569 (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
1570
1571 if (flags & VDD_DTL)
1572 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
1573
1574 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
1575 }
1576
1577 void
1578 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
1579 {
1580 for (int c = 0; c < vd->vdev_children; c++)
1581 vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
1582
1583 if (vd->vdev_ops->vdev_op_leaf)
1584 vdev_dirty(vd->vdev_top, flags, vd, txg);
1585 }
1586
1587 /*
1588 * DTLs.
1589 *
1590 * A vdev's DTL (dirty time log) is the set of transaction groups for which
1591 * the vdev has less than perfect replication. There are four kinds of DTL:
1592 *
1593 * DTL_MISSING: txgs for which the vdev has no valid copies of the data
1594 *
1595 * DTL_PARTIAL: txgs for which data is available, but not fully replicated
1596 *
1597 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
1598 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
1599 * txgs that was scrubbed.
1600 *
1601 * DTL_OUTAGE: txgs which cannot currently be read, whether due to
1602 * persistent errors or just some device being offline.
1603 * Unlike the other three, the DTL_OUTAGE map is not generally
1604 * maintained; it's only computed when needed, typically to
1605 * determine whether a device can be detached.
1606 *
1608 * either has the data or it doesn't.
1609 *
1610 * For interior vdevs such as mirror and RAID-Z the picture is more complex.
1611 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
1612 * if any child is less than fully replicated, then so is its parent.
1613 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
1614 * comprising only those txgs which appear in 'maxfaults' or more children;
1615 * those are the txgs we don't have enough replication to read. For example,
1616 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
1617 * thus, its DTL_MISSING consists of the set of txgs that appear in more than
1618 * two child DTL_MISSING maps.
1619 *
1620 * It should be clear from the above that to compute the DTLs and outage maps
1621 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
1622 * Therefore, that is all we keep on disk. When loading the pool, or after
1623 * a configuration change, we generate all other DTLs from first principles.
1624 */
1625 void
1626 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1627 {
1628 range_tree_t *rt = vd->vdev_dtl[t];
1629
1630 ASSERT(t < DTL_TYPES);
1631 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1632 ASSERT(spa_writeable(vd->vdev_spa));
1633
1634 mutex_enter(rt->rt_lock);
1635 if (!range_tree_contains(rt, txg, size))
1636 range_tree_add(rt, txg, size);
1637 mutex_exit(rt->rt_lock);
1638 }
1639
1640 boolean_t
1641 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1642 {
1643 range_tree_t *rt = vd->vdev_dtl[t];
1644 boolean_t dirty = B_FALSE;
1645
1646 ASSERT(t < DTL_TYPES);
1647 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1648
1649 mutex_enter(rt->rt_lock);
1650 if (range_tree_space(rt) != 0)
1651 dirty = range_tree_contains(rt, txg, size);
1652 mutex_exit(rt->rt_lock);
1653
1654 return (dirty);
1655 }
1656
1657 boolean_t
1658 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1659 {
1660 range_tree_t *rt = vd->vdev_dtl[t];
1661 boolean_t empty;
1662
1663 mutex_enter(rt->rt_lock);
1664 empty = (range_tree_space(rt) == 0);
1665 mutex_exit(rt->rt_lock);
1666
1667 return (empty);
1668 }
1669
1670 /*
1671 * Returns the lowest txg in the DTL range.
1672 */
1673 static uint64_t
1674 vdev_dtl_min(vdev_t *vd)
1675 {
1676 range_seg_t *rs;
1677
1678 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1679 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
1680 ASSERT0(vd->vdev_children);
1681
1682 rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
1683 return (rs->rs_start - 1);
1684 }
1685
1686 /*
1687 * Returns the highest txg in the DTL.
1688 */
1689 static uint64_t
1690 vdev_dtl_max(vdev_t *vd)
1691 {
1692 range_seg_t *rs;
1693
1694 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1695 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
1696 ASSERT0(vd->vdev_children);
1697
1698 rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
1699 return (rs->rs_end);
1700 }
1701
1702 /*
1703 * Determine if a resilvering vdev should remove any DTL entries from
1704 * its range. If the vdev was resilvering for the entire duration of the
1705 * scan then it should excise that range from its DTLs. Otherwise, this
1706 * vdev is considered partially resilvered and should leave its DTL
1707 * entries intact. The comment in vdev_dtl_reassess() describes how we
1708 * excise the DTLs.
1709 */
1710 static boolean_t
1711 vdev_dtl_should_excise(vdev_t *vd)
1712 {
1713 spa_t *spa = vd->vdev_spa;
1714 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1715
1716 ASSERT0(scn->scn_phys.scn_errors);
1717 ASSERT0(vd->vdev_children);
1718
1719 if (vd->vdev_resilver_txg == 0 ||
1720 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
1721 return (B_TRUE);
1722
1723 /*
1724 * When a resilver is initiated the scan will assign the scn_max_txg
1725 * value to the highest txg value that exists in all DTLs. If this
1726 * device's max DTL is not part of this scan (i.e. it is not in
1727 * the range (scn_min_txg, scn_max_txg] then it is not eligible
1728 * for excision.
1729 */
1730 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
1731 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
1732 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
1733 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
1734 return (B_TRUE);
1735 }
1736 return (B_FALSE);
1737 }
1738
1739 /*
1740 * Reassess DTLs after a config change or scrub completion.
1770 (spa->spa_scrub_started ||
1771 (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
1772 vdev_dtl_should_excise(vd)) {
1773 /*
1774 * We completed a scrub up to scrub_txg. If we
1775 * did it without rebooting, then the scrub dtl
1776 * will be valid, so excise the old region and
1777 * fold in the scrub dtl. Otherwise, leave the
1778 * dtl as-is if there was an error.
1779 *
1780 * There's little trick here: to excise the beginning
1781 * of the DTL_MISSING map, we put it into a reference
1782 * tree and then add a segment with refcnt -1 that
1783 * covers the range [0, scrub_txg). This means
1784 * that each txg in that range has refcnt -1 or 0.
1785 * We then add DTL_SCRUB with a refcnt of 2, so that
1786 * entries in the range [0, scrub_txg) will have a
1787 * positive refcnt -- either 1 or 2. We then convert
1788 * the reference tree into the new DTL_MISSING map.
1789 */
1790 space_reftree_create(&reftree);
1791 space_reftree_add_map(&reftree,
1792 vd->vdev_dtl[DTL_MISSING], 1);
1793 space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
1794 space_reftree_add_map(&reftree,
1795 vd->vdev_dtl[DTL_SCRUB], 2);
1796 space_reftree_generate_map(&reftree,
1797 vd->vdev_dtl[DTL_MISSING], 1);
1798 space_reftree_destroy(&reftree);
1799 }
1800 range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
1801 range_tree_walk(vd->vdev_dtl[DTL_MISSING],
1802 range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
1803 if (scrub_done)
1804 range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
1805 range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
1806 if (!vdev_readable(vd))
1807 range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
1808 else
1809 range_tree_walk(vd->vdev_dtl[DTL_MISSING],
1810 range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
1811
1812 /*
1813 * If the vdev was resilvering and no longer has any
1814 * DTLs then reset its resilvering flag.
1815 */
1816 if (vd->vdev_resilver_txg != 0 &&
1817 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
1818 range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0)
1819 vd->vdev_resilver_txg = 0;
1820
1821 mutex_exit(&vd->vdev_dtl_lock);
1822
1823 if (txg != 0)
1824 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
1825 return;
1826 }
1827
1828 mutex_enter(&vd->vdev_dtl_lock);
1829 for (int t = 0; t < DTL_TYPES; t++) {
1830 /* account for child's outage in parent's missing map */
1831 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
1832 if (t == DTL_SCRUB)
1833 continue; /* leaf vdevs only */
1834 if (t == DTL_PARTIAL)
1835 minref = 1; /* i.e. non-zero */
1836 else if (vd->vdev_nparity != 0)
1837 minref = vd->vdev_nparity + 1; /* RAID-Z */
1838 else
1839 minref = vd->vdev_children; /* any kind of mirror */
1840 space_reftree_create(&reftree);
1841 for (int c = 0; c < vd->vdev_children; c++) {
1842 vdev_t *cvd = vd->vdev_child[c];
1843 mutex_enter(&cvd->vdev_dtl_lock);
1844 space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
1845 mutex_exit(&cvd->vdev_dtl_lock);
1846 }
1847 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
1848 space_reftree_destroy(&reftree);
1849 }
1850 mutex_exit(&vd->vdev_dtl_lock);
1851 }
1852
1853 int
1854 vdev_dtl_load(vdev_t *vd)
1855 {
1856 spa_t *spa = vd->vdev_spa;
1857 objset_t *mos = spa->spa_meta_objset;
1858 int error = 0;
1859
1860 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
1861 ASSERT(!vd->vdev_ishole);
1862
1863 error = space_map_open(&vd->vdev_dtl_sm, mos,
1864 vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
1865 if (error)
1866 return (error);
1867 ASSERT(vd->vdev_dtl_sm != NULL);
1868
1869 mutex_enter(&vd->vdev_dtl_lock);
1870
1871 /*
1872 * Now that we've opened the space_map we need to update
1873 * the in-core DTL.
1874 */
1875 space_map_update(vd->vdev_dtl_sm);
1876
1877 error = space_map_load(vd->vdev_dtl_sm,
1878 vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
1879 mutex_exit(&vd->vdev_dtl_lock);
1880
1881 return (error);
1882 }
1883
1884 for (int c = 0; c < vd->vdev_children; c++) {
1885 error = vdev_dtl_load(vd->vdev_child[c]);
1886 if (error != 0)
1887 break;
1888 }
1889
1890 return (error);
1891 }
1892
1893 void
1894 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1895 {
1896 spa_t *spa = vd->vdev_spa;
1897 range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
1898 objset_t *mos = spa->spa_meta_objset;
1899 range_tree_t *rtsync;
1900 kmutex_t rtlock;
1901 dmu_tx_t *tx;
1902 uint64_t object = space_map_object(vd->vdev_dtl_sm);
1903
1904 ASSERT(!vd->vdev_ishole);
1905 ASSERT(vd->vdev_ops->vdev_op_leaf);
1906
1907 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1908
1909 if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
1910 mutex_enter(&vd->vdev_dtl_lock);
1911 space_map_free(vd->vdev_dtl_sm, tx);
1912 space_map_close(vd->vdev_dtl_sm);
1913 vd->vdev_dtl_sm = NULL;
1914 mutex_exit(&vd->vdev_dtl_lock);
1915 dmu_tx_commit(tx);
1916 return;
1917 }
1918
1919 if (vd->vdev_dtl_sm == NULL) {
1920 uint64_t new_object;
1921
1922 new_object = space_map_alloc(mos, tx);
1923 VERIFY3U(new_object, !=, 0);
1924
1925 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
1926 0, -1ULL, 0, &vd->vdev_dtl_lock));
1927 ASSERT(vd->vdev_dtl_sm != NULL);
1928 }
1929
1930 mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
1931
1932 rtsync = range_tree_create(NULL, NULL, &rtlock);
1933
1934 mutex_enter(&rtlock);
1935
1936 mutex_enter(&vd->vdev_dtl_lock);
1937 range_tree_walk(rt, range_tree_add, rtsync);
1938 mutex_exit(&vd->vdev_dtl_lock);
1939
1940 space_map_truncate(vd->vdev_dtl_sm, tx);
1941 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
1942 range_tree_vacate(rtsync, NULL, NULL);
1943
1944 range_tree_destroy(rtsync);
1945
1946 mutex_exit(&rtlock);
1947 mutex_destroy(&rtlock);
1948
1949 /*
1950 * If the object for the space map has changed then dirty
1951 * the top level so that we update the config.
1952 */
1953 if (object != space_map_object(vd->vdev_dtl_sm)) {
1954 zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
1955 "new object %llu", txg, spa_name(spa), object,
1956 space_map_object(vd->vdev_dtl_sm));
1957 vdev_config_dirty(vd->vdev_top);
1958 }
1959
1960 dmu_tx_commit(tx);
1961
1962 mutex_enter(&vd->vdev_dtl_lock);
1963 space_map_update(vd->vdev_dtl_sm);
1964 mutex_exit(&vd->vdev_dtl_lock);
1965 }
1966
1967 /*
1968 * Determine whether the specified vdev can be offlined/detached/removed
1969 * without losing data.
1970 */
1971 boolean_t
1972 vdev_dtl_required(vdev_t *vd)
1973 {
1974 spa_t *spa = vd->vdev_spa;
1975 vdev_t *tvd = vd->vdev_top;
1976 uint8_t cant_read = vd->vdev_cant_read;
1977 boolean_t required;
1978
1979 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1980
1981 if (vd == spa->spa_root_vdev || vd == tvd)
1982 return (B_TRUE);
1983
1984 /*
1993 vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
1994
1995 if (!required && zio_injection_enabled)
1996 required = !!zio_handle_device_injection(vd, NULL, ECHILD);
1997
1998 return (required);
1999 }
2000
2001 /*
2002 * Determine if resilver is needed, and if so the txg range.
2003 */
2004 boolean_t
2005 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
2006 {
2007 boolean_t needed = B_FALSE;
2008 uint64_t thismin = UINT64_MAX;
2009 uint64_t thismax = 0;
2010
2011 if (vd->vdev_children == 0) {
2012 mutex_enter(&vd->vdev_dtl_lock);
2013 if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
2014 vdev_writeable(vd)) {
2015
2016 thismin = vdev_dtl_min(vd);
2017 thismax = vdev_dtl_max(vd);
2018 needed = B_TRUE;
2019 }
2020 mutex_exit(&vd->vdev_dtl_lock);
2021 } else {
2022 for (int c = 0; c < vd->vdev_children; c++) {
2023 vdev_t *cvd = vd->vdev_child[c];
2024 uint64_t cmin, cmax;
2025
2026 if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
2027 thismin = MIN(thismin, cmin);
2028 thismax = MAX(thismax, cmax);
2029 needed = B_TRUE;
2030 }
2031 }
2032 }
2033
2098 return (-1);
2099 }
2100
2101 /*
2102 * We don't actually check the pool state here. If it's in fact in
2103 * use by another pool, we update this fact on the fly when requested.
2104 */
2105 nvlist_free(label);
2106 return (0);
2107 }
2108
2109 void
2110 vdev_remove(vdev_t *vd, uint64_t txg)
2111 {
2112 spa_t *spa = vd->vdev_spa;
2113 objset_t *mos = spa->spa_meta_objset;
2114 dmu_tx_t *tx;
2115
2116 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2117
2118 if (vd->vdev_ms != NULL) {
2119 for (int m = 0; m < vd->vdev_ms_count; m++) {
2120 metaslab_t *msp = vd->vdev_ms[m];
2121
2122 if (msp == NULL || msp->ms_sm == NULL)
2123 continue;
2124
2125 mutex_enter(&msp->ms_lock);
2126 VERIFY0(space_map_allocated(msp->ms_sm));
2127 space_map_free(msp->ms_sm, tx);
2128 space_map_close(msp->ms_sm);
2129 msp->ms_sm = NULL;
2130 mutex_exit(&msp->ms_lock);
2131 }
2132 }
2133
2134 if (vd->vdev_ms_array) {
2135 (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
2136 vd->vdev_ms_array = 0;
2137 }
2138 dmu_tx_commit(tx);
2139 }
2140
2141 void
2142 vdev_sync_done(vdev_t *vd, uint64_t txg)
2143 {
2144 metaslab_t *msp;
2145 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
2146
2147 ASSERT(!vd->vdev_ishole);
2148
2149 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
2150 metaslab_sync_done(msp, txg);
2151
2152 if (reassess)
2153 metaslab_sync_reassess(vd->vdev_mg);
2154 }
2155
2156 void
|