Print this page
3956 ::vdev -r should work with pipelines
3957 ztest should update the cachefile before killing itself
3958 multiple scans can lead to partial resilvering
3959 ddt entries are not always resilvered
3960 dsl_scan can skip over dedup-ed blocks if physical birth != logical birth
3961 freed gang blocks are not resilvered and can cause pool to suspend
3962 ztest should print out zfs debug buffer before exiting
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>


 504             (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 505             alloctype == VDEV_ALLOC_ROOTPOOL)) {
 506                 if (alloctype == VDEV_ALLOC_LOAD) {
 507                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 508                             &vd->vdev_dtl_smo.smo_object);
 509                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 510                             &vd->vdev_unspare);
 511                 }
 512 
 513                 if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 514                         uint64_t spare = 0;
 515 
 516                         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 517                             &spare) == 0 && spare)
 518                                 spa_spare_add(vd);
 519                 }
 520 
 521                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 522                     &vd->vdev_offline);
 523 
 524                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
 525                     &vd->vdev_resilvering);
 526 
 527                 /*
 528                  * When importing a pool, we want to ignore the persistent fault
 529                  * state, as the diagnosis made on another system may not be
 530                  * valid in the current context.  Local vdevs will
 531                  * remain in the faulted state.
 532                  */
 533                 if (spa_load_state(spa) == SPA_LOAD_OPEN) {
 534                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 535                             &vd->vdev_faulted);
 536                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 537                             &vd->vdev_degraded);
 538                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 539                             &vd->vdev_removed);
 540 
 541                         if (vd->vdev_faulted || vd->vdev_degraded) {
 542                                 char *aux;
 543 
 544                                 vd->vdev_label_aux =
 545                                     VDEV_AUX_ERR_EXCEEDED;


1646                 dirty = space_map_contains(sm, txg, size);
1647         mutex_exit(sm->sm_lock);
1648 
1649         return (dirty);
1650 }
1651 
1652 boolean_t
1653 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1654 {
1655         space_map_t *sm = &vd->vdev_dtl[t];
1656         boolean_t empty;
1657 
1658         mutex_enter(sm->sm_lock);
1659         empty = (sm->sm_space == 0);
1660         mutex_exit(sm->sm_lock);
1661 
1662         return (empty);
1663 }
1664 
1665 /*





































































1666  * Reassess DTLs after a config change or scrub completion.
1667  */
1668 void
1669 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
1670 {
1671         spa_t *spa = vd->vdev_spa;
1672         avl_tree_t reftree;
1673         int minref;
1674 
1675         ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1676 
1677         for (int c = 0; c < vd->vdev_children; c++)
1678                 vdev_dtl_reassess(vd->vdev_child[c], txg,
1679                     scrub_txg, scrub_done);
1680 
1681         if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
1682                 return;
1683 
1684         if (vd->vdev_ops->vdev_op_leaf) {
1685                 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1686 
1687                 mutex_enter(&vd->vdev_dtl_lock);







1688                 if (scrub_txg != 0 &&
1689                     (spa->spa_scrub_started ||
1690                     (scn && scn->scn_phys.scn_errors == 0))) {

1691                         /*
1692                          * We completed a scrub up to scrub_txg.  If we
1693                          * did it without rebooting, then the scrub dtl
1694                          * will be valid, so excise the old region and
1695                          * fold in the scrub dtl.  Otherwise, leave the
1696                          * dtl as-is if there was an error.
1697                          *
1698                          * There's little trick here: to excise the beginning
1699                          * of the DTL_MISSING map, we put it into a reference
1700                          * tree and then add a segment with refcnt -1 that
1701                          * covers the range [0, scrub_txg).  This means
1702                          * that each txg in that range has refcnt -1 or 0.
1703                          * We then add DTL_SCRUB with a refcnt of 2, so that
1704                          * entries in the range [0, scrub_txg) will have a
1705                          * positive refcnt -- either 1 or 2.  We then convert
1706                          * the reference tree into the new DTL_MISSING map.
1707                          */
1708                         space_map_ref_create(&reftree);
1709                         space_map_ref_add_map(&reftree,
1710                             &vd->vdev_dtl[DTL_MISSING], 1);
1711                         space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
1712                         space_map_ref_add_map(&reftree,
1713                             &vd->vdev_dtl[DTL_SCRUB], 2);
1714                         space_map_ref_generate_map(&reftree,
1715                             &vd->vdev_dtl[DTL_MISSING], 1);
1716                         space_map_ref_destroy(&reftree);
1717                 }
1718                 space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
1719                 space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1720                     space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
1721                 if (scrub_done)
1722                         space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
1723                 space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
1724                 if (!vdev_readable(vd))
1725                         space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
1726                 else
1727                         space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1728                             space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);










1729                 mutex_exit(&vd->vdev_dtl_lock);
1730 
1731                 if (txg != 0)
1732                         vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
1733                 return;
1734         }
1735 
1736         mutex_enter(&vd->vdev_dtl_lock);
1737         for (int t = 0; t < DTL_TYPES; t++) {
1738                 /* account for child's outage in parent's missing map */
1739                 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
1740                 if (t == DTL_SCRUB)
1741                         continue;                       /* leaf vdevs only */
1742                 if (t == DTL_PARTIAL)
1743                         minref = 1;                     /* i.e. non-zero */
1744                 else if (vd->vdev_nparity != 0)
1745                         minref = vd->vdev_nparity + 1;       /* RAID-Z */
1746                 else
1747                         minref = vd->vdev_children;  /* any kind of mirror */
1748                 space_map_ref_create(&reftree);


1885         if (!required && zio_injection_enabled)
1886                 required = !!zio_handle_device_injection(vd, NULL, ECHILD);
1887 
1888         return (required);
1889 }
1890 
1891 /*
1892  * Determine if resilver is needed, and if so the txg range.
1893  */
1894 boolean_t
1895 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
1896 {
1897         boolean_t needed = B_FALSE;
1898         uint64_t thismin = UINT64_MAX;
1899         uint64_t thismax = 0;
1900 
1901         if (vd->vdev_children == 0) {
1902                 mutex_enter(&vd->vdev_dtl_lock);
1903                 if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
1904                     vdev_writeable(vd)) {
1905                         space_seg_t *ss;
1906 
1907                         ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
1908                         thismin = ss->ss_start - 1;
1909                         ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
1910                         thismax = ss->ss_end;
1911                         needed = B_TRUE;
1912                 }
1913                 mutex_exit(&vd->vdev_dtl_lock);
1914         } else {
1915                 for (int c = 0; c < vd->vdev_children; c++) {
1916                         vdev_t *cvd = vd->vdev_child[c];
1917                         uint64_t cmin, cmax;
1918 
1919                         if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
1920                                 thismin = MIN(thismin, cmin);
1921                                 thismax = MAX(thismax, cmax);
1922                                 needed = B_TRUE;
1923                         }
1924                 }
1925         }
1926 
1927         if (needed && minp) {
1928                 *minp = thismin;
1929                 *maxp = thismax;
1930         }




 504             (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 505             alloctype == VDEV_ALLOC_ROOTPOOL)) {
 506                 if (alloctype == VDEV_ALLOC_LOAD) {
 507                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 508                             &vd->vdev_dtl_smo.smo_object);
 509                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 510                             &vd->vdev_unspare);
 511                 }
 512 
 513                 if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 514                         uint64_t spare = 0;
 515 
 516                         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 517                             &spare) == 0 && spare)
 518                                 spa_spare_add(vd);
 519                 }
 520 
 521                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 522                     &vd->vdev_offline);
 523 
 524                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 525                     &vd->vdev_resilver_txg);
 526 
 527                 /*
 528                  * When importing a pool, we want to ignore the persistent fault
 529                  * state, as the diagnosis made on another system may not be
 530                  * valid in the current context.  Local vdevs will
 531                  * remain in the faulted state.
 532                  */
 533                 if (spa_load_state(spa) == SPA_LOAD_OPEN) {
 534                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 535                             &vd->vdev_faulted);
 536                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 537                             &vd->vdev_degraded);
 538                         (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 539                             &vd->vdev_removed);
 540 
 541                         if (vd->vdev_faulted || vd->vdev_degraded) {
 542                                 char *aux;
 543 
 544                                 vd->vdev_label_aux =
 545                                     VDEV_AUX_ERR_EXCEEDED;


1646                 dirty = space_map_contains(sm, txg, size);
1647         mutex_exit(sm->sm_lock);
1648 
1649         return (dirty);
1650 }
1651 
1652 boolean_t
1653 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1654 {
1655         space_map_t *sm = &vd->vdev_dtl[t];
1656         boolean_t empty;
1657 
1658         mutex_enter(sm->sm_lock);
1659         empty = (sm->sm_space == 0);
1660         mutex_exit(sm->sm_lock);
1661 
1662         return (empty);
1663 }
1664 
1665 /*
1666  * Returns the lowest txg in the DTL range.
1667  */
1668 static uint64_t
1669 vdev_dtl_min(vdev_t *vd)
1670 {
1671         space_seg_t *ss;
1672 
1673         ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1674         ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
1675         ASSERT0(vd->vdev_children);
1676 
1677         ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
1678         return (ss->ss_start - 1);
1679 }
1680 
1681 /*
1682  * Returns the highest txg in the DTL.
1683  */
1684 static uint64_t
1685 vdev_dtl_max(vdev_t *vd)
1686 {
1687         space_seg_t *ss;
1688 
1689         ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1690         ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0);
1691         ASSERT0(vd->vdev_children);
1692 
1693         ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
1694         return (ss->ss_end);
1695 }
1696 
1697 /*
1698  * Determine if a resilvering vdev should remove any DTL entries from
1699  * its range. If the vdev was resilvering for the entire duration of the
1700  * scan then it should excise that range from its DTLs. Otherwise, this
1701  * vdev is considered partially resilvered and should leave its DTL
1702  * entries intact. The comment in vdev_dtl_reassess() describes how we
1703  * excise the DTLs.
1704  */
1705 static boolean_t
1706 vdev_dtl_should_excise(vdev_t *vd)
1707 {
1708         spa_t *spa = vd->vdev_spa;
1709         dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1710 
1711         ASSERT0(scn->scn_phys.scn_errors);
1712         ASSERT0(vd->vdev_children);
1713 
1714         if (vd->vdev_resilver_txg == 0 ||
1715             vd->vdev_dtl[DTL_MISSING].sm_space == 0)
1716                 return (B_TRUE);
1717 
1718         /*
1719          * When a resilver is initiated the scan will assign the scn_max_txg
1720          * value to the highest txg value that exists in all DTLs. If this
1721          * device's max DTL is not part of this scan (i.e. it is not in
1722          * the range (scn_min_txg, scn_max_txg] then it is not eligible
1723          * for excision.
1724          */
1725         if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
1726                 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
1727                 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
1728                 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
1729                 return (B_TRUE);
1730         }
1731         return (B_FALSE);
1732 }
1733 
1734 /*
1735  * Reassess DTLs after a config change or scrub completion.
1736  */
1737 void
1738 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
1739 {
1740         spa_t *spa = vd->vdev_spa;
1741         avl_tree_t reftree;
1742         int minref;
1743 
1744         ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1745 
1746         for (int c = 0; c < vd->vdev_children; c++)
1747                 vdev_dtl_reassess(vd->vdev_child[c], txg,
1748                     scrub_txg, scrub_done);
1749 
1750         if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
1751                 return;
1752 
1753         if (vd->vdev_ops->vdev_op_leaf) {
1754                 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1755 
1756                 mutex_enter(&vd->vdev_dtl_lock);
1757 
1758                 /*
1759                  * If we've completed a scan cleanly then determine
1760                  * if this vdev should remove any DTLs. We only want to
1761                  * excise regions on vdevs that were available during
1762                  * the entire duration of this scan.
1763                  */
1764                 if (scrub_txg != 0 &&
1765                     (spa->spa_scrub_started ||
1766                     (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
1767                     vdev_dtl_should_excise(vd)) {
1768                         /*
1769                          * We completed a scrub up to scrub_txg.  If we
1770                          * did it without rebooting, then the scrub dtl
1771                          * will be valid, so excise the old region and
1772                          * fold in the scrub dtl.  Otherwise, leave the
1773                          * dtl as-is if there was an error.
1774                          *
1775                          * There's little trick here: to excise the beginning
1776                          * of the DTL_MISSING map, we put it into a reference
1777                          * tree and then add a segment with refcnt -1 that
1778                          * covers the range [0, scrub_txg).  This means
1779                          * that each txg in that range has refcnt -1 or 0.
1780                          * We then add DTL_SCRUB with a refcnt of 2, so that
1781                          * entries in the range [0, scrub_txg) will have a
1782                          * positive refcnt -- either 1 or 2.  We then convert
1783                          * the reference tree into the new DTL_MISSING map.
1784                          */
1785                         space_map_ref_create(&reftree);
1786                         space_map_ref_add_map(&reftree,
1787                             &vd->vdev_dtl[DTL_MISSING], 1);
1788                         space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
1789                         space_map_ref_add_map(&reftree,
1790                             &vd->vdev_dtl[DTL_SCRUB], 2);
1791                         space_map_ref_generate_map(&reftree,
1792                             &vd->vdev_dtl[DTL_MISSING], 1);
1793                         space_map_ref_destroy(&reftree);
1794                 }
1795                 space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
1796                 space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1797                     space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
1798                 if (scrub_done)
1799                         space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
1800                 space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
1801                 if (!vdev_readable(vd))
1802                         space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
1803                 else
1804                         space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1805                             space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
1806 
1807                 /*
1808                  * If the vdev was resilvering and no longer has any
1809                  * DTLs then reset its resilvering flag.
1810                  */
1811                 if (vd->vdev_resilver_txg != 0 &&
1812                     vd->vdev_dtl[DTL_MISSING].sm_space == 0 &&
1813                     vd->vdev_dtl[DTL_OUTAGE].sm_space == 0)
1814                         vd->vdev_resilver_txg = 0;
1815 
1816                 mutex_exit(&vd->vdev_dtl_lock);
1817 
1818                 if (txg != 0)
1819                         vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
1820                 return;
1821         }
1822 
1823         mutex_enter(&vd->vdev_dtl_lock);
1824         for (int t = 0; t < DTL_TYPES; t++) {
1825                 /* account for child's outage in parent's missing map */
1826                 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
1827                 if (t == DTL_SCRUB)
1828                         continue;                       /* leaf vdevs only */
1829                 if (t == DTL_PARTIAL)
1830                         minref = 1;                     /* i.e. non-zero */
1831                 else if (vd->vdev_nparity != 0)
1832                         minref = vd->vdev_nparity + 1;       /* RAID-Z */
1833                 else
1834                         minref = vd->vdev_children;  /* any kind of mirror */
1835                 space_map_ref_create(&reftree);


1972         if (!required && zio_injection_enabled)
1973                 required = !!zio_handle_device_injection(vd, NULL, ECHILD);
1974 
1975         return (required);
1976 }
1977 
1978 /*
1979  * Determine if resilver is needed, and if so the txg range.
1980  */
1981 boolean_t
1982 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
1983 {
1984         boolean_t needed = B_FALSE;
1985         uint64_t thismin = UINT64_MAX;
1986         uint64_t thismax = 0;
1987 
1988         if (vd->vdev_children == 0) {
1989                 mutex_enter(&vd->vdev_dtl_lock);
1990                 if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
1991                     vdev_writeable(vd)) {

1992 
1993                         thismin = vdev_dtl_min(vd);
1994                         thismax = vdev_dtl_max(vd);


1995                         needed = B_TRUE;
1996                 }
1997                 mutex_exit(&vd->vdev_dtl_lock);
1998         } else {
1999                 for (int c = 0; c < vd->vdev_children; c++) {
2000                         vdev_t *cvd = vd->vdev_child[c];
2001                         uint64_t cmin, cmax;
2002 
2003                         if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
2004                                 thismin = MIN(thismin, cmin);
2005                                 thismax = MAX(thismax, cmax);
2006                                 needed = B_TRUE;
2007                         }
2008                 }
2009         }
2010 
2011         if (needed && minp) {
2012                 *minp = thismin;
2013                 *maxp = thismax;
2014         }