Print this page
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4104 ::spa_space no longer works
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>

@@ -244,11 +244,11 @@
 
 const char histo_stars[] = "****************************************";
 const int histo_width = sizeof (histo_stars) - 1;
 
 static void
-dump_histogram(const uint64_t *histo, int size)
+dump_histogram(const uint64_t *histo, int size, int offset)
 {
         int i;
         int minidx = size - 1;
         int maxidx = 0;
         uint64_t max = 0;

@@ -265,11 +265,11 @@
         if (max < histo_width)
                 max = histo_width;
 
         for (i = minidx; i <= maxidx; i++) {
                 (void) printf("\t\t\t%3u: %6llu %s\n",
-                    i, (u_longlong_t)histo[i],
+                    i + offset, (u_longlong_t)histo[i],
                     &histo_stars[(max - histo[i]) * histo_width / max]);
         }
 }
 
 static void

@@ -318,23 +318,23 @@
             (u_longlong_t)zs.zs_magic);
         (void) printf("\t\tzap_salt: 0x%llx\n",
             (u_longlong_t)zs.zs_salt);
 
         (void) printf("\t\tLeafs with 2^n pointers:\n");
-        dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE);
+        dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
 
         (void) printf("\t\tBlocks with n*5 entries:\n");
-        dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE);
+        dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
 
         (void) printf("\t\tBlocks n/10 full:\n");
-        dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE);
+        dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
 
         (void) printf("\t\tEntries with n chunks:\n");
-        dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE);
+        dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
 
         (void) printf("\t\tBuckets with n entries:\n");
-        dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE);
+        dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
 }
 
 /*ARGSUSED*/
 static void
 dump_none(objset_t *os, uint64_t object, void *data, size_t size)

@@ -519,30 +519,89 @@
                     typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
         }
         zap_cursor_fini(&zc);
 }
 
+int
+get_dtl_refcount(vdev_t *vd)
+{
+        int refcount = 0;
+
+        if (vd->vdev_ops->vdev_op_leaf) {
+                space_map_t *sm = vd->vdev_dtl_sm;
+
+                if (sm != NULL &&
+                    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+                        return (1);
+                return (0);
+        }
+
+        for (int c = 0; c < vd->vdev_children; c++)
+                refcount += get_dtl_refcount(vd->vdev_child[c]);
+        return (refcount);
+}
+
+int
+get_metaslab_refcount(vdev_t *vd)
+{
+        int refcount = 0;
+
+        if (vd->vdev_top == vd) {
+                for (int m = 0; m < vd->vdev_ms_count; m++) {
+                        space_map_t *sm = vd->vdev_ms[m]->ms_sm;
+
+                        if (sm != NULL &&
+                            sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+                                refcount++;
+                }
+        }
+        for (int c = 0; c < vd->vdev_children; c++)
+                refcount += get_metaslab_refcount(vd->vdev_child[c]);
+
+        return (refcount);
+}
+
+static int
+verify_spacemap_refcounts(spa_t *spa)
+{
+        int expected_refcount, actual_refcount;
+
+        expected_refcount = spa_feature_get_refcount(spa,
+            &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM]);
+        actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
+        actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
+
+        if (expected_refcount != actual_refcount) {
+                (void) printf("space map refcount mismatch: expected %d != "
+                    "actual %d\n", expected_refcount, actual_refcount);
+                return (2);
+        }
+        return (0);
+}
+
 static void
-dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
+dump_spacemap(objset_t *os, space_map_t *sm)
 {
         uint64_t alloc, offset, entry;
-        uint8_t mapshift = sm->sm_shift;
-        uint64_t mapstart = sm->sm_start;
         char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
                             "INVALID", "INVALID", "INVALID", "INVALID" };
 
-        if (smo->smo_object == 0)
+        if (sm == NULL)
                 return;
 
         /*
          * Print out the freelist entries in both encoded and decoded form.
          */
         alloc = 0;
-        for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
-                VERIFY3U(0, ==, dmu_read(os, smo->smo_object, offset,
+        for (offset = 0; offset < space_map_length(sm);
+            offset += sizeof (entry)) {
+                uint8_t mapshift = sm->sm_shift;
+
+                VERIFY0(dmu_read(os, space_map_object(sm), offset,
                     sizeof (entry), &entry, DMU_READ_PREFETCH));
                 if (SM_DEBUG_DECODE(entry)) {
+
                         (void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
                             (u_longlong_t)(offset / sizeof (entry)),
                             ddata[SM_DEBUG_ACTION_DECODE(entry)],
                             (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
                             (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));

@@ -550,75 +609,89 @@
                         (void) printf("\t    [%6llu]    %c  range:"
                             " %010llx-%010llx  size: %06llx\n",
                             (u_longlong_t)(offset / sizeof (entry)),
                             SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
                             (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
-                            mapshift) + mapstart),
+                            mapshift) + sm->sm_start),
                             (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
-                            mapshift) + mapstart + (SM_RUN_DECODE(entry) <<
-                            mapshift)),
+                            mapshift) + sm->sm_start +
+                            (SM_RUN_DECODE(entry) << mapshift)),
                             (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
                         if (SM_TYPE_DECODE(entry) == SM_ALLOC)
                                 alloc += SM_RUN_DECODE(entry) << mapshift;
                         else
                                 alloc -= SM_RUN_DECODE(entry) << mapshift;
                 }
         }
-        if (alloc != smo->smo_alloc) {
+        if (alloc != space_map_allocated(sm)) {
                 (void) printf("space_map_object alloc (%llu) INCONSISTENT "
                     "with space map summary (%llu)\n",
-                    (u_longlong_t)smo->smo_alloc, (u_longlong_t)alloc);
+                    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
         }
 }
 
 static void
 dump_metaslab_stats(metaslab_t *msp)
 {
         char maxbuf[32];
-        space_map_t *sm = msp->ms_map;
-        avl_tree_t *t = sm->sm_pp_root;
-        int free_pct = sm->sm_space * 100 / sm->sm_size;
+        range_tree_t *rt = msp->ms_tree;
+        avl_tree_t *t = &msp->ms_size_tree;
+        int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
-        zdb_nicenum(space_map_maxsize(sm), maxbuf);
+        zdb_nicenum(metaslab_block_maxsize(msp), maxbuf);
 
         (void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
             "segments", avl_numnodes(t), "maxsize", maxbuf,
             "freepct", free_pct);
+        (void) printf("\tIn-memory histogram:\n");
+        dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_metaslab(metaslab_t *msp)
 {
         vdev_t *vd = msp->ms_group->mg_vd;
         spa_t *spa = vd->vdev_spa;
-        space_map_t *sm = msp->ms_map;
-        space_map_obj_t *smo = &msp->ms_smo;
+        space_map_t *sm = msp->ms_sm;
         char freebuf[32];
 
-        zdb_nicenum(sm->sm_size - smo->smo_alloc, freebuf);
+        zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf);
 
         (void) printf(
             "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
-            (u_longlong_t)(sm->sm_start / sm->sm_size),
-            (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf);
+            (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
+            (u_longlong_t)space_map_object(sm), freebuf);
 
-        if (dump_opt['m'] > 1 && !dump_opt['L']) {
+        if (dump_opt['m'] > 2 && !dump_opt['L']) {
                 mutex_enter(&msp->ms_lock);
-                space_map_load_wait(sm);
-                if (!sm->sm_loaded)
-                        VERIFY(space_map_load(sm, zfs_metaslab_ops,
-                            SM_FREE, smo, spa->spa_meta_objset) == 0);
+                metaslab_load_wait(msp);
+                if (!msp->ms_loaded) {
+                        VERIFY0(metaslab_load(msp));
+                        range_tree_stat_verify(msp->ms_tree);
+                }
                 dump_metaslab_stats(msp);
-                space_map_unload(sm);
+                metaslab_unload(msp);
                 mutex_exit(&msp->ms_lock);
         }
 
-        if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
-                ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift));
+        if (dump_opt['m'] > 1 && sm != NULL &&
+            spa_feature_is_active(spa,
+            &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM])) {
+                /*
+                 * The space map histogram represents free space in chunks
+                 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
+                 */
+                (void) printf("\tOn-disk histogram:\n");
+                dump_histogram(sm->sm_phys->smp_histogram,
+                    SPACE_MAP_HISTOGRAM_SIZE(sm), sm->sm_shift);
+        }
 
+        if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
+                ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+
                 mutex_enter(&msp->ms_lock);
-                dump_spacemap(spa->spa_meta_objset, smo, sm);
+                dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
                 mutex_exit(&msp->ms_lock);
         }
 }
 
 static void

@@ -801,13 +874,13 @@
 
         dump_dedup_ratio(&dds_total);
 }
 
 static void
-dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
+dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
-        char *prefix = (void *)sm;
+        char *prefix = arg;
 
         (void) printf("%s [%llu,%llu) length %llu\n",
             prefix,
             (u_longlong_t)start,
             (u_longlong_t)(start + size),

@@ -833,21 +906,20 @@
             vd->vdev_path ? vd->vdev_path :
             vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
             required ? "DTL-required" : "DTL-expendable");
 
         for (int t = 0; t < DTL_TYPES; t++) {
-                space_map_t *sm = &vd->vdev_dtl[t];
-                if (sm->sm_space == 0)
+                range_tree_t *rt = vd->vdev_dtl[t];
+                if (range_tree_space(rt) == 0)
                         continue;
                 (void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
                     indent + 2, "", name[t]);
-                mutex_enter(sm->sm_lock);
-                space_map_walk(sm, dump_dtl_seg, (void *)prefix);
-                mutex_exit(sm->sm_lock);
+                mutex_enter(rt->rt_lock);
+                range_tree_walk(rt, dump_dtl_seg, prefix);
+                mutex_exit(rt->rt_lock);
                 if (dump_opt['d'] > 5 && vd->vdev_children == 0)
-                        dump_spacemap(spa->spa_meta_objset,
-                            &vd->vdev_dtl_smo, sm);
+                        dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
         }
 
         for (int c = 0; c < vd->vdev_children; c++)
                 dump_dtl(vd->vdev_child[c], indent + 4);
 }

@@ -2219,43 +2291,21 @@
 
         return (0);
 }
 
 static void
-zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
+zdb_leak(void *arg, uint64_t start, uint64_t size)
 {
-        vdev_t *vd = sm->sm_ppd;
+        vdev_t *vd = arg;
 
         (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
             (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
 }
 
-/* ARGSUSED */
-static void
-zdb_space_map_load(space_map_t *sm)
-{
-}
-
-static void
-zdb_space_map_unload(space_map_t *sm)
-{
-        space_map_vacate(sm, zdb_leak, sm);
-}
-
-/* ARGSUSED */
-static void
-zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
-}
-
-static space_map_ops_t zdb_space_map_ops = {
-        zdb_space_map_load,
-        zdb_space_map_unload,
+static metaslab_ops_t zdb_metaslab_ops = {
         NULL,   /* alloc */
-        zdb_space_map_claim,
-        NULL,   /* free */
-        NULL    /* maxsize */
+        NULL    /* fragmented */
 };
 
 static void
 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {

@@ -2306,15 +2356,25 @@
                 for (int c = 0; c < rvd->vdev_children; c++) {
                         vdev_t *vd = rvd->vdev_child[c];
                         for (int m = 0; m < vd->vdev_ms_count; m++) {
                                 metaslab_t *msp = vd->vdev_ms[m];
                                 mutex_enter(&msp->ms_lock);
-                                space_map_unload(msp->ms_map);
-                                VERIFY(space_map_load(msp->ms_map,
-                                    &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
-                                    spa->spa_meta_objset) == 0);
-                                msp->ms_map->sm_ppd = vd;
+                                metaslab_unload(msp);
+
+                                /*
+                                 * For leak detection, we overload the metaslab
+                                 * ms_tree to contain allocated segments
+                                 * instead of free segments. As a result,
+                                 * we can't use the normal metaslab_load/unload
+                                 * interfaces.
+                                 */
+                                if (msp->ms_sm != NULL) {
+                                        msp->ms_ops = &zdb_metaslab_ops;
+                                        VERIFY0(space_map_load(msp->ms_sm,
+                                            msp->ms_tree, SM_ALLOC));
+                                        msp->ms_loaded = B_TRUE;
+                                }
                                 mutex_exit(&msp->ms_lock);
                         }
                 }
         }
 

@@ -2333,11 +2393,24 @@
                 for (int c = 0; c < rvd->vdev_children; c++) {
                         vdev_t *vd = rvd->vdev_child[c];
                         for (int m = 0; m < vd->vdev_ms_count; m++) {
                                 metaslab_t *msp = vd->vdev_ms[m];
                                 mutex_enter(&msp->ms_lock);
-                                space_map_unload(msp->ms_map);
+
+                                /*
+                                 * The ms_tree has been overloaded to
+                                 * contain allocated segments. Now that we
+                                 * finished traversing all blocks, any
+                                 * block that remains in the ms_tree
+                                 * represents an allocated block that we
+                                 * did not claim during the traversal.
+                                 * Claimed blocks would have been removed
+                                 * from the ms_tree.
+                                 */
+                                range_tree_vacate(msp->ms_tree, zdb_leak, vd);
+                                msp->ms_loaded = B_FALSE;
+
                                 mutex_exit(&msp->ms_lock);
                         }
                 }
         }
 }

@@ -2561,11 +2634,11 @@
                                 if (dump_opt['b'] >= 4) {
                                         (void) printf("psize "
                                             "(in 512-byte sectors): "
                                             "number of blocks\n");
                                         dump_histogram(zb->zb_psize_histogram,
-                                            PSIZE_HISTO_SIZE);
+                                            PSIZE_HISTO_SIZE, 0);
                                 }
                         }
                 }
         }
 

@@ -2731,10 +2804,13 @@
                     NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
         }
         if (dump_opt['b'] || dump_opt['c'])
                 rc = dump_block_stats(spa);
 
+        if (rc == 0)
+                rc = verify_spacemap_refcounts(spa);
+
         if (dump_opt['s'])
                 show_pool_stats(spa);
 
         if (dump_opt['h'])
                 dump_history(spa);