Print this page
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4104 ::spa_space no longer works
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/metaslab.c
          +++ new/usr/src/uts/common/fs/zfs/metaslab.c
↓ open down ↓ 23 lines elided ↑ open up ↑
  24   24   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/dmu.h>
  29   29  #include <sys/dmu_tx.h>
  30   30  #include <sys/space_map.h>
  31   31  #include <sys/metaslab_impl.h>
  32   32  #include <sys/vdev_impl.h>
  33   33  #include <sys/zio.h>
       34 +#include <sys/spa_impl.h>
  34   35  
  35   36  /*
  36   37   * Allow allocations to switch to gang blocks quickly. We do this to
  37   38   * avoid having to load lots of space_maps in a given txg. There are,
  38   39   * however, some cases where we want to avoid "fast" ganging and instead
  39   40   * we want to do an exhaustive search of all metaslabs on this device.
  40   41   * Currently we don't allow any gang, zil, or dump device related allocations
  41   42   * to "fast" gang.
  42   43   */
  43   44  #define CAN_FASTGANG(flags) \
  44   45          (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
  45   46          METASLAB_GANG_AVOID)))
  46   47  
       48 +#define METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
       49 +#define METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
       50 +#define METASLAB_ACTIVE_MASK            \
       51 +        (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
       52 +
  47   53  uint64_t metaslab_aliquot = 512ULL << 10;
  48   54  uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  49   55  
  50   56  /*
  51   57   * The in-core space map representation is more compact than its on-disk form.
  52   58   * The zfs_condense_pct determines how much more compact the in-core
  53   59   * space_map representation must be before we compact it on-disk.
  54   60   * Values should be greater than or equal to 100.
  55   61   */
  56   62  int zfs_condense_pct = 200;
↓ open down ↓ 15 lines elided ↑ open up ↑
  72   78   * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  73   79   * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  74   80   * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  75   81   * groups are allowed to accept allocations. Gang blocks are always
  76   82   * eligible to allocate on any metaslab group. The default value of 0 means
  77   83   * no metaslab group will be excluded based on this criterion.
  78   84   */
  79   85  int zfs_mg_noalloc_threshold = 0;
  80   86  
  81   87  /*
  82      - * Metaslab debugging: when set, keeps all space maps in core to verify frees.
       88 + * When set will load all metaslabs when pool is first opened.
  83   89   */
  84      -static int metaslab_debug = 0;
       90 +int metaslab_debug_load = 0;
  85   91  
  86   92  /*
       93 + * When set will prevent metaslabs from being unloaded.
       94 + */
       95 +int metaslab_debug_unload = 0;
       96 +
       97 +/*
  87   98   * Minimum size which forces the dynamic allocator to change
  88   99   * it's allocation strategy.  Once the space map cannot satisfy
  89  100   * an allocation of this size then it switches to using more
  90  101   * aggressive strategy (i.e search by size rather than offset).
  91  102   */
  92  103  uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
  93  104  
  94  105  /*
  95  106   * The minimum free space, in percent, which must be available
  96  107   * in a space map to continue allocations in a first-fit fashion.
↓ open down ↓ 2 lines elided ↑ open up ↑
  99  110   */
 100  111  int metaslab_df_free_pct = 4;
 101  112  
 102  113  /*
 103  114   * A metaslab is considered "free" if it contains a contiguous
 104  115   * segment which is greater than metaslab_min_alloc_size.
 105  116   */
 106  117  uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 107  118  
 108  119  /*
 109      - * Max number of space_maps to prefetch.
      120 + * Percentage of all cpus that can be used by the metaslab taskq.
 110  121   */
 111      -int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
      122 +int metaslab_load_pct = 50;
 112  123  
 113  124  /*
 114      - * Percentage bonus multiplier for metaslabs that are in the bonus area.
      125 + * Determines how many txgs a metaslab may remain loaded without having any
      126 + * allocations from it. As long as a metaslab continues to be used we will
      127 + * keep it loaded.
 115  128   */
 116      -int metaslab_smo_bonus_pct = 150;
      129 +int metaslab_unload_delay = TXG_SIZE * 2;
 117  130  
 118  131  /*
 119  132   * Should we be willing to write data to degraded vdevs?
 120  133   */
 121  134  boolean_t zfs_write_to_degraded = B_FALSE;
 122  135  
 123  136  /*
      137 + * Max number of metaslabs per group to preload.
      138 + */
      139 +int metaslab_preload_limit = SPA_DVAS_PER_BP;
      140 +
      141 +/*
      142 + * Enable/disable preloading of metaslab.
      143 + */
      144 +boolean_t metaslab_preload_enabled = B_TRUE;
      145 +
      146 +/*
      147 + * Enable/disable additional weight factor for each metaslab.
      148 + */
      149 +boolean_t metaslab_weight_factor_enable = B_FALSE;
      150 +
      151 +
      152 +/*
 124  153   * ==========================================================================
 125  154   * Metaslab classes
 126  155   * ==========================================================================
 127  156   */
 128  157  metaslab_class_t *
 129      -metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
      158 +metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 130  159  {
 131  160          metaslab_class_t *mc;
 132  161  
 133  162          mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 134  163  
 135  164          mc->mc_spa = spa;
 136  165          mc->mc_rotor = NULL;
 137  166          mc->mc_ops = ops;
 138  167  
 139  168          return (mc);
↓ open down ↓ 83 lines elided ↑ open up ↑
 223  252          const metaslab_t *m2 = x2;
 224  253  
 225  254          if (m1->ms_weight < m2->ms_weight)
 226  255                  return (1);
 227  256          if (m1->ms_weight > m2->ms_weight)
 228  257                  return (-1);
 229  258  
 230  259          /*
 231  260           * If the weights are identical, use the offset to force uniqueness.
 232  261           */
 233      -        if (m1->ms_map->sm_start < m2->ms_map->sm_start)
      262 +        if (m1->ms_start < m2->ms_start)
 234  263                  return (-1);
 235      -        if (m1->ms_map->sm_start > m2->ms_map->sm_start)
      264 +        if (m1->ms_start > m2->ms_start)
 236  265                  return (1);
 237  266  
 238  267          ASSERT3P(m1, ==, m2);
 239  268  
 240  269          return (0);
 241  270  }
 242  271  
 243  272  /*
 244  273   * Update the allocatable flag and the metaslab group's capacity.
 245  274   * The allocatable flag is set to true if the capacity is below
↓ open down ↓ 47 lines elided ↑ open up ↑
 293  322          metaslab_group_t *mg;
 294  323  
 295  324          mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 296  325          mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 297  326          avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 298  327              sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 299  328          mg->mg_vd = vd;
 300  329          mg->mg_class = mc;
 301  330          mg->mg_activation_count = 0;
 302  331  
      332 +        mg->mg_taskq = taskq_create("metaslab_group_tasksq", metaslab_load_pct,
      333 +            minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
      334 +
 303  335          return (mg);
 304  336  }
 305  337  
 306  338  void
 307  339  metaslab_group_destroy(metaslab_group_t *mg)
 308  340  {
 309  341          ASSERT(mg->mg_prev == NULL);
 310  342          ASSERT(mg->mg_next == NULL);
 311  343          /*
 312  344           * We may have gone below zero with the activation count
↓ open down ↓ 48 lines elided ↑ open up ↑
 361  393          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 362  394  
 363  395          if (--mg->mg_activation_count != 0) {
 364  396                  ASSERT(mc->mc_rotor != mg);
 365  397                  ASSERT(mg->mg_prev == NULL);
 366  398                  ASSERT(mg->mg_next == NULL);
 367  399                  ASSERT(mg->mg_activation_count < 0);
 368  400                  return;
 369  401          }
 370  402  
      403 +        taskq_wait(mg->mg_taskq);
      404 +
 371  405          mgprev = mg->mg_prev;
 372  406          mgnext = mg->mg_next;
 373  407  
 374  408          if (mg == mgnext) {
 375  409                  mc->mc_rotor = NULL;
 376  410          } else {
 377  411                  mc->mc_rotor = mgnext;
 378  412                  mgprev->mg_next = mgnext;
 379  413                  mgnext->mg_prev = mgprev;
 380  414          }
↓ open down ↓ 59 lines elided ↑ open up ↑
 440  474           * is greater than the set value of zfs_mg_noalloc_threshold, it's
 441  475           * associated with a slog, or there are no other metaslab groups
 442  476           * with free capacity greater than zfs_mg_noalloc_threshold.
 443  477           */
 444  478          return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
 445  479              mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
 446  480  }
 447  481  
 448  482  /*
 449  483   * ==========================================================================
 450      - * Common allocator routines
      484 + * Range tree callbacks
 451  485   * ==========================================================================
 452  486   */
      487 +
      488 +/*
      489 + * Comparison function for the private size-ordered tree. Tree is sorted
      490 + * by size, larger sizes at the end of the tree.
      491 + */
 453  492  static int
 454      -metaslab_segsize_compare(const void *x1, const void *x2)
      493 +metaslab_rangesize_compare(const void *x1, const void *x2)
 455  494  {
 456      -        const space_seg_t *s1 = x1;
 457      -        const space_seg_t *s2 = x2;
 458      -        uint64_t ss_size1 = s1->ss_end - s1->ss_start;
 459      -        uint64_t ss_size2 = s2->ss_end - s2->ss_start;
      495 +        const range_seg_t *r1 = x1;
      496 +        const range_seg_t *r2 = x2;
      497 +        uint64_t rs_size1 = r1->rs_end - r1->rs_start;
      498 +        uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 460  499  
 461      -        if (ss_size1 < ss_size2)
      500 +        if (rs_size1 < rs_size2)
 462  501                  return (-1);
 463      -        if (ss_size1 > ss_size2)
      502 +        if (rs_size1 > rs_size2)
 464  503                  return (1);
 465  504  
 466      -        if (s1->ss_start < s2->ss_start)
      505 +        if (r1->rs_start < r2->rs_start)
 467  506                  return (-1);
 468      -        if (s1->ss_start > s2->ss_start)
      507 +
      508 +        if (r1->rs_start > r2->rs_start)
 469  509                  return (1);
 470  510  
 471  511          return (0);
 472  512  }
 473  513  
 474  514  /*
 475      - * This is a helper function that can be used by the allocator to find
 476      - * a suitable block to allocate. This will search the specified AVL
 477      - * tree looking for a block that matches the specified criteria.
      515 + * Create any block allocator specific components. The current allocators
      516 + * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
 478  517   */
 479      -static uint64_t
 480      -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 481      -    uint64_t align)
      518 +static void
      519 +metaslab_rt_create(range_tree_t *rt, void *arg)
 482  520  {
 483      -        space_seg_t *ss, ssearch;
 484      -        avl_index_t where;
      521 +        metaslab_t *msp = arg;
 485  522  
 486      -        ssearch.ss_start = *cursor;
 487      -        ssearch.ss_end = *cursor + size;
      523 +        ASSERT3P(rt->rt_arg, ==, msp);
      524 +        ASSERT(msp->ms_tree == NULL);
 488  525  
 489      -        ss = avl_find(t, &ssearch, &where);
 490      -        if (ss == NULL)
 491      -                ss = avl_nearest(t, where, AVL_AFTER);
 492      -
 493      -        while (ss != NULL) {
 494      -                uint64_t offset = P2ROUNDUP(ss->ss_start, align);
 495      -
 496      -                if (offset + size <= ss->ss_end) {
 497      -                        *cursor = offset + size;
 498      -                        return (offset);
 499      -                }
 500      -                ss = AVL_NEXT(t, ss);
 501      -        }
 502      -
 503      -        /*
 504      -         * If we know we've searched the whole map (*cursor == 0), give up.
 505      -         * Otherwise, reset the cursor to the beginning and try again.
 506      -         */
 507      -        if (*cursor == 0)
 508      -                return (-1ULL);
 509      -
 510      -        *cursor = 0;
 511      -        return (metaslab_block_picker(t, cursor, size, align));
      526 +        avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
      527 +            sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 512  528  }
 513  529  
      530 +/*
      531 + * Destroy the block allocator specific components.
      532 + */
 514  533  static void
 515      -metaslab_pp_load(space_map_t *sm)
      534 +metaslab_rt_destroy(range_tree_t *rt, void *arg)
 516  535  {
 517      -        space_seg_t *ss;
      536 +        metaslab_t *msp = arg;
 518  537  
 519      -        ASSERT(sm->sm_ppd == NULL);
 520      -        sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
      538 +        ASSERT3P(rt->rt_arg, ==, msp);
      539 +        ASSERT3P(msp->ms_tree, ==, rt);
      540 +        ASSERT0(avl_numnodes(&msp->ms_size_tree));
 521  541  
 522      -        sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
 523      -        avl_create(sm->sm_pp_root, metaslab_segsize_compare,
 524      -            sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
 525      -
 526      -        for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
 527      -                avl_add(sm->sm_pp_root, ss);
      542 +        avl_destroy(&msp->ms_size_tree);
 528  543  }
 529  544  
 530  545  static void
 531      -metaslab_pp_unload(space_map_t *sm)
      546 +metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 532  547  {
 533      -        void *cookie = NULL;
      548 +        metaslab_t *msp = arg;
 534  549  
 535      -        kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
 536      -        sm->sm_ppd = NULL;
 537      -
 538      -        while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
 539      -                /* tear down the tree */
 540      -        }
 541      -
 542      -        avl_destroy(sm->sm_pp_root);
 543      -        kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
 544      -        sm->sm_pp_root = NULL;
      550 +        ASSERT3P(rt->rt_arg, ==, msp);
      551 +        ASSERT3P(msp->ms_tree, ==, rt);
      552 +        VERIFY(!msp->ms_condensing);
      553 +        avl_add(&msp->ms_size_tree, rs);
 545  554  }
 546  555  
 547      -/* ARGSUSED */
 548  556  static void
 549      -metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
      557 +metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 550  558  {
 551      -        /* No need to update cursor */
      559 +        metaslab_t *msp = arg;
      560 +
      561 +        ASSERT3P(rt->rt_arg, ==, msp);
      562 +        ASSERT3P(msp->ms_tree, ==, rt);
      563 +        VERIFY(!msp->ms_condensing);
      564 +        avl_remove(&msp->ms_size_tree, rs);
 552  565  }
 553  566  
 554      -/* ARGSUSED */
 555  567  static void
 556      -metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
      568 +metaslab_rt_vacate(range_tree_t *rt, void *arg)
 557  569  {
 558      -        /* No need to update cursor */
      570 +        metaslab_t *msp = arg;
      571 +
      572 +        ASSERT3P(rt->rt_arg, ==, msp);
      573 +        ASSERT3P(msp->ms_tree, ==, rt);
      574 +
      575 +        /*
      576 +         * Normally one would walk the tree freeing nodes along the way.
      577 +         * Since the nodes are shared with the range trees we can avoid
      578 +         * walking all nodes and just reinitialize the avl tree. The nodes
      579 +         * will be freed by the range tree, so we don't want to free them here.
      580 +         */
      581 +        avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
      582 +            sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 559  583  }
 560  584  
      585 +static range_tree_ops_t metaslab_rt_ops = {
      586 +        metaslab_rt_create,
      587 +        metaslab_rt_destroy,
      588 +        metaslab_rt_add,
      589 +        metaslab_rt_remove,
      590 +        metaslab_rt_vacate
      591 +};
      592 +
 561  593  /*
      594 + * ==========================================================================
      595 + * Metaslab block operations
      596 + * ==========================================================================
      597 + */
      598 +
      599 +/*
 562  600   * Return the maximum contiguous segment within the metaslab.
 563  601   */
 564  602  uint64_t
 565      -metaslab_pp_maxsize(space_map_t *sm)
      603 +metaslab_block_maxsize(metaslab_t *msp)
 566  604  {
 567      -        avl_tree_t *t = sm->sm_pp_root;
 568      -        space_seg_t *ss;
      605 +        avl_tree_t *t = &msp->ms_size_tree;
      606 +        range_seg_t *rs;
 569  607  
 570      -        if (t == NULL || (ss = avl_last(t)) == NULL)
      608 +        if (t == NULL || (rs = avl_last(t)) == NULL)
 571  609                  return (0ULL);
 572  610  
 573      -        return (ss->ss_end - ss->ss_start);
      611 +        return (rs->rs_end - rs->rs_start);
 574  612  }
 575  613  
      614 +uint64_t
      615 +metaslab_block_alloc(metaslab_t *msp, uint64_t size)
      616 +{
      617 +        uint64_t start;
      618 +        range_tree_t *rt = msp->ms_tree;
      619 +
      620 +        VERIFY(!msp->ms_condensing);
      621 +
      622 +        start = msp->ms_ops->msop_alloc(msp, size);
      623 +        if (start != -1ULL) {
      624 +                vdev_t *vd = msp->ms_group->mg_vd;
      625 +
      626 +                VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
      627 +                VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
      628 +                VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
      629 +                range_tree_remove(rt, start, size);
      630 +        }
      631 +        return (start);
      632 +}
      633 +
 576  634  /*
 577  635   * ==========================================================================
      636 + * Common allocator routines
      637 + * ==========================================================================
      638 + */
      639 +
      640 +/*
      641 + * This is a helper function that can be used by the allocator to find
      642 + * a suitable block to allocate. This will search the specified AVL
      643 + * tree looking for a block that matches the specified criteria.
      644 + */
      645 +static uint64_t
      646 +metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
      647 +    uint64_t align)
      648 +{
      649 +        range_seg_t *rs, rsearch;
      650 +        avl_index_t where;
      651 +
      652 +        rsearch.rs_start = *cursor;
      653 +        rsearch.rs_end = *cursor + size;
      654 +
      655 +        rs = avl_find(t, &rsearch, &where);
      656 +        if (rs == NULL)
      657 +                rs = avl_nearest(t, where, AVL_AFTER);
      658 +
      659 +        while (rs != NULL) {
      660 +                uint64_t offset = P2ROUNDUP(rs->rs_start, align);
      661 +
      662 +                if (offset + size <= rs->rs_end) {
      663 +                        *cursor = offset + size;
      664 +                        return (offset);
      665 +                }
      666 +                rs = AVL_NEXT(t, rs);
      667 +        }
      668 +
      669 +        /*
      670 +         * If we know we've searched the whole map (*cursor == 0), give up.
      671 +         * Otherwise, reset the cursor to the beginning and try again.
      672 +         */
      673 +        if (*cursor == 0)
      674 +                return (-1ULL);
      675 +
      676 +        *cursor = 0;
      677 +        return (metaslab_block_picker(t, cursor, size, align));
      678 +}
      679 +
      680 +/*
      681 + * ==========================================================================
 578  682   * The first-fit block allocator
 579  683   * ==========================================================================
 580  684   */
 581  685  static uint64_t
 582      -metaslab_ff_alloc(space_map_t *sm, uint64_t size)
      686 +metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 583  687  {
 584      -        avl_tree_t *t = &sm->sm_root;
      688 +        /*
      689 +         * Find the largest power of 2 block size that evenly divides the
      690 +         * requested size. This is used to try to allocate blocks with similar
      691 +         * alignment from the same area of the metaslab (i.e. same cursor
      692 +         * bucket) but it does not guarantee that other allocations sizes
      693 +         * may exist in the same region.
      694 +         */
 585  695          uint64_t align = size & -size;
 586      -        uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
      696 +        uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
      697 +        avl_tree_t *t = &msp->ms_tree->rt_root;
 587  698  
 588  699          return (metaslab_block_picker(t, cursor, size, align));
 589  700  }
 590  701  
 591  702  /* ARGSUSED */
 592      -boolean_t
 593      -metaslab_ff_fragmented(space_map_t *sm)
      703 +static boolean_t
      704 +metaslab_ff_fragmented(metaslab_t *msp)
 594  705  {
 595  706          return (B_TRUE);
 596  707  }
 597  708  
 598      -static space_map_ops_t metaslab_ff_ops = {
 599      -        metaslab_pp_load,
 600      -        metaslab_pp_unload,
      709 +static metaslab_ops_t metaslab_ff_ops = {
 601  710          metaslab_ff_alloc,
 602      -        metaslab_pp_claim,
 603      -        metaslab_pp_free,
 604      -        metaslab_pp_maxsize,
 605  711          metaslab_ff_fragmented
 606  712  };
 607  713  
 608  714  /*
 609  715   * ==========================================================================
 610  716   * Dynamic block allocator -
 611  717   * Uses the first fit allocation scheme until space get low and then
 612  718   * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
 613  719   * and metaslab_df_free_pct to determine when to switch the allocation scheme.
 614  720   * ==========================================================================
 615  721   */
 616  722  static uint64_t
 617      -metaslab_df_alloc(space_map_t *sm, uint64_t size)
      723 +metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 618  724  {
 619      -        avl_tree_t *t = &sm->sm_root;
      725 +        /*
      726 +         * Find the largest power of 2 block size that evenly divides the
      727 +         * requested size. This is used to try to allocate blocks with similar
      728 +         * alignment from the same area of the metaslab (i.e. same cursor
      729 +         * bucket) but it does not guarantee that other allocations sizes
      730 +         * may exist in the same region.
      731 +         */
 620  732          uint64_t align = size & -size;
 621      -        uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 622      -        uint64_t max_size = metaslab_pp_maxsize(sm);
 623      -        int free_pct = sm->sm_space * 100 / sm->sm_size;
      733 +        uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
      734 +        range_tree_t *rt = msp->ms_tree;
      735 +        avl_tree_t *t = &rt->rt_root;
      736 +        uint64_t max_size = metaslab_block_maxsize(msp);
      737 +        int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 624  738  
 625      -        ASSERT(MUTEX_HELD(sm->sm_lock));
 626      -        ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
      739 +        ASSERT(MUTEX_HELD(&msp->ms_lock));
      740 +        ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 627  741  
 628  742          if (max_size < size)
 629  743                  return (-1ULL);
 630  744  
 631  745          /*
 632  746           * If we're running low on space switch to using the size
 633  747           * sorted AVL tree (best-fit).
 634  748           */
 635  749          if (max_size < metaslab_df_alloc_threshold ||
 636  750              free_pct < metaslab_df_free_pct) {
 637      -                t = sm->sm_pp_root;
      751 +                t = &msp->ms_size_tree;
 638  752                  *cursor = 0;
 639  753          }
 640  754  
 641  755          return (metaslab_block_picker(t, cursor, size, 1ULL));
 642  756  }
 643  757  
 644  758  static boolean_t
 645      -metaslab_df_fragmented(space_map_t *sm)
      759 +metaslab_df_fragmented(metaslab_t *msp)
 646  760  {
 647      -        uint64_t max_size = metaslab_pp_maxsize(sm);
 648      -        int free_pct = sm->sm_space * 100 / sm->sm_size;
      761 +        range_tree_t *rt = msp->ms_tree;
      762 +        uint64_t max_size = metaslab_block_maxsize(msp);
      763 +        int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 649  764  
 650  765          if (max_size >= metaslab_df_alloc_threshold &&
 651  766              free_pct >= metaslab_df_free_pct)
 652  767                  return (B_FALSE);
 653  768  
 654  769          return (B_TRUE);
 655  770  }
 656  771  
 657      -static space_map_ops_t metaslab_df_ops = {
 658      -        metaslab_pp_load,
 659      -        metaslab_pp_unload,
      772 +static metaslab_ops_t metaslab_df_ops = {
 660  773          metaslab_df_alloc,
 661      -        metaslab_pp_claim,
 662      -        metaslab_pp_free,
 663      -        metaslab_pp_maxsize,
 664  774          metaslab_df_fragmented
 665  775  };
 666  776  
 667  777  /*
 668  778   * ==========================================================================
 669      - * Other experimental allocators
      779 + * Cursor fit block allocator -
      780 + * Select the largest region in the metaslab, set the cursor to the beginning
      781 + * of the range and the cursor_end to the end of the range. As allocations
      782 + * are made advance the cursor. Continue allocating from the cursor until
      783 + * the range is exhausted and then find a new range.
 670  784   * ==========================================================================
 671  785   */
 672  786  static uint64_t
 673      -metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
      787 +metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 674  788  {
 675      -        avl_tree_t *t = &sm->sm_root;
 676      -        uint64_t *cursor = (uint64_t *)sm->sm_ppd;
 677      -        uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
 678      -        uint64_t max_size = metaslab_pp_maxsize(sm);
 679      -        uint64_t rsize = size;
      789 +        range_tree_t *rt = msp->ms_tree;
      790 +        avl_tree_t *t = &msp->ms_size_tree;
      791 +        uint64_t *cursor = &msp->ms_lbas[0];
      792 +        uint64_t *cursor_end = &msp->ms_lbas[1];
 680  793          uint64_t offset = 0;
 681  794  
 682      -        ASSERT(MUTEX_HELD(sm->sm_lock));
 683      -        ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
      795 +        ASSERT(MUTEX_HELD(&msp->ms_lock));
      796 +        ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
 684  797  
 685      -        if (max_size < size)
 686      -                return (-1ULL);
      798 +        ASSERT3U(*cursor_end, >=, *cursor);
 687  799  
 688      -        ASSERT3U(*extent_end, >=, *cursor);
      800 +        if ((*cursor + size) > *cursor_end) {
      801 +                range_seg_t *rs;
 689  802  
 690      -        /*
 691      -         * If we're running low on space switch to using the size
 692      -         * sorted AVL tree (best-fit).
 693      -         */
 694      -        if ((*cursor + size) > *extent_end) {
      803 +                rs = avl_last(&msp->ms_size_tree);
      804 +                if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
      805 +                        return (-1ULL);
 695  806  
 696      -                t = sm->sm_pp_root;
 697      -                *cursor = *extent_end = 0;
 698      -
 699      -                if (max_size > 2 * SPA_MAXBLOCKSIZE)
 700      -                        rsize = MIN(metaslab_min_alloc_size, max_size);
 701      -                offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
 702      -                if (offset != -1)
 703      -                        *cursor = offset + size;
 704      -        } else {
 705      -                offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
      807 +                *cursor = rs->rs_start;
      808 +                *cursor_end = rs->rs_end;
 706  809          }
 707      -        ASSERT3U(*cursor, <=, *extent_end);
      810 +
      811 +        offset = *cursor;
      812 +        *cursor += size;
      813 +
 708  814          return (offset);
 709  815  }
 710  816  
 711  817  static boolean_t
 712      -metaslab_cdf_fragmented(space_map_t *sm)
      818 +metaslab_cf_fragmented(metaslab_t *msp)
 713  819  {
 714      -        uint64_t max_size = metaslab_pp_maxsize(sm);
 715      -
 716      -        if (max_size > (metaslab_min_alloc_size * 10))
 717      -                return (B_FALSE);
 718      -        return (B_TRUE);
      820 +        return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
 719  821  }
 720  822  
 721      -static space_map_ops_t metaslab_cdf_ops = {
 722      -        metaslab_pp_load,
 723      -        metaslab_pp_unload,
 724      -        metaslab_cdf_alloc,
 725      -        metaslab_pp_claim,
 726      -        metaslab_pp_free,
 727      -        metaslab_pp_maxsize,
 728      -        metaslab_cdf_fragmented
      823 +static metaslab_ops_t metaslab_cf_ops = {
      824 +        metaslab_cf_alloc,
      825 +        metaslab_cf_fragmented
 729  826  };
 730  827  
      828 +/*
      829 + * ==========================================================================
      830 + * New dynamic fit allocator -
      831 + * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
      832 + * contiguous blocks. If no region is found then just use the largest segment
      833 + * that remains.
      834 + * ==========================================================================
      835 + */
      836 +
      837 +/*
      838 + * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
      839 + * to request from the allocator.
      840 + */
 731  841  uint64_t metaslab_ndf_clump_shift = 4;
 732  842  
 733  843  static uint64_t
 734      -metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
      844 +metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 735  845  {
 736      -        avl_tree_t *t = &sm->sm_root;
      846 +        avl_tree_t *t = &msp->ms_tree->rt_root;
 737  847          avl_index_t where;
 738      -        space_seg_t *ss, ssearch;
      848 +        range_seg_t *rs, rsearch;
 739  849          uint64_t hbit = highbit(size);
 740      -        uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
 741      -        uint64_t max_size = metaslab_pp_maxsize(sm);
      850 +        uint64_t *cursor = &msp->ms_lbas[hbit - 1];
      851 +        uint64_t max_size = metaslab_block_maxsize(msp);
 742  852  
 743      -        ASSERT(MUTEX_HELD(sm->sm_lock));
 744      -        ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
      853 +        ASSERT(MUTEX_HELD(&msp->ms_lock));
      854 +        ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 745  855  
 746  856          if (max_size < size)
 747  857                  return (-1ULL);
 748  858  
 749      -        ssearch.ss_start = *cursor;
 750      -        ssearch.ss_end = *cursor + size;
      859 +        rsearch.rs_start = *cursor;
      860 +        rsearch.rs_end = *cursor + size;
 751  861  
 752      -        ss = avl_find(t, &ssearch, &where);
 753      -        if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
 754      -                t = sm->sm_pp_root;
      862 +        rs = avl_find(t, &rsearch, &where);
      863 +        if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
      864 +                t = &msp->ms_size_tree;
 755  865  
 756      -                ssearch.ss_start = 0;
 757      -                ssearch.ss_end = MIN(max_size,
      866 +                rsearch.rs_start = 0;
      867 +                rsearch.rs_end = MIN(max_size,
 758  868                      1ULL << (hbit + metaslab_ndf_clump_shift));
 759      -                ss = avl_find(t, &ssearch, &where);
 760      -                if (ss == NULL)
 761      -                        ss = avl_nearest(t, where, AVL_AFTER);
 762      -                ASSERT(ss != NULL);
      869 +                rs = avl_find(t, &rsearch, &where);
      870 +                if (rs == NULL)
      871 +                        rs = avl_nearest(t, where, AVL_AFTER);
      872 +                ASSERT(rs != NULL);
 763  873          }
 764  874  
 765      -        if (ss != NULL) {
 766      -                if (ss->ss_start + size <= ss->ss_end) {
 767      -                        *cursor = ss->ss_start + size;
 768      -                        return (ss->ss_start);
 769      -                }
      875 +        if ((rs->rs_end - rs->rs_start) >= size) {
      876 +                *cursor = rs->rs_start + size;
      877 +                return (rs->rs_start);
 770  878          }
 771  879          return (-1ULL);
 772  880  }
 773  881  
 774  882  static boolean_t
 775      -metaslab_ndf_fragmented(space_map_t *sm)
      883 +metaslab_ndf_fragmented(metaslab_t *msp)
 776  884  {
 777      -        uint64_t max_size = metaslab_pp_maxsize(sm);
 778      -
 779      -        if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
 780      -                return (B_FALSE);
 781      -        return (B_TRUE);
      885 +        return (metaslab_block_maxsize(msp) <=
      886 +            (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
 782  887  }
 783  888  
 784      -
 785      -static space_map_ops_t metaslab_ndf_ops = {
 786      -        metaslab_pp_load,
 787      -        metaslab_pp_unload,
      889 +static metaslab_ops_t metaslab_ndf_ops = {
 788  890          metaslab_ndf_alloc,
 789      -        metaslab_pp_claim,
 790      -        metaslab_pp_free,
 791      -        metaslab_pp_maxsize,
 792  891          metaslab_ndf_fragmented
 793  892  };
 794  893  
 795      -space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
      894 +metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 796  895  
 797  896  /*
 798  897   * ==========================================================================
 799  898   * Metaslabs
 800  899   * ==========================================================================
 801  900   */
      901 +
      902 +/*
      903 + * Wait for any in-progress metaslab loads to complete.
      904 + */
      905 +void
      906 +metaslab_load_wait(metaslab_t *msp)
      907 +{
      908 +        ASSERT(MUTEX_HELD(&msp->ms_lock));
      909 +
      910 +        while (msp->ms_loading) {
      911 +                ASSERT(!msp->ms_loaded);
      912 +                cv_wait(&msp->ms_load_cv, &msp->ms_lock);
      913 +        }
      914 +}
      915 +
      916 +int
      917 +metaslab_load(metaslab_t *msp)
      918 +{
      919 +        int error = 0;
      920 +
      921 +        ASSERT(MUTEX_HELD(&msp->ms_lock));
      922 +        ASSERT(!msp->ms_loaded);
      923 +        ASSERT(!msp->ms_loading);
      924 +
      925 +        msp->ms_loading = B_TRUE;
      926 +
      927 +        /*
      928 +         * If the space map has not been allocated yet, then treat
      929 +         * all the space in the metaslab as free and add it to the
      930 +         * ms_tree.
      931 +         */
      932 +        if (msp->ms_sm != NULL)
      933 +                error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
      934 +        else
      935 +                range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
      936 +
      937 +        msp->ms_loaded = (error == 0);
      938 +        msp->ms_loading = B_FALSE;
      939 +
      940 +        if (msp->ms_loaded) {
      941 +                for (int t = 0; t < TXG_DEFER_SIZE; t++) {
      942 +                        range_tree_walk(msp->ms_defertree[t],
      943 +                            range_tree_remove, msp->ms_tree);
      944 +                }
      945 +        }
      946 +        cv_broadcast(&msp->ms_load_cv);
      947 +        return (error);
      948 +}
      949 +
      950 +void
      951 +metaslab_unload(metaslab_t *msp)
      952 +{
      953 +        ASSERT(MUTEX_HELD(&msp->ms_lock));
      954 +        range_tree_vacate(msp->ms_tree, NULL, NULL);
      955 +        msp->ms_loaded = B_FALSE;
      956 +        msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
      957 +}
      958 +
 802  959  metaslab_t *
 803      -metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 804      -        uint64_t start, uint64_t size, uint64_t txg)
      960 +metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
 805  961  {
 806  962          vdev_t *vd = mg->mg_vd;
      963 +        objset_t *mos = vd->vdev_spa->spa_meta_objset;
 807  964          metaslab_t *msp;
 808  965  
 809  966          msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 810  967          mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
      968 +        cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
      969 +        msp->ms_id = id;
      970 +        msp->ms_start = id << vd->vdev_ms_shift;
      971 +        msp->ms_size = 1ULL << vd->vdev_ms_shift;
 811  972  
 812      -        msp->ms_smo_syncing = *smo;
      973 +        /*
      974 +         * We only open space map objects that already exist. All others
      975 +         * will be opened when we finally allocate an object for it.
      976 +         */
      977 +        if (object != 0) {
      978 +                VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
      979 +                    msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
      980 +                ASSERT(msp->ms_sm != NULL);
      981 +        }
 813  982  
 814  983          /*
 815      -         * We create the main space map here, but we don't create the
 816      -         * allocmaps and freemaps until metaslab_sync_done().  This serves
      984 +         * We create the main range tree here, but we don't create the
      985 +         * alloctree and freetree until metaslab_sync_done().  This serves
 817  986           * two purposes: it allows metaslab_sync_done() to detect the
 818  987           * addition of new space; and for debugging, it ensures that we'd
 819  988           * data fault on any attempt to use this metaslab before it's ready.
 820  989           */
 821      -        msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
 822      -        space_map_create(msp->ms_map, start, size,
 823      -            vd->vdev_ashift, &msp->ms_lock);
 824      -
      990 +        msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
 825  991          metaslab_group_add(mg, msp);
 826  992  
 827      -        if (metaslab_debug && smo->smo_object != 0) {
 828      -                mutex_enter(&msp->ms_lock);
 829      -                VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
 830      -                    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
 831      -                mutex_exit(&msp->ms_lock);
 832      -        }
      993 +        msp->ms_ops = mg->mg_class->mc_ops;
 833  994  
 834  995          /*
 835  996           * If we're opening an existing pool (txg == 0) or creating
 836  997           * a new one (txg == TXG_INITIAL), all space is available now.
 837  998           * If we're adding space to an existing pool, the new space
 838  999           * does not become available until after this txg has synced.
 839 1000           */
 840 1001          if (txg <= TXG_INITIAL)
 841 1002                  metaslab_sync_done(msp, 0);
 842 1003  
     1004 +        /*
     1005 +         * If metaslab_debug_load is set and we're initializing a metaslab
     1006 +         * that has an allocated space_map object then load the its space
     1007 +         * map so that can verify frees.
     1008 +         */
     1009 +        if (metaslab_debug_load && msp->ms_sm != NULL) {
     1010 +                mutex_enter(&msp->ms_lock);
     1011 +                VERIFY0(metaslab_load(msp));
     1012 +                mutex_exit(&msp->ms_lock);
     1013 +        }
     1014 +
 843 1015          if (txg != 0) {
 844 1016                  vdev_dirty(vd, 0, NULL, txg);
 845 1017                  vdev_dirty(vd, VDD_METASLAB, msp, txg);
 846 1018          }
 847 1019  
 848 1020          return (msp);
 849 1021  }
 850 1022  
 851 1023  void
 852 1024  metaslab_fini(metaslab_t *msp)
 853 1025  {
 854 1026          metaslab_group_t *mg = msp->ms_group;
 855 1027  
 856      -        vdev_space_update(mg->mg_vd,
 857      -            -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
 858      -
 859 1028          metaslab_group_remove(mg, msp);
 860 1029  
 861 1030          mutex_enter(&msp->ms_lock);
 862 1031  
 863      -        space_map_unload(msp->ms_map);
 864      -        space_map_destroy(msp->ms_map);
 865      -        kmem_free(msp->ms_map, sizeof (*msp->ms_map));
     1032 +        VERIFY(msp->ms_group == NULL);
     1033 +        vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
     1034 +            0, -msp->ms_size);
     1035 +        space_map_close(msp->ms_sm);
 866 1036  
     1037 +        metaslab_unload(msp);
     1038 +        range_tree_destroy(msp->ms_tree);
     1039 +
 867 1040          for (int t = 0; t < TXG_SIZE; t++) {
 868      -                space_map_destroy(msp->ms_allocmap[t]);
 869      -                space_map_destroy(msp->ms_freemap[t]);
 870      -                kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
 871      -                kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
     1041 +                range_tree_destroy(msp->ms_alloctree[t]);
     1042 +                range_tree_destroy(msp->ms_freetree[t]);
 872 1043          }
 873 1044  
 874 1045          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 875      -                space_map_destroy(msp->ms_defermap[t]);
 876      -                kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
     1046 +                range_tree_destroy(msp->ms_defertree[t]);
 877 1047          }
 878 1048  
 879 1049          ASSERT0(msp->ms_deferspace);
 880 1050  
 881 1051          mutex_exit(&msp->ms_lock);
     1052 +        cv_destroy(&msp->ms_load_cv);
 882 1053          mutex_destroy(&msp->ms_lock);
 883 1054  
 884 1055          kmem_free(msp, sizeof (metaslab_t));
 885 1056  }
 886 1057  
 887      -#define METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
 888      -#define METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
 889      -#define METASLAB_ACTIVE_MASK            \
 890      -        (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
     1058 +/*
     1059 + * Apply a weighting factor based on the histogram information for this
     1060 + * metaslab. The current weighting factor is somewhat arbitrary and requires
     1061 + * additional investigation. The implementation provides a measure of
     1062 + * "weighted" free space and gives a higher weighting for larger contiguous
     1063 + * regions. The weighting factor is determined by counting the number of
     1064 + * sm_shift sectors that exist in each region represented by the histogram.
     1065 + * That value is then multiplied by the power of 2 exponent and the sm_shift
     1066 + * value.
     1067 + *
     1068 + * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
     1069 + * metaslab has an sm_shift value of 9 (512B):
     1070 + *
     1071 + * 1) calculate the number of sm_shift sectors in the region:
     1072 + *      2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
     1073 + * 2) multiply by the power of 2 exponent and the sm_shift value:
     1074 + *      16384 * 21 * 9 = 3096576
     1075 + * This value will be added to the weighting of the metaslab.
     1076 + */
     1077 +static uint64_t
     1078 +metaslab_weight_factor(metaslab_t *msp)
     1079 +{
     1080 +        uint64_t factor = 0;
     1081 +        uint64_t sectors;
     1082 +        int i;
 891 1083  
     1084 +        /*
     1085 +         * A null space map means that the entire metaslab is free,
     1086 +         * calculate a weight factor that spans the entire size of the
     1087 +         * metaslab.
     1088 +         */
     1089 +        if (msp->ms_sm == NULL) {
     1090 +                vdev_t *vd = msp->ms_group->mg_vd;
     1091 +
     1092 +                i = highbit(msp->ms_size) - 1;
     1093 +                sectors = msp->ms_size >> vd->vdev_ashift;
     1094 +                return (sectors * i * vd->vdev_ashift);
     1095 +        }
     1096 +
     1097 +        if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
     1098 +                return (0);
     1099 +
     1100 +        for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
     1101 +                if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
     1102 +                        continue;
     1103 +
     1104 +                /*
     1105 +                 * Determine the number of sm_shift sectors in the region
     1106 +                 * indicated by the histogram. For example, given an
     1107 +                 * sm_shift value of 9 (512 bytes) and i = 4 then we know
     1108 +                 * that we're looking at an 8K region in the histogram
     1109 +                 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
     1110 +                 * number of sm_shift sectors (512 bytes in this example),
     1111 +                 * we would take 8192 / 512 = 16. Since the histogram
     1112 +                 * is offset by sm_shift we can simply use the value of
     1113 +                 * of i to calculate this (i.e. 2^i = 16 where i = 4).
     1114 +                 */
     1115 +                sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
     1116 +                factor += (i + msp->ms_sm->sm_shift) * sectors;
     1117 +        }
     1118 +        return (factor * msp->ms_sm->sm_shift);
     1119 +}
     1120 +
 892 1121  static uint64_t
 893 1122  metaslab_weight(metaslab_t *msp)
 894 1123  {
 895 1124          metaslab_group_t *mg = msp->ms_group;
 896      -        space_map_t *sm = msp->ms_map;
 897      -        space_map_obj_t *smo = &msp->ms_smo;
 898 1125          vdev_t *vd = mg->mg_vd;
 899 1126          uint64_t weight, space;
 900 1127  
 901 1128          ASSERT(MUTEX_HELD(&msp->ms_lock));
 902 1129  
 903 1130          /*
 904 1131           * This vdev is in the process of being removed so there is nothing
 905 1132           * for us to do here.
 906 1133           */
 907 1134          if (vd->vdev_removing) {
 908      -                ASSERT0(smo->smo_alloc);
     1135 +                ASSERT0(space_map_allocated(msp->ms_sm));
 909 1136                  ASSERT0(vd->vdev_ms_shift);
 910 1137                  return (0);
 911 1138          }
 912 1139  
 913 1140          /*
 914 1141           * The baseline weight is the metaslab's free space.
 915 1142           */
 916      -        space = sm->sm_size - smo->smo_alloc;
     1143 +        space = msp->ms_size - space_map_allocated(msp->ms_sm);
 917 1144          weight = space;
 918 1145  
 919 1146          /*
 920 1147           * Modern disks have uniform bit density and constant angular velocity.
 921 1148           * Therefore, the outer recording zones are faster (higher bandwidth)
 922 1149           * than the inner zones by the ratio of outer to inner track diameter,
 923 1150           * which is typically around 2:1.  We account for this by assigning
 924 1151           * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 925 1152           * In effect, this means that we'll select the metaslab with the most
 926 1153           * free bandwidth rather than simply the one with the most free space.
 927 1154           */
 928      -        weight = 2 * weight -
 929      -            ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
     1155 +        weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
 930 1156          ASSERT(weight >= space && weight <= 2 * space);
 931 1157  
 932      -        /*
 933      -         * For locality, assign higher weight to metaslabs which have
 934      -         * a lower offset than what we've already activated.
 935      -         */
 936      -        if (sm->sm_start <= mg->mg_bonus_area)
 937      -                weight *= (metaslab_smo_bonus_pct / 100);
 938      -        ASSERT(weight >= space &&
 939      -            weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
     1158 +        msp->ms_factor = metaslab_weight_factor(msp);
     1159 +        if (metaslab_weight_factor_enable)
     1160 +                weight += msp->ms_factor;
 940 1161  
 941      -        if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
     1162 +        if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
 942 1163                  /*
 943 1164                   * If this metaslab is one we're actively using, adjust its
 944 1165                   * weight to make it preferable to any inactive metaslab so
 945 1166                   * we'll polish it off.
 946 1167                   */
 947 1168                  weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 948 1169          }
     1170 +
 949 1171          return (weight);
 950 1172  }
 951 1173  
 952      -static void
 953      -metaslab_prefetch(metaslab_group_t *mg)
 954      -{
 955      -        spa_t *spa = mg->mg_vd->vdev_spa;
 956      -        metaslab_t *msp;
 957      -        avl_tree_t *t = &mg->mg_metaslab_tree;
 958      -        int m;
 959      -
 960      -        mutex_enter(&mg->mg_lock);
 961      -
 962      -        /*
 963      -         * Prefetch the next potential metaslabs
 964      -         */
 965      -        for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
 966      -                space_map_t *sm = msp->ms_map;
 967      -                space_map_obj_t *smo = &msp->ms_smo;
 968      -
 969      -                /* If we have reached our prefetch limit then we're done */
 970      -                if (m >= metaslab_prefetch_limit)
 971      -                        break;
 972      -
 973      -                if (!sm->sm_loaded && smo->smo_object != 0) {
 974      -                        mutex_exit(&mg->mg_lock);
 975      -                        dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
 976      -                            0ULL, smo->smo_objsize);
 977      -                        mutex_enter(&mg->mg_lock);
 978      -                }
 979      -        }
 980      -        mutex_exit(&mg->mg_lock);
 981      -}
 982      -
 983 1174  static int
 984 1175  metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 985 1176  {
 986      -        metaslab_group_t *mg = msp->ms_group;
 987      -        space_map_t *sm = msp->ms_map;
 988      -        space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 989      -
 990 1177          ASSERT(MUTEX_HELD(&msp->ms_lock));
 991 1178  
 992 1179          if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 993      -                space_map_load_wait(sm);
 994      -                if (!sm->sm_loaded) {
 995      -                        space_map_obj_t *smo = &msp->ms_smo;
 996      -
 997      -                        int error = space_map_load(sm, sm_ops, SM_FREE, smo,
 998      -                            spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
 999      -                        if (error)  {
     1180 +                metaslab_load_wait(msp);
     1181 +                if (!msp->ms_loaded) {
     1182 +                        int error = metaslab_load(msp);
     1183 +                        if (error) {
1000 1184                                  metaslab_group_sort(msp->ms_group, msp, 0);
1001 1185                                  return (error);
1002 1186                          }
1003      -                        for (int t = 0; t < TXG_DEFER_SIZE; t++)
1004      -                                space_map_walk(msp->ms_defermap[t],
1005      -                                    space_map_claim, sm);
1006      -
1007 1187                  }
1008 1188  
1009      -                /*
1010      -                 * Track the bonus area as we activate new metaslabs.
1011      -                 */
1012      -                if (sm->sm_start > mg->mg_bonus_area) {
1013      -                        mutex_enter(&mg->mg_lock);
1014      -                        mg->mg_bonus_area = sm->sm_start;
1015      -                        mutex_exit(&mg->mg_lock);
1016      -                }
1017      -
1018 1189                  metaslab_group_sort(msp->ms_group, msp,
1019 1190                      msp->ms_weight | activation_weight);
1020 1191          }
1021      -        ASSERT(sm->sm_loaded);
     1192 +        ASSERT(msp->ms_loaded);
1022 1193          ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1023 1194  
1024 1195          return (0);
1025 1196  }
1026 1197  
1027 1198  static void
1028 1199  metaslab_passivate(metaslab_t *msp, uint64_t size)
1029 1200  {
1030 1201          /*
1031 1202           * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1032 1203           * this metaslab again.  In that case, it had better be empty,
1033 1204           * or we would be leaving space on the table.
1034 1205           */
1035      -        ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0);
     1206 +        ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1036 1207          metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1037 1208          ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1038 1209  }
1039 1210  
     1211 +static void
     1212 +metaslab_preload(void *arg)
     1213 +{
     1214 +        metaslab_t *msp = arg;
     1215 +        spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
     1216 +
     1217 +        mutex_enter(&msp->ms_lock);
     1218 +        metaslab_load_wait(msp);
     1219 +        if (!msp->ms_loaded)
     1220 +                (void) metaslab_load(msp);
     1221 +
     1222 +        /*
     1223 +         * Set the ms_access_txg value so that we don't unload it right away.
     1224 +         */
     1225 +        msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
     1226 +        mutex_exit(&msp->ms_lock);
     1227 +}
     1228 +
     1229 +static void
     1230 +metaslab_group_preload(metaslab_group_t *mg)
     1231 +{
     1232 +        spa_t *spa = mg->mg_vd->vdev_spa;
     1233 +        metaslab_t *msp;
     1234 +        avl_tree_t *t = &mg->mg_metaslab_tree;
     1235 +        int m = 0;
     1236 +
     1237 +        if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
     1238 +                taskq_wait(mg->mg_taskq);
     1239 +                return;
     1240 +        }
     1241 +        mutex_enter(&mg->mg_lock);
     1242 +
     1243 +        /*
     1244 +         * Prefetch the next potential metaslabs
     1245 +         */
     1246 +        for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
     1247 +
     1248 +                /* If we have reached our preload limit then we're done */
     1249 +                if (++m > metaslab_preload_limit)
     1250 +                        break;
     1251 +
     1252 +                VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
     1253 +                    msp, TQ_SLEEP) != NULL);
     1254 +        }
     1255 +        mutex_exit(&mg->mg_lock);
     1256 +}
     1257 +
1040 1258  /*
1041      - * Determine if the in-core space map representation can be condensed on-disk.
1042      - * We would like to use the following criteria to make our decision:
     1259 + * Determine if the space map's on-disk footprint is past our tolerance
     1260 + * for inefficiency. We would like to use the following criteria to make
     1261 + * our decision:
1043 1262   *
1044 1263   * 1. The size of the space map object should not dramatically increase as a
1045      - * result of writing out our in-core free map.
     1264 + * result of writing out the free space range tree.
1046 1265   *
1047 1266   * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1048      - * times the size than the in-core representation (i.e. zfs_condense_pct = 110
1049      - * and in-core = 1MB, minimal = 1.1.MB).
     1267 + * times the size than the free space range tree representation
     1268 + * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1050 1269   *
1051 1270   * Checking the first condition is tricky since we don't want to walk
1052 1271   * the entire AVL tree calculating the estimated on-disk size. Instead we
1053      - * use the size-ordered AVL tree in the space map and calculate the
1054      - * size required for the largest segment in our in-core free map. If the
     1272 + * use the size-ordered range tree in the metaslab and calculate the
     1273 + * size required to write out the largest segment in our free tree. If the
1055 1274   * size required to represent that segment on disk is larger than the space
1056 1275   * map object then we avoid condensing this map.
1057 1276   *
1058 1277   * To determine the second criterion we use a best-case estimate and assume
1059 1278   * each segment can be represented on-disk as a single 64-bit entry. We refer
1060 1279   * to this best-case estimate as the space map's minimal form.
1061 1280   */
1062 1281  static boolean_t
1063 1282  metaslab_should_condense(metaslab_t *msp)
1064 1283  {
1065      -        space_map_t *sm = msp->ms_map;
1066      -        space_map_obj_t *smo = &msp->ms_smo_syncing;
1067      -        space_seg_t *ss;
     1284 +        space_map_t *sm = msp->ms_sm;
     1285 +        range_seg_t *rs;
1068 1286          uint64_t size, entries, segsz;
1069 1287  
1070 1288          ASSERT(MUTEX_HELD(&msp->ms_lock));
1071      -        ASSERT(sm->sm_loaded);
     1289 +        ASSERT(msp->ms_loaded);
1072 1290  
1073 1291          /*
1074      -         * Use the sm_pp_root AVL tree, which is ordered by size, to obtain
1075      -         * the largest segment in the in-core free map. If the tree is
1076      -         * empty then we should condense the map.
     1292 +         * Use the ms_size_tree range tree, which is ordered by size, to
     1293 +         * obtain the largest segment in the free tree. If the tree is empty
     1294 +         * then we should condense the map.
1077 1295           */
1078      -        ss = avl_last(sm->sm_pp_root);
1079      -        if (ss == NULL)
     1296 +        rs = avl_last(&msp->ms_size_tree);
     1297 +        if (rs == NULL)
1080 1298                  return (B_TRUE);
1081 1299  
1082 1300          /*
1083 1301           * Calculate the number of 64-bit entries this segment would
1084 1302           * require when written to disk. If this single segment would be
1085 1303           * larger on-disk than the entire current on-disk structure, then
1086 1304           * clearly condensing will increase the on-disk structure size.
1087 1305           */
1088      -        size = (ss->ss_end - ss->ss_start) >> sm->sm_shift;
     1306 +        size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1089 1307          entries = size / (MIN(size, SM_RUN_MAX));
1090 1308          segsz = entries * sizeof (uint64_t);
1091 1309  
1092      -        return (segsz <= smo->smo_objsize &&
1093      -            smo->smo_objsize >= (zfs_condense_pct *
1094      -            sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100);
     1310 +        return (segsz <= space_map_length(msp->ms_sm) &&
     1311 +            space_map_length(msp->ms_sm) >= (zfs_condense_pct *
     1312 +            sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
1095 1313  }
1096 1314  
1097 1315  /*
1098 1316   * Condense the on-disk space map representation to its minimized form.
1099 1317   * The minimized form consists of a small number of allocations followed by
1100      - * the in-core free map.
     1318 + * the entries of the free range tree.
1101 1319   */
1102 1320  static void
1103 1321  metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1104 1322  {
1105 1323          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1106      -        space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK];
1107      -        space_map_t condense_map;
1108      -        space_map_t *sm = msp->ms_map;
1109      -        objset_t *mos = spa_meta_objset(spa);
1110      -        space_map_obj_t *smo = &msp->ms_smo_syncing;
     1324 +        range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
     1325 +        range_tree_t *condense_tree;
     1326 +        space_map_t *sm = msp->ms_sm;
1111 1327  
1112 1328          ASSERT(MUTEX_HELD(&msp->ms_lock));
1113 1329          ASSERT3U(spa_sync_pass(spa), ==, 1);
1114      -        ASSERT(sm->sm_loaded);
     1330 +        ASSERT(msp->ms_loaded);
1115 1331  
1116 1332          spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1117      -            "smo size %llu, segments %lu", txg,
1118      -            (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
1119      -            smo->smo_objsize, avl_numnodes(&sm->sm_root));
     1333 +            "smp size %llu, segments %lu", txg, msp->ms_id, msp,
     1334 +            space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1120 1335  
1121 1336          /*
1122      -         * Create an map that is a 100% allocated map. We remove segments
     1337 +         * Create an range tree that is 100% allocated. We remove segments
1123 1338           * that have been freed in this txg, any deferred frees that exist,
1124 1339           * and any allocation in the future. Removing segments should be
1125      -         * a relatively inexpensive operation since we expect these maps to
1126      -         * a small number of nodes.
     1340 +         * a relatively inexpensive operation since we expect these trees to
     1341 +         * have a small number of nodes.
1127 1342           */
1128      -        space_map_create(&condense_map, sm->sm_start, sm->sm_size,
1129      -            sm->sm_shift, sm->sm_lock);
1130      -        space_map_add(&condense_map, condense_map.sm_start,
1131      -            condense_map.sm_size);
     1343 +        condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
     1344 +        range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1132 1345  
1133 1346          /*
1134      -         * Remove what's been freed in this txg from the condense_map.
     1347 +         * Remove what's been freed in this txg from the condense_tree.
1135 1348           * Since we're in sync_pass 1, we know that all the frees from
1136      -         * this txg are in the freemap.
     1349 +         * this txg are in the freetree.
1137 1350           */
1138      -        space_map_walk(freemap, space_map_remove, &condense_map);
     1351 +        range_tree_walk(freetree, range_tree_remove, condense_tree);
1139 1352  
1140      -        for (int t = 0; t < TXG_DEFER_SIZE; t++)
1141      -                space_map_walk(msp->ms_defermap[t],
1142      -                    space_map_remove, &condense_map);
     1353 +        for (int t = 0; t < TXG_DEFER_SIZE; t++) {
     1354 +                range_tree_walk(msp->ms_defertree[t],
     1355 +                    range_tree_remove, condense_tree);
     1356 +        }
1143 1357  
1144      -        for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1145      -                space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK],
1146      -                    space_map_remove, &condense_map);
     1358 +        for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
     1359 +                range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
     1360 +                    range_tree_remove, condense_tree);
     1361 +        }
1147 1362  
1148 1363          /*
1149 1364           * We're about to drop the metaslab's lock thus allowing
1150 1365           * other consumers to change it's content. Set the
1151      -         * space_map's sm_condensing flag to ensure that
     1366 +         * metaslab's ms_condensing flag to ensure that
1152 1367           * allocations on this metaslab do not occur while we're
1153 1368           * in the middle of committing it to disk. This is only critical
1154      -         * for the ms_map as all other space_maps use per txg
     1369 +         * for the ms_tree as all other range trees use per txg
1155 1370           * views of their content.
1156 1371           */
1157      -        sm->sm_condensing = B_TRUE;
     1372 +        msp->ms_condensing = B_TRUE;
1158 1373  
1159 1374          mutex_exit(&msp->ms_lock);
1160      -        space_map_truncate(smo, mos, tx);
     1375 +        space_map_truncate(sm, tx);
1161 1376          mutex_enter(&msp->ms_lock);
1162 1377  
1163 1378          /*
1164 1379           * While we would ideally like to create a space_map representation
1165 1380           * that consists only of allocation records, doing so can be
1166      -         * prohibitively expensive because the in-core free map can be
     1381 +         * prohibitively expensive because the in-core free tree can be
1167 1382           * large, and therefore computationally expensive to subtract
1168      -         * from the condense_map. Instead we sync out two maps, a cheap
1169      -         * allocation only map followed by the in-core free map. While not
     1383 +         * from the condense_tree. Instead we sync out two trees, a cheap
     1384 +         * allocation only tree followed by the in-core free tree. While not
1170 1385           * optimal, this is typically close to optimal, and much cheaper to
1171 1386           * compute.
1172 1387           */
1173      -        space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx);
1174      -        space_map_vacate(&condense_map, NULL, NULL);
1175      -        space_map_destroy(&condense_map);
     1388 +        space_map_write(sm, condense_tree, SM_ALLOC, tx);
     1389 +        range_tree_vacate(condense_tree, NULL, NULL);
     1390 +        range_tree_destroy(condense_tree);
1176 1391  
1177      -        space_map_sync(sm, SM_FREE, smo, mos, tx);
1178      -        sm->sm_condensing = B_FALSE;
1179      -
1180      -        spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, "
1181      -            "smo size %llu", txg,
1182      -            (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
1183      -            smo->smo_objsize);
     1392 +        space_map_write(sm, msp->ms_tree, SM_FREE, tx);
     1393 +        msp->ms_condensing = B_FALSE;
1184 1394  }
1185 1395  
1186 1396  /*
1187 1397   * Write a metaslab to disk in the context of the specified transaction group.
1188 1398   */
1189 1399  void
1190 1400  metaslab_sync(metaslab_t *msp, uint64_t txg)
1191 1401  {
1192      -        vdev_t *vd = msp->ms_group->mg_vd;
     1402 +        metaslab_group_t *mg = msp->ms_group;
     1403 +        vdev_t *vd = mg->mg_vd;
1193 1404          spa_t *spa = vd->vdev_spa;
1194 1405          objset_t *mos = spa_meta_objset(spa);
1195      -        space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK];
1196      -        space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK];
1197      -        space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1198      -        space_map_t *sm = msp->ms_map;
1199      -        space_map_obj_t *smo = &msp->ms_smo_syncing;
1200      -        dmu_buf_t *db;
     1406 +        range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
     1407 +        range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
     1408 +        range_tree_t **freed_tree =
     1409 +            &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1201 1410          dmu_tx_t *tx;
     1411 +        uint64_t object = space_map_object(msp->ms_sm);
1202 1412  
1203 1413          ASSERT(!vd->vdev_ishole);
1204 1414  
1205 1415          /*
1206 1416           * This metaslab has just been added so there's no work to do now.
1207 1417           */
1208      -        if (*freemap == NULL) {
1209      -                ASSERT3P(allocmap, ==, NULL);
     1418 +        if (*freetree == NULL) {
     1419 +                ASSERT3P(alloctree, ==, NULL);
1210 1420                  return;
1211 1421          }
1212 1422  
1213      -        ASSERT3P(allocmap, !=, NULL);
1214      -        ASSERT3P(*freemap, !=, NULL);
1215      -        ASSERT3P(*freed_map, !=, NULL);
     1423 +        ASSERT3P(alloctree, !=, NULL);
     1424 +        ASSERT3P(*freetree, !=, NULL);
     1425 +        ASSERT3P(*freed_tree, !=, NULL);
1216 1426  
1217      -        if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0)
     1427 +        if (range_tree_space(alloctree) == 0 &&
     1428 +            range_tree_space(*freetree) == 0)
1218 1429                  return;
1219 1430  
1220 1431          /*
1221 1432           * The only state that can actually be changing concurrently with
1222      -         * metaslab_sync() is the metaslab's ms_map.  No other thread can
1223      -         * be modifying this txg's allocmap, freemap, freed_map, or smo.
1224      -         * Therefore, we only hold ms_lock to satify space_map ASSERTs.
1225      -         * We drop it whenever we call into the DMU, because the DMU
1226      -         * can call down to us (e.g. via zio_free()) at any time.
     1433 +         * metaslab_sync() is the metaslab's ms_tree.  No other thread can
     1434 +         * be modifying this txg's alloctree, freetree, freed_tree, or
     1435 +         * space_map_phys_t. Therefore, we only hold ms_lock to satify
     1436 +         * space_map ASSERTs. We drop it whenever we call into the DMU,
     1437 +         * because the DMU can call down to us (e.g. via zio_free()) at
     1438 +         * any time.
1227 1439           */
1228 1440  
1229 1441          tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1230 1442  
1231      -        if (smo->smo_object == 0) {
1232      -                ASSERT(smo->smo_objsize == 0);
1233      -                ASSERT(smo->smo_alloc == 0);
1234      -                smo->smo_object = dmu_object_alloc(mos,
1235      -                    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1236      -                    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1237      -                ASSERT(smo->smo_object != 0);
1238      -                dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1239      -                    (sm->sm_start >> vd->vdev_ms_shift),
1240      -                    sizeof (uint64_t), &smo->smo_object, tx);
     1443 +        if (msp->ms_sm == NULL) {
     1444 +                uint64_t new_object;
     1445 +
     1446 +                new_object = space_map_alloc(mos, tx);
     1447 +                VERIFY3U(new_object, !=, 0);
     1448 +
     1449 +                VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
     1450 +                    msp->ms_start, msp->ms_size, vd->vdev_ashift,
     1451 +                    &msp->ms_lock));
     1452 +                ASSERT(msp->ms_sm != NULL);
1241 1453          }
1242 1454  
1243 1455          mutex_enter(&msp->ms_lock);
1244 1456  
1245      -        if (sm->sm_loaded && spa_sync_pass(spa) == 1 &&
     1457 +        if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1246 1458              metaslab_should_condense(msp)) {
1247 1459                  metaslab_condense(msp, txg, tx);
1248 1460          } else {
1249      -                space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
1250      -                space_map_sync(*freemap, SM_FREE, smo, mos, tx);
     1461 +                space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
     1462 +                space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1251 1463          }
1252 1464  
1253      -        space_map_vacate(allocmap, NULL, NULL);
     1465 +        range_tree_vacate(alloctree, NULL, NULL);
1254 1466  
     1467 +        if (msp->ms_loaded) {
     1468 +                /*
     1469 +                 * When the space map is loaded, we have an accruate
     1470 +                 * histogram in the range tree. This gives us an opportunity
     1471 +                 * to bring the space map's histogram up-to-date so we clear
     1472 +                 * it first before updating it.
     1473 +                 */
     1474 +                space_map_histogram_clear(msp->ms_sm);
     1475 +                space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
     1476 +        } else {
     1477 +                /*
     1478 +                 * Since the space map is not loaded we simply update the
     1479 +                 * exisiting histogram with what was freed in this txg. This
     1480 +                 * means that the on-disk histogram may not have an accurate
     1481 +                 * view of the free space but it's close enough to allow
     1482 +                 * us to make allocation decisions.
     1483 +                 */
     1484 +                space_map_histogram_add(msp->ms_sm, *freetree, tx);
     1485 +        }
     1486 +
1255 1487          /*
1256      -         * For sync pass 1, we avoid walking the entire space map and
1257      -         * instead will just swap the pointers for freemap and
1258      -         * freed_map. We can safely do this since the freed_map is
     1488 +         * For sync pass 1, we avoid traversing this txg's free range tree
     1489 +         * and instead will just swap the pointers for freetree and
     1490 +         * freed_tree. We can safely do this since the freed_tree is
1259 1491           * guaranteed to be empty on the initial pass.
1260 1492           */
1261 1493          if (spa_sync_pass(spa) == 1) {
1262      -                ASSERT0((*freed_map)->sm_space);
1263      -                ASSERT0(avl_numnodes(&(*freed_map)->sm_root));
1264      -                space_map_swap(freemap, freed_map);
     1494 +                range_tree_swap(freetree, freed_tree);
1265 1495          } else {
1266      -                space_map_vacate(*freemap, space_map_add, *freed_map);
     1496 +                range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1267 1497          }
1268 1498  
1269      -        ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space);
1270      -        ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space);
     1499 +        ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
     1500 +        ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1271 1501  
1272 1502          mutex_exit(&msp->ms_lock);
1273 1503  
1274      -        VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1275      -        dmu_buf_will_dirty(db, tx);
1276      -        ASSERT3U(db->db_size, >=, sizeof (*smo));
1277      -        bcopy(smo, db->db_data, sizeof (*smo));
1278      -        dmu_buf_rele(db, FTAG);
1279      -
     1504 +        if (object != space_map_object(msp->ms_sm)) {
     1505 +                object = space_map_object(msp->ms_sm);
     1506 +                dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
     1507 +                    msp->ms_id, sizeof (uint64_t), &object, tx);
     1508 +        }
1280 1509          dmu_tx_commit(tx);
1281 1510  }
1282 1511  
1283 1512  /*
1284 1513   * Called after a transaction group has completely synced to mark
1285 1514   * all of the metaslab's free space as usable.
1286 1515   */
1287 1516  void
1288 1517  metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1289 1518  {
1290      -        space_map_obj_t *smo = &msp->ms_smo;
1291      -        space_map_obj_t *smosync = &msp->ms_smo_syncing;
1292      -        space_map_t *sm = msp->ms_map;
1293      -        space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1294      -        space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1295 1519          metaslab_group_t *mg = msp->ms_group;
1296 1520          vdev_t *vd = mg->mg_vd;
     1521 +        range_tree_t **freed_tree;
     1522 +        range_tree_t **defer_tree;
1297 1523          int64_t alloc_delta, defer_delta;
1298 1524  
1299 1525          ASSERT(!vd->vdev_ishole);
1300 1526  
1301 1527          mutex_enter(&msp->ms_lock);
1302 1528  
1303 1529          /*
1304 1530           * If this metaslab is just becoming available, initialize its
1305      -         * allocmaps, freemaps, and defermap and add its capacity to the vdev.
     1531 +         * alloctrees, freetrees, and defertree and add its capacity to
     1532 +         * the vdev.
1306 1533           */
1307      -        if (*freed_map == NULL) {
1308      -                ASSERT(*defer_map == NULL);
     1534 +        if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
1309 1535                  for (int t = 0; t < TXG_SIZE; t++) {
1310      -                        msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t),
1311      -                            KM_SLEEP);
1312      -                        space_map_create(msp->ms_allocmap[t], sm->sm_start,
1313      -                            sm->sm_size, sm->sm_shift, sm->sm_lock);
1314      -                        msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t),
1315      -                            KM_SLEEP);
1316      -                        space_map_create(msp->ms_freemap[t], sm->sm_start,
1317      -                            sm->sm_size, sm->sm_shift, sm->sm_lock);
     1536 +                        ASSERT(msp->ms_alloctree[t] == NULL);
     1537 +                        ASSERT(msp->ms_freetree[t] == NULL);
     1538 +
     1539 +                        msp->ms_alloctree[t] = range_tree_create(NULL, msp,
     1540 +                            &msp->ms_lock);
     1541 +                        msp->ms_freetree[t] = range_tree_create(NULL, msp,
     1542 +                            &msp->ms_lock);
1318 1543                  }
1319 1544  
1320 1545                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1321      -                        msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t),
1322      -                            KM_SLEEP);
1323      -                        space_map_create(msp->ms_defermap[t], sm->sm_start,
1324      -                            sm->sm_size, sm->sm_shift, sm->sm_lock);
     1546 +                        ASSERT(msp->ms_defertree[t] == NULL);
     1547 +
     1548 +                        msp->ms_defertree[t] = range_tree_create(NULL, msp,
     1549 +                            &msp->ms_lock);
1325 1550                  }
1326 1551  
1327      -                freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1328      -                defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1329      -
1330      -                vdev_space_update(vd, 0, 0, sm->sm_size);
     1552 +                vdev_space_update(vd, 0, 0, msp->ms_size);
1331 1553          }
1332 1554  
1333      -        alloc_delta = smosync->smo_alloc - smo->smo_alloc;
1334      -        defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space;
     1555 +        freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
     1556 +        defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
1335 1557  
     1558 +        alloc_delta = space_map_alloc_delta(msp->ms_sm);
     1559 +        defer_delta = range_tree_space(*freed_tree) -
     1560 +            range_tree_space(*defer_tree);
     1561 +
1336 1562          vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1337 1563  
1338      -        ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0);
1339      -        ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0);
     1564 +        ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
     1565 +        ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1340 1566  
1341 1567          /*
1342      -         * If there's a space_map_load() in progress, wait for it to complete
     1568 +         * If there's a metaslab_load() in progress, wait for it to complete
1343 1569           * so that we have a consistent view of the in-core space map.
1344 1570           */
1345      -        space_map_load_wait(sm);
     1571 +        metaslab_load_wait(msp);
1346 1572  
1347 1573          /*
1348      -         * Move the frees from the defer_map to this map (if it's loaded).
1349      -         * Swap the freed_map and the defer_map -- this is safe to do
1350      -         * because we've just emptied out the defer_map.
     1574 +         * Move the frees from the defer_tree back to the free
     1575 +         * range tree (if it's loaded). Swap the freed_tree and the
     1576 +         * defer_tree -- this is safe to do because we've just emptied out
     1577 +         * the defer_tree.
1351 1578           */
1352      -        space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
1353      -        ASSERT0((*defer_map)->sm_space);
1354      -        ASSERT0(avl_numnodes(&(*defer_map)->sm_root));
1355      -        space_map_swap(freed_map, defer_map);
     1579 +        range_tree_vacate(*defer_tree,
     1580 +            msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
     1581 +        range_tree_swap(freed_tree, defer_tree);
1356 1582  
1357      -        *smo = *smosync;
     1583 +        space_map_update(msp->ms_sm);
1358 1584  
1359 1585          msp->ms_deferspace += defer_delta;
1360 1586          ASSERT3S(msp->ms_deferspace, >=, 0);
1361      -        ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
     1587 +        ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
1362 1588          if (msp->ms_deferspace != 0) {
1363 1589                  /*
1364 1590                   * Keep syncing this metaslab until all deferred frees
1365 1591                   * are back in circulation.
1366 1592                   */
1367 1593                  vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1368 1594          }
1369 1595  
1370      -        /*
1371      -         * If the map is loaded but no longer active, evict it as soon as all
1372      -         * future allocations have synced.  (If we unloaded it now and then
1373      -         * loaded a moment later, the map wouldn't reflect those allocations.)
1374      -         */
1375      -        if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1376      -                int evictable = 1;
     1596 +        if (msp->ms_loaded && msp->ms_access_txg < txg) {
     1597 +                for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
     1598 +                        VERIFY0(range_tree_space(
     1599 +                            msp->ms_alloctree[(txg + t) & TXG_MASK]));
     1600 +                }
1377 1601  
1378      -                for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1379      -                        if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
1380      -                                evictable = 0;
1381      -
1382      -                if (evictable && !metaslab_debug)
1383      -                        space_map_unload(sm);
     1602 +                if (!metaslab_debug_unload)
     1603 +                        metaslab_unload(msp);
1384 1604          }
1385 1605  
1386 1606          metaslab_group_sort(mg, msp, metaslab_weight(msp));
1387      -
1388 1607          mutex_exit(&msp->ms_lock);
     1608 +
1389 1609  }
1390 1610  
1391 1611  void
1392 1612  metaslab_sync_reassess(metaslab_group_t *mg)
1393 1613  {
1394      -        vdev_t *vd = mg->mg_vd;
1395 1614          int64_t failures = mg->mg_alloc_failures;
1396 1615  
1397 1616          metaslab_group_alloc_update(mg);
1398      -
1399      -        /*
1400      -         * Re-evaluate all metaslabs which have lower offsets than the
1401      -         * bonus area.
1402      -         */
1403      -        for (int m = 0; m < vd->vdev_ms_count; m++) {
1404      -                metaslab_t *msp = vd->vdev_ms[m];
1405      -
1406      -                if (msp->ms_map->sm_start > mg->mg_bonus_area)
1407      -                        break;
1408      -
1409      -                mutex_enter(&msp->ms_lock);
1410      -                metaslab_group_sort(mg, msp, metaslab_weight(msp));
1411      -                mutex_exit(&msp->ms_lock);
1412      -        }
1413      -
1414 1617          atomic_add_64(&mg->mg_alloc_failures, -failures);
1415 1618  
1416 1619          /*
1417      -         * Prefetch the next potential metaslabs
     1620 +         * Preload the next potential metaslabs
1418 1621           */
1419      -        metaslab_prefetch(mg);
     1622 +        metaslab_group_preload(mg);
1420 1623  }
1421 1624  
1422 1625  static uint64_t
1423 1626  metaslab_distance(metaslab_t *msp, dva_t *dva)
1424 1627  {
1425 1628          uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1426 1629          uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1427      -        uint64_t start = msp->ms_map->sm_start >> ms_shift;
     1630 +        uint64_t start = msp->ms_id;
1428 1631  
1429 1632          if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1430 1633                  return (1ULL << 63);
1431 1634  
1432 1635          if (offset < start)
1433 1636                  return ((start - offset) << ms_shift);
1434 1637          if (offset > start)
1435 1638                  return ((offset - start) << ms_shift);
1436 1639          return (0);
1437 1640  }
↓ open down ↓ 31 lines elided ↑ open up ↑
1469 1672                                      spa_name(spa), mg->mg_vd->vdev_id, txg,
1470 1673                                      mg, msp, psize, asize,
1471 1674                                      mg->mg_alloc_failures, msp->ms_weight);
1472 1675                                  mutex_exit(&mg->mg_lock);
1473 1676                                  return (-1ULL);
1474 1677                          }
1475 1678  
1476 1679                          /*
1477 1680                           * If the selected metaslab is condensing, skip it.
1478 1681                           */
1479      -                        if (msp->ms_map->sm_condensing)
     1682 +                        if (msp->ms_condensing)
1480 1683                                  continue;
1481 1684  
1482 1685                          was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1483 1686                          if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1484 1687                                  break;
1485 1688  
1486 1689                          target_distance = min_distance +
1487      -                            (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
     1690 +                            (space_map_allocated(msp->ms_sm) != 0 ? 0 :
     1691 +                            min_distance >> 1);
1488 1692  
1489 1693                          for (i = 0; i < d; i++)
1490 1694                                  if (metaslab_distance(msp, &dva[i]) <
1491 1695                                      target_distance)
1492 1696                                          break;
1493 1697                          if (i == d)
1494 1698                                  break;
1495 1699                  }
1496 1700                  mutex_exit(&mg->mg_lock);
1497 1701                  if (msp == NULL)
↓ open down ↓ 6 lines elided ↑ open up ↑
1504 1708                   * allocation attempts on this metaslab group then we
1505 1709                   * consider skipping it. We skip it only if we're allowed
1506 1710                   * to "fast" gang, the physical size is larger than
1507 1711                   * a gang block, and we're attempting to allocate from
1508 1712                   * the primary metaslab.
1509 1713                   */
1510 1714                  if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1511 1715                      CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1512 1716                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
1513 1717                          spa_dbgmsg(spa, "%s: skipping metaslab group: "
1514      -                            "vdev %llu, txg %llu, mg %p, psize %llu, "
1515      -                            "asize %llu, failures %llu", spa_name(spa),
1516      -                            mg->mg_vd->vdev_id, txg, mg, psize, asize,
     1718 +                            "vdev %llu, txg %llu, mg %p, msp[%llu] %p, "
     1719 +                            "psize %llu, asize %llu, failures %llu",
     1720 +                            spa_name(spa), mg->mg_vd->vdev_id, txg, mg,
     1721 +                            msp->ms_id, msp, psize, asize,
1517 1722                              mg->mg_alloc_failures);
1518 1723                          mutex_exit(&msp->ms_lock);
1519 1724                          return (-1ULL);
1520 1725                  }
1521 1726  
1522 1727                  /*
1523 1728                   * Ensure that the metaslab we have selected is still
1524 1729                   * capable of handling our request. It's possible that
1525 1730                   * another thread may have changed the weight while we
1526 1731                   * were blocked on the metaslab lock.
↓ open down ↓ 16 lines elided ↑ open up ↑
1543 1748                  if (metaslab_activate(msp, activation_weight) != 0) {
1544 1749                          mutex_exit(&msp->ms_lock);
1545 1750                          continue;
1546 1751                  }
1547 1752  
1548 1753                  /*
1549 1754                   * If this metaslab is currently condensing then pick again as
1550 1755                   * we can't manipulate this metaslab until it's committed
1551 1756                   * to disk.
1552 1757                   */
1553      -                if (msp->ms_map->sm_condensing) {
     1758 +                if (msp->ms_condensing) {
1554 1759                          mutex_exit(&msp->ms_lock);
1555 1760                          continue;
1556 1761                  }
1557 1762  
1558      -                if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL)
     1763 +                if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
1559 1764                          break;
1560 1765  
1561 1766                  atomic_inc_64(&mg->mg_alloc_failures);
1562 1767  
1563      -                metaslab_passivate(msp, space_map_maxsize(msp->ms_map));
1564      -
     1768 +                metaslab_passivate(msp, metaslab_block_maxsize(msp));
1565 1769                  mutex_exit(&msp->ms_lock);
1566 1770          }
1567 1771  
1568      -        if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
     1772 +        if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1569 1773                  vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1570 1774  
1571      -        space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize);
     1775 +        range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
     1776 +        msp->ms_access_txg = txg + metaslab_unload_delay;
1572 1777  
1573 1778          mutex_exit(&msp->ms_lock);
1574 1779  
1575 1780          return (offset);
1576 1781  }
1577 1782  
1578 1783  /*
1579 1784   * Allocate a block for the specified i/o.
1580 1785   */
1581 1786  static int
↓ open down ↓ 226 lines elided ↑ open up ↑
1808 2013          }
1809 2014  
1810 2015          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1811 2016  
1812 2017          if (DVA_GET_GANG(dva))
1813 2018                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1814 2019  
1815 2020          mutex_enter(&msp->ms_lock);
1816 2021  
1817 2022          if (now) {
1818      -                space_map_remove(msp->ms_allocmap[txg & TXG_MASK],
     2023 +                range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
1819 2024                      offset, size);
1820      -                space_map_free(msp->ms_map, offset, size);
     2025 +
     2026 +                VERIFY(!msp->ms_condensing);
     2027 +                VERIFY3U(offset, >=, msp->ms_start);
     2028 +                VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
     2029 +                VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
     2030 +                    msp->ms_size);
     2031 +                VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
     2032 +                VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
     2033 +                range_tree_add(msp->ms_tree, offset, size);
1821 2034          } else {
1822      -                if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0)
     2035 +                if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
1823 2036                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
1824      -                space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size);
     2037 +                range_tree_add(msp->ms_freetree[txg & TXG_MASK],
     2038 +                    offset, size);
1825 2039          }
1826 2040  
1827 2041          mutex_exit(&msp->ms_lock);
1828 2042  }
1829 2043  
1830 2044  /*
1831 2045   * Intent log support: upon opening the pool after a crash, notify the SPA
1832 2046   * of blocks that the intent log has allocated for immediate write, but
1833 2047   * which are still considered free by the SPA because the last transaction
1834 2048   * group didn't commit yet.
↓ open down ↓ 14 lines elided ↑ open up ↑
1849 2063              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1850 2064                  return (SET_ERROR(ENXIO));
1851 2065  
1852 2066          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1853 2067  
1854 2068          if (DVA_GET_GANG(dva))
1855 2069                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1856 2070  
1857 2071          mutex_enter(&msp->ms_lock);
1858 2072  
1859      -        if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded)
     2073 +        if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
1860 2074                  error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1861 2075  
1862      -        if (error == 0 && !space_map_contains(msp->ms_map, offset, size))
     2076 +        if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
1863 2077                  error = SET_ERROR(ENOENT);
1864 2078  
1865 2079          if (error || txg == 0) {        /* txg == 0 indicates dry run */
1866 2080                  mutex_exit(&msp->ms_lock);
1867 2081                  return (error);
1868 2082          }
1869 2083  
1870      -        space_map_claim(msp->ms_map, offset, size);
     2084 +        VERIFY(!msp->ms_condensing);
     2085 +        VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
     2086 +        VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
     2087 +        VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
     2088 +        range_tree_remove(msp->ms_tree, offset, size);
1871 2089  
1872 2090          if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
1873      -                if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
     2091 +                if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1874 2092                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
1875      -                space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size);
     2093 +                range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
1876 2094          }
1877 2095  
1878 2096          mutex_exit(&msp->ms_lock);
1879 2097  
1880 2098          return (0);
1881 2099  }
1882 2100  
1883 2101  int
1884 2102  metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
1885 2103      int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
↓ open down ↓ 12 lines elided ↑ open up ↑
1898 2116                  return (SET_ERROR(ENOSPC));
1899 2117          }
1900 2118  
1901 2119          ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
1902 2120          ASSERT(BP_GET_NDVAS(bp) == 0);
1903 2121          ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
1904 2122  
1905 2123          for (int d = 0; d < ndvas; d++) {
1906 2124                  error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
1907 2125                      txg, flags);
1908      -                if (error) {
     2126 +                if (error != 0) {
1909 2127                          for (d--; d >= 0; d--) {
1910 2128                                  metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
1911 2129                                  bzero(&dva[d], sizeof (dva_t));
1912 2130                          }
1913 2131                          spa_config_exit(spa, SCL_ALLOC, FTAG);
1914 2132                          return (error);
1915 2133                  }
1916 2134          }
1917 2135          ASSERT(error == 0);
1918 2136          ASSERT(BP_GET_NDVAS(bp) == ndvas);
↓ open down ↓ 46 lines elided ↑ open up ↑
1965 2183                  if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1966 2184                          break;
1967 2185  
1968 2186          spa_config_exit(spa, SCL_ALLOC, FTAG);
1969 2187  
1970 2188          ASSERT(error == 0 || txg == 0);
1971 2189  
1972 2190          return (error);
1973 2191  }
1974 2192  
1975      -static void
1976      -checkmap(space_map_t *sm, uint64_t off, uint64_t size)
1977      -{
1978      -        space_seg_t *ss;
1979      -        avl_index_t where;
1980      -
1981      -        mutex_enter(sm->sm_lock);
1982      -        ss = space_map_find(sm, off, size, &where);
1983      -        if (ss != NULL)
1984      -                panic("freeing free block; ss=%p", (void *)ss);
1985      -        mutex_exit(sm->sm_lock);
1986      -}
1987      -
1988 2193  void
1989 2194  metaslab_check_free(spa_t *spa, const blkptr_t *bp)
1990 2195  {
1991 2196          if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
1992 2197                  return;
1993 2198  
1994 2199          spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1995 2200          for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
1996      -                uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]);
1997      -                vdev_t *vd = vdev_lookup_top(spa, vdid);
1998      -                uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]);
     2201 +                uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
     2202 +                vdev_t *vd = vdev_lookup_top(spa, vdev);
     2203 +                uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
1999 2204                  uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2000      -                metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift];
     2205 +                metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2001 2206  
2002      -                if (ms->ms_map->sm_loaded)
2003      -                        checkmap(ms->ms_map, off, size);
     2207 +                if (msp->ms_loaded)
     2208 +                        range_tree_verify(msp->ms_tree, offset, size);
2004 2209  
2005 2210                  for (int j = 0; j < TXG_SIZE; j++)
2006      -                        checkmap(ms->ms_freemap[j], off, size);
     2211 +                        range_tree_verify(msp->ms_freetree[j], offset, size);
2007 2212                  for (int j = 0; j < TXG_DEFER_SIZE; j++)
2008      -                        checkmap(ms->ms_defermap[j], off, size);
     2213 +                        range_tree_verify(msp->ms_defertree[j], offset, size);
2009 2214          }
2010 2215          spa_config_exit(spa, SCL_VDEV, FTAG);
2011 2216  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX