1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  */
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/dmu.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/space_map.h>
  32 #include <sys/metaslab_impl.h>
  33 #include <sys/vdev_impl.h>
  34 #include <sys/zio.h>
  35 #include <sys/spa_impl.h>
  36 #include <sys/zfeature.h>
  37 
  38 #define GANG_ALLOCATION(flags) \
  39         ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  40 
  41 uint64_t metaslab_aliquot = 512ULL << 10;
  42 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  43 
  44 /*
  45  * The in-core space map representation is more compact than its on-disk form.
  46  * The zfs_condense_pct determines how much more compact the in-core
  47  * space map representation must be before we compact it on-disk.
  48  * Values should be greater than or equal to 100.
  49  */
  50 int zfs_condense_pct = 200;
  51 
  52 /*
  53  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  54  * space used on disk. In particular, a space map uses data in increments of
  55  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
  56  * same number of blocks after condensing. Since the goal of condensing is to
  57  * reduce the number of IOPs required to read the space map, we only want to
  58  * condense when we can be sure we will reduce the number of blocks used by the
  59  * space map. Unfortunately, we cannot precisely compute whether or not this is
  60  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  61  * we apply the following heuristic: do not condense a spacemap unless the
  62  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  63  * blocks.
  64  */
  65 int zfs_metaslab_condense_block_threshold = 4;
  66 
  67 /*
  68  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  69  * be eligible for allocation. The value is defined as a percentage of
  70  * free space. Metaslab groups that have more free space than
  71  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  72  * a metaslab group's free space is less than or equal to the
  73  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  74  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  75  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  76  * groups are allowed to accept allocations. Gang blocks are always
  77  * eligible to allocate on any metaslab group. The default value of 0 means
  78  * no metaslab group will be excluded based on this criterion.
  79  */
  80 int zfs_mg_noalloc_threshold = 0;
  81 
  82 /*
  83  * Metaslab groups are considered eligible for allocations if their
  84  * fragmenation metric (measured as a percentage) is less than or equal to
  85  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
  86  * then it will be skipped unless all metaslab groups within the metaslab
  87  * class have also crossed this threshold.
  88  */
  89 int zfs_mg_fragmentation_threshold = 85;
  90 
  91 /*
  92  * Allow metaslabs to keep their active state as long as their fragmentation
  93  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
  94  * active metaslab that exceeds this threshold will no longer keep its active
  95  * status allowing better metaslabs to be selected.
  96  */
  97 int zfs_metaslab_fragmentation_threshold = 70;
  98 
  99 /*
 100  * When set will load all metaslabs when pool is first opened.
 101  */
 102 int metaslab_debug_load = 0;
 103 
 104 /*
 105  * When set will prevent metaslabs from being unloaded.
 106  */
 107 int metaslab_debug_unload = 0;
 108 
 109 /*
 110  * Minimum size which forces the dynamic allocator to change
 111  * it's allocation strategy.  Once the space map cannot satisfy
 112  * an allocation of this size then it switches to using more
 113  * aggressive strategy (i.e search by size rather than offset).
 114  */
 115 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 116 
 117 /*
 118  * The minimum free space, in percent, which must be available
 119  * in a space map to continue allocations in a first-fit fashion.
 120  * Once the space map's free space drops below this level we dynamically
 121  * switch to using best-fit allocations.
 122  */
 123 int metaslab_df_free_pct = 4;
 124 
 125 /*
 126  * A metaslab is considered "free" if it contains a contiguous
 127  * segment which is greater than metaslab_min_alloc_size.
 128  */
 129 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 130 
 131 /*
 132  * Percentage of all cpus that can be used by the metaslab taskq.
 133  */
 134 int metaslab_load_pct = 50;
 135 
 136 /*
 137  * Determines how many txgs a metaslab may remain loaded without having any
 138  * allocations from it. As long as a metaslab continues to be used we will
 139  * keep it loaded.
 140  */
 141 int metaslab_unload_delay = TXG_SIZE * 2;
 142 
 143 /*
 144  * Max number of metaslabs per group to preload.
 145  */
 146 int metaslab_preload_limit = SPA_DVAS_PER_BP;
 147 
 148 /*
 149  * Enable/disable preloading of metaslab.
 150  */
 151 boolean_t metaslab_preload_enabled = B_TRUE;
 152 
 153 /*
 154  * Enable/disable fragmentation weighting on metaslabs.
 155  */
 156 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
 157 
 158 /*
 159  * Enable/disable lba weighting (i.e. outer tracks are given preference).
 160  */
 161 boolean_t metaslab_lba_weighting_enabled = B_TRUE;
 162 
 163 /*
 164  * Enable/disable metaslab group biasing.
 165  */
 166 boolean_t metaslab_bias_enabled = B_TRUE;
 167 
 168 /*
 169  * Enable/disable segment-based metaslab selection.
 170  */
 171 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
 172 
 173 /*
 174  * When using segment-based metaslab selection, we will continue
 175  * allocating from the active metaslab until we have exhausted
 176  * zfs_metaslab_switch_threshold of its buckets.
 177  */
 178 int zfs_metaslab_switch_threshold = 2;
 179 
 180 /*
 181  * Internal switch to enable/disable the metaslab allocation tracing
 182  * facility.
 183  */
 184 boolean_t metaslab_trace_enabled = B_TRUE;
 185 
 186 /*
 187  * Maximum entries that the metaslab allocation tracing facility will keep
 188  * in a given list when running in non-debug mode. We limit the number
 189  * of entries in non-debug mode to prevent us from using up too much memory.
 190  * The limit should be sufficiently large that we don't expect any allocation
 191  * to every exceed this value. In debug mode, the system will panic if this
 192  * limit is ever reached allowing for further investigation.
 193  */
 194 uint64_t metaslab_trace_max_entries = 5000;
 195 
 196 static uint64_t metaslab_weight(metaslab_t *);
 197 static void metaslab_set_fragmentation(metaslab_t *);
 198 
 199 kmem_cache_t *metaslab_alloc_trace_cache;
 200 
 201 /*
 202  * ==========================================================================
 203  * Metaslab classes
 204  * ==========================================================================
 205  */
 206 metaslab_class_t *
 207 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 208 {
 209         metaslab_class_t *mc;
 210 
 211         mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 212 
 213         mc->mc_spa = spa;
 214         mc->mc_rotor = NULL;
 215         mc->mc_ops = ops;
 216         mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 217         refcount_create_tracked(&mc->mc_alloc_slots);
 218 
 219         return (mc);
 220 }
 221 
 222 void
 223 metaslab_class_destroy(metaslab_class_t *mc)
 224 {
 225         ASSERT(mc->mc_rotor == NULL);
 226         ASSERT(mc->mc_alloc == 0);
 227         ASSERT(mc->mc_deferred == 0);
 228         ASSERT(mc->mc_space == 0);
 229         ASSERT(mc->mc_dspace == 0);
 230 
 231         refcount_destroy(&mc->mc_alloc_slots);
 232         mutex_destroy(&mc->mc_lock);
 233         kmem_free(mc, sizeof (metaslab_class_t));
 234 }
 235 
 236 int
 237 metaslab_class_validate(metaslab_class_t *mc)
 238 {
 239         metaslab_group_t *mg;
 240         vdev_t *vd;
 241 
 242         /*
 243          * Must hold one of the spa_config locks.
 244          */
 245         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 246             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 247 
 248         if ((mg = mc->mc_rotor) == NULL)
 249                 return (0);
 250 
 251         do {
 252                 vd = mg->mg_vd;
 253                 ASSERT(vd->vdev_mg != NULL);
 254                 ASSERT3P(vd->vdev_top, ==, vd);
 255                 ASSERT3P(mg->mg_class, ==, mc);
 256                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 257         } while ((mg = mg->mg_next) != mc->mc_rotor);
 258 
 259         return (0);
 260 }
 261 
 262 void
 263 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 264     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 265 {
 266         atomic_add_64(&mc->mc_alloc, alloc_delta);
 267         atomic_add_64(&mc->mc_deferred, defer_delta);
 268         atomic_add_64(&mc->mc_space, space_delta);
 269         atomic_add_64(&mc->mc_dspace, dspace_delta);
 270 }
 271 
 272 uint64_t
 273 metaslab_class_get_alloc(metaslab_class_t *mc)
 274 {
 275         return (mc->mc_alloc);
 276 }
 277 
 278 uint64_t
 279 metaslab_class_get_deferred(metaslab_class_t *mc)
 280 {
 281         return (mc->mc_deferred);
 282 }
 283 
 284 uint64_t
 285 metaslab_class_get_space(metaslab_class_t *mc)
 286 {
 287         return (mc->mc_space);
 288 }
 289 
 290 uint64_t
 291 metaslab_class_get_dspace(metaslab_class_t *mc)
 292 {
 293         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 294 }
 295 
 296 void
 297 metaslab_class_histogram_verify(metaslab_class_t *mc)
 298 {
 299         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 300         uint64_t *mc_hist;
 301         int i;
 302 
 303         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 304                 return;
 305 
 306         mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 307             KM_SLEEP);
 308 
 309         for (int c = 0; c < rvd->vdev_children; c++) {
 310                 vdev_t *tvd = rvd->vdev_child[c];
 311                 metaslab_group_t *mg = tvd->vdev_mg;
 312 
 313                 /*
 314                  * Skip any holes, uninitialized top-levels, or
 315                  * vdevs that are not in this metalab class.
 316                  */
 317                 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 318                     mg->mg_class != mc) {
 319                         continue;
 320                 }
 321 
 322                 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 323                         mc_hist[i] += mg->mg_histogram[i];
 324         }
 325 
 326         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 327                 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 328 
 329         kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 330 }
 331 
 332 /*
 333  * Calculate the metaslab class's fragmentation metric. The metric
 334  * is weighted based on the space contribution of each metaslab group.
 335  * The return value will be a number between 0 and 100 (inclusive), or
 336  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 337  * zfs_frag_table for more information about the metric.
 338  */
 339 uint64_t
 340 metaslab_class_fragmentation(metaslab_class_t *mc)
 341 {
 342         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 343         uint64_t fragmentation = 0;
 344 
 345         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 346 
 347         for (int c = 0; c < rvd->vdev_children; c++) {
 348                 vdev_t *tvd = rvd->vdev_child[c];
 349                 metaslab_group_t *mg = tvd->vdev_mg;
 350 
 351                 /*
 352                  * Skip any holes, uninitialized top-levels, or
 353                  * vdevs that are not in this metalab class.
 354                  */
 355                 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 356                     mg->mg_class != mc) {
 357                         continue;
 358                 }
 359 
 360                 /*
 361                  * If a metaslab group does not contain a fragmentation
 362                  * metric then just bail out.
 363                  */
 364                 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 365                         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 366                         return (ZFS_FRAG_INVALID);
 367                 }
 368 
 369                 /*
 370                  * Determine how much this metaslab_group is contributing
 371                  * to the overall pool fragmentation metric.
 372                  */
 373                 fragmentation += mg->mg_fragmentation *
 374                     metaslab_group_get_space(mg);
 375         }
 376         fragmentation /= metaslab_class_get_space(mc);
 377 
 378         ASSERT3U(fragmentation, <=, 100);
 379         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 380         return (fragmentation);
 381 }
 382 
 383 /*
 384  * Calculate the amount of expandable space that is available in
 385  * this metaslab class. If a device is expanded then its expandable
 386  * space will be the amount of allocatable space that is currently not
 387  * part of this metaslab class.
 388  */
 389 uint64_t
 390 metaslab_class_expandable_space(metaslab_class_t *mc)
 391 {
 392         vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 393         uint64_t space = 0;
 394 
 395         spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 396         for (int c = 0; c < rvd->vdev_children; c++) {
 397                 vdev_t *tvd = rvd->vdev_child[c];
 398                 metaslab_group_t *mg = tvd->vdev_mg;
 399 
 400                 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 401                     mg->mg_class != mc) {
 402                         continue;
 403                 }
 404 
 405                 /*
 406                  * Calculate if we have enough space to add additional
 407                  * metaslabs. We report the expandable space in terms
 408                  * of the metaslab size since that's the unit of expansion.
 409                  */
 410                 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
 411                     1ULL << tvd->vdev_ms_shift);
 412         }
 413         spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 414         return (space);
 415 }
 416 
 417 static int
 418 metaslab_compare(const void *x1, const void *x2)
 419 {
 420         const metaslab_t *m1 = x1;
 421         const metaslab_t *m2 = x2;
 422 
 423         if (m1->ms_weight < m2->ms_weight)
 424                 return (1);
 425         if (m1->ms_weight > m2->ms_weight)
 426                 return (-1);
 427 
 428         /*
 429          * If the weights are identical, use the offset to force uniqueness.
 430          */
 431         if (m1->ms_start < m2->ms_start)
 432                 return (-1);
 433         if (m1->ms_start > m2->ms_start)
 434                 return (1);
 435 
 436         ASSERT3P(m1, ==, m2);
 437 
 438         return (0);
 439 }
 440 
 441 /*
 442  * Verify that the space accounting on disk matches the in-core range_trees.
 443  */
 444 void
 445 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 446 {
 447         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 448         uint64_t allocated = 0;
 449         uint64_t sm_free_space, msp_free_space;
 450 
 451         ASSERT(MUTEX_HELD(&msp->ms_lock));
 452 
 453         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 454                 return;
 455 
 456         /*
 457          * We can only verify the metaslab space when we're called
 458          * from syncing context with a loaded metaslab that has an allocated
 459          * space map. Calling this in non-syncing context does not
 460          * provide a consistent view of the metaslab since we're performing
 461          * allocations in the future.
 462          */
 463         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 464             !msp->ms_loaded)
 465                 return;
 466 
 467         sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
 468             space_map_alloc_delta(msp->ms_sm);
 469 
 470         /*
 471          * Account for future allocations since we would have already
 472          * deducted that space from the ms_freetree.
 473          */
 474         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 475                 allocated +=
 476                     range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
 477         }
 478 
 479         msp_free_space = range_tree_space(msp->ms_tree) + allocated +
 480             msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
 481 
 482         VERIFY3U(sm_free_space, ==, msp_free_space);
 483 }
 484 
 485 /*
 486  * ==========================================================================
 487  * Metaslab groups
 488  * ==========================================================================
 489  */
 490 /*
 491  * Update the allocatable flag and the metaslab group's capacity.
 492  * The allocatable flag is set to true if the capacity is below
 493  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 494  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 495  * transitions from allocatable to non-allocatable or vice versa then the
 496  * metaslab group's class is updated to reflect the transition.
 497  */
 498 static void
 499 metaslab_group_alloc_update(metaslab_group_t *mg)
 500 {
 501         vdev_t *vd = mg->mg_vd;
 502         metaslab_class_t *mc = mg->mg_class;
 503         vdev_stat_t *vs = &vd->vdev_stat;
 504         boolean_t was_allocatable;
 505         boolean_t was_initialized;
 506 
 507         ASSERT(vd == vd->vdev_top);
 508 
 509         mutex_enter(&mg->mg_lock);
 510         was_allocatable = mg->mg_allocatable;
 511         was_initialized = mg->mg_initialized;
 512 
 513         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 514             (vs->vs_space + 1);
 515 
 516         mutex_enter(&mc->mc_lock);
 517 
 518         /*
 519          * If the metaslab group was just added then it won't
 520          * have any space until we finish syncing out this txg.
 521          * At that point we will consider it initialized and available
 522          * for allocations.  We also don't consider non-activated
 523          * metaslab groups (e.g. vdevs that are in the middle of being removed)
 524          * to be initialized, because they can't be used for allocation.
 525          */
 526         mg->mg_initialized = metaslab_group_initialized(mg);
 527         if (!was_initialized && mg->mg_initialized) {
 528                 mc->mc_groups++;
 529         } else if (was_initialized && !mg->mg_initialized) {
 530                 ASSERT3U(mc->mc_groups, >, 0);
 531                 mc->mc_groups--;
 532         }
 533         if (mg->mg_initialized)
 534                 mg->mg_no_free_space = B_FALSE;
 535 
 536         /*
 537          * A metaslab group is considered allocatable if it has plenty
 538          * of free space or is not heavily fragmented. We only take
 539          * fragmentation into account if the metaslab group has a valid
 540          * fragmentation metric (i.e. a value between 0 and 100).
 541          */
 542         mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 543             mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 544             (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 545             mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 546 
 547         /*
 548          * The mc_alloc_groups maintains a count of the number of
 549          * groups in this metaslab class that are still above the
 550          * zfs_mg_noalloc_threshold. This is used by the allocating
 551          * threads to determine if they should avoid allocations to
 552          * a given group. The allocator will avoid allocations to a group
 553          * if that group has reached or is below the zfs_mg_noalloc_threshold
 554          * and there are still other groups that are above the threshold.
 555          * When a group transitions from allocatable to non-allocatable or
 556          * vice versa we update the metaslab class to reflect that change.
 557          * When the mc_alloc_groups value drops to 0 that means that all
 558          * groups have reached the zfs_mg_noalloc_threshold making all groups
 559          * eligible for allocations. This effectively means that all devices
 560          * are balanced again.
 561          */
 562         if (was_allocatable && !mg->mg_allocatable)
 563                 mc->mc_alloc_groups--;
 564         else if (!was_allocatable && mg->mg_allocatable)
 565                 mc->mc_alloc_groups++;
 566         mutex_exit(&mc->mc_lock);
 567 
 568         mutex_exit(&mg->mg_lock);
 569 }
 570 
 571 metaslab_group_t *
 572 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 573 {
 574         metaslab_group_t *mg;
 575 
 576         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 577         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 578         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 579             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 580         mg->mg_vd = vd;
 581         mg->mg_class = mc;
 582         mg->mg_activation_count = 0;
 583         mg->mg_initialized = B_FALSE;
 584         mg->mg_no_free_space = B_TRUE;
 585         refcount_create_tracked(&mg->mg_alloc_queue_depth);
 586 
 587         mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 588             minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 589 
 590         return (mg);
 591 }
 592 
 593 void
 594 metaslab_group_destroy(metaslab_group_t *mg)
 595 {
 596         ASSERT(mg->mg_prev == NULL);
 597         ASSERT(mg->mg_next == NULL);
 598         /*
 599          * We may have gone below zero with the activation count
 600          * either because we never activated in the first place or
 601          * because we're done, and possibly removing the vdev.
 602          */
 603         ASSERT(mg->mg_activation_count <= 0);
 604 
 605         taskq_destroy(mg->mg_taskq);
 606         avl_destroy(&mg->mg_metaslab_tree);
 607         mutex_destroy(&mg->mg_lock);
 608         refcount_destroy(&mg->mg_alloc_queue_depth);
 609         kmem_free(mg, sizeof (metaslab_group_t));
 610 }
 611 
 612 void
 613 metaslab_group_activate(metaslab_group_t *mg)
 614 {
 615         metaslab_class_t *mc = mg->mg_class;
 616         metaslab_group_t *mgprev, *mgnext;
 617 
 618         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 619 
 620         ASSERT(mc->mc_rotor != mg);
 621         ASSERT(mg->mg_prev == NULL);
 622         ASSERT(mg->mg_next == NULL);
 623         ASSERT(mg->mg_activation_count <= 0);
 624 
 625         if (++mg->mg_activation_count <= 0)
 626                 return;
 627 
 628         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 629         metaslab_group_alloc_update(mg);
 630 
 631         if ((mgprev = mc->mc_rotor) == NULL) {
 632                 mg->mg_prev = mg;
 633                 mg->mg_next = mg;
 634         } else {
 635                 mgnext = mgprev->mg_next;
 636                 mg->mg_prev = mgprev;
 637                 mg->mg_next = mgnext;
 638                 mgprev->mg_next = mg;
 639                 mgnext->mg_prev = mg;
 640         }
 641         mc->mc_rotor = mg;
 642 }
 643 
 644 void
 645 metaslab_group_passivate(metaslab_group_t *mg)
 646 {
 647         metaslab_class_t *mc = mg->mg_class;
 648         metaslab_group_t *mgprev, *mgnext;
 649 
 650         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 651 
 652         if (--mg->mg_activation_count != 0) {
 653                 ASSERT(mc->mc_rotor != mg);
 654                 ASSERT(mg->mg_prev == NULL);
 655                 ASSERT(mg->mg_next == NULL);
 656                 ASSERT(mg->mg_activation_count < 0);
 657                 return;
 658         }
 659 
 660         taskq_wait(mg->mg_taskq);
 661         metaslab_group_alloc_update(mg);
 662 
 663         mgprev = mg->mg_prev;
 664         mgnext = mg->mg_next;
 665 
 666         if (mg == mgnext) {
 667                 mc->mc_rotor = NULL;
 668         } else {
 669                 mc->mc_rotor = mgnext;
 670                 mgprev->mg_next = mgnext;
 671                 mgnext->mg_prev = mgprev;
 672         }
 673 
 674         mg->mg_prev = NULL;
 675         mg->mg_next = NULL;
 676 }
 677 
 678 boolean_t
 679 metaslab_group_initialized(metaslab_group_t *mg)
 680 {
 681         vdev_t *vd = mg->mg_vd;
 682         vdev_stat_t *vs = &vd->vdev_stat;
 683 
 684         return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 685 }
 686 
 687 uint64_t
 688 metaslab_group_get_space(metaslab_group_t *mg)
 689 {
 690         return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 691 }
 692 
 693 void
 694 metaslab_group_histogram_verify(metaslab_group_t *mg)
 695 {
 696         uint64_t *mg_hist;
 697         vdev_t *vd = mg->mg_vd;
 698         uint64_t ashift = vd->vdev_ashift;
 699         int i;
 700 
 701         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 702                 return;
 703 
 704         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 705             KM_SLEEP);
 706 
 707         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 708             SPACE_MAP_HISTOGRAM_SIZE + ashift);
 709 
 710         for (int m = 0; m < vd->vdev_ms_count; m++) {
 711                 metaslab_t *msp = vd->vdev_ms[m];
 712 
 713                 if (msp->ms_sm == NULL)
 714                         continue;
 715 
 716                 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 717                         mg_hist[i + ashift] +=
 718                             msp->ms_sm->sm_phys->smp_histogram[i];
 719         }
 720 
 721         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 722                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 723 
 724         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 725 }
 726 
 727 static void
 728 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 729 {
 730         metaslab_class_t *mc = mg->mg_class;
 731         uint64_t ashift = mg->mg_vd->vdev_ashift;
 732 
 733         ASSERT(MUTEX_HELD(&msp->ms_lock));
 734         if (msp->ms_sm == NULL)
 735                 return;
 736 
 737         mutex_enter(&mg->mg_lock);
 738         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 739                 mg->mg_histogram[i + ashift] +=
 740                     msp->ms_sm->sm_phys->smp_histogram[i];
 741                 mc->mc_histogram[i + ashift] +=
 742                     msp->ms_sm->sm_phys->smp_histogram[i];
 743         }
 744         mutex_exit(&mg->mg_lock);
 745 }
 746 
 747 void
 748 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 749 {
 750         metaslab_class_t *mc = mg->mg_class;
 751         uint64_t ashift = mg->mg_vd->vdev_ashift;
 752 
 753         ASSERT(MUTEX_HELD(&msp->ms_lock));
 754         if (msp->ms_sm == NULL)
 755                 return;
 756 
 757         mutex_enter(&mg->mg_lock);
 758         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 759                 ASSERT3U(mg->mg_histogram[i + ashift], >=,
 760                     msp->ms_sm->sm_phys->smp_histogram[i]);
 761                 ASSERT3U(mc->mc_histogram[i + ashift], >=,
 762                     msp->ms_sm->sm_phys->smp_histogram[i]);
 763 
 764                 mg->mg_histogram[i + ashift] -=
 765                     msp->ms_sm->sm_phys->smp_histogram[i];
 766                 mc->mc_histogram[i + ashift] -=
 767                     msp->ms_sm->sm_phys->smp_histogram[i];
 768         }
 769         mutex_exit(&mg->mg_lock);
 770 }
 771 
 772 static void
 773 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 774 {
 775         ASSERT(msp->ms_group == NULL);
 776         mutex_enter(&mg->mg_lock);
 777         msp->ms_group = mg;
 778         msp->ms_weight = 0;
 779         avl_add(&mg->mg_metaslab_tree, msp);
 780         mutex_exit(&mg->mg_lock);
 781 
 782         mutex_enter(&msp->ms_lock);
 783         metaslab_group_histogram_add(mg, msp);
 784         mutex_exit(&msp->ms_lock);
 785 }
 786 
 787 static void
 788 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 789 {
 790         mutex_enter(&msp->ms_lock);
 791         metaslab_group_histogram_remove(mg, msp);
 792         mutex_exit(&msp->ms_lock);
 793 
 794         mutex_enter(&mg->mg_lock);
 795         ASSERT(msp->ms_group == mg);
 796         avl_remove(&mg->mg_metaslab_tree, msp);
 797         msp->ms_group = NULL;
 798         mutex_exit(&mg->mg_lock);
 799 }
 800 
 801 static void
 802 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 803 {
 804         /*
 805          * Although in principle the weight can be any value, in
 806          * practice we do not use values in the range [1, 511].
 807          */
 808         ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 809         ASSERT(MUTEX_HELD(&msp->ms_lock));
 810 
 811         mutex_enter(&mg->mg_lock);
 812         ASSERT(msp->ms_group == mg);
 813         avl_remove(&mg->mg_metaslab_tree, msp);
 814         msp->ms_weight = weight;
 815         avl_add(&mg->mg_metaslab_tree, msp);
 816         mutex_exit(&mg->mg_lock);
 817 }
 818 
 819 /*
 820  * Calculate the fragmentation for a given metaslab group. We can use
 821  * a simple average here since all metaslabs within the group must have
 822  * the same size. The return value will be a value between 0 and 100
 823  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
 824  * group have a fragmentation metric.
 825  */
 826 uint64_t
 827 metaslab_group_fragmentation(metaslab_group_t *mg)
 828 {
 829         vdev_t *vd = mg->mg_vd;
 830         uint64_t fragmentation = 0;
 831         uint64_t valid_ms = 0;
 832 
 833         for (int m = 0; m < vd->vdev_ms_count; m++) {
 834                 metaslab_t *msp = vd->vdev_ms[m];
 835 
 836                 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 837                         continue;
 838 
 839                 valid_ms++;
 840                 fragmentation += msp->ms_fragmentation;
 841         }
 842 
 843         if (valid_ms <= vd->vdev_ms_count / 2)
 844                 return (ZFS_FRAG_INVALID);
 845 
 846         fragmentation /= valid_ms;
 847         ASSERT3U(fragmentation, <=, 100);
 848         return (fragmentation);
 849 }
 850 
 851 /*
 852  * Determine if a given metaslab group should skip allocations. A metaslab
 853  * group should avoid allocations if its free capacity is less than the
 854  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
 855  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
 856  * that can still handle allocations. If the allocation throttle is enabled
 857  * then we skip allocations to devices that have reached their maximum
 858  * allocation queue depth unless the selected metaslab group is the only
 859  * eligible group remaining.
 860  */
 861 static boolean_t
 862 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 863     uint64_t psize)
 864 {
 865         spa_t *spa = mg->mg_vd->vdev_spa;
 866         metaslab_class_t *mc = mg->mg_class;
 867 
 868         /*
 869          * We can only consider skipping this metaslab group if it's
 870          * in the normal metaslab class and there are other metaslab
 871          * groups to select from. Otherwise, we always consider it eligible
 872          * for allocations.
 873          */
 874         if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
 875                 return (B_TRUE);
 876 
 877         /*
 878          * If the metaslab group's mg_allocatable flag is set (see comments
 879          * in metaslab_group_alloc_update() for more information) and
 880          * the allocation throttle is disabled then allow allocations to this
 881          * device. However, if the allocation throttle is enabled then
 882          * check if we have reached our allocation limit (mg_alloc_queue_depth)
 883          * to determine if we should allow allocations to this metaslab group.
 884          * If all metaslab groups are no longer considered allocatable
 885          * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 886          * gang block size then we allow allocations on this metaslab group
 887          * regardless of the mg_allocatable or throttle settings.
 888          */
 889         if (mg->mg_allocatable) {
 890                 metaslab_group_t *mgp;
 891                 int64_t qdepth;
 892                 uint64_t qmax = mg->mg_max_alloc_queue_depth;
 893 
 894                 if (!mc->mc_alloc_throttle_enabled)
 895                         return (B_TRUE);
 896 
 897                 /*
 898                  * If this metaslab group does not have any free space, then
 899                  * there is no point in looking further.
 900                  */
 901                 if (mg->mg_no_free_space)
 902                         return (B_FALSE);
 903 
 904                 qdepth = refcount_count(&mg->mg_alloc_queue_depth);
 905 
 906                 /*
 907                  * If this metaslab group is below its qmax or it's
 908                  * the only allocatable metasable group, then attempt
 909                  * to allocate from it.
 910                  */
 911                 if (qdepth < qmax || mc->mc_alloc_groups == 1)
 912                         return (B_TRUE);
 913                 ASSERT3U(mc->mc_alloc_groups, >, 1);
 914 
 915                 /*
 916                  * Since this metaslab group is at or over its qmax, we
 917                  * need to determine if there are metaslab groups after this
 918                  * one that might be able to handle this allocation. This is
 919                  * racy since we can't hold the locks for all metaslab
 920                  * groups at the same time when we make this check.
 921                  */
 922                 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
 923                         qmax = mgp->mg_max_alloc_queue_depth;
 924 
 925                         qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
 926 
 927                         /*
 928                          * If there is another metaslab group that
 929                          * might be able to handle the allocation, then
 930                          * we return false so that we skip this group.
 931                          */
 932                         if (qdepth < qmax && !mgp->mg_no_free_space)
 933                                 return (B_FALSE);
 934                 }
 935 
 936                 /*
 937                  * We didn't find another group to handle the allocation
 938                  * so we can't skip this metaslab group even though
 939                  * we are at or over our qmax.
 940                  */
 941                 return (B_TRUE);
 942 
 943         } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 944                 return (B_TRUE);
 945         }
 946         return (B_FALSE);
 947 }
 948 
 949 /*
 950  * ==========================================================================
 951  * Range tree callbacks
 952  * ==========================================================================
 953  */
 954 
 955 /*
 956  * Comparison function for the private size-ordered tree. Tree is sorted
 957  * by size, larger sizes at the end of the tree.
 958  */
 959 static int
 960 metaslab_rangesize_compare(const void *x1, const void *x2)
 961 {
 962         const range_seg_t *r1 = x1;
 963         const range_seg_t *r2 = x2;
 964         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 965         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 966 
 967         if (rs_size1 < rs_size2)
 968                 return (-1);
 969         if (rs_size1 > rs_size2)
 970                 return (1);
 971 
 972         if (r1->rs_start < r2->rs_start)
 973                 return (-1);
 974 
 975         if (r1->rs_start > r2->rs_start)
 976                 return (1);
 977 
 978         return (0);
 979 }
 980 
 981 /*
 982  * Create any block allocator specific components. The current allocators
 983  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
 984  */
 985 static void
 986 metaslab_rt_create(range_tree_t *rt, void *arg)
 987 {
 988         metaslab_t *msp = arg;
 989 
 990         ASSERT3P(rt->rt_arg, ==, msp);
 991         ASSERT(msp->ms_tree == NULL);
 992 
 993         avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 994             sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 995 }
 996 
 997 /*
 998  * Destroy the block allocator specific components.
 999  */
1000 static void
1001 metaslab_rt_destroy(range_tree_t *rt, void *arg)
1002 {
1003         metaslab_t *msp = arg;
1004 
1005         ASSERT3P(rt->rt_arg, ==, msp);
1006         ASSERT3P(msp->ms_tree, ==, rt);
1007         ASSERT0(avl_numnodes(&msp->ms_size_tree));
1008 
1009         avl_destroy(&msp->ms_size_tree);
1010 }
1011 
1012 static void
1013 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1014 {
1015         metaslab_t *msp = arg;
1016 
1017         ASSERT3P(rt->rt_arg, ==, msp);
1018         ASSERT3P(msp->ms_tree, ==, rt);
1019         VERIFY(!msp->ms_condensing);
1020         avl_add(&msp->ms_size_tree, rs);
1021 }
1022 
1023 static void
1024 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1025 {
1026         metaslab_t *msp = arg;
1027 
1028         ASSERT3P(rt->rt_arg, ==, msp);
1029         ASSERT3P(msp->ms_tree, ==, rt);
1030         VERIFY(!msp->ms_condensing);
1031         avl_remove(&msp->ms_size_tree, rs);
1032 }
1033 
1034 static void
1035 metaslab_rt_vacate(range_tree_t *rt, void *arg)
1036 {
1037         metaslab_t *msp = arg;
1038 
1039         ASSERT3P(rt->rt_arg, ==, msp);
1040         ASSERT3P(msp->ms_tree, ==, rt);
1041 
1042         /*
1043          * Normally one would walk the tree freeing nodes along the way.
1044          * Since the nodes are shared with the range trees we can avoid
1045          * walking all nodes and just reinitialize the avl tree. The nodes
1046          * will be freed by the range tree, so we don't want to free them here.
1047          */
1048         avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1049             sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1050 }
1051 
1052 static range_tree_ops_t metaslab_rt_ops = {
1053         metaslab_rt_create,
1054         metaslab_rt_destroy,
1055         metaslab_rt_add,
1056         metaslab_rt_remove,
1057         metaslab_rt_vacate
1058 };
1059 
1060 /*
1061  * ==========================================================================
1062  * Common allocator routines
1063  * ==========================================================================
1064  */
1065 
1066 /*
1067  * Return the maximum contiguous segment within the metaslab.
1068  */
1069 uint64_t
1070 metaslab_block_maxsize(metaslab_t *msp)
1071 {
1072         avl_tree_t *t = &msp->ms_size_tree;
1073         range_seg_t *rs;
1074 
1075         if (t == NULL || (rs = avl_last(t)) == NULL)
1076                 return (0ULL);
1077 
1078         return (rs->rs_end - rs->rs_start);
1079 }
1080 
1081 static range_seg_t *
1082 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1083 {
1084         range_seg_t *rs, rsearch;
1085         avl_index_t where;
1086 
1087         rsearch.rs_start = start;
1088         rsearch.rs_end = start + size;
1089 
1090         rs = avl_find(t, &rsearch, &where);
1091         if (rs == NULL) {
1092                 rs = avl_nearest(t, where, AVL_AFTER);
1093         }
1094 
1095         return (rs);
1096 }
1097 
1098 /*
1099  * This is a helper function that can be used by the allocator to find
1100  * a suitable block to allocate. This will search the specified AVL
1101  * tree looking for a block that matches the specified criteria.
1102  */
1103 static uint64_t
1104 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1105     uint64_t align)
1106 {
1107         range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1108 
1109         while (rs != NULL) {
1110                 uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1111 
1112                 if (offset + size <= rs->rs_end) {
1113                         *cursor = offset + size;
1114                         return (offset);
1115                 }
1116                 rs = AVL_NEXT(t, rs);
1117         }
1118 
1119         /*
1120          * If we know we've searched the whole map (*cursor == 0), give up.
1121          * Otherwise, reset the cursor to the beginning and try again.
1122          */
1123         if (*cursor == 0)
1124                 return (-1ULL);
1125 
1126         *cursor = 0;
1127         return (metaslab_block_picker(t, cursor, size, align));
1128 }
1129 
1130 /*
1131  * ==========================================================================
1132  * The first-fit block allocator
1133  * ==========================================================================
1134  */
1135 static uint64_t
1136 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1137 {
1138         /*
1139          * Find the largest power of 2 block size that evenly divides the
1140          * requested size. This is used to try to allocate blocks with similar
1141          * alignment from the same area of the metaslab (i.e. same cursor
1142          * bucket) but it does not guarantee that other allocations sizes
1143          * may exist in the same region.
1144          */
1145         uint64_t align = size & -size;
1146         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1147         avl_tree_t *t = &msp->ms_tree->rt_root;
1148 
1149         return (metaslab_block_picker(t, cursor, size, align));
1150 }
1151 
1152 static metaslab_ops_t metaslab_ff_ops = {
1153         metaslab_ff_alloc
1154 };
1155 
1156 /*
1157  * ==========================================================================
1158  * Dynamic block allocator -
1159  * Uses the first fit allocation scheme until space get low and then
1160  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1161  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1162  * ==========================================================================
1163  */
1164 static uint64_t
1165 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1166 {
1167         /*
1168          * Find the largest power of 2 block size that evenly divides the
1169          * requested size. This is used to try to allocate blocks with similar
1170          * alignment from the same area of the metaslab (i.e. same cursor
1171          * bucket) but it does not guarantee that other allocations sizes
1172          * may exist in the same region.
1173          */
1174         uint64_t align = size & -size;
1175         uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1176         range_tree_t *rt = msp->ms_tree;
1177         avl_tree_t *t = &rt->rt_root;
1178         uint64_t max_size = metaslab_block_maxsize(msp);
1179         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1180 
1181         ASSERT(MUTEX_HELD(&msp->ms_lock));
1182         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1183 
1184         if (max_size < size)
1185                 return (-1ULL);
1186 
1187         /*
1188          * If we're running low on space switch to using the size
1189          * sorted AVL tree (best-fit).
1190          */
1191         if (max_size < metaslab_df_alloc_threshold ||
1192             free_pct < metaslab_df_free_pct) {
1193                 t = &msp->ms_size_tree;
1194                 *cursor = 0;
1195         }
1196 
1197         return (metaslab_block_picker(t, cursor, size, 1ULL));
1198 }
1199 
1200 static metaslab_ops_t metaslab_df_ops = {
1201         metaslab_df_alloc
1202 };
1203 
1204 /*
1205  * ==========================================================================
1206  * Cursor fit block allocator -
1207  * Select the largest region in the metaslab, set the cursor to the beginning
1208  * of the range and the cursor_end to the end of the range. As allocations
1209  * are made advance the cursor. Continue allocating from the cursor until
1210  * the range is exhausted and then find a new range.
1211  * ==========================================================================
1212  */
1213 static uint64_t
1214 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1215 {
1216         range_tree_t *rt = msp->ms_tree;
1217         avl_tree_t *t = &msp->ms_size_tree;
1218         uint64_t *cursor = &msp->ms_lbas[0];
1219         uint64_t *cursor_end = &msp->ms_lbas[1];
1220         uint64_t offset = 0;
1221 
1222         ASSERT(MUTEX_HELD(&msp->ms_lock));
1223         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1224 
1225         ASSERT3U(*cursor_end, >=, *cursor);
1226 
1227         if ((*cursor + size) > *cursor_end) {
1228                 range_seg_t *rs;
1229 
1230                 rs = avl_last(&msp->ms_size_tree);
1231                 if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1232                         return (-1ULL);
1233 
1234                 *cursor = rs->rs_start;
1235                 *cursor_end = rs->rs_end;
1236         }
1237 
1238         offset = *cursor;
1239         *cursor += size;
1240 
1241         return (offset);
1242 }
1243 
1244 static metaslab_ops_t metaslab_cf_ops = {
1245         metaslab_cf_alloc
1246 };
1247 
1248 /*
1249  * ==========================================================================
1250  * New dynamic fit allocator -
1251  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1252  * contiguous blocks. If no region is found then just use the largest segment
1253  * that remains.
1254  * ==========================================================================
1255  */
1256 
1257 /*
1258  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1259  * to request from the allocator.
1260  */
1261 uint64_t metaslab_ndf_clump_shift = 4;
1262 
1263 static uint64_t
1264 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1265 {
1266         avl_tree_t *t = &msp->ms_tree->rt_root;
1267         avl_index_t where;
1268         range_seg_t *rs, rsearch;
1269         uint64_t hbit = highbit64(size);
1270         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1271         uint64_t max_size = metaslab_block_maxsize(msp);
1272 
1273         ASSERT(MUTEX_HELD(&msp->ms_lock));
1274         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1275 
1276         if (max_size < size)
1277                 return (-1ULL);
1278 
1279         rsearch.rs_start = *cursor;
1280         rsearch.rs_end = *cursor + size;
1281 
1282         rs = avl_find(t, &rsearch, &where);
1283         if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1284                 t = &msp->ms_size_tree;
1285 
1286                 rsearch.rs_start = 0;
1287                 rsearch.rs_end = MIN(max_size,
1288                     1ULL << (hbit + metaslab_ndf_clump_shift));
1289                 rs = avl_find(t, &rsearch, &where);
1290                 if (rs == NULL)
1291                         rs = avl_nearest(t, where, AVL_AFTER);
1292                 ASSERT(rs != NULL);
1293         }
1294 
1295         if ((rs->rs_end - rs->rs_start) >= size) {
1296                 *cursor = rs->rs_start + size;
1297                 return (rs->rs_start);
1298         }
1299         return (-1ULL);
1300 }
1301 
1302 static metaslab_ops_t metaslab_ndf_ops = {
1303         metaslab_ndf_alloc
1304 };
1305 
1306 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1307 
1308 /*
1309  * ==========================================================================
1310  * Metaslabs
1311  * ==========================================================================
1312  */
1313 
1314 /*
1315  * Wait for any in-progress metaslab loads to complete.
1316  */
1317 void
1318 metaslab_load_wait(metaslab_t *msp)
1319 {
1320         ASSERT(MUTEX_HELD(&msp->ms_lock));
1321 
1322         while (msp->ms_loading) {
1323                 ASSERT(!msp->ms_loaded);
1324                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1325         }
1326 }
1327 
1328 int
1329 metaslab_load(metaslab_t *msp)
1330 {
1331         int error = 0;
1332         boolean_t success = B_FALSE;
1333 
1334         ASSERT(MUTEX_HELD(&msp->ms_lock));
1335         ASSERT(!msp->ms_loaded);
1336         ASSERT(!msp->ms_loading);
1337 
1338         msp->ms_loading = B_TRUE;
1339 
1340         /*
1341          * If the space map has not been allocated yet, then treat
1342          * all the space in the metaslab as free and add it to the
1343          * ms_tree.
1344          */
1345         if (msp->ms_sm != NULL)
1346                 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1347         else
1348                 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1349 
1350         success = (error == 0);
1351         msp->ms_loading = B_FALSE;
1352 
1353         if (success) {
1354                 ASSERT3P(msp->ms_group, !=, NULL);
1355                 msp->ms_loaded = B_TRUE;
1356 
1357                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1358                         range_tree_walk(msp->ms_defertree[t],
1359                             range_tree_remove, msp->ms_tree);
1360                 }
1361                 msp->ms_max_size = metaslab_block_maxsize(msp);
1362         }
1363         cv_broadcast(&msp->ms_load_cv);
1364         return (error);
1365 }
1366 
1367 void
1368 metaslab_unload(metaslab_t *msp)
1369 {
1370         ASSERT(MUTEX_HELD(&msp->ms_lock));
1371         range_tree_vacate(msp->ms_tree, NULL, NULL);
1372         msp->ms_loaded = B_FALSE;
1373         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1374         msp->ms_max_size = 0;
1375 }
1376 
1377 int
1378 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1379     metaslab_t **msp)
1380 {
1381         vdev_t *vd = mg->mg_vd;
1382         objset_t *mos = vd->vdev_spa->spa_meta_objset;
1383         metaslab_t *ms;
1384         int error;
1385 
1386         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1387         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1388         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1389         ms->ms_id = id;
1390         ms->ms_start = id << vd->vdev_ms_shift;
1391         ms->ms_size = 1ULL << vd->vdev_ms_shift;
1392 
1393         /*
1394          * We only open space map objects that already exist. All others
1395          * will be opened when we finally allocate an object for it.
1396          */
1397         if (object != 0) {
1398                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1399                     ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
1400 
1401                 if (error != 0) {
1402                         kmem_free(ms, sizeof (metaslab_t));
1403                         return (error);
1404                 }
1405 
1406                 ASSERT(ms->ms_sm != NULL);
1407         }
1408 
1409         /*
1410          * We create the main range tree here, but we don't create the
1411          * other range trees until metaslab_sync_done().  This serves
1412          * two purposes: it allows metaslab_sync_done() to detect the
1413          * addition of new space; and for debugging, it ensures that we'd
1414          * data fault on any attempt to use this metaslab before it's ready.
1415          */
1416         ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
1417         metaslab_group_add(mg, ms);
1418 
1419         metaslab_set_fragmentation(ms);
1420 
1421         /*
1422          * If we're opening an existing pool (txg == 0) or creating
1423          * a new one (txg == TXG_INITIAL), all space is available now.
1424          * If we're adding space to an existing pool, the new space
1425          * does not become available until after this txg has synced.
1426          * The metaslab's weight will also be initialized when we sync
1427          * out this txg. This ensures that we don't attempt to allocate
1428          * from it before we have initialized it completely.
1429          */
1430         if (txg <= TXG_INITIAL)
1431                 metaslab_sync_done(ms, 0);
1432 
1433         /*
1434          * If metaslab_debug_load is set and we're initializing a metaslab
1435          * that has an allocated space map object then load the its space
1436          * map so that can verify frees.
1437          */
1438         if (metaslab_debug_load && ms->ms_sm != NULL) {
1439                 mutex_enter(&ms->ms_lock);
1440                 VERIFY0(metaslab_load(ms));
1441                 mutex_exit(&ms->ms_lock);
1442         }
1443 
1444         if (txg != 0) {
1445                 vdev_dirty(vd, 0, NULL, txg);
1446                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1447         }
1448 
1449         *msp = ms;
1450 
1451         return (0);
1452 }
1453 
1454 void
1455 metaslab_fini(metaslab_t *msp)
1456 {
1457         metaslab_group_t *mg = msp->ms_group;
1458 
1459         metaslab_group_remove(mg, msp);
1460 
1461         mutex_enter(&msp->ms_lock);
1462         VERIFY(msp->ms_group == NULL);
1463         vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1464             0, -msp->ms_size);
1465         space_map_close(msp->ms_sm);
1466 
1467         metaslab_unload(msp);
1468         range_tree_destroy(msp->ms_tree);
1469         range_tree_destroy(msp->ms_freeingtree);
1470         range_tree_destroy(msp->ms_freedtree);
1471 
1472         for (int t = 0; t < TXG_SIZE; t++) {
1473                 range_tree_destroy(msp->ms_alloctree[t]);
1474         }
1475 
1476         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1477                 range_tree_destroy(msp->ms_defertree[t]);
1478         }
1479 
1480         ASSERT0(msp->ms_deferspace);
1481 
1482         mutex_exit(&msp->ms_lock);
1483         cv_destroy(&msp->ms_load_cv);
1484         mutex_destroy(&msp->ms_lock);
1485 
1486         kmem_free(msp, sizeof (metaslab_t));
1487 }
1488 
1489 #define FRAGMENTATION_TABLE_SIZE        17
1490 
1491 /*
1492  * This table defines a segment size based fragmentation metric that will
1493  * allow each metaslab to derive its own fragmentation value. This is done
1494  * by calculating the space in each bucket of the spacemap histogram and
1495  * multiplying that by the fragmetation metric in this table. Doing
1496  * this for all buckets and dividing it by the total amount of free
1497  * space in this metaslab (i.e. the total free space in all buckets) gives
1498  * us the fragmentation metric. This means that a high fragmentation metric
1499  * equates to most of the free space being comprised of small segments.
1500  * Conversely, if the metric is low, then most of the free space is in
1501  * large segments. A 10% change in fragmentation equates to approximately
1502  * double the number of segments.
1503  *
1504  * This table defines 0% fragmented space using 16MB segments. Testing has
1505  * shown that segments that are greater than or equal to 16MB do not suffer
1506  * from drastic performance problems. Using this value, we derive the rest
1507  * of the table. Since the fragmentation value is never stored on disk, it
1508  * is possible to change these calculations in the future.
1509  */
1510 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1511         100,    /* 512B */
1512         100,    /* 1K   */
1513         98,     /* 2K   */
1514         95,     /* 4K   */
1515         90,     /* 8K   */
1516         80,     /* 16K  */
1517         70,     /* 32K  */
1518         60,     /* 64K  */
1519         50,     /* 128K */
1520         40,     /* 256K */
1521         30,     /* 512K */
1522         20,     /* 1M   */
1523         15,     /* 2M   */
1524         10,     /* 4M   */
1525         5,      /* 8M   */
1526         0       /* 16M  */
1527 };
1528 
1529 /*
1530  * Calclate the metaslab's fragmentation metric. A return value
1531  * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1532  * not support this metric. Otherwise, the return value should be in the
1533  * range [0, 100].
1534  */
1535 static void
1536 metaslab_set_fragmentation(metaslab_t *msp)
1537 {
1538         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1539         uint64_t fragmentation = 0;
1540         uint64_t total = 0;
1541         boolean_t feature_enabled = spa_feature_is_enabled(spa,
1542             SPA_FEATURE_SPACEMAP_HISTOGRAM);
1543 
1544         if (!feature_enabled) {
1545                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1546                 return;
1547         }
1548 
1549         /*
1550          * A null space map means that the entire metaslab is free
1551          * and thus is not fragmented.
1552          */
1553         if (msp->ms_sm == NULL) {
1554                 msp->ms_fragmentation = 0;
1555                 return;
1556         }
1557 
1558         /*
1559          * If this metaslab's space map has not been upgraded, flag it
1560          * so that we upgrade next time we encounter it.
1561          */
1562         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1563                 uint64_t txg = spa_syncing_txg(spa);
1564                 vdev_t *vd = msp->ms_group->mg_vd;
1565 
1566                 if (spa_writeable(spa)) {
1567                         msp->ms_condense_wanted = B_TRUE;
1568                         vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1569                         spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1570                             "msp %p, vd %p", txg, msp, vd);
1571                 }
1572                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1573                 return;
1574         }
1575 
1576         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1577                 uint64_t space = 0;
1578                 uint8_t shift = msp->ms_sm->sm_shift;
1579 
1580                 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1581                     FRAGMENTATION_TABLE_SIZE - 1);
1582 
1583                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1584                         continue;
1585 
1586                 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1587                 total += space;
1588 
1589                 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1590                 fragmentation += space * zfs_frag_table[idx];
1591         }
1592 
1593         if (total > 0)
1594                 fragmentation /= total;
1595         ASSERT3U(fragmentation, <=, 100);
1596 
1597         msp->ms_fragmentation = fragmentation;
1598 }
1599 
1600 /*
1601  * Compute a weight -- a selection preference value -- for the given metaslab.
1602  * This is based on the amount of free space, the level of fragmentation,
1603  * the LBA range, and whether the metaslab is loaded.
1604  */
1605 static uint64_t
1606 metaslab_space_weight(metaslab_t *msp)
1607 {
1608         metaslab_group_t *mg = msp->ms_group;
1609         vdev_t *vd = mg->mg_vd;
1610         uint64_t weight, space;
1611 
1612         ASSERT(MUTEX_HELD(&msp->ms_lock));
1613         ASSERT(!vd->vdev_removing);
1614 
1615         /*
1616          * The baseline weight is the metaslab's free space.
1617          */
1618         space = msp->ms_size - space_map_allocated(msp->ms_sm);
1619 
1620         if (metaslab_fragmentation_factor_enabled &&
1621             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1622                 /*
1623                  * Use the fragmentation information to inversely scale
1624                  * down the baseline weight. We need to ensure that we
1625                  * don't exclude this metaslab completely when it's 100%
1626                  * fragmented. To avoid this we reduce the fragmented value
1627                  * by 1.
1628                  */
1629                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1630 
1631                 /*
1632                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1633                  * this metaslab again. The fragmentation metric may have
1634                  * decreased the space to something smaller than
1635                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1636                  * so that we can consume any remaining space.
1637                  */
1638                 if (space > 0 && space < SPA_MINBLOCKSIZE)
1639                         space = SPA_MINBLOCKSIZE;
1640         }
1641         weight = space;
1642 
1643         /*
1644          * Modern disks have uniform bit density and constant angular velocity.
1645          * Therefore, the outer recording zones are faster (higher bandwidth)
1646          * than the inner zones by the ratio of outer to inner track diameter,
1647          * which is typically around 2:1.  We account for this by assigning
1648          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1649          * In effect, this means that we'll select the metaslab with the most
1650          * free bandwidth rather than simply the one with the most free space.
1651          */
1652         if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
1653                 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1654                 ASSERT(weight >= space && weight <= 2 * space);
1655         }
1656 
1657         /*
1658          * If this metaslab is one we're actively using, adjust its
1659          * weight to make it preferable to any inactive metaslab so
1660          * we'll polish it off. If the fragmentation on this metaslab
1661          * has exceed our threshold, then don't mark it active.
1662          */
1663         if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1664             msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1665                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1666         }
1667 
1668         WEIGHT_SET_SPACEBASED(weight);
1669         return (weight);
1670 }
1671 
1672 /*
1673  * Return the weight of the specified metaslab, according to the segment-based
1674  * weighting algorithm. The metaslab must be loaded. This function can
1675  * be called within a sync pass since it relies only on the metaslab's
1676  * range tree which is always accurate when the metaslab is loaded.
1677  */
1678 static uint64_t
1679 metaslab_weight_from_range_tree(metaslab_t *msp)
1680 {
1681         uint64_t weight = 0;
1682         uint32_t segments = 0;
1683 
1684         ASSERT(msp->ms_loaded);
1685 
1686         for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1687             i--) {
1688                 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1689                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1690 
1691                 segments <<= 1;
1692                 segments += msp->ms_tree->rt_histogram[i];
1693 
1694                 /*
1695                  * The range tree provides more precision than the space map
1696                  * and must be downgraded so that all values fit within the
1697                  * space map's histogram. This allows us to compare loaded
1698                  * vs. unloaded metaslabs to determine which metaslab is
1699                  * considered "best".
1700                  */
1701                 if (i > max_idx)
1702                         continue;
1703 
1704                 if (segments != 0) {
1705                         WEIGHT_SET_COUNT(weight, segments);
1706                         WEIGHT_SET_INDEX(weight, i);
1707                         WEIGHT_SET_ACTIVE(weight, 0);
1708                         break;
1709                 }
1710         }
1711         return (weight);
1712 }
1713 
1714 /*
1715  * Calculate the weight based on the on-disk histogram. This should only
1716  * be called after a sync pass has completely finished since the on-disk
1717  * information is updated in metaslab_sync().
1718  */
1719 static uint64_t
1720 metaslab_weight_from_spacemap(metaslab_t *msp)
1721 {
1722         uint64_t weight = 0;
1723 
1724         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1725                 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1726                         WEIGHT_SET_COUNT(weight,
1727                             msp->ms_sm->sm_phys->smp_histogram[i]);
1728                         WEIGHT_SET_INDEX(weight, i +
1729                             msp->ms_sm->sm_shift);
1730                         WEIGHT_SET_ACTIVE(weight, 0);
1731                         break;
1732                 }
1733         }
1734         return (weight);
1735 }
1736 
1737 /*
1738  * Compute a segment-based weight for the specified metaslab. The weight
1739  * is determined by highest bucket in the histogram. The information
1740  * for the highest bucket is encoded into the weight value.
1741  */
1742 static uint64_t
1743 metaslab_segment_weight(metaslab_t *msp)
1744 {
1745         metaslab_group_t *mg = msp->ms_group;
1746         uint64_t weight = 0;
1747         uint8_t shift = mg->mg_vd->vdev_ashift;
1748 
1749         ASSERT(MUTEX_HELD(&msp->ms_lock));
1750 
1751         /*
1752          * The metaslab is completely free.
1753          */
1754         if (space_map_allocated(msp->ms_sm) == 0) {
1755                 int idx = highbit64(msp->ms_size) - 1;
1756                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1757 
1758                 if (idx < max_idx) {
1759                         WEIGHT_SET_COUNT(weight, 1ULL);
1760                         WEIGHT_SET_INDEX(weight, idx);
1761                 } else {
1762                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1763                         WEIGHT_SET_INDEX(weight, max_idx);
1764                 }
1765                 WEIGHT_SET_ACTIVE(weight, 0);
1766                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1767 
1768                 return (weight);
1769         }
1770 
1771         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1772 
1773         /*
1774          * If the metaslab is fully allocated then just make the weight 0.
1775          */
1776         if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1777                 return (0);
1778         /*
1779          * If the metaslab is already loaded, then use the range tree to
1780          * determine the weight. Otherwise, we rely on the space map information
1781          * to generate the weight.
1782          */
1783         if (msp->ms_loaded) {
1784                 weight = metaslab_weight_from_range_tree(msp);
1785         } else {
1786                 weight = metaslab_weight_from_spacemap(msp);
1787         }
1788 
1789         /*
1790          * If the metaslab was active the last time we calculated its weight
1791          * then keep it active. We want to consume the entire region that
1792          * is associated with this weight.
1793          */
1794         if (msp->ms_activation_weight != 0 && weight != 0)
1795                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1796         return (weight);
1797 }
1798 
1799 /*
1800  * Determine if we should attempt to allocate from this metaslab. If the
1801  * metaslab has a maximum size then we can quickly determine if the desired
1802  * allocation size can be satisfied. Otherwise, if we're using segment-based
1803  * weighting then we can determine the maximum allocation that this metaslab
1804  * can accommodate based on the index encoded in the weight. If we're using
1805  * space-based weights then rely on the entire weight (excluding the weight
1806  * type bit).
1807  */
1808 boolean_t
1809 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1810 {
1811         boolean_t should_allocate;
1812 
1813         if (msp->ms_max_size != 0)
1814                 return (msp->ms_max_size >= asize);
1815 
1816         if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1817                 /*
1818                  * The metaslab segment weight indicates segments in the
1819                  * range [2^i, 2^(i+1)), where i is the index in the weight.
1820                  * Since the asize might be in the middle of the range, we
1821                  * should attempt the allocation if asize < 2^(i+1).
1822                  */
1823                 should_allocate = (asize <
1824                     1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
1825         } else {
1826                 should_allocate = (asize <=
1827                     (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
1828         }
1829         return (should_allocate);
1830 }
1831 
1832 static uint64_t
1833 metaslab_weight(metaslab_t *msp)
1834 {
1835         vdev_t *vd = msp->ms_group->mg_vd;
1836         spa_t *spa = vd->vdev_spa;
1837         uint64_t weight;
1838 
1839         ASSERT(MUTEX_HELD(&msp->ms_lock));
1840 
1841         /*
1842          * This vdev is in the process of being removed so there is nothing
1843          * for us to do here.
1844          */
1845         if (vd->vdev_removing) {
1846                 ASSERT0(space_map_allocated(msp->ms_sm));
1847                 ASSERT0(vd->vdev_ms_shift);
1848                 return (0);
1849         }
1850 
1851         metaslab_set_fragmentation(msp);
1852 
1853         /*
1854          * Update the maximum size if the metaslab is loaded. This will
1855          * ensure that we get an accurate maximum size if newly freed space
1856          * has been added back into the free tree.
1857          */
1858         if (msp->ms_loaded)
1859                 msp->ms_max_size = metaslab_block_maxsize(msp);
1860 
1861         /*
1862          * Segment-based weighting requires space map histogram support.
1863          */
1864         if (zfs_metaslab_segment_weight_enabled &&
1865             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
1866             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
1867             sizeof (space_map_phys_t))) {
1868                 weight = metaslab_segment_weight(msp);
1869         } else {
1870                 weight = metaslab_space_weight(msp);
1871         }
1872         return (weight);
1873 }
1874 
1875 static int
1876 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1877 {
1878         ASSERT(MUTEX_HELD(&msp->ms_lock));
1879 
1880         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1881                 metaslab_load_wait(msp);
1882                 if (!msp->ms_loaded) {
1883                         int error = metaslab_load(msp);
1884                         if (error) {
1885                                 metaslab_group_sort(msp->ms_group, msp, 0);
1886                                 return (error);
1887                         }
1888                 }
1889 
1890                 msp->ms_activation_weight = msp->ms_weight;
1891                 metaslab_group_sort(msp->ms_group, msp,
1892                     msp->ms_weight | activation_weight);
1893         }
1894         ASSERT(msp->ms_loaded);
1895         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1896 
1897         return (0);
1898 }
1899 
1900 static void
1901 metaslab_passivate(metaslab_t *msp, uint64_t weight)
1902 {
1903         uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
1904 
1905         /*
1906          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1907          * this metaslab again.  In that case, it had better be empty,
1908          * or we would be leaving space on the table.
1909          */
1910         ASSERT(size >= SPA_MINBLOCKSIZE ||
1911             range_tree_space(msp->ms_tree) == 0);
1912         ASSERT0(weight & METASLAB_ACTIVE_MASK);
1913 
1914         msp->ms_activation_weight = 0;
1915         metaslab_group_sort(msp->ms_group, msp, weight);
1916         ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1917 }
1918 
1919 /*
1920  * Segment-based metaslabs are activated once and remain active until
1921  * we either fail an allocation attempt (similar to space-based metaslabs)
1922  * or have exhausted the free space in zfs_metaslab_switch_threshold
1923  * buckets since the metaslab was activated. This function checks to see
1924  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
1925  * metaslab and passivates it proactively. This will allow us to select a
1926  * metaslabs with larger contiguous region if any remaining within this
1927  * metaslab group. If we're in sync pass > 1, then we continue using this
1928  * metaslab so that we don't dirty more block and cause more sync passes.
1929  */
1930 void
1931 metaslab_segment_may_passivate(metaslab_t *msp)
1932 {
1933         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1934 
1935         if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
1936                 return;
1937 
1938         /*
1939          * Since we are in the middle of a sync pass, the most accurate
1940          * information that is accessible to us is the in-core range tree
1941          * histogram; calculate the new weight based on that information.
1942          */
1943         uint64_t weight = metaslab_weight_from_range_tree(msp);
1944         int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
1945         int current_idx = WEIGHT_GET_INDEX(weight);
1946 
1947         if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
1948                 metaslab_passivate(msp, weight);
1949 }
1950 
1951 static void
1952 metaslab_preload(void *arg)
1953 {
1954         metaslab_t *msp = arg;
1955         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1956 
1957         ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1958 
1959         mutex_enter(&msp->ms_lock);
1960         metaslab_load_wait(msp);
1961         if (!msp->ms_loaded)
1962                 (void) metaslab_load(msp);
1963         msp->ms_selected_txg = spa_syncing_txg(spa);
1964         mutex_exit(&msp->ms_lock);
1965 }
1966 
1967 static void
1968 metaslab_group_preload(metaslab_group_t *mg)
1969 {
1970         spa_t *spa = mg->mg_vd->vdev_spa;
1971         metaslab_t *msp;
1972         avl_tree_t *t = &mg->mg_metaslab_tree;
1973         int m = 0;
1974 
1975         if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1976                 taskq_wait(mg->mg_taskq);
1977                 return;
1978         }
1979 
1980         mutex_enter(&mg->mg_lock);
1981         /*
1982          * Load the next potential metaslabs
1983          */
1984         for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1985                 /*
1986                  * We preload only the maximum number of metaslabs specified
1987                  * by metaslab_preload_limit. If a metaslab is being forced
1988                  * to condense then we preload it too. This will ensure
1989                  * that force condensing happens in the next txg.
1990                  */
1991                 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
1992                         continue;
1993                 }
1994 
1995                 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1996                     msp, TQ_SLEEP) != NULL);
1997         }
1998         mutex_exit(&mg->mg_lock);
1999 }
2000 
2001 /*
2002  * Determine if the space map's on-disk footprint is past our tolerance
2003  * for inefficiency. We would like to use the following criteria to make
2004  * our decision:
2005  *
2006  * 1. The size of the space map object should not dramatically increase as a
2007  * result of writing out the free space range tree.
2008  *
2009  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2010  * times the size than the free space range tree representation
2011  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
2012  *
2013  * 3. The on-disk size of the space map should actually decrease.
2014  *
2015  * Checking the first condition is tricky since we don't want to walk
2016  * the entire AVL tree calculating the estimated on-disk size. Instead we
2017  * use the size-ordered range tree in the metaslab and calculate the
2018  * size required to write out the largest segment in our free tree. If the
2019  * size required to represent that segment on disk is larger than the space
2020  * map object then we avoid condensing this map.
2021  *
2022  * To determine the second criterion we use a best-case estimate and assume
2023  * each segment can be represented on-disk as a single 64-bit entry. We refer
2024  * to this best-case estimate as the space map's minimal form.
2025  *
2026  * Unfortunately, we cannot compute the on-disk size of the space map in this
2027  * context because we cannot accurately compute the effects of compression, etc.
2028  * Instead, we apply the heuristic described in the block comment for
2029  * zfs_metaslab_condense_block_threshold - we only condense if the space used
2030  * is greater than a threshold number of blocks.
2031  */
2032 static boolean_t
2033 metaslab_should_condense(metaslab_t *msp)
2034 {
2035         space_map_t *sm = msp->ms_sm;
2036         range_seg_t *rs;
2037         uint64_t size, entries, segsz, object_size, optimal_size, record_size;
2038         dmu_object_info_t doi;
2039         uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
2040 
2041         ASSERT(MUTEX_HELD(&msp->ms_lock));
2042         ASSERT(msp->ms_loaded);
2043 
2044         /*
2045          * Use the ms_size_tree range tree, which is ordered by size, to
2046          * obtain the largest segment in the free tree. We always condense
2047          * metaslabs that are empty and metaslabs for which a condense
2048          * request has been made.
2049          */
2050         rs = avl_last(&msp->ms_size_tree);
2051         if (rs == NULL || msp->ms_condense_wanted)
2052                 return (B_TRUE);
2053 
2054         /*
2055          * Calculate the number of 64-bit entries this segment would
2056          * require when written to disk. If this single segment would be
2057          * larger on-disk than the entire current on-disk structure, then
2058          * clearly condensing will increase the on-disk structure size.
2059          */
2060         size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
2061         entries = size / (MIN(size, SM_RUN_MAX));
2062         segsz = entries * sizeof (uint64_t);
2063 
2064         optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
2065         object_size = space_map_length(msp->ms_sm);
2066 
2067         dmu_object_info_from_db(sm->sm_dbuf, &doi);
2068         record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2069 
2070         return (segsz <= object_size &&
2071             object_size >= (optimal_size * zfs_condense_pct / 100) &&
2072             object_size > zfs_metaslab_condense_block_threshold * record_size);
2073 }
2074 
2075 /*
2076  * Condense the on-disk space map representation to its minimized form.
2077  * The minimized form consists of a small number of allocations followed by
2078  * the entries of the free range tree.
2079  */
2080 static void
2081 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2082 {
2083         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2084         range_tree_t *condense_tree;
2085         space_map_t *sm = msp->ms_sm;
2086 
2087         ASSERT(MUTEX_HELD(&msp->ms_lock));
2088         ASSERT3U(spa_sync_pass(spa), ==, 1);
2089         ASSERT(msp->ms_loaded);
2090 
2091 
2092         spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2093             "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2094             msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2095             msp->ms_group->mg_vd->vdev_spa->spa_name,
2096             space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
2097             msp->ms_condense_wanted ? "TRUE" : "FALSE");
2098 
2099         msp->ms_condense_wanted = B_FALSE;
2100 
2101         /*
2102          * Create an range tree that is 100% allocated. We remove segments
2103          * that have been freed in this txg, any deferred frees that exist,
2104          * and any allocation in the future. Removing segments should be
2105          * a relatively inexpensive operation since we expect these trees to
2106          * have a small number of nodes.
2107          */
2108         condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
2109         range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2110 
2111         /*
2112          * Remove what's been freed in this txg from the condense_tree.
2113          * Since we're in sync_pass 1, we know that all the frees from
2114          * this txg are in the freeingtree.
2115          */
2116         range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
2117 
2118         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2119                 range_tree_walk(msp->ms_defertree[t],
2120                     range_tree_remove, condense_tree);
2121         }
2122 
2123         for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2124                 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
2125                     range_tree_remove, condense_tree);
2126         }
2127 
2128         /*
2129          * We're about to drop the metaslab's lock thus allowing
2130          * other consumers to change it's content. Set the
2131          * metaslab's ms_condensing flag to ensure that
2132          * allocations on this metaslab do not occur while we're
2133          * in the middle of committing it to disk. This is only critical
2134          * for the ms_tree as all other range trees use per txg
2135          * views of their content.
2136          */
2137         msp->ms_condensing = B_TRUE;
2138 
2139         mutex_exit(&msp->ms_lock);
2140         space_map_truncate(sm, tx);
2141         mutex_enter(&msp->ms_lock);
2142 
2143         /*
2144          * While we would ideally like to create a space map representation
2145          * that consists only of allocation records, doing so can be
2146          * prohibitively expensive because the in-core free tree can be
2147          * large, and therefore computationally expensive to subtract
2148          * from the condense_tree. Instead we sync out two trees, a cheap
2149          * allocation only tree followed by the in-core free tree. While not
2150          * optimal, this is typically close to optimal, and much cheaper to
2151          * compute.
2152          */
2153         space_map_write(sm, condense_tree, SM_ALLOC, tx);
2154         range_tree_vacate(condense_tree, NULL, NULL);
2155         range_tree_destroy(condense_tree);
2156 
2157         space_map_write(sm, msp->ms_tree, SM_FREE, tx);
2158         msp->ms_condensing = B_FALSE;
2159 }
2160 
2161 /*
2162  * Write a metaslab to disk in the context of the specified transaction group.
2163  */
2164 void
2165 metaslab_sync(metaslab_t *msp, uint64_t txg)
2166 {
2167         metaslab_group_t *mg = msp->ms_group;
2168         vdev_t *vd = mg->mg_vd;
2169         spa_t *spa = vd->vdev_spa;
2170         objset_t *mos = spa_meta_objset(spa);
2171         range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2172         dmu_tx_t *tx;
2173         uint64_t object = space_map_object(msp->ms_sm);
2174 
2175         ASSERT(!vd->vdev_ishole);
2176 
2177         /*
2178          * This metaslab has just been added so there's no work to do now.
2179          */
2180         if (msp->ms_freeingtree == NULL) {
2181                 ASSERT3P(alloctree, ==, NULL);
2182                 return;
2183         }
2184 
2185         ASSERT3P(alloctree, !=, NULL);
2186         ASSERT3P(msp->ms_freeingtree, !=, NULL);
2187         ASSERT3P(msp->ms_freedtree, !=, NULL);
2188 
2189         /*
2190          * Normally, we don't want to process a metaslab if there
2191          * are no allocations or frees to perform. However, if the metaslab
2192          * is being forced to condense we need to let it through.
2193          */
2194         if (range_tree_space(alloctree) == 0 &&
2195             range_tree_space(msp->ms_freeingtree) == 0 &&
2196             !msp->ms_condense_wanted)
2197                 return;
2198 
2199         /*
2200          * The only state that can actually be changing concurrently with
2201          * metaslab_sync() is the metaslab's ms_tree.  No other thread can
2202          * be modifying this txg's alloctree, freeingtree, freedtree, or
2203          * space_map_phys_t. Therefore, we only hold ms_lock to satify
2204          * space map ASSERTs. We drop it whenever we call into the DMU,
2205          * because the DMU can call down to us (e.g. via zio_free()) at
2206          * any time.
2207          */
2208 
2209         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2210 
2211         if (msp->ms_sm == NULL) {
2212                 uint64_t new_object;
2213 
2214                 new_object = space_map_alloc(mos, tx);
2215                 VERIFY3U(new_object, !=, 0);
2216 
2217                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2218                     msp->ms_start, msp->ms_size, vd->vdev_ashift,
2219                     &msp->ms_lock));
2220                 ASSERT(msp->ms_sm != NULL);
2221         }
2222 
2223         mutex_enter(&msp->ms_lock);
2224 
2225         /*
2226          * Note: metaslab_condense() clears the space map's histogram.
2227          * Therefore we must verify and remove this histogram before
2228          * condensing.
2229          */
2230         metaslab_group_histogram_verify(mg);
2231         metaslab_class_histogram_verify(mg->mg_class);
2232         metaslab_group_histogram_remove(mg, msp);
2233 
2234         if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2235             metaslab_should_condense(msp)) {
2236                 metaslab_condense(msp, txg, tx);
2237         } else {
2238                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2239                 space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
2240         }
2241 
2242         if (msp->ms_loaded) {
2243                 /*
2244                  * When the space map is loaded, we have an accruate
2245                  * histogram in the range tree. This gives us an opportunity
2246                  * to bring the space map's histogram up-to-date so we clear
2247                  * it first before updating it.
2248                  */
2249                 space_map_histogram_clear(msp->ms_sm);
2250                 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2251 
2252                 /*
2253                  * Since we've cleared the histogram we need to add back
2254                  * any free space that has already been processed, plus
2255                  * any deferred space. This allows the on-disk histogram
2256                  * to accurately reflect all free space even if some space
2257                  * is not yet available for allocation (i.e. deferred).
2258                  */
2259                 space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
2260 
2261                 /*
2262                  * Add back any deferred free space that has not been
2263                  * added back into the in-core free tree yet. This will
2264                  * ensure that we don't end up with a space map histogram
2265                  * that is completely empty unless the metaslab is fully
2266                  * allocated.
2267                  */
2268                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2269                         space_map_histogram_add(msp->ms_sm,
2270                             msp->ms_defertree[t], tx);
2271                 }
2272         }
2273 
2274         /*
2275          * Always add the free space from this sync pass to the space
2276          * map histogram. We want to make sure that the on-disk histogram
2277          * accounts for all free space. If the space map is not loaded,
2278          * then we will lose some accuracy but will correct it the next
2279          * time we load the space map.
2280          */
2281         space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
2282 
2283         metaslab_group_histogram_add(mg, msp);
2284         metaslab_group_histogram_verify(mg);
2285         metaslab_class_histogram_verify(mg->mg_class);
2286 
2287         /*
2288          * For sync pass 1, we avoid traversing this txg's free range tree
2289          * and instead will just swap the pointers for freeingtree and
2290          * freedtree. We can safely do this since the freed_tree is
2291          * guaranteed to be empty on the initial pass.
2292          */
2293         if (spa_sync_pass(spa) == 1) {
2294                 range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
2295         } else {
2296                 range_tree_vacate(msp->ms_freeingtree,
2297                     range_tree_add, msp->ms_freedtree);
2298         }
2299         range_tree_vacate(alloctree, NULL, NULL);
2300 
2301         ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2302         ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
2303         ASSERT0(range_tree_space(msp->ms_freeingtree));
2304 
2305         mutex_exit(&msp->ms_lock);
2306 
2307         if (object != space_map_object(msp->ms_sm)) {
2308                 object = space_map_object(msp->ms_sm);
2309                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2310                     msp->ms_id, sizeof (uint64_t), &object, tx);
2311         }
2312         dmu_tx_commit(tx);
2313 }
2314 
2315 /*
2316  * Called after a transaction group has completely synced to mark
2317  * all of the metaslab's free space as usable.
2318  */
2319 void
2320 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2321 {
2322         metaslab_group_t *mg = msp->ms_group;
2323         vdev_t *vd = mg->mg_vd;
2324         spa_t *spa = vd->vdev_spa;
2325         range_tree_t **defer_tree;
2326         int64_t alloc_delta, defer_delta;
2327         boolean_t defer_allowed = B_TRUE;
2328 
2329         ASSERT(!vd->vdev_ishole);
2330 
2331         mutex_enter(&msp->ms_lock);
2332 
2333         /*
2334          * If this metaslab is just becoming available, initialize its
2335          * range trees and add its capacity to the vdev.
2336          */
2337         if (msp->ms_freedtree == NULL) {
2338                 for (int t = 0; t < TXG_SIZE; t++) {
2339                         ASSERT(msp->ms_alloctree[t] == NULL);
2340 
2341                         msp->ms_alloctree[t] = range_tree_create(NULL, msp,
2342                             &msp->ms_lock);
2343                 }
2344 
2345                 ASSERT3P(msp->ms_freeingtree, ==, NULL);
2346                 msp->ms_freeingtree = range_tree_create(NULL, msp,
2347                     &msp->ms_lock);
2348 
2349                 ASSERT3P(msp->ms_freedtree, ==, NULL);
2350                 msp->ms_freedtree = range_tree_create(NULL, msp,
2351                     &msp->ms_lock);
2352 
2353                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2354                         ASSERT(msp->ms_defertree[t] == NULL);
2355 
2356                         msp->ms_defertree[t] = range_tree_create(NULL, msp,
2357                             &msp->ms_lock);
2358                 }
2359 
2360                 vdev_space_update(vd, 0, 0, msp->ms_size);
2361         }
2362 
2363         defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2364 
2365         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2366             metaslab_class_get_alloc(spa_normal_class(spa));
2367         if (free_space <= spa_get_slop_space(spa)) {
2368                 defer_allowed = B_FALSE;
2369         }
2370 
2371         defer_delta = 0;
2372         alloc_delta = space_map_alloc_delta(msp->ms_sm);
2373         if (defer_allowed) {
2374                 defer_delta = range_tree_space(msp->ms_freedtree) -
2375                     range_tree_space(*defer_tree);
2376         } else {
2377                 defer_delta -= range_tree_space(*defer_tree);
2378         }
2379 
2380         vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2381 
2382         /*
2383          * If there's a metaslab_load() in progress, wait for it to complete
2384          * so that we have a consistent view of the in-core space map.
2385          */
2386         metaslab_load_wait(msp);
2387 
2388         /*
2389          * Move the frees from the defer_tree back to the free
2390          * range tree (if it's loaded). Swap the freed_tree and the
2391          * defer_tree -- this is safe to do because we've just emptied out
2392          * the defer_tree.
2393          */
2394         range_tree_vacate(*defer_tree,
2395             msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2396         if (defer_allowed) {
2397                 range_tree_swap(&msp->ms_freedtree, defer_tree);
2398         } else {
2399                 range_tree_vacate(msp->ms_freedtree,
2400                     msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2401         }
2402 
2403         space_map_update(msp->ms_sm);
2404 
2405         msp->ms_deferspace += defer_delta;
2406         ASSERT3S(msp->ms_deferspace, >=, 0);
2407         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2408         if (msp->ms_deferspace != 0) {
2409                 /*
2410                  * Keep syncing this metaslab until all deferred frees
2411                  * are back in circulation.
2412                  */
2413                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2414         }
2415 
2416         /*
2417          * Calculate the new weights before unloading any metaslabs.
2418          * This will give us the most accurate weighting.
2419          */
2420         metaslab_group_sort(mg, msp, metaslab_weight(msp));
2421 
2422         /*
2423          * If the metaslab is loaded and we've not tried to load or allocate
2424          * from it in 'metaslab_unload_delay' txgs, then unload it.
2425          */
2426         if (msp->ms_loaded &&
2427             msp->ms_selected_txg + metaslab_unload_delay < txg) {
2428                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2429                         VERIFY0(range_tree_space(
2430                             msp->ms_alloctree[(txg + t) & TXG_MASK]));
2431                 }
2432 
2433                 if (!metaslab_debug_unload)
2434                         metaslab_unload(msp);
2435         }
2436 
2437         mutex_exit(&msp->ms_lock);
2438 }
2439 
2440 void
2441 metaslab_sync_reassess(metaslab_group_t *mg)
2442 {
2443         metaslab_group_alloc_update(mg);
2444         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2445 
2446         /*
2447          * Preload the next potential metaslabs
2448          */
2449         metaslab_group_preload(mg);
2450 }
2451 
2452 static uint64_t
2453 metaslab_distance(metaslab_t *msp, dva_t *dva)
2454 {
2455         uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2456         uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2457         uint64_t start = msp->ms_id;
2458 
2459         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2460                 return (1ULL << 63);
2461 
2462         if (offset < start)
2463                 return ((start - offset) << ms_shift);
2464         if (offset > start)
2465                 return ((offset - start) << ms_shift);
2466         return (0);
2467 }
2468 
2469 /*
2470  * ==========================================================================
2471  * Metaslab allocation tracing facility
2472  * ==========================================================================
2473  */
2474 kstat_t *metaslab_trace_ksp;
2475 kstat_named_t metaslab_trace_over_limit;
2476 
2477 void
2478 metaslab_alloc_trace_init(void)
2479 {
2480         ASSERT(metaslab_alloc_trace_cache == NULL);
2481         metaslab_alloc_trace_cache = kmem_cache_create(
2482             "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2483             0, NULL, NULL, NULL, NULL, NULL, 0);
2484         metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2485             "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2486         if (metaslab_trace_ksp != NULL) {
2487                 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2488                 kstat_named_init(&metaslab_trace_over_limit,
2489                     "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2490                 kstat_install(metaslab_trace_ksp);
2491         }
2492 }
2493 
2494 void
2495 metaslab_alloc_trace_fini(void)
2496 {
2497         if (metaslab_trace_ksp != NULL) {
2498                 kstat_delete(metaslab_trace_ksp);
2499                 metaslab_trace_ksp = NULL;
2500         }
2501         kmem_cache_destroy(metaslab_alloc_trace_cache);
2502         metaslab_alloc_trace_cache = NULL;
2503 }
2504 
2505 /*
2506  * Add an allocation trace element to the allocation tracing list.
2507  */
2508 static void
2509 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2510     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2511 {
2512         if (!metaslab_trace_enabled)
2513                 return;
2514 
2515         /*
2516          * When the tracing list reaches its maximum we remove
2517          * the second element in the list before adding a new one.
2518          * By removing the second element we preserve the original
2519          * entry as a clue to what allocations steps have already been
2520          * performed.
2521          */
2522         if (zal->zal_size == metaslab_trace_max_entries) {
2523                 metaslab_alloc_trace_t *mat_next;
2524 #ifdef DEBUG
2525                 panic("too many entries in allocation list");
2526 #endif
2527                 atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2528                 zal->zal_size--;
2529                 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2530                 list_remove(&zal->zal_list, mat_next);
2531                 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2532         }
2533 
2534         metaslab_alloc_trace_t *mat =
2535             kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2536         list_link_init(&mat->mat_list_node);
2537         mat->mat_mg = mg;
2538         mat->mat_msp = msp;
2539         mat->mat_size = psize;
2540         mat->mat_dva_id = dva_id;
2541         mat->mat_offset = offset;
2542         mat->mat_weight = 0;
2543 
2544         if (msp != NULL)
2545                 mat->mat_weight = msp->ms_weight;
2546 
2547         /*
2548          * The list is part of the zio so locking is not required. Only
2549          * a single thread will perform allocations for a given zio.
2550          */
2551         list_insert_tail(&zal->zal_list, mat);
2552         zal->zal_size++;
2553 
2554         ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2555 }
2556 
2557 void
2558 metaslab_trace_init(zio_alloc_list_t *zal)
2559 {
2560         list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2561             offsetof(metaslab_alloc_trace_t, mat_list_node));
2562         zal->zal_size = 0;
2563 }
2564 
2565 void
2566 metaslab_trace_fini(zio_alloc_list_t *zal)
2567 {
2568         metaslab_alloc_trace_t *mat;
2569 
2570         while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2571                 kmem_cache_free(metaslab_alloc_trace_cache, mat);
2572         list_destroy(&zal->zal_list);
2573         zal->zal_size = 0;
2574 }
2575 
2576 /*
2577  * ==========================================================================
2578  * Metaslab block operations
2579  * ==========================================================================
2580  */
2581 
2582 static void
2583 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2584 {
2585         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2586             flags & METASLAB_DONT_THROTTLE)
2587                 return;
2588 
2589         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2590         if (!mg->mg_class->mc_alloc_throttle_enabled)
2591                 return;
2592 
2593         (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2594 }
2595 
2596 void
2597 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2598 {
2599         if (!(flags & METASLAB_ASYNC_ALLOC) ||
2600             flags & METASLAB_DONT_THROTTLE)
2601                 return;
2602 
2603         metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2604         if (!mg->mg_class->mc_alloc_throttle_enabled)
2605                 return;
2606 
2607         (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2608 }
2609 
2610 void
2611 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2612 {
2613 #ifdef ZFS_DEBUG
2614         const dva_t *dva = bp->blk_dva;
2615         int ndvas = BP_GET_NDVAS(bp);
2616 
2617         for (int d = 0; d < ndvas; d++) {
2618                 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2619                 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2620                 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2621         }
2622 #endif
2623 }
2624 
2625 static uint64_t
2626 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2627 {
2628         uint64_t start;
2629         range_tree_t *rt = msp->ms_tree;
2630         metaslab_class_t *mc = msp->ms_group->mg_class;
2631 
2632         VERIFY(!msp->ms_condensing);
2633 
2634         start = mc->mc_ops->msop_alloc(msp, size);
2635         if (start != -1ULL) {
2636                 metaslab_group_t *mg = msp->ms_group;
2637                 vdev_t *vd = mg->mg_vd;
2638 
2639                 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2640                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2641                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2642                 range_tree_remove(rt, start, size);
2643 
2644                 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2645                         vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2646 
2647                 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
2648 
2649                 /* Track the last successful allocation */
2650                 msp->ms_alloc_txg = txg;
2651                 metaslab_verify_space(msp, txg);
2652         }
2653 
2654         /*
2655          * Now that we've attempted the allocation we need to update the
2656          * metaslab's maximum block size since it may have changed.
2657          */
2658         msp->ms_max_size = metaslab_block_maxsize(msp);
2659         return (start);
2660 }
2661 
2662 static uint64_t
2663 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2664     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2665 {
2666         metaslab_t *msp = NULL;
2667         uint64_t offset = -1ULL;
2668         uint64_t activation_weight;
2669         uint64_t target_distance;
2670         int i;
2671 
2672         activation_weight = METASLAB_WEIGHT_PRIMARY;
2673         for (i = 0; i < d; i++) {
2674                 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2675                         activation_weight = METASLAB_WEIGHT_SECONDARY;
2676                         break;
2677                 }
2678         }
2679 
2680         metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2681         search->ms_weight = UINT64_MAX;
2682         search->ms_start = 0;
2683         for (;;) {
2684                 boolean_t was_active;
2685                 avl_tree_t *t = &mg->mg_metaslab_tree;
2686                 avl_index_t idx;
2687 
2688                 mutex_enter(&mg->mg_lock);
2689 
2690                 /*
2691                  * Find the metaslab with the highest weight that is less
2692                  * than what we've already tried.  In the common case, this
2693                  * means that we will examine each metaslab at most once.
2694                  * Note that concurrent callers could reorder metaslabs
2695                  * by activation/passivation once we have dropped the mg_lock.
2696                  * If a metaslab is activated by another thread, and we fail
2697                  * to allocate from the metaslab we have selected, we may
2698                  * not try the newly-activated metaslab, and instead activate
2699                  * another metaslab.  This is not optimal, but generally
2700                  * does not cause any problems (a possible exception being
2701                  * if every metaslab is completely full except for the
2702                  * the newly-activated metaslab which we fail to examine).
2703                  */
2704                 msp = avl_find(t, search, &idx);
2705                 if (msp == NULL)
2706                         msp = avl_nearest(t, idx, AVL_AFTER);
2707                 for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2708 
2709                         if (!metaslab_should_allocate(msp, asize)) {
2710                                 metaslab_trace_add(zal, mg, msp, asize, d,
2711                                     TRACE_TOO_SMALL);
2712                                 continue;
2713                         }
2714 
2715                         /*
2716                          * If the selected metaslab is condensing, skip it.
2717                          */
2718                         if (msp->ms_condensing)
2719                                 continue;
2720 
2721                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2722                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2723                                 break;
2724 
2725                         target_distance = min_distance +
2726                             (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2727                             min_distance >> 1);
2728 
2729                         for (i = 0; i < d; i++) {
2730                                 if (metaslab_distance(msp, &dva[i]) <
2731                                     target_distance)
2732                                         break;
2733                         }
2734                         if (i == d)
2735                                 break;
2736                 }
2737                 mutex_exit(&mg->mg_lock);
2738                 if (msp == NULL) {
2739                         kmem_free(search, sizeof (*search));
2740                         return (-1ULL);
2741                 }
2742                 search->ms_weight = msp->ms_weight;
2743                 search->ms_start = msp->ms_start + 1;
2744 
2745                 mutex_enter(&msp->ms_lock);
2746 
2747                 /*
2748                  * Ensure that the metaslab we have selected is still
2749                  * capable of handling our request. It's possible that
2750                  * another thread may have changed the weight while we
2751                  * were blocked on the metaslab lock. We check the
2752                  * active status first to see if we need to reselect
2753                  * a new metaslab.
2754                  */
2755                 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2756                         mutex_exit(&msp->ms_lock);
2757                         continue;
2758                 }
2759 
2760                 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2761                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
2762                         metaslab_passivate(msp,
2763                             msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2764                         mutex_exit(&msp->ms_lock);
2765                         continue;
2766                 }
2767 
2768                 if (metaslab_activate(msp, activation_weight) != 0) {
2769                         mutex_exit(&msp->ms_lock);
2770                         continue;
2771                 }
2772                 msp->ms_selected_txg = txg;
2773 
2774                 /*
2775                  * Now that we have the lock, recheck to see if we should
2776                  * continue to use this metaslab for this allocation. The
2777                  * the metaslab is now loaded so metaslab_should_allocate() can
2778                  * accurately determine if the allocation attempt should
2779                  * proceed.
2780                  */
2781                 if (!metaslab_should_allocate(msp, asize)) {
2782                         /* Passivate this metaslab and select a new one. */
2783                         metaslab_trace_add(zal, mg, msp, asize, d,
2784                             TRACE_TOO_SMALL);
2785                         goto next;
2786                 }
2787 
2788                 /*
2789                  * If this metaslab is currently condensing then pick again as
2790                  * we can't manipulate this metaslab until it's committed
2791                  * to disk.
2792                  */
2793                 if (msp->ms_condensing) {
2794                         metaslab_trace_add(zal, mg, msp, asize, d,
2795                             TRACE_CONDENSING);
2796                         mutex_exit(&msp->ms_lock);
2797                         continue;
2798                 }
2799 
2800                 offset = metaslab_block_alloc(msp, asize, txg);
2801                 metaslab_trace_add(zal, mg, msp, asize, d, offset);
2802 
2803                 if (offset != -1ULL) {
2804                         /* Proactively passivate the metaslab, if needed */
2805                         metaslab_segment_may_passivate(msp);
2806                         break;
2807                 }
2808 next:
2809                 ASSERT(msp->ms_loaded);
2810 
2811                 /*
2812                  * We were unable to allocate from this metaslab so determine
2813                  * a new weight for this metaslab. Now that we have loaded
2814                  * the metaslab we can provide a better hint to the metaslab
2815                  * selector.
2816                  *
2817                  * For space-based metaslabs, we use the maximum block size.
2818                  * This information is only available when the metaslab
2819                  * is loaded and is more accurate than the generic free
2820                  * space weight that was calculated by metaslab_weight().
2821                  * This information allows us to quickly compare the maximum
2822                  * available allocation in the metaslab to the allocation
2823                  * size being requested.
2824                  *
2825                  * For segment-based metaslabs, determine the new weight
2826                  * based on the highest bucket in the range tree. We
2827                  * explicitly use the loaded segment weight (i.e. the range
2828                  * tree histogram) since it contains the space that is
2829                  * currently available for allocation and is accurate
2830                  * even within a sync pass.
2831                  */
2832                 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2833                         uint64_t weight = metaslab_block_maxsize(msp);
2834                         WEIGHT_SET_SPACEBASED(weight);
2835                         metaslab_passivate(msp, weight);
2836                 } else {
2837                         metaslab_passivate(msp,
2838                             metaslab_weight_from_range_tree(msp));
2839                 }
2840 
2841                 /*
2842                  * We have just failed an allocation attempt, check
2843                  * that metaslab_should_allocate() agrees. Otherwise,
2844                  * we may end up in an infinite loop retrying the same
2845                  * metaslab.
2846                  */
2847                 ASSERT(!metaslab_should_allocate(msp, asize));
2848                 mutex_exit(&msp->ms_lock);
2849         }
2850         mutex_exit(&msp->ms_lock);
2851         kmem_free(search, sizeof (*search));
2852         return (offset);
2853 }
2854 
2855 static uint64_t
2856 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
2857     uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2858 {
2859         uint64_t offset;
2860         ASSERT(mg->mg_initialized);
2861 
2862         offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
2863             min_distance, dva, d);
2864 
2865         mutex_enter(&mg->mg_lock);
2866         if (offset == -1ULL) {
2867                 mg->mg_failed_allocations++;
2868                 metaslab_trace_add(zal, mg, NULL, asize, d,
2869                     TRACE_GROUP_FAILURE);
2870                 if (asize == SPA_GANGBLOCKSIZE) {
2871                         /*
2872                          * This metaslab group was unable to allocate
2873                          * the minimum gang block size so it must be out of
2874                          * space. We must notify the allocation throttle
2875                          * to start skipping allocation attempts to this
2876                          * metaslab group until more space becomes available.
2877                          * Note: this failure cannot be caused by the
2878                          * allocation throttle since the allocation throttle
2879                          * is only responsible for skipping devices and
2880                          * not failing block allocations.
2881                          */
2882                         mg->mg_no_free_space = B_TRUE;
2883                 }
2884         }
2885         mg->mg_allocations++;
2886         mutex_exit(&mg->mg_lock);
2887         return (offset);
2888 }
2889 
2890 /*
2891  * If we have to write a ditto block (i.e. more than one DVA for a given BP)
2892  * on the same vdev as an existing DVA of this BP, then try to allocate it
2893  * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
2894  * existing DVAs.
2895  */
2896 int ditto_same_vdev_distance_shift = 3;
2897 
2898 /*
2899  * Allocate a block for the specified i/o.
2900  */
2901 static int
2902 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2903     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
2904     zio_alloc_list_t *zal)
2905 {
2906         metaslab_group_t *mg, *rotor;
2907         vdev_t *vd;
2908         boolean_t try_hard = B_FALSE;
2909 
2910         ASSERT(!DVA_IS_VALID(&dva[d]));
2911 
2912         /*
2913          * For testing, make some blocks above a certain size be gang blocks.
2914          */
2915         if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
2916                 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
2917                 return (SET_ERROR(ENOSPC));
2918         }
2919 
2920         /*
2921          * Start at the rotor and loop through all mgs until we find something.
2922          * Note that there's no locking on mc_rotor or mc_aliquot because
2923          * nothing actually breaks if we miss a few updates -- we just won't
2924          * allocate quite as evenly.  It all balances out over time.
2925          *
2926          * If we are doing ditto or log blocks, try to spread them across
2927          * consecutive vdevs.  If we're forced to reuse a vdev before we've
2928          * allocated all of our ditto blocks, then try and spread them out on
2929          * that vdev as much as possible.  If it turns out to not be possible,
2930          * gradually lower our standards until anything becomes acceptable.
2931          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2932          * gives us hope of containing our fault domains to something we're
2933          * able to reason about.  Otherwise, any two top-level vdev failures
2934          * will guarantee the loss of data.  With consecutive allocation,
2935          * only two adjacent top-level vdev failures will result in data loss.
2936          *
2937          * If we are doing gang blocks (hintdva is non-NULL), try to keep
2938          * ourselves on the same vdev as our gang block header.  That
2939          * way, we can hope for locality in vdev_cache, plus it makes our
2940          * fault domains something tractable.
2941          */
2942         if (hintdva) {
2943                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2944 
2945                 /*
2946                  * It's possible the vdev we're using as the hint no
2947                  * longer exists (i.e. removed). Consult the rotor when
2948                  * all else fails.
2949                  */
2950                 if (vd != NULL) {
2951                         mg = vd->vdev_mg;
2952 
2953                         if (flags & METASLAB_HINTBP_AVOID &&
2954                             mg->mg_next != NULL)
2955                                 mg = mg->mg_next;
2956                 } else {
2957                         mg = mc->mc_rotor;
2958                 }
2959         } else if (d != 0) {
2960                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
2961                 mg = vd->vdev_mg->mg_next;
2962         } else {
2963                 mg = mc->mc_rotor;
2964         }
2965 
2966         /*
2967          * If the hint put us into the wrong metaslab class, or into a
2968          * metaslab group that has been passivated, just follow the rotor.
2969          */
2970         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
2971                 mg = mc->mc_rotor;
2972 
2973         rotor = mg;
2974 top:
2975         do {
2976                 boolean_t allocatable;
2977 
2978                 ASSERT(mg->mg_activation_count == 1);
2979                 vd = mg->mg_vd;
2980 
2981                 /*
2982                  * Don't allocate from faulted devices.
2983                  */
2984                 if (try_hard) {
2985                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2986                         allocatable = vdev_allocatable(vd);
2987                         spa_config_exit(spa, SCL_ZIO, FTAG);
2988                 } else {
2989                         allocatable = vdev_allocatable(vd);
2990                 }
2991 
2992                 /*
2993                  * Determine if the selected metaslab group is eligible
2994                  * for allocations. If we're ganging then don't allow
2995                  * this metaslab group to skip allocations since that would
2996                  * inadvertently return ENOSPC and suspend the pool
2997                  * even though space is still available.
2998                  */
2999                 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3000                         allocatable = metaslab_group_allocatable(mg, rotor,
3001                             psize);
3002                 }
3003 
3004                 if (!allocatable) {
3005                         metaslab_trace_add(zal, mg, NULL, psize, d,
3006                             TRACE_NOT_ALLOCATABLE);
3007                         goto next;
3008                 }
3009 
3010                 ASSERT(mg->mg_initialized);
3011 
3012                 /*
3013                  * Avoid writing single-copy data to a failing,
3014                  * non-redundant vdev, unless we've already tried all
3015                  * other vdevs.
3016                  */
3017                 if ((vd->vdev_stat.vs_write_errors > 0 ||
3018                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
3019                     d == 0 && !try_hard && vd->vdev_children == 0) {
3020                         metaslab_trace_add(zal, mg, NULL, psize, d,
3021                             TRACE_VDEV_ERROR);
3022                         goto next;
3023                 }
3024 
3025                 ASSERT(mg->mg_class == mc);
3026 
3027                 /*
3028                  * If we don't need to try hard, then require that the
3029                  * block be 1/8th of the device away from any other DVAs
3030                  * in this BP.  If we are trying hard, allow any offset
3031                  * to be used (distance=0).
3032                  */
3033                 uint64_t distance = 0;
3034                 if (!try_hard) {
3035                         distance = vd->vdev_asize >>
3036                             ditto_same_vdev_distance_shift;
3037                         if (distance <= (1ULL << vd->vdev_ms_shift))
3038                                 distance = 0;
3039                 }
3040 
3041                 uint64_t asize = vdev_psize_to_asize(vd, psize);
3042                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3043 
3044                 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3045                     distance, dva, d);
3046 
3047                 if (offset != -1ULL) {
3048                         /*
3049                          * If we've just selected this metaslab group,
3050                          * figure out whether the corresponding vdev is
3051                          * over- or under-used relative to the pool,
3052                          * and set an allocation bias to even it out.
3053                          */
3054                         if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3055                                 vdev_stat_t *vs = &vd->vdev_stat;
3056                                 int64_t vu, cu;
3057 
3058                                 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
3059                                 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
3060 
3061                                 /*
3062                                  * Calculate how much more or less we should
3063                                  * try to allocate from this device during
3064                                  * this iteration around the rotor.
3065                                  * For example, if a device is 80% full
3066                                  * and the pool is 20% full then we should
3067                                  * reduce allocations by 60% on this device.
3068                                  *
3069                                  * mg_bias = (20 - 80) * 512K / 100 = -307K
3070                                  *
3071                                  * This reduces allocations by 307K for this
3072                                  * iteration.
3073                                  */
3074                                 mg->mg_bias = ((cu - vu) *
3075                                     (int64_t)mg->mg_aliquot) / 100;
3076                         } else if (!metaslab_bias_enabled) {
3077                                 mg->mg_bias = 0;
3078                         }
3079 
3080                         if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3081                             mg->mg_aliquot + mg->mg_bias) {
3082                                 mc->mc_rotor = mg->mg_next;
3083                                 mc->mc_aliquot = 0;
3084                         }
3085 
3086                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
3087                         DVA_SET_OFFSET(&dva[d], offset);
3088                         DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
3089                         DVA_SET_ASIZE(&dva[d], asize);
3090 
3091                         return (0);
3092                 }
3093 next:
3094                 mc->mc_rotor = mg->mg_next;
3095                 mc->mc_aliquot = 0;
3096         } while ((mg = mg->mg_next) != rotor);
3097 
3098         /*
3099          * If we haven't tried hard, do so now.
3100          */
3101         if (!try_hard) {
3102                 try_hard = B_TRUE;
3103                 goto top;
3104         }
3105 
3106         bzero(&dva[d], sizeof (dva_t));
3107 
3108         metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3109         return (SET_ERROR(ENOSPC));
3110 }
3111 
3112 /*
3113  * Free the block represented by DVA in the context of the specified
3114  * transaction group.
3115  */
3116 static void
3117 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
3118 {
3119         uint64_t vdev = DVA_GET_VDEV(dva);
3120         uint64_t offset = DVA_GET_OFFSET(dva);
3121         uint64_t size = DVA_GET_ASIZE(dva);
3122         vdev_t *vd;
3123         metaslab_t *msp;
3124 
3125         ASSERT(DVA_IS_VALID(dva));
3126 
3127         if (txg > spa_freeze_txg(spa))
3128                 return;
3129 
3130         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3131             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3132                 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
3133                     (u_longlong_t)vdev, (u_longlong_t)offset);
3134                 ASSERT(0);
3135                 return;
3136         }
3137 
3138         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3139 
3140         if (DVA_GET_GANG(dva))
3141                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3142 
3143         mutex_enter(&msp->ms_lock);
3144 
3145         if (now) {
3146                 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3147                     offset, size);
3148 
3149                 VERIFY(!msp->ms_condensing);
3150                 VERIFY3U(offset, >=, msp->ms_start);
3151                 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3152                 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
3153                     msp->ms_size);
3154                 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3155                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3156                 range_tree_add(msp->ms_tree, offset, size);
3157                 msp->ms_max_size = metaslab_block_maxsize(msp);
3158         } else {
3159                 VERIFY3U(txg, ==, spa->spa_syncing_txg);
3160                 if (range_tree_space(msp->ms_freeingtree) == 0)
3161                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
3162                 range_tree_add(msp->ms_freeingtree, offset, size);
3163         }
3164 
3165         mutex_exit(&msp->ms_lock);
3166 }
3167 
3168 /*
3169  * Intent log support: upon opening the pool after a crash, notify the SPA
3170  * of blocks that the intent log has allocated for immediate write, but
3171  * which are still considered free by the SPA because the last transaction
3172  * group didn't commit yet.
3173  */
3174 static int
3175 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3176 {
3177         uint64_t vdev = DVA_GET_VDEV(dva);
3178         uint64_t offset = DVA_GET_OFFSET(dva);
3179         uint64_t size = DVA_GET_ASIZE(dva);
3180         vdev_t *vd;
3181         metaslab_t *msp;
3182         int error = 0;
3183 
3184         ASSERT(DVA_IS_VALID(dva));
3185 
3186         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3187             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
3188                 return (SET_ERROR(ENXIO));
3189 
3190         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3191 
3192         if (DVA_GET_GANG(dva))
3193                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3194 
3195         mutex_enter(&msp->ms_lock);
3196 
3197         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3198                 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3199 
3200         if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
3201                 error = SET_ERROR(ENOENT);
3202 
3203         if (error || txg == 0) {        /* txg == 0 indicates dry run */
3204                 mutex_exit(&msp->ms_lock);
3205                 return (error);
3206         }
3207 
3208         VERIFY(!msp->ms_condensing);
3209         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3210         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3211         VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
3212         range_tree_remove(msp->ms_tree, offset, size);
3213 
3214         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
3215                 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3216                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
3217                 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3218         }
3219 
3220         mutex_exit(&msp->ms_lock);
3221 
3222         return (0);
3223 }
3224 
3225 /*
3226  * Reserve some allocation slots. The reservation system must be called
3227  * before we call into the allocator. If there aren't any available slots
3228  * then the I/O will be throttled until an I/O completes and its slots are
3229  * freed up. The function returns true if it was successful in placing
3230  * the reservation.
3231  */
3232 boolean_t
3233 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3234     int flags)
3235 {
3236         uint64_t available_slots = 0;
3237         boolean_t slot_reserved = B_FALSE;
3238 
3239         ASSERT(mc->mc_alloc_throttle_enabled);
3240         mutex_enter(&mc->mc_lock);
3241 
3242         uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3243         if (reserved_slots < mc->mc_alloc_max_slots)
3244                 available_slots = mc->mc_alloc_max_slots - reserved_slots;
3245 
3246         if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3247                 /*
3248                  * We reserve the slots individually so that we can unreserve
3249                  * them individually when an I/O completes.
3250                  */
3251                 for (int d = 0; d < slots; d++) {
3252                         reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3253                 }
3254                 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3255                 slot_reserved = B_TRUE;
3256         }
3257 
3258         mutex_exit(&mc->mc_lock);
3259         return (slot_reserved);
3260 }
3261 
3262 void
3263 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3264 {
3265         ASSERT(mc->mc_alloc_throttle_enabled);
3266         mutex_enter(&mc->mc_lock);
3267         for (int d = 0; d < slots; d++) {
3268                 (void) refcount_remove(&mc->mc_alloc_slots, zio);
3269         }
3270         mutex_exit(&mc->mc_lock);
3271 }
3272 
3273 int
3274 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3275     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3276     zio_alloc_list_t *zal, zio_t *zio)
3277 {
3278         dva_t *dva = bp->blk_dva;
3279         dva_t *hintdva = hintbp->blk_dva;
3280         int error = 0;
3281 
3282         ASSERT(bp->blk_birth == 0);
3283         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3284 
3285         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3286 
3287         if (mc->mc_rotor == NULL) {  /* no vdevs in this class */
3288                 spa_config_exit(spa, SCL_ALLOC, FTAG);
3289                 return (SET_ERROR(ENOSPC));
3290         }
3291 
3292         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3293         ASSERT(BP_GET_NDVAS(bp) == 0);
3294         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3295         ASSERT3P(zal, !=, NULL);
3296 
3297         for (int d = 0; d < ndvas; d++) {
3298                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3299                     txg, flags, zal);
3300                 if (error != 0) {
3301                         for (d--; d >= 0; d--) {
3302                                 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
3303                                 metaslab_group_alloc_decrement(spa,
3304                                     DVA_GET_VDEV(&dva[d]), zio, flags);
3305                                 bzero(&dva[d], sizeof (dva_t));
3306                         }
3307                         spa_config_exit(spa, SCL_ALLOC, FTAG);
3308                         return (error);
3309                 } else {
3310                         /*
3311                          * Update the metaslab group's queue depth
3312                          * based on the newly allocated dva.
3313                          */
3314                         metaslab_group_alloc_increment(spa,
3315                             DVA_GET_VDEV(&dva[d]), zio, flags);
3316                 }
3317 
3318         }
3319         ASSERT(error == 0);
3320         ASSERT(BP_GET_NDVAS(bp) == ndvas);
3321 
3322         spa_config_exit(spa, SCL_ALLOC, FTAG);
3323 
3324         BP_SET_BIRTH(bp, txg, txg);
3325 
3326         return (0);
3327 }
3328 
3329 void
3330 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3331 {
3332         const dva_t *dva = bp->blk_dva;
3333         int ndvas = BP_GET_NDVAS(bp);
3334 
3335         ASSERT(!BP_IS_HOLE(bp));
3336         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3337 
3338         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3339 
3340         for (int d = 0; d < ndvas; d++)
3341                 metaslab_free_dva(spa, &dva[d], txg, now);
3342 
3343         spa_config_exit(spa, SCL_FREE, FTAG);
3344 }
3345 
3346 int
3347 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3348 {
3349         const dva_t *dva = bp->blk_dva;
3350         int ndvas = BP_GET_NDVAS(bp);
3351         int error = 0;
3352 
3353         ASSERT(!BP_IS_HOLE(bp));
3354 
3355         if (txg != 0) {
3356                 /*
3357                  * First do a dry run to make sure all DVAs are claimable,
3358                  * so we don't have to unwind from partial failures below.
3359                  */
3360                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
3361                         return (error);
3362         }
3363 
3364         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3365 
3366         for (int d = 0; d < ndvas; d++)
3367                 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3368                         break;
3369 
3370         spa_config_exit(spa, SCL_ALLOC, FTAG);
3371 
3372         ASSERT(error == 0 || txg == 0);
3373 
3374         return (error);
3375 }
3376 
3377 void
3378 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
3379 {
3380         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3381                 return;
3382 
3383         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3384         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
3385                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
3386                 vdev_t *vd = vdev_lookup_top(spa, vdev);
3387                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
3388                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
3389                 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3390 
3391                 if (msp->ms_loaded)
3392                         range_tree_verify(msp->ms_tree, offset, size);
3393 
3394                 range_tree_verify(msp->ms_freeingtree, offset, size);
3395                 range_tree_verify(msp->ms_freedtree, offset, size);
3396                 for (int j = 0; j < TXG_DEFER_SIZE; j++)
3397                         range_tree_verify(msp->ms_defertree[j], offset, size);
3398         }
3399         spa_config_exit(spa, SCL_VDEV, FTAG);
3400 }