1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_tx.h>
  30 #include <sys/space_map.h>
  31 #include <sys/metaslab_impl.h>
  32 #include <sys/vdev_impl.h>
  33 #include <sys/zio.h>
  34 #include <sys/spa_impl.h>
  35 
  36 /*
  37  * Allow allocations to switch to gang blocks quickly. We do this to
  38  * avoid having to load lots of space_maps in a given txg. There are,
  39  * however, some cases where we want to avoid "fast" ganging and instead
  40  * we want to do an exhaustive search of all metaslabs on this device.
  41  * Currently we don't allow any gang, zil, or dump device related allocations
  42  * to "fast" gang.
  43  */
  44 #define CAN_FASTGANG(flags) \
  45         (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
  46         METASLAB_GANG_AVOID)))
  47 
  48 #define METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
  49 #define METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
  50 #define METASLAB_ACTIVE_MASK            \
  51         (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
  52 
  53 uint64_t metaslab_aliquot = 512ULL << 10;
  54 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  55 
  56 /*
  57  * The in-core space map representation is more compact than its on-disk form.
  58  * The zfs_condense_pct determines how much more compact the in-core
  59  * space_map representation must be before we compact it on-disk.
  60  * Values should be greater than or equal to 100.
  61  */
  62 int zfs_condense_pct = 200;
  63 
  64 /*
  65  * This value defines the number of allowed allocation failures per vdev.
  66  * If a device reaches this threshold in a given txg then we consider skipping
  67  * allocations on that device. The value of zfs_mg_alloc_failures is computed
  68  * in zio_init() unless it has been overridden in /etc/system.
  69  */
  70 int zfs_mg_alloc_failures = 0;
  71 
  72 /*
  73  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  74  * be eligible for allocation. The value is defined as a percentage of
  75  * a free space. Metaslab groups that have more free space than
  76  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  77  * a metaslab group's free space is less than or equal to the
  78  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  79  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  80  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  81  * groups are allowed to accept allocations. Gang blocks are always
  82  * eligible to allocate on any metaslab group. The default value of 0 means
  83  * no metaslab group will be excluded based on this criterion.
  84  */
  85 int zfs_mg_noalloc_threshold = 0;
  86 
  87 /*
  88  * When set will load all metaslabs when pool is first opened.
  89  */
  90 int metaslab_debug_load = 0;
  91 
  92 /*
  93  * When set will prevent metaslabs from being unloaded.
  94  */
  95 int metaslab_debug_unload = 0;
  96 
  97 /*
  98  * Minimum size which forces the dynamic allocator to change
  99  * it's allocation strategy.  Once the space map cannot satisfy
 100  * an allocation of this size then it switches to using more
 101  * aggressive strategy (i.e search by size rather than offset).
 102  */
 103 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
 104 
 105 /*
 106  * The minimum free space, in percent, which must be available
 107  * in a space map to continue allocations in a first-fit fashion.
 108  * Once the space_map's free space drops below this level we dynamically
 109  * switch to using best-fit allocations.
 110  */
 111 int metaslab_df_free_pct = 4;
 112 
 113 /*
 114  * A metaslab is considered "free" if it contains a contiguous
 115  * segment which is greater than metaslab_min_alloc_size.
 116  */
 117 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 118 
 119 /*
 120  * Percentage of all cpus that can be used by the metaslab taskq.
 121  */
 122 int metaslab_load_pct = 50;
 123 
 124 /*
 125  * Determines how many txgs a metaslab may remain loaded without having any
 126  * allocations from it. As long as a metaslab continues to be used we will
 127  * keep it loaded.
 128  */
 129 int metaslab_unload_delay = TXG_SIZE * 2;
 130 
 131 /*
 132  * Should we be willing to write data to degraded vdevs?
 133  */
 134 boolean_t zfs_write_to_degraded = B_FALSE;
 135 
 136 /*
 137  * Max number of metaslabs per group to preload.
 138  */
 139 int metaslab_preload_limit = SPA_DVAS_PER_BP;
 140 
 141 /*
 142  * Enable/disable preloading of metaslab.
 143  */
 144 boolean_t metaslab_preload_enabled = B_TRUE;
 145 
 146 /*
 147  * Enable/disable additional weight factor for each metaslab.
 148  */
 149 boolean_t metaslab_weight_factor_enable = B_FALSE;
 150 
 151 
 152 /*
 153  * ==========================================================================
 154  * Metaslab classes
 155  * ==========================================================================
 156  */
 157 metaslab_class_t *
 158 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 159 {
 160         metaslab_class_t *mc;
 161 
 162         mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 163 
 164         mc->mc_spa = spa;
 165         mc->mc_rotor = NULL;
 166         mc->mc_ops = ops;
 167 
 168         return (mc);
 169 }
 170 
 171 void
 172 metaslab_class_destroy(metaslab_class_t *mc)
 173 {
 174         ASSERT(mc->mc_rotor == NULL);
 175         ASSERT(mc->mc_alloc == 0);
 176         ASSERT(mc->mc_deferred == 0);
 177         ASSERT(mc->mc_space == 0);
 178         ASSERT(mc->mc_dspace == 0);
 179 
 180         kmem_free(mc, sizeof (metaslab_class_t));
 181 }
 182 
 183 int
 184 metaslab_class_validate(metaslab_class_t *mc)
 185 {
 186         metaslab_group_t *mg;
 187         vdev_t *vd;
 188 
 189         /*
 190          * Must hold one of the spa_config locks.
 191          */
 192         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 193             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 194 
 195         if ((mg = mc->mc_rotor) == NULL)
 196                 return (0);
 197 
 198         do {
 199                 vd = mg->mg_vd;
 200                 ASSERT(vd->vdev_mg != NULL);
 201                 ASSERT3P(vd->vdev_top, ==, vd);
 202                 ASSERT3P(mg->mg_class, ==, mc);
 203                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 204         } while ((mg = mg->mg_next) != mc->mc_rotor);
 205 
 206         return (0);
 207 }
 208 
 209 void
 210 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 211     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 212 {
 213         atomic_add_64(&mc->mc_alloc, alloc_delta);
 214         atomic_add_64(&mc->mc_deferred, defer_delta);
 215         atomic_add_64(&mc->mc_space, space_delta);
 216         atomic_add_64(&mc->mc_dspace, dspace_delta);
 217 }
 218 
 219 uint64_t
 220 metaslab_class_get_alloc(metaslab_class_t *mc)
 221 {
 222         return (mc->mc_alloc);
 223 }
 224 
 225 uint64_t
 226 metaslab_class_get_deferred(metaslab_class_t *mc)
 227 {
 228         return (mc->mc_deferred);
 229 }
 230 
 231 uint64_t
 232 metaslab_class_get_space(metaslab_class_t *mc)
 233 {
 234         return (mc->mc_space);
 235 }
 236 
 237 uint64_t
 238 metaslab_class_get_dspace(metaslab_class_t *mc)
 239 {
 240         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 241 }
 242 
 243 /*
 244  * ==========================================================================
 245  * Metaslab groups
 246  * ==========================================================================
 247  */
 248 static int
 249 metaslab_compare(const void *x1, const void *x2)
 250 {
 251         const metaslab_t *m1 = x1;
 252         const metaslab_t *m2 = x2;
 253 
 254         if (m1->ms_weight < m2->ms_weight)
 255                 return (1);
 256         if (m1->ms_weight > m2->ms_weight)
 257                 return (-1);
 258 
 259         /*
 260          * If the weights are identical, use the offset to force uniqueness.
 261          */
 262         if (m1->ms_start < m2->ms_start)
 263                 return (-1);
 264         if (m1->ms_start > m2->ms_start)
 265                 return (1);
 266 
 267         ASSERT3P(m1, ==, m2);
 268 
 269         return (0);
 270 }
 271 
 272 /*
 273  * Update the allocatable flag and the metaslab group's capacity.
 274  * The allocatable flag is set to true if the capacity is below
 275  * the zfs_mg_noalloc_threshold. If a metaslab group transitions
 276  * from allocatable to non-allocatable or vice versa then the metaslab
 277  * group's class is updated to reflect the transition.
 278  */
 279 static void
 280 metaslab_group_alloc_update(metaslab_group_t *mg)
 281 {
 282         vdev_t *vd = mg->mg_vd;
 283         metaslab_class_t *mc = mg->mg_class;
 284         vdev_stat_t *vs = &vd->vdev_stat;
 285         boolean_t was_allocatable;
 286 
 287         ASSERT(vd == vd->vdev_top);
 288 
 289         mutex_enter(&mg->mg_lock);
 290         was_allocatable = mg->mg_allocatable;
 291 
 292         mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 293             (vs->vs_space + 1);
 294 
 295         mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
 296 
 297         /*
 298          * The mc_alloc_groups maintains a count of the number of
 299          * groups in this metaslab class that are still above the
 300          * zfs_mg_noalloc_threshold. This is used by the allocating
 301          * threads to determine if they should avoid allocations to
 302          * a given group. The allocator will avoid allocations to a group
 303          * if that group has reached or is below the zfs_mg_noalloc_threshold
 304          * and there are still other groups that are above the threshold.
 305          * When a group transitions from allocatable to non-allocatable or
 306          * vice versa we update the metaslab class to reflect that change.
 307          * When the mc_alloc_groups value drops to 0 that means that all
 308          * groups have reached the zfs_mg_noalloc_threshold making all groups
 309          * eligible for allocations. This effectively means that all devices
 310          * are balanced again.
 311          */
 312         if (was_allocatable && !mg->mg_allocatable)
 313                 mc->mc_alloc_groups--;
 314         else if (!was_allocatable && mg->mg_allocatable)
 315                 mc->mc_alloc_groups++;
 316         mutex_exit(&mg->mg_lock);
 317 }
 318 
 319 metaslab_group_t *
 320 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 321 {
 322         metaslab_group_t *mg;
 323 
 324         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 325         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 326         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 327             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 328         mg->mg_vd = vd;
 329         mg->mg_class = mc;
 330         mg->mg_activation_count = 0;
 331 
 332         mg->mg_taskq = taskq_create("metaslab_group_tasksq", metaslab_load_pct,
 333             minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 334 
 335         return (mg);
 336 }
 337 
 338 void
 339 metaslab_group_destroy(metaslab_group_t *mg)
 340 {
 341         ASSERT(mg->mg_prev == NULL);
 342         ASSERT(mg->mg_next == NULL);
 343         /*
 344          * We may have gone below zero with the activation count
 345          * either because we never activated in the first place or
 346          * because we're done, and possibly removing the vdev.
 347          */
 348         ASSERT(mg->mg_activation_count <= 0);
 349 
 350         if (mg->mg_taskq) taskq_destroy(mg->mg_taskq);
 351 
 352         avl_destroy(&mg->mg_metaslab_tree);
 353         mutex_destroy(&mg->mg_lock);
 354         kmem_free(mg, sizeof (metaslab_group_t));
 355 }
 356 
 357 void
 358 metaslab_group_activate(metaslab_group_t *mg)
 359 {
 360         metaslab_class_t *mc = mg->mg_class;
 361         metaslab_group_t *mgprev, *mgnext;
 362 
 363         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 364 
 365         ASSERT(mc->mc_rotor != mg);
 366         ASSERT(mg->mg_prev == NULL);
 367         ASSERT(mg->mg_next == NULL);
 368         ASSERT(mg->mg_activation_count <= 0);
 369 
 370         if (++mg->mg_activation_count <= 0)
 371                 return;
 372 
 373         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 374         metaslab_group_alloc_update(mg);
 375 
 376         if ((mgprev = mc->mc_rotor) == NULL) {
 377                 mg->mg_prev = mg;
 378                 mg->mg_next = mg;
 379         } else {
 380                 mgnext = mgprev->mg_next;
 381                 mg->mg_prev = mgprev;
 382                 mg->mg_next = mgnext;
 383                 mgprev->mg_next = mg;
 384                 mgnext->mg_prev = mg;
 385         }
 386         mc->mc_rotor = mg;
 387 }
 388 
 389 void
 390 metaslab_group_passivate(metaslab_group_t *mg)
 391 {
 392         metaslab_class_t *mc = mg->mg_class;
 393         metaslab_group_t *mgprev, *mgnext;
 394 
 395         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 396 
 397         if (--mg->mg_activation_count != 0) {
 398                 ASSERT(mc->mc_rotor != mg);
 399                 ASSERT(mg->mg_prev == NULL);
 400                 ASSERT(mg->mg_next == NULL);
 401                 ASSERT(mg->mg_activation_count < 0);
 402                 return;
 403         }
 404 
 405         taskq_wait(mg->mg_taskq);
 406 
 407         mgprev = mg->mg_prev;
 408         mgnext = mg->mg_next;
 409 
 410         if (mg == mgnext) {
 411                 mc->mc_rotor = NULL;
 412         } else {
 413                 mc->mc_rotor = mgnext;
 414                 mgprev->mg_next = mgnext;
 415                 mgnext->mg_prev = mgprev;
 416         }
 417 
 418         mg->mg_prev = NULL;
 419         mg->mg_next = NULL;
 420 }
 421 
 422 static void
 423 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 424 {
 425         mutex_enter(&mg->mg_lock);
 426         ASSERT(msp->ms_group == NULL);
 427         msp->ms_group = mg;
 428         msp->ms_weight = 0;
 429         avl_add(&mg->mg_metaslab_tree, msp);
 430         mutex_exit(&mg->mg_lock);
 431 }
 432 
 433 static void
 434 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 435 {
 436         mutex_enter(&mg->mg_lock);
 437         ASSERT(msp->ms_group == mg);
 438         avl_remove(&mg->mg_metaslab_tree, msp);
 439         msp->ms_group = NULL;
 440         mutex_exit(&mg->mg_lock);
 441 }
 442 
 443 static void
 444 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 445 {
 446         /*
 447          * Although in principle the weight can be any value, in
 448          * practice we do not use values in the range [1, 510].
 449          */
 450         ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
 451         ASSERT(MUTEX_HELD(&msp->ms_lock));
 452 
 453         mutex_enter(&mg->mg_lock);
 454         ASSERT(msp->ms_group == mg);
 455         avl_remove(&mg->mg_metaslab_tree, msp);
 456         msp->ms_weight = weight;
 457         avl_add(&mg->mg_metaslab_tree, msp);
 458         mutex_exit(&mg->mg_lock);
 459 }
 460 
 461 /*
 462  * Determine if a given metaslab group should skip allocations. A metaslab
 463  * group should avoid allocations if its used capacity has crossed the
 464  * zfs_mg_noalloc_threshold and there is at least one metaslab group
 465  * that can still handle allocations.
 466  */
 467 static boolean_t
 468 metaslab_group_allocatable(metaslab_group_t *mg)
 469 {
 470         vdev_t *vd = mg->mg_vd;
 471         spa_t *spa = vd->vdev_spa;
 472         metaslab_class_t *mc = mg->mg_class;
 473 
 474         /*
 475          * A metaslab group is considered allocatable if its free capacity
 476          * is greater than the set value of zfs_mg_noalloc_threshold, it's
 477          * associated with a slog, or there are no other metaslab groups
 478          * with free capacity greater than zfs_mg_noalloc_threshold.
 479          */
 480         return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
 481             mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
 482 }
 483 
 484 /*
 485  * ==========================================================================
 486  * Range tree callbacks
 487  * ==========================================================================
 488  */
 489 
 490 /*
 491  * Comparison function for the private size-ordered tree. Tree is sorted
 492  * by size, larger sizes at the end of the tree.
 493  */
 494 static int
 495 metaslab_rangesize_compare(const void *x1, const void *x2)
 496 {
 497         const range_seg_t *r1 = x1;
 498         const range_seg_t *r2 = x2;
 499         uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 500         uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 501 
 502         if (rs_size1 < rs_size2)
 503                 return (-1);
 504         if (rs_size1 > rs_size2)
 505                 return (1);
 506 
 507         if (r1->rs_start < r2->rs_start)
 508                 return (-1);
 509 
 510         if (r1->rs_start > r2->rs_start)
 511                 return (1);
 512 
 513         return (0);
 514 }
 515 
 516 /*
 517  * Create any block allocator specific components. The current allocators
 518  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
 519  */
 520 static void
 521 metaslab_rt_create(range_tree_t *rt, void *arg)
 522 {
 523         metaslab_t *msp = arg;
 524 
 525         ASSERT3P(rt->rt_arg, ==, msp);
 526         ASSERT(msp->ms_tree == NULL);
 527 
 528         avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 529             sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 530 }
 531 
 532 /*
 533  * Destroy the block allocator specific components.
 534  */
 535 static void
 536 metaslab_rt_destroy(range_tree_t *rt, void *arg)
 537 {
 538         metaslab_t *msp = arg;
 539 
 540         ASSERT3P(rt->rt_arg, ==, msp);
 541         ASSERT3P(msp->ms_tree, ==, rt);
 542         ASSERT0(avl_numnodes(&msp->ms_size_tree));
 543 
 544         avl_destroy(&msp->ms_size_tree);
 545 }
 546 
 547 static void
 548 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 549 {
 550         metaslab_t *msp = arg;
 551 
 552         ASSERT3P(rt->rt_arg, ==, msp);
 553         ASSERT3P(msp->ms_tree, ==, rt);
 554         VERIFY(!msp->ms_condensing);
 555         avl_add(&msp->ms_size_tree, rs);
 556 }
 557 
 558 static void
 559 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 560 {
 561         metaslab_t *msp = arg;
 562 
 563         ASSERT3P(rt->rt_arg, ==, msp);
 564         ASSERT3P(msp->ms_tree, ==, rt);
 565         VERIFY(!msp->ms_condensing);
 566         avl_remove(&msp->ms_size_tree, rs);
 567 }
 568 
 569 static void
 570 metaslab_rt_vacate(range_tree_t *rt, void *arg)
 571 {
 572         metaslab_t *msp = arg;
 573 
 574         ASSERT3P(rt->rt_arg, ==, msp);
 575         ASSERT3P(msp->ms_tree, ==, rt);
 576 
 577         /*
 578          * Normally one would walk the tree freeing nodes along the way.
 579          * Since the nodes are shared with the range trees we can avoid
 580          * walking all nodes and just reinitialize the avl tree. The nodes
 581          * will be freed by the range tree, so we don't want to free them here.
 582          */
 583         avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 584             sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 585 }
 586 
 587 static range_tree_ops_t metaslab_rt_ops = {
 588         metaslab_rt_create,
 589         metaslab_rt_destroy,
 590         metaslab_rt_add,
 591         metaslab_rt_remove,
 592         metaslab_rt_vacate
 593 };
 594 
 595 /*
 596  * ==========================================================================
 597  * Metaslab block operations
 598  * ==========================================================================
 599  */
 600 
 601 /*
 602  * Return the maximum contiguous segment within the metaslab.
 603  */
 604 uint64_t
 605 metaslab_block_maxsize(metaslab_t *msp)
 606 {
 607         avl_tree_t *t = &msp->ms_size_tree;
 608         range_seg_t *rs;
 609 
 610         if (t == NULL || (rs = avl_last(t)) == NULL)
 611                 return (0ULL);
 612 
 613         return (rs->rs_end - rs->rs_start);
 614 }
 615 
 616 uint64_t
 617 metaslab_block_alloc(metaslab_t *msp, uint64_t size)
 618 {
 619         uint64_t start;
 620         range_tree_t *rt = msp->ms_tree;
 621 
 622         VERIFY(!msp->ms_condensing);
 623 
 624         start = msp->ms_ops->msop_alloc(msp, size);
 625         if (start != -1ULL) {
 626                 vdev_t *vd = msp->ms_group->mg_vd;
 627 
 628                 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
 629                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 630                 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
 631                 range_tree_remove(rt, start, size);
 632         }
 633         return (start);
 634 }
 635 
 636 /*
 637  * ==========================================================================
 638  * Common allocator routines
 639  * ==========================================================================
 640  */
 641 
 642 /*
 643  * This is a helper function that can be used by the allocator to find
 644  * a suitable block to allocate. This will search the specified AVL
 645  * tree looking for a block that matches the specified criteria.
 646  */
 647 static uint64_t
 648 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 649     uint64_t align)
 650 {
 651         range_seg_t *rs, rsearch;
 652         avl_index_t where;
 653 
 654         rsearch.rs_start = *cursor;
 655         rsearch.rs_end = *cursor + size;
 656 
 657         rs = avl_find(t, &rsearch, &where);
 658         if (rs == NULL)
 659                 rs = avl_nearest(t, where, AVL_AFTER);
 660 
 661         while (rs != NULL) {
 662                 uint64_t offset = P2ROUNDUP(rs->rs_start, align);
 663 
 664                 if (offset + size <= rs->rs_end) {
 665                         *cursor = offset + size;
 666                         return (offset);
 667                 }
 668                 rs = AVL_NEXT(t, rs);
 669         }
 670 
 671         /*
 672          * If we know we've searched the whole map (*cursor == 0), give up.
 673          * Otherwise, reset the cursor to the beginning and try again.
 674          */
 675         if (*cursor == 0)
 676                 return (-1ULL);
 677 
 678         *cursor = 0;
 679         return (metaslab_block_picker(t, cursor, size, align));
 680 }
 681 
 682 /*
 683  * ==========================================================================
 684  * The first-fit block allocator
 685  * ==========================================================================
 686  */
 687 static uint64_t
 688 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 689 {
 690         /*
 691          * Find the largest power of 2 block size that evenly divides the
 692          * requested size. This is used to try to allocate blocks with similar
 693          * alignment from the same area of the metaslab (i.e. same cursor
 694          * bucket) but it does not guarantee that other allocations sizes
 695          * may exist in the same region.
 696          */
 697         uint64_t align = size & -size;
 698         uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
 699         avl_tree_t *t = &msp->ms_tree->rt_root;
 700 
 701         return (metaslab_block_picker(t, cursor, size, align));
 702 }
 703 
 704 /* ARGSUSED */
 705 static boolean_t
 706 metaslab_ff_fragmented(metaslab_t *msp)
 707 {
 708         return (B_TRUE);
 709 }
 710 
 711 static metaslab_ops_t metaslab_ff_ops = {
 712         metaslab_ff_alloc,
 713         metaslab_ff_fragmented
 714 };
 715 
 716 /*
 717  * ==========================================================================
 718  * Dynamic block allocator -
 719  * Uses the first fit allocation scheme until space get low and then
 720  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
 721  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
 722  * ==========================================================================
 723  */
 724 static uint64_t
 725 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 726 {
 727         /*
 728          * Find the largest power of 2 block size that evenly divides the
 729          * requested size. This is used to try to allocate blocks with similar
 730          * alignment from the same area of the metaslab (i.e. same cursor
 731          * bucket) but it does not guarantee that other allocations sizes
 732          * may exist in the same region.
 733          */
 734         uint64_t align = size & -size;
 735         uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
 736         range_tree_t *rt = msp->ms_tree;
 737         avl_tree_t *t = &rt->rt_root;
 738         uint64_t max_size = metaslab_block_maxsize(msp);
 739         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 740 
 741         ASSERT(MUTEX_HELD(&msp->ms_lock));
 742         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 743 
 744         if (max_size < size)
 745                 return (-1ULL);
 746 
 747         /*
 748          * If we're running low on space switch to using the size
 749          * sorted AVL tree (best-fit).
 750          */
 751         if (max_size < metaslab_df_alloc_threshold ||
 752             free_pct < metaslab_df_free_pct) {
 753                 t = &msp->ms_size_tree;
 754                 *cursor = 0;
 755         }
 756 
 757         return (metaslab_block_picker(t, cursor, size, 1ULL));
 758 }
 759 
 760 static boolean_t
 761 metaslab_df_fragmented(metaslab_t *msp)
 762 {
 763         range_tree_t *rt = msp->ms_tree;
 764         uint64_t max_size = metaslab_block_maxsize(msp);
 765         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 766 
 767         if (max_size >= metaslab_df_alloc_threshold &&
 768             free_pct >= metaslab_df_free_pct)
 769                 return (B_FALSE);
 770 
 771         return (B_TRUE);
 772 }
 773 
 774 static metaslab_ops_t metaslab_df_ops = {
 775         metaslab_df_alloc,
 776         metaslab_df_fragmented
 777 };
 778 
 779 /*
 780  * ==========================================================================
 781  * Cursor fit block allocator -
 782  * Select the largest region in the metaslab, set the cursor to the beginning
 783  * of the range and the cursor_end to the end of the range. As allocations
 784  * are made advance the cursor. Continue allocating from the cursor until
 785  * the range is exhausted and then find a new range.
 786  * ==========================================================================
 787  */
 788 static uint64_t
 789 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 790 {
 791         range_tree_t *rt = msp->ms_tree;
 792         avl_tree_t *t = &msp->ms_size_tree;
 793         uint64_t *cursor = &msp->ms_lbas[0];
 794         uint64_t *cursor_end = &msp->ms_lbas[1];
 795         uint64_t offset = 0;
 796 
 797         ASSERT(MUTEX_HELD(&msp->ms_lock));
 798         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
 799 
 800         ASSERT3U(*cursor_end, >=, *cursor);
 801 
 802         if ((*cursor + size) > *cursor_end) {
 803                 range_seg_t *rs;
 804 
 805                 rs = avl_last(&msp->ms_size_tree);
 806                 if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
 807                         return (-1ULL);
 808 
 809                 *cursor = rs->rs_start;
 810                 *cursor_end = rs->rs_end;
 811         }
 812 
 813         offset = *cursor;
 814         *cursor += size;
 815 
 816         return (offset);
 817 }
 818 
 819 static boolean_t
 820 metaslab_cf_fragmented(metaslab_t *msp)
 821 {
 822         return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
 823 }
 824 
 825 static metaslab_ops_t metaslab_cf_ops = {
 826         metaslab_cf_alloc,
 827         metaslab_cf_fragmented
 828 };
 829 
 830 /*
 831  * ==========================================================================
 832  * New dynamic fit allocator -
 833  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
 834  * contiguous blocks. If no region is found then just use the largest segment
 835  * that remains.
 836  * ==========================================================================
 837  */
 838 
 839 /*
 840  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
 841  * to request from the allocator.
 842  */
 843 uint64_t metaslab_ndf_clump_shift = 4;
 844 
 845 static uint64_t
 846 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 847 {
 848         avl_tree_t *t = &msp->ms_tree->rt_root;
 849         avl_index_t where;
 850         range_seg_t *rs, rsearch;
 851         uint64_t hbit = highbit(size);
 852         uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 853         uint64_t max_size = metaslab_block_maxsize(msp);
 854 
 855         ASSERT(MUTEX_HELD(&msp->ms_lock));
 856         ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 857 
 858         if (max_size < size)
 859                 return (-1ULL);
 860 
 861         rsearch.rs_start = *cursor;
 862         rsearch.rs_end = *cursor + size;
 863 
 864         rs = avl_find(t, &rsearch, &where);
 865         if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
 866                 t = &msp->ms_size_tree;
 867 
 868                 rsearch.rs_start = 0;
 869                 rsearch.rs_end = MIN(max_size,
 870                     1ULL << (hbit + metaslab_ndf_clump_shift));
 871                 rs = avl_find(t, &rsearch, &where);
 872                 if (rs == NULL)
 873                         rs = avl_nearest(t, where, AVL_AFTER);
 874                 ASSERT(rs != NULL);
 875         }
 876 
 877         if ((rs->rs_end - rs->rs_start) >= size) {
 878                 *cursor = rs->rs_start + size;
 879                 return (rs->rs_start);
 880         }
 881         return (-1ULL);
 882 }
 883 
 884 static boolean_t
 885 metaslab_ndf_fragmented(metaslab_t *msp)
 886 {
 887         return (metaslab_block_maxsize(msp) <=
 888             (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
 889 }
 890 
 891 static metaslab_ops_t metaslab_ndf_ops = {
 892         metaslab_ndf_alloc,
 893         metaslab_ndf_fragmented
 894 };
 895 
 896 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 897 
 898 /*
 899  * ==========================================================================
 900  * Metaslabs
 901  * ==========================================================================
 902  */
 903 
 904 /*
 905  * Wait for any in-progress metaslab loads to complete.
 906  */
 907 void
 908 metaslab_load_wait(metaslab_t *msp)
 909 {
 910         ASSERT(MUTEX_HELD(&msp->ms_lock));
 911 
 912         while (msp->ms_loading) {
 913                 ASSERT(!msp->ms_loaded);
 914                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
 915         }
 916 }
 917 
 918 int
 919 metaslab_load(metaslab_t *msp)
 920 {
 921         int error = 0;
 922 
 923         ASSERT(MUTEX_HELD(&msp->ms_lock));
 924         ASSERT(!msp->ms_loaded);
 925         ASSERT(!msp->ms_loading);
 926 
 927         msp->ms_loading = B_TRUE;
 928 
 929         /*
 930          * If the space map has not been allocated yet, then treat
 931          * all the space in the metaslab as free and add it to the
 932          * ms_tree.
 933          */
 934         if (msp->ms_sm != NULL)
 935                 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
 936         else
 937                 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
 938 
 939         msp->ms_loaded = (error == 0);
 940         msp->ms_loading = B_FALSE;
 941 
 942         if (msp->ms_loaded) {
 943                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 944                         range_tree_walk(msp->ms_defertree[t],
 945                             range_tree_remove, msp->ms_tree);
 946                 }
 947         }
 948         cv_broadcast(&msp->ms_load_cv);
 949         return (error);
 950 }
 951 
 952 void
 953 metaslab_unload(metaslab_t *msp)
 954 {
 955         ASSERT(MUTEX_HELD(&msp->ms_lock));
 956         range_tree_vacate(msp->ms_tree, NULL, NULL);
 957         msp->ms_loaded = B_FALSE;
 958         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 959 }
 960 
 961 metaslab_t *
 962 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
 963 {
 964         vdev_t *vd = mg->mg_vd;
 965         objset_t *mos = vd->vdev_spa->spa_meta_objset;
 966         metaslab_t *msp;
 967 
 968         msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 969         mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 970         cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
 971         msp->ms_id = id;
 972         msp->ms_start = id << vd->vdev_ms_shift;
 973         msp->ms_size = 1ULL << vd->vdev_ms_shift;
 974 
 975         /*
 976          * We only open space map objects that already exist. All others
 977          * will be opened when we finally allocate an object for it.
 978          */
 979         if (object != 0) {
 980                 VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
 981                     msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
 982                 ASSERT(msp->ms_sm != NULL);
 983         }
 984 
 985         /*
 986          * We create the main range tree here, but we don't create the
 987          * alloctree and freetree until metaslab_sync_done().  This serves
 988          * two purposes: it allows metaslab_sync_done() to detect the
 989          * addition of new space; and for debugging, it ensures that we'd
 990          * data fault on any attempt to use this metaslab before it's ready.
 991          */
 992         msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
 993         metaslab_group_add(mg, msp);
 994 
 995         msp->ms_ops = mg->mg_class->mc_ops;
 996 
 997         /*
 998          * If we're opening an existing pool (txg == 0) or creating
 999          * a new one (txg == TXG_INITIAL), all space is available now.
1000          * If we're adding space to an existing pool, the new space
1001          * does not become available until after this txg has synced.
1002          */
1003         if (txg <= TXG_INITIAL)
1004                 metaslab_sync_done(msp, 0);
1005 
1006         /*
1007          * If metaslab_debug_load is set and we're initializing a metaslab
1008          * that has an allocated space_map object then load the its space
1009          * map so that can verify frees.
1010          */
1011         if (metaslab_debug_load && msp->ms_sm != NULL) {
1012                 mutex_enter(&msp->ms_lock);
1013                 VERIFY0(metaslab_load(msp));
1014                 mutex_exit(&msp->ms_lock);
1015         }
1016 
1017         if (txg != 0) {
1018                 vdev_dirty(vd, 0, NULL, txg);
1019                 vdev_dirty(vd, VDD_METASLAB, msp, txg);
1020         }
1021 
1022         return (msp);
1023 }
1024 
1025 void
1026 metaslab_fini(metaslab_t *msp)
1027 {
1028         metaslab_group_t *mg = msp->ms_group;
1029 
1030         metaslab_group_remove(mg, msp);
1031 
1032         mutex_enter(&msp->ms_lock);
1033 
1034         VERIFY(msp->ms_group == NULL);
1035         vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1036             0, -msp->ms_size);
1037         space_map_close(msp->ms_sm);
1038 
1039         metaslab_unload(msp);
1040         range_tree_destroy(msp->ms_tree);
1041 
1042         for (int t = 0; t < TXG_SIZE; t++) {
1043                 range_tree_destroy(msp->ms_alloctree[t]);
1044                 range_tree_destroy(msp->ms_freetree[t]);
1045         }
1046 
1047         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1048                 range_tree_destroy(msp->ms_defertree[t]);
1049         }
1050 
1051         ASSERT0(msp->ms_deferspace);
1052 
1053         mutex_exit(&msp->ms_lock);
1054         cv_destroy(&msp->ms_load_cv);
1055         mutex_destroy(&msp->ms_lock);
1056 
1057         kmem_free(msp, sizeof (metaslab_t));
1058 }
1059 
1060 /*
1061  * Apply a weighting factor based on the histogram information for this
1062  * metaslab. The current weighting factor is somewhat arbitrary and requires
1063  * additional investigation. The implementation provides a measure of
1064  * "weighted" free space and gives a higher weighting for larger contiguous
1065  * regions. The weighting factor is determined by counting the number of
1066  * sm_shift sectors that exist in each region represented by the histogram.
1067  * That value is then multiplied by the power of 2 exponent and the sm_shift
1068  * value.
1069  *
1070  * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
1071  * metaslab has an sm_shift value of 9 (512B):
1072  *
1073  * 1) calculate the number of sm_shift sectors in the region:
1074  *      2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1075  * 2) multiply by the power of 2 exponent and the sm_shift value:
1076  *      16384 * 21 * 9 = 3096576
1077  * This value will be added to the weighting of the metaslab.
1078  */
1079 static uint64_t
1080 metaslab_weight_factor(metaslab_t *msp)
1081 {
1082         uint64_t factor = 0;
1083         uint64_t sectors;
1084         int i;
1085 
1086         /*
1087          * A null space map means that the entire metaslab is free,
1088          * calculate a weight factor that spans the entire size of the
1089          * metaslab.
1090          */
1091         if (msp->ms_sm == NULL) {
1092                 vdev_t *vd = msp->ms_group->mg_vd;
1093 
1094                 i = highbit(msp->ms_size) - 1;
1095                 sectors = msp->ms_size >> vd->vdev_ashift;
1096                 return (sectors * i * vd->vdev_ashift);
1097         }
1098 
1099         if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1100                 return (0);
1101 
1102         for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1103                 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1104                         continue;
1105 
1106                 /*
1107                  * Determine the number of sm_shift sectors in the region
1108                  * indicated by the histogram. For example, given an
1109                  * sm_shift value of 9 (512 bytes) and i = 4 then we know
1110                  * that we're looking at an 8K region in the histogram
1111                  * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1112                  * number of sm_shift sectors (512 bytes in this example),
1113                  * we would take 8192 / 512 = 16. Since the histogram
1114                  * is offset by sm_shift we can simply use the value of
1115                  * of i to calculate this (i.e. 2^i = 16 where i = 4).
1116                  */
1117                 sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
1118                 factor += (i + msp->ms_sm->sm_shift) * sectors;
1119         }
1120         return (factor * msp->ms_sm->sm_shift);
1121 }
1122 
1123 static uint64_t
1124 metaslab_weight(metaslab_t *msp)
1125 {
1126         metaslab_group_t *mg = msp->ms_group;
1127         vdev_t *vd = mg->mg_vd;
1128         uint64_t weight, space;
1129 
1130         ASSERT(MUTEX_HELD(&msp->ms_lock));
1131 
1132         /*
1133          * This vdev is in the process of being removed so there is nothing
1134          * for us to do here.
1135          */
1136         if (vd->vdev_removing) {
1137                 ASSERT0(space_map_allocated(msp->ms_sm));
1138                 ASSERT0(vd->vdev_ms_shift);
1139                 return (0);
1140         }
1141 
1142         /*
1143          * The baseline weight is the metaslab's free space.
1144          */
1145         space = msp->ms_size - space_map_allocated(msp->ms_sm);
1146         weight = space;
1147 
1148         /*
1149          * Modern disks have uniform bit density and constant angular velocity.
1150          * Therefore, the outer recording zones are faster (higher bandwidth)
1151          * than the inner zones by the ratio of outer to inner track diameter,
1152          * which is typically around 2:1.  We account for this by assigning
1153          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1154          * In effect, this means that we'll select the metaslab with the most
1155          * free bandwidth rather than simply the one with the most free space.
1156          */
1157         weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1158         ASSERT(weight >= space && weight <= 2 * space);
1159 
1160         msp->ms_factor = metaslab_weight_factor(msp);
1161         if (metaslab_weight_factor_enable)
1162                 weight += msp->ms_factor;
1163 
1164         if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
1165                 /*
1166                  * If this metaslab is one we're actively using, adjust its
1167                  * weight to make it preferable to any inactive metaslab so
1168                  * we'll polish it off.
1169                  */
1170                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1171         }
1172 
1173         return (weight);
1174 }
1175 
1176 static int
1177 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1178 {
1179         ASSERT(MUTEX_HELD(&msp->ms_lock));
1180 
1181         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1182                 metaslab_load_wait(msp);
1183                 if (!msp->ms_loaded) {
1184                         int error = metaslab_load(msp);
1185                         if (error) {
1186                                 metaslab_group_sort(msp->ms_group, msp, 0);
1187                                 return (error);
1188                         }
1189                 }
1190 
1191                 metaslab_group_sort(msp->ms_group, msp,
1192                     msp->ms_weight | activation_weight);
1193         }
1194         ASSERT(msp->ms_loaded);
1195         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1196 
1197         return (0);
1198 }
1199 
1200 static void
1201 metaslab_passivate(metaslab_t *msp, uint64_t size)
1202 {
1203         /*
1204          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1205          * this metaslab again.  In that case, it had better be empty,
1206          * or we would be leaving space on the table.
1207          */
1208         ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1209         metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1210         ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1211 }
1212 
1213 static void
1214 metaslab_preload(void *arg)
1215 {
1216         metaslab_t *msp = arg;
1217         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1218 
1219         mutex_enter(&msp->ms_lock);
1220         metaslab_load_wait(msp);
1221         if (!msp->ms_loaded)
1222                 (void) metaslab_load(msp);
1223 
1224         /*
1225          * Set the ms_access_txg value so that we don't unload it right away.
1226          */
1227         msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1228         mutex_exit(&msp->ms_lock);
1229 }
1230 
1231 static void
1232 metaslab_group_preload(metaslab_group_t *mg)
1233 {
1234         spa_t *spa = mg->mg_vd->vdev_spa;
1235         metaslab_t *msp;
1236         avl_tree_t *t = &mg->mg_metaslab_tree;
1237         int m = 0;
1238 
1239         if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1240                 taskq_wait(mg->mg_taskq);
1241                 return;
1242         }
1243         mutex_enter(&mg->mg_lock);
1244 
1245         /*
1246          * Prefetch the next potential metaslabs
1247          */
1248         for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1249 
1250                 /* If we have reached our preload limit then we're done */
1251                 if (++m > metaslab_preload_limit)
1252                         break;
1253 
1254                 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1255                     msp, TQ_SLEEP) != NULL);
1256         }
1257         mutex_exit(&mg->mg_lock);
1258 }
1259 
1260 /*
1261  * Determine if the space map's on-disk footprint is past our tolerance
1262  * for inefficiency. We would like to use the following criteria to make
1263  * our decision:
1264  *
1265  * 1. The size of the space map object should not dramatically increase as a
1266  * result of writing out the free space range tree.
1267  *
1268  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1269  * times the size than the free space range tree representation
1270  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1271  *
1272  * Checking the first condition is tricky since we don't want to walk
1273  * the entire AVL tree calculating the estimated on-disk size. Instead we
1274  * use the size-ordered range tree in the metaslab and calculate the
1275  * size required to write out the largest segment in our free tree. If the
1276  * size required to represent that segment on disk is larger than the space
1277  * map object then we avoid condensing this map.
1278  *
1279  * To determine the second criterion we use a best-case estimate and assume
1280  * each segment can be represented on-disk as a single 64-bit entry. We refer
1281  * to this best-case estimate as the space map's minimal form.
1282  */
1283 static boolean_t
1284 metaslab_should_condense(metaslab_t *msp)
1285 {
1286         space_map_t *sm = msp->ms_sm;
1287         range_seg_t *rs;
1288         uint64_t size, entries, segsz;
1289 
1290         ASSERT(MUTEX_HELD(&msp->ms_lock));
1291         ASSERT(msp->ms_loaded);
1292 
1293         /*
1294          * Use the ms_size_tree range tree, which is ordered by size, to
1295          * obtain the largest segment in the free tree. If the tree is empty
1296          * then we should condense the map.
1297          */
1298         rs = avl_last(&msp->ms_size_tree);
1299         if (rs == NULL)
1300                 return (B_TRUE);
1301 
1302         /*
1303          * Calculate the number of 64-bit entries this segment would
1304          * require when written to disk. If this single segment would be
1305          * larger on-disk than the entire current on-disk structure, then
1306          * clearly condensing will increase the on-disk structure size.
1307          */
1308         size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1309         entries = size / (MIN(size, SM_RUN_MAX));
1310         segsz = entries * sizeof (uint64_t);
1311 
1312         return (segsz <= space_map_length(msp->ms_sm) &&
1313             space_map_length(msp->ms_sm) >= (zfs_condense_pct *
1314             sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
1315 }
1316 
1317 /*
1318  * Condense the on-disk space map representation to its minimized form.
1319  * The minimized form consists of a small number of allocations followed by
1320  * the entries of the free range tree.
1321  */
1322 static void
1323 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1324 {
1325         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1326         range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1327         range_tree_t *condense_tree;
1328         space_map_t *sm = msp->ms_sm;
1329 
1330         ASSERT(MUTEX_HELD(&msp->ms_lock));
1331         ASSERT3U(spa_sync_pass(spa), ==, 1);
1332         ASSERT(msp->ms_loaded);
1333 
1334         spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1335             "smp size %llu, segments %lu", txg, msp->ms_id, msp,
1336             space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1337 
1338         /*
1339          * Create an range tree that is 100% allocated. We remove segments
1340          * that have been freed in this txg, any deferred frees that exist,
1341          * and any allocation in the future. Removing segments should be
1342          * a relatively inexpensive operation since we expect these trees to
1343          * have a small number of nodes.
1344          */
1345         condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1346         range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1347 
1348         /*
1349          * Remove what's been freed in this txg from the condense_tree.
1350          * Since we're in sync_pass 1, we know that all the frees from
1351          * this txg are in the freetree.
1352          */
1353         range_tree_walk(freetree, range_tree_remove, condense_tree);
1354 
1355         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1356                 range_tree_walk(msp->ms_defertree[t],
1357                     range_tree_remove, condense_tree);
1358         }
1359 
1360         for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1361                 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1362                     range_tree_remove, condense_tree);
1363         }
1364 
1365         /*
1366          * We're about to drop the metaslab's lock thus allowing
1367          * other consumers to change it's content. Set the
1368          * metaslab's ms_condensing flag to ensure that
1369          * allocations on this metaslab do not occur while we're
1370          * in the middle of committing it to disk. This is only critical
1371          * for the ms_tree as all other range trees use per txg
1372          * views of their content.
1373          */
1374         msp->ms_condensing = B_TRUE;
1375 
1376         mutex_exit(&msp->ms_lock);
1377         space_map_truncate(sm, tx);
1378         mutex_enter(&msp->ms_lock);
1379 
1380         /*
1381          * While we would ideally like to create a space_map representation
1382          * that consists only of allocation records, doing so can be
1383          * prohibitively expensive because the in-core free tree can be
1384          * large, and therefore computationally expensive to subtract
1385          * from the condense_tree. Instead we sync out two trees, a cheap
1386          * allocation only tree followed by the in-core free tree. While not
1387          * optimal, this is typically close to optimal, and much cheaper to
1388          * compute.
1389          */
1390         space_map_write(sm, condense_tree, SM_ALLOC, tx);
1391         range_tree_vacate(condense_tree, NULL, NULL);
1392         range_tree_destroy(condense_tree);
1393 
1394         space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1395         msp->ms_condensing = B_FALSE;
1396 }
1397 
1398 /*
1399  * Write a metaslab to disk in the context of the specified transaction group.
1400  */
1401 void
1402 metaslab_sync(metaslab_t *msp, uint64_t txg)
1403 {
1404         metaslab_group_t *mg = msp->ms_group;
1405         vdev_t *vd = mg->mg_vd;
1406         spa_t *spa = vd->vdev_spa;
1407         objset_t *mos = spa_meta_objset(spa);
1408         range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1409         range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1410         range_tree_t **freed_tree =
1411             &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1412         dmu_tx_t *tx;
1413         uint64_t object = space_map_object(msp->ms_sm);
1414 
1415         ASSERT(!vd->vdev_ishole);
1416 
1417         /*
1418          * This metaslab has just been added so there's no work to do now.
1419          */
1420         if (*freetree == NULL) {
1421                 ASSERT3P(alloctree, ==, NULL);
1422                 return;
1423         }
1424 
1425         ASSERT3P(alloctree, !=, NULL);
1426         ASSERT3P(*freetree, !=, NULL);
1427         ASSERT3P(*freed_tree, !=, NULL);
1428 
1429         if (range_tree_space(alloctree) == 0 &&
1430             range_tree_space(*freetree) == 0)
1431                 return;
1432 
1433         /*
1434          * The only state that can actually be changing concurrently with
1435          * metaslab_sync() is the metaslab's ms_tree.  No other thread can
1436          * be modifying this txg's alloctree, freetree, freed_tree, or
1437          * space_map_phys_t. Therefore, we only hold ms_lock to satify
1438          * space_map ASSERTs. We drop it whenever we call into the DMU,
1439          * because the DMU can call down to us (e.g. via zio_free()) at
1440          * any time.
1441          */
1442 
1443         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1444 
1445         if (msp->ms_sm == NULL) {
1446                 uint64_t new_object;
1447 
1448                 new_object = space_map_alloc(mos, tx);
1449                 VERIFY3U(new_object, !=, 0);
1450 
1451                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1452                     msp->ms_start, msp->ms_size, vd->vdev_ashift,
1453                     &msp->ms_lock));
1454                 ASSERT(msp->ms_sm != NULL);
1455         }
1456 
1457         mutex_enter(&msp->ms_lock);
1458 
1459         if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1460             metaslab_should_condense(msp)) {
1461                 metaslab_condense(msp, txg, tx);
1462         } else {
1463                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1464                 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1465         }
1466 
1467         range_tree_vacate(alloctree, NULL, NULL);
1468 
1469         if (msp->ms_loaded) {
1470                 /*
1471                  * When the space map is loaded, we have an accruate
1472                  * histogram in the range tree. This gives us an opportunity
1473                  * to bring the space map's histogram up-to-date so we clear
1474                  * it first before updating it.
1475                  */
1476                 space_map_histogram_clear(msp->ms_sm);
1477                 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1478         } else {
1479                 /*
1480                  * Since the space map is not loaded we simply update the
1481                  * exisiting histogram with what was freed in this txg. This
1482                  * means that the on-disk histogram may not have an accurate
1483                  * view of the free space but it's close enough to allow
1484                  * us to make allocation decisions.
1485                  */
1486                 space_map_histogram_add(msp->ms_sm, *freetree, tx);
1487         }
1488 
1489         /*
1490          * For sync pass 1, we avoid traversing this txg's free range tree
1491          * and instead will just swap the pointers for freetree and
1492          * freed_tree. We can safely do this since the freed_tree is
1493          * guaranteed to be empty on the initial pass.
1494          */
1495         if (spa_sync_pass(spa) == 1) {
1496                 range_tree_swap(freetree, freed_tree);
1497         } else {
1498                 range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1499         }
1500 
1501         ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1502         ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1503 
1504         mutex_exit(&msp->ms_lock);
1505 
1506         if (object != space_map_object(msp->ms_sm)) {
1507                 object = space_map_object(msp->ms_sm);
1508                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1509                     msp->ms_id, sizeof (uint64_t), &object, tx);
1510         }
1511         dmu_tx_commit(tx);
1512 }
1513 
1514 /*
1515  * Called after a transaction group has completely synced to mark
1516  * all of the metaslab's free space as usable.
1517  */
1518 void
1519 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1520 {
1521         metaslab_group_t *mg = msp->ms_group;
1522         vdev_t *vd = mg->mg_vd;
1523         range_tree_t **freed_tree;
1524         range_tree_t **defer_tree;
1525         int64_t alloc_delta, defer_delta;
1526 
1527         ASSERT(!vd->vdev_ishole);
1528 
1529         mutex_enter(&msp->ms_lock);
1530 
1531         /*
1532          * If this metaslab is just becoming available, initialize its
1533          * alloctrees, freetrees, and defertree and add its capacity to
1534          * the vdev.
1535          */
1536         if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
1537                 for (int t = 0; t < TXG_SIZE; t++) {
1538                         ASSERT(msp->ms_alloctree[t] == NULL);
1539                         ASSERT(msp->ms_freetree[t] == NULL);
1540 
1541                         msp->ms_alloctree[t] = range_tree_create(NULL, msp,
1542                             &msp->ms_lock);
1543                         msp->ms_freetree[t] = range_tree_create(NULL, msp,
1544                             &msp->ms_lock);
1545                 }
1546 
1547                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1548                         ASSERT(msp->ms_defertree[t] == NULL);
1549 
1550                         msp->ms_defertree[t] = range_tree_create(NULL, msp,
1551                             &msp->ms_lock);
1552                 }
1553 
1554                 vdev_space_update(vd, 0, 0, msp->ms_size);
1555         }
1556 
1557         freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1558         defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
1559 
1560         alloc_delta = space_map_alloc_delta(msp->ms_sm);
1561         defer_delta = range_tree_space(*freed_tree) -
1562             range_tree_space(*defer_tree);
1563 
1564         vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1565 
1566         ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1567         ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1568 
1569         /*
1570          * If there's a metaslab_load() in progress, wait for it to complete
1571          * so that we have a consistent view of the in-core space map.
1572          */
1573         metaslab_load_wait(msp);
1574 
1575         /*
1576          * Move the frees from the defer_tree back to the free
1577          * range tree (if it's loaded). Swap the freed_tree and the
1578          * defer_tree -- this is safe to do because we've just emptied out
1579          * the defer_tree.
1580          */
1581         range_tree_vacate(*defer_tree,
1582             msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
1583         range_tree_swap(freed_tree, defer_tree);
1584 
1585         space_map_update(msp->ms_sm);
1586 
1587         msp->ms_deferspace += defer_delta;
1588         ASSERT3S(msp->ms_deferspace, >=, 0);
1589         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
1590         if (msp->ms_deferspace != 0) {
1591                 /*
1592                  * Keep syncing this metaslab until all deferred frees
1593                  * are back in circulation.
1594                  */
1595                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1596         }
1597 
1598         if (msp->ms_loaded && msp->ms_access_txg < txg) {
1599                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1600                         VERIFY0(range_tree_space(
1601                             msp->ms_alloctree[(txg + t) & TXG_MASK]));
1602                 }
1603 
1604                 if (!metaslab_debug_unload)
1605                         metaslab_unload(msp);
1606         }
1607 
1608         metaslab_group_sort(mg, msp, metaslab_weight(msp));
1609         mutex_exit(&msp->ms_lock);
1610 
1611 }
1612 
1613 void
1614 metaslab_sync_reassess(metaslab_group_t *mg)
1615 {
1616         int64_t failures = mg->mg_alloc_failures;
1617 
1618         metaslab_group_alloc_update(mg);
1619         atomic_add_64(&mg->mg_alloc_failures, -failures);
1620 
1621         /*
1622          * Preload the next potential metaslabs
1623          */
1624         metaslab_group_preload(mg);
1625 }
1626 
1627 static uint64_t
1628 metaslab_distance(metaslab_t *msp, dva_t *dva)
1629 {
1630         uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1631         uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1632         uint64_t start = msp->ms_id;
1633 
1634         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1635                 return (1ULL << 63);
1636 
1637         if (offset < start)
1638                 return ((start - offset) << ms_shift);
1639         if (offset > start)
1640                 return ((offset - start) << ms_shift);
1641         return (0);
1642 }
1643 
1644 static uint64_t
1645 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1646     uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1647 {
1648         spa_t *spa = mg->mg_vd->vdev_spa;
1649         metaslab_t *msp = NULL;
1650         uint64_t offset = -1ULL;
1651         avl_tree_t *t = &mg->mg_metaslab_tree;
1652         uint64_t activation_weight;
1653         uint64_t target_distance;
1654         int i;
1655 
1656         activation_weight = METASLAB_WEIGHT_PRIMARY;
1657         for (i = 0; i < d; i++) {
1658                 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1659                         activation_weight = METASLAB_WEIGHT_SECONDARY;
1660                         break;
1661                 }
1662         }
1663 
1664         for (;;) {
1665                 boolean_t was_active;
1666 
1667                 mutex_enter(&mg->mg_lock);
1668                 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1669                         if (msp->ms_weight < asize) {
1670                                 spa_dbgmsg(spa, "%s: failed to meet weight "
1671                                     "requirement: vdev %llu, txg %llu, mg %p, "
1672                                     "msp %p, psize %llu, asize %llu, "
1673                                     "failures %llu, weight %llu",
1674                                     spa_name(spa), mg->mg_vd->vdev_id, txg,
1675                                     mg, msp, psize, asize,
1676                                     mg->mg_alloc_failures, msp->ms_weight);
1677                                 mutex_exit(&mg->mg_lock);
1678                                 return (-1ULL);
1679                         }
1680 
1681                         /*
1682                          * If the selected metaslab is condensing, skip it.
1683                          */
1684                         if (msp->ms_condensing)
1685                                 continue;
1686 
1687                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1688                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1689                                 break;
1690 
1691                         target_distance = min_distance +
1692                             (space_map_allocated(msp->ms_sm) != 0 ? 0 :
1693                             min_distance >> 1);
1694 
1695                         for (i = 0; i < d; i++)
1696                                 if (metaslab_distance(msp, &dva[i]) <
1697                                     target_distance)
1698                                         break;
1699                         if (i == d)
1700                                 break;
1701                 }
1702                 mutex_exit(&mg->mg_lock);
1703                 if (msp == NULL)
1704                         return (-1ULL);
1705 
1706                 mutex_enter(&msp->ms_lock);
1707 
1708                 /*
1709                  * If we've already reached the allowable number of failed
1710                  * allocation attempts on this metaslab group then we
1711                  * consider skipping it. We skip it only if we're allowed
1712                  * to "fast" gang, the physical size is larger than
1713                  * a gang block, and we're attempting to allocate from
1714                  * the primary metaslab.
1715                  */
1716                 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1717                     CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1718                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
1719                         spa_dbgmsg(spa, "%s: skipping metaslab group: "
1720                             "vdev %llu, txg %llu, mg %p, msp[%llu] %p, "
1721                             "psize %llu, asize %llu, failures %llu",
1722                             spa_name(spa), mg->mg_vd->vdev_id, txg, mg,
1723                             msp->ms_id, msp, psize, asize,
1724                             mg->mg_alloc_failures);
1725                         mutex_exit(&msp->ms_lock);
1726                         return (-1ULL);
1727                 }
1728 
1729                 /*
1730                  * Ensure that the metaslab we have selected is still
1731                  * capable of handling our request. It's possible that
1732                  * another thread may have changed the weight while we
1733                  * were blocked on the metaslab lock.
1734                  */
1735                 if (msp->ms_weight < asize || (was_active &&
1736                     !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1737                     activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1738                         mutex_exit(&msp->ms_lock);
1739                         continue;
1740                 }
1741 
1742                 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1743                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
1744                         metaslab_passivate(msp,
1745                             msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1746                         mutex_exit(&msp->ms_lock);
1747                         continue;
1748                 }
1749 
1750                 if (metaslab_activate(msp, activation_weight) != 0) {
1751                         mutex_exit(&msp->ms_lock);
1752                         continue;
1753                 }
1754 
1755                 /*
1756                  * If this metaslab is currently condensing then pick again as
1757                  * we can't manipulate this metaslab until it's committed
1758                  * to disk.
1759                  */
1760                 if (msp->ms_condensing) {
1761                         mutex_exit(&msp->ms_lock);
1762                         continue;
1763                 }
1764 
1765                 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
1766                         break;
1767 
1768                 atomic_inc_64(&mg->mg_alloc_failures);
1769 
1770                 metaslab_passivate(msp, metaslab_block_maxsize(msp));
1771                 mutex_exit(&msp->ms_lock);
1772         }
1773 
1774         if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1775                 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1776 
1777         range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
1778         msp->ms_access_txg = txg + metaslab_unload_delay;
1779 
1780         mutex_exit(&msp->ms_lock);
1781 
1782         return (offset);
1783 }
1784 
1785 /*
1786  * Allocate a block for the specified i/o.
1787  */
1788 static int
1789 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1790     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1791 {
1792         metaslab_group_t *mg, *rotor;
1793         vdev_t *vd;
1794         int dshift = 3;
1795         int all_zero;
1796         int zio_lock = B_FALSE;
1797         boolean_t allocatable;
1798         uint64_t offset = -1ULL;
1799         uint64_t asize;
1800         uint64_t distance;
1801 
1802         ASSERT(!DVA_IS_VALID(&dva[d]));
1803 
1804         /*
1805          * For testing, make some blocks above a certain size be gang blocks.
1806          */
1807         if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1808                 return (SET_ERROR(ENOSPC));
1809 
1810         /*
1811          * Start at the rotor and loop through all mgs until we find something.
1812          * Note that there's no locking on mc_rotor or mc_aliquot because
1813          * nothing actually breaks if we miss a few updates -- we just won't
1814          * allocate quite as evenly.  It all balances out over time.
1815          *
1816          * If we are doing ditto or log blocks, try to spread them across
1817          * consecutive vdevs.  If we're forced to reuse a vdev before we've
1818          * allocated all of our ditto blocks, then try and spread them out on
1819          * that vdev as much as possible.  If it turns out to not be possible,
1820          * gradually lower our standards until anything becomes acceptable.
1821          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1822          * gives us hope of containing our fault domains to something we're
1823          * able to reason about.  Otherwise, any two top-level vdev failures
1824          * will guarantee the loss of data.  With consecutive allocation,
1825          * only two adjacent top-level vdev failures will result in data loss.
1826          *
1827          * If we are doing gang blocks (hintdva is non-NULL), try to keep
1828          * ourselves on the same vdev as our gang block header.  That
1829          * way, we can hope for locality in vdev_cache, plus it makes our
1830          * fault domains something tractable.
1831          */
1832         if (hintdva) {
1833                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1834 
1835                 /*
1836                  * It's possible the vdev we're using as the hint no
1837                  * longer exists (i.e. removed). Consult the rotor when
1838                  * all else fails.
1839                  */
1840                 if (vd != NULL) {
1841                         mg = vd->vdev_mg;
1842 
1843                         if (flags & METASLAB_HINTBP_AVOID &&
1844                             mg->mg_next != NULL)
1845                                 mg = mg->mg_next;
1846                 } else {
1847                         mg = mc->mc_rotor;
1848                 }
1849         } else if (d != 0) {
1850                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1851                 mg = vd->vdev_mg->mg_next;
1852         } else {
1853                 mg = mc->mc_rotor;
1854         }
1855 
1856         /*
1857          * If the hint put us into the wrong metaslab class, or into a
1858          * metaslab group that has been passivated, just follow the rotor.
1859          */
1860         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1861                 mg = mc->mc_rotor;
1862 
1863         rotor = mg;
1864 top:
1865         all_zero = B_TRUE;
1866         do {
1867                 ASSERT(mg->mg_activation_count == 1);
1868 
1869                 vd = mg->mg_vd;
1870 
1871                 /*
1872                  * Don't allocate from faulted devices.
1873                  */
1874                 if (zio_lock) {
1875                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1876                         allocatable = vdev_allocatable(vd);
1877                         spa_config_exit(spa, SCL_ZIO, FTAG);
1878                 } else {
1879                         allocatable = vdev_allocatable(vd);
1880                 }
1881 
1882                 /*
1883                  * Determine if the selected metaslab group is eligible
1884                  * for allocations. If we're ganging or have requested
1885                  * an allocation for the smallest gang block size
1886                  * then we don't want to avoid allocating to the this
1887                  * metaslab group. If we're in this condition we should
1888                  * try to allocate from any device possible so that we
1889                  * don't inadvertently return ENOSPC and suspend the pool
1890                  * even though space is still available.
1891                  */
1892                 if (allocatable && CAN_FASTGANG(flags) &&
1893                     psize > SPA_GANGBLOCKSIZE)
1894                         allocatable = metaslab_group_allocatable(mg);
1895 
1896                 if (!allocatable)
1897                         goto next;
1898 
1899                 /*
1900                  * Avoid writing single-copy data to a failing vdev
1901                  * unless the user instructs us that it is okay.
1902                  */
1903                 if ((vd->vdev_stat.vs_write_errors > 0 ||
1904                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
1905                     d == 0 && dshift == 3 &&
1906                     !(zfs_write_to_degraded && vd->vdev_state ==
1907                     VDEV_STATE_DEGRADED)) {
1908                         all_zero = B_FALSE;
1909                         goto next;
1910                 }
1911 
1912                 ASSERT(mg->mg_class == mc);
1913 
1914                 distance = vd->vdev_asize >> dshift;
1915                 if (distance <= (1ULL << vd->vdev_ms_shift))
1916                         distance = 0;
1917                 else
1918                         all_zero = B_FALSE;
1919 
1920                 asize = vdev_psize_to_asize(vd, psize);
1921                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1922 
1923                 offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1924                     dva, d, flags);
1925                 if (offset != -1ULL) {
1926                         /*
1927                          * If we've just selected this metaslab group,
1928                          * figure out whether the corresponding vdev is
1929                          * over- or under-used relative to the pool,
1930                          * and set an allocation bias to even it out.
1931                          */
1932                         if (mc->mc_aliquot == 0) {
1933                                 vdev_stat_t *vs = &vd->vdev_stat;
1934                                 int64_t vu, cu;
1935 
1936                                 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1937                                 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1938 
1939                                 /*
1940                                  * Calculate how much more or less we should
1941                                  * try to allocate from this device during
1942                                  * this iteration around the rotor.
1943                                  * For example, if a device is 80% full
1944                                  * and the pool is 20% full then we should
1945                                  * reduce allocations by 60% on this device.
1946                                  *
1947                                  * mg_bias = (20 - 80) * 512K / 100 = -307K
1948                                  *
1949                                  * This reduces allocations by 307K for this
1950                                  * iteration.
1951                                  */
1952                                 mg->mg_bias = ((cu - vu) *
1953                                     (int64_t)mg->mg_aliquot) / 100;
1954                         }
1955 
1956                         if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1957                             mg->mg_aliquot + mg->mg_bias) {
1958                                 mc->mc_rotor = mg->mg_next;
1959                                 mc->mc_aliquot = 0;
1960                         }
1961 
1962                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
1963                         DVA_SET_OFFSET(&dva[d], offset);
1964                         DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1965                         DVA_SET_ASIZE(&dva[d], asize);
1966 
1967                         return (0);
1968                 }
1969 next:
1970                 mc->mc_rotor = mg->mg_next;
1971                 mc->mc_aliquot = 0;
1972         } while ((mg = mg->mg_next) != rotor);
1973 
1974         if (!all_zero) {
1975                 dshift++;
1976                 ASSERT(dshift < 64);
1977                 goto top;
1978         }
1979 
1980         if (!allocatable && !zio_lock) {
1981                 dshift = 3;
1982                 zio_lock = B_TRUE;
1983                 goto top;
1984         }
1985 
1986         bzero(&dva[d], sizeof (dva_t));
1987 
1988         return (SET_ERROR(ENOSPC));
1989 }
1990 
1991 /*
1992  * Free the block represented by DVA in the context of the specified
1993  * transaction group.
1994  */
1995 static void
1996 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1997 {
1998         uint64_t vdev = DVA_GET_VDEV(dva);
1999         uint64_t offset = DVA_GET_OFFSET(dva);
2000         uint64_t size = DVA_GET_ASIZE(dva);
2001         vdev_t *vd;
2002         metaslab_t *msp;
2003 
2004         ASSERT(DVA_IS_VALID(dva));
2005 
2006         if (txg > spa_freeze_txg(spa))
2007                 return;
2008 
2009         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2010             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
2011                 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
2012                     (u_longlong_t)vdev, (u_longlong_t)offset);
2013                 ASSERT(0);
2014                 return;
2015         }
2016 
2017         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2018 
2019         if (DVA_GET_GANG(dva))
2020                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2021 
2022         mutex_enter(&msp->ms_lock);
2023 
2024         if (now) {
2025                 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
2026                     offset, size);
2027 
2028                 VERIFY(!msp->ms_condensing);
2029                 VERIFY3U(offset, >=, msp->ms_start);
2030                 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2031                 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2032                     msp->ms_size);
2033                 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2034                 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2035                 range_tree_add(msp->ms_tree, offset, size);
2036         } else {
2037                 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2038                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
2039                 range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2040                     offset, size);
2041         }
2042 
2043         mutex_exit(&msp->ms_lock);
2044 }
2045 
2046 /*
2047  * Intent log support: upon opening the pool after a crash, notify the SPA
2048  * of blocks that the intent log has allocated for immediate write, but
2049  * which are still considered free by the SPA because the last transaction
2050  * group didn't commit yet.
2051  */
2052 static int
2053 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2054 {
2055         uint64_t vdev = DVA_GET_VDEV(dva);
2056         uint64_t offset = DVA_GET_OFFSET(dva);
2057         uint64_t size = DVA_GET_ASIZE(dva);
2058         vdev_t *vd;
2059         metaslab_t *msp;
2060         int error = 0;
2061 
2062         ASSERT(DVA_IS_VALID(dva));
2063 
2064         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2065             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2066                 return (SET_ERROR(ENXIO));
2067 
2068         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2069 
2070         if (DVA_GET_GANG(dva))
2071                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2072 
2073         mutex_enter(&msp->ms_lock);
2074 
2075         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2076                 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2077 
2078         if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2079                 error = SET_ERROR(ENOENT);
2080 
2081         if (error || txg == 0) {        /* txg == 0 indicates dry run */
2082                 mutex_exit(&msp->ms_lock);
2083                 return (error);
2084         }
2085 
2086         VERIFY(!msp->ms_condensing);
2087         VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2088         VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2089         VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2090         range_tree_remove(msp->ms_tree, offset, size);
2091 
2092         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
2093                 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2094                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
2095                 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2096         }
2097 
2098         mutex_exit(&msp->ms_lock);
2099 
2100         return (0);
2101 }
2102 
2103 int
2104 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2105     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
2106 {
2107         dva_t *dva = bp->blk_dva;
2108         dva_t *hintdva = hintbp->blk_dva;
2109         int error = 0;
2110 
2111         ASSERT(bp->blk_birth == 0);
2112         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2113 
2114         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2115 
2116         if (mc->mc_rotor == NULL) {  /* no vdevs in this class */
2117                 spa_config_exit(spa, SCL_ALLOC, FTAG);
2118                 return (SET_ERROR(ENOSPC));
2119         }
2120 
2121         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2122         ASSERT(BP_GET_NDVAS(bp) == 0);
2123         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2124 
2125         for (int d = 0; d < ndvas; d++) {
2126                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2127                     txg, flags);
2128                 if (error != 0) {
2129                         for (d--; d >= 0; d--) {
2130                                 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2131                                 bzero(&dva[d], sizeof (dva_t));
2132                         }
2133                         spa_config_exit(spa, SCL_ALLOC, FTAG);
2134                         return (error);
2135                 }
2136         }
2137         ASSERT(error == 0);
2138         ASSERT(BP_GET_NDVAS(bp) == ndvas);
2139 
2140         spa_config_exit(spa, SCL_ALLOC, FTAG);
2141 
2142         BP_SET_BIRTH(bp, txg, txg);
2143 
2144         return (0);
2145 }
2146 
2147 void
2148 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2149 {
2150         const dva_t *dva = bp->blk_dva;
2151         int ndvas = BP_GET_NDVAS(bp);
2152 
2153         ASSERT(!BP_IS_HOLE(bp));
2154         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2155 
2156         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2157 
2158         for (int d = 0; d < ndvas; d++)
2159                 metaslab_free_dva(spa, &dva[d], txg, now);
2160 
2161         spa_config_exit(spa, SCL_FREE, FTAG);
2162 }
2163 
2164 int
2165 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2166 {
2167         const dva_t *dva = bp->blk_dva;
2168         int ndvas = BP_GET_NDVAS(bp);
2169         int error = 0;
2170 
2171         ASSERT(!BP_IS_HOLE(bp));
2172 
2173         if (txg != 0) {
2174                 /*
2175                  * First do a dry run to make sure all DVAs are claimable,
2176                  * so we don't have to unwind from partial failures below.
2177                  */
2178                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
2179                         return (error);
2180         }
2181 
2182         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2183 
2184         for (int d = 0; d < ndvas; d++)
2185                 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2186                         break;
2187 
2188         spa_config_exit(spa, SCL_ALLOC, FTAG);
2189 
2190         ASSERT(error == 0 || txg == 0);
2191 
2192         return (error);
2193 }
2194 
2195 void
2196 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2197 {
2198         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2199                 return;
2200 
2201         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2202         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2203                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2204                 vdev_t *vd = vdev_lookup_top(spa, vdev);
2205                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2206                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2207                 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2208 
2209                 if (msp->ms_loaded)
2210                         range_tree_verify(msp->ms_tree, offset, size);
2211 
2212                 for (int j = 0; j < TXG_SIZE; j++)
2213                         range_tree_verify(msp->ms_freetree[j], offset, size);
2214                 for (int j = 0; j < TXG_DEFER_SIZE; j++)
2215                         range_tree_verify(msp->ms_defertree[j], offset, size);
2216         }
2217         spa_config_exit(spa, SCL_VDEV, FTAG);
2218 }