illumos-gate Old usr/src/uts/common/fs/zfs/metaslab.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/zfs_context.h>
  27 #include <sys/dmu.h>
  28 #include <sys/dmu_tx.h>
  29 #include <sys/space_map.h>
  30 #include <sys/metaslab_impl.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/zio.h>
  33 
  34 /*
  35  * Allow allocations to switch to gang blocks quickly. We do this to
  36  * avoid having to load lots of space_maps in a given txg. There are,
  37  * however, some cases where we want to avoid "fast" ganging and instead
  38  * we want to do an exhaustive search of all metaslabs on this device.
  39  * Currently we don't allow any gang, zil, or dump device related allocations
  40  * to "fast" gang.
  41  */
  42 #define CAN_FASTGANG(flags) \
  43         (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
  44         METASLAB_GANG_AVOID)))
  45 
  46 uint64_t metaslab_aliquot = 512ULL << 10;
  47 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  48 
  49 /*
  50  * This value defines the number of allowed allocation failures per vdev.
  51  * If a device reaches this threshold in a given txg then we consider skipping
  52  * allocations on that device.
  53  */
  54 int zfs_mg_alloc_failures;
  55 
  56 /*
  57  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  58  */
  59 static int metaslab_debug = 0;
  60 
  61 /*
  62  * Minimum size which forces the dynamic allocator to change
  63  * it's allocation strategy.  Once the space map cannot satisfy
  64  * an allocation of this size then it switches to using more
  65  * aggressive strategy (i.e search by size rather than offset).
  66  */
  67 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
  68 
  69 /*
  70  * The minimum free space, in percent, which must be available
  71  * in a space map to continue allocations in a first-fit fashion.
  72  * Once the space_map's free space drops below this level we dynamically
  73  * switch to using best-fit allocations.
  74  */
  75 int metaslab_df_free_pct = 4;
  76 
  77 /*
  78  * A metaslab is considered "free" if it contains a contiguous
  79  * segment which is greater than metaslab_min_alloc_size.
  80  */
  81 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
  82 
  83 /*
  84  * Max number of space_maps to prefetch.
  85  */
  86 int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
  87 
  88 /*
  89  * Percentage bonus multiplier for metaslabs that are in the bonus area.
  90  */
  91 int metaslab_smo_bonus_pct = 150;
  92 
  93 /*
  94  * ==========================================================================
  95  * Metaslab classes
  96  * ==========================================================================
  97  */
  98 metaslab_class_t *
  99 metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
 100 {
 101         metaslab_class_t *mc;
 102 
 103         mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 104 
 105         mc->mc_spa = spa;
 106         mc->mc_rotor = NULL;
 107         mc->mc_ops = ops;
 108 
 109         return (mc);
 110 }
 111 
 112 void
 113 metaslab_class_destroy(metaslab_class_t *mc)
 114 {
 115         ASSERT(mc->mc_rotor == NULL);
 116         ASSERT(mc->mc_alloc == 0);
 117         ASSERT(mc->mc_deferred == 0);
 118         ASSERT(mc->mc_space == 0);
 119         ASSERT(mc->mc_dspace == 0);
 120 
 121         kmem_free(mc, sizeof (metaslab_class_t));
 122 }
 123 
 124 int
 125 metaslab_class_validate(metaslab_class_t *mc)
 126 {
 127         metaslab_group_t *mg;
 128         vdev_t *vd;
 129 
 130         /*
 131          * Must hold one of the spa_config locks.
 132          */
 133         ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 134             spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 135 
 136         if ((mg = mc->mc_rotor) == NULL)
 137                 return (0);
 138 
 139         do {
 140                 vd = mg->mg_vd;
 141                 ASSERT(vd->vdev_mg != NULL);
 142                 ASSERT3P(vd->vdev_top, ==, vd);
 143                 ASSERT3P(mg->mg_class, ==, mc);
 144                 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 145         } while ((mg = mg->mg_next) != mc->mc_rotor);
 146 
 147         return (0);
 148 }
 149 
 150 void
 151 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 152     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 153 {
 154         atomic_add_64(&mc->mc_alloc, alloc_delta);
 155         atomic_add_64(&mc->mc_deferred, defer_delta);
 156         atomic_add_64(&mc->mc_space, space_delta);
 157         atomic_add_64(&mc->mc_dspace, dspace_delta);
 158 }
 159 
 160 uint64_t
 161 metaslab_class_get_alloc(metaslab_class_t *mc)
 162 {
 163         return (mc->mc_alloc);
 164 }
 165 
 166 uint64_t
 167 metaslab_class_get_deferred(metaslab_class_t *mc)
 168 {
 169         return (mc->mc_deferred);
 170 }
 171 
 172 uint64_t
 173 metaslab_class_get_space(metaslab_class_t *mc)
 174 {
 175         return (mc->mc_space);
 176 }
 177 
 178 uint64_t
 179 metaslab_class_get_dspace(metaslab_class_t *mc)
 180 {
 181         return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 182 }
 183 
 184 /*
 185  * ==========================================================================
 186  * Metaslab groups
 187  * ==========================================================================
 188  */
 189 static int
 190 metaslab_compare(const void *x1, const void *x2)
 191 {
 192         const metaslab_t *m1 = x1;
 193         const metaslab_t *m2 = x2;
 194 
 195         if (m1->ms_weight < m2->ms_weight)
 196                 return (1);
 197         if (m1->ms_weight > m2->ms_weight)
 198                 return (-1);
 199 
 200         /*
 201          * If the weights are identical, use the offset to force uniqueness.
 202          */
 203         if (m1->ms_map.sm_start < m2->ms_map.sm_start)
 204                 return (-1);
 205         if (m1->ms_map.sm_start > m2->ms_map.sm_start)
 206                 return (1);
 207 
 208         ASSERT3P(m1, ==, m2);
 209 
 210         return (0);
 211 }
 212 
 213 metaslab_group_t *
 214 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 215 {
 216         metaslab_group_t *mg;
 217 
 218         mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 219         mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 220         avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 221             sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 222         mg->mg_vd = vd;
 223         mg->mg_class = mc;
 224         mg->mg_activation_count = 0;
 225 
 226         return (mg);
 227 }
 228 
 229 void
 230 metaslab_group_destroy(metaslab_group_t *mg)
 231 {
 232         ASSERT(mg->mg_prev == NULL);
 233         ASSERT(mg->mg_next == NULL);
 234         /*
 235          * We may have gone below zero with the activation count
 236          * either because we never activated in the first place or
 237          * because we're done, and possibly removing the vdev.
 238          */
 239         ASSERT(mg->mg_activation_count <= 0);
 240 
 241         avl_destroy(&mg->mg_metaslab_tree);
 242         mutex_destroy(&mg->mg_lock);
 243         kmem_free(mg, sizeof (metaslab_group_t));
 244 }
 245 
 246 void
 247 metaslab_group_activate(metaslab_group_t *mg)
 248 {
 249         metaslab_class_t *mc = mg->mg_class;
 250         metaslab_group_t *mgprev, *mgnext;
 251 
 252         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 253 
 254         ASSERT(mc->mc_rotor != mg);
 255         ASSERT(mg->mg_prev == NULL);
 256         ASSERT(mg->mg_next == NULL);
 257         ASSERT(mg->mg_activation_count <= 0);
 258 
 259         if (++mg->mg_activation_count <= 0)
 260                 return;
 261 
 262         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 263 
 264         if ((mgprev = mc->mc_rotor) == NULL) {
 265                 mg->mg_prev = mg;
 266                 mg->mg_next = mg;
 267         } else {
 268                 mgnext = mgprev->mg_next;
 269                 mg->mg_prev = mgprev;
 270                 mg->mg_next = mgnext;
 271                 mgprev->mg_next = mg;
 272                 mgnext->mg_prev = mg;
 273         }
 274         mc->mc_rotor = mg;
 275 }
 276 
 277 void
 278 metaslab_group_passivate(metaslab_group_t *mg)
 279 {
 280         metaslab_class_t *mc = mg->mg_class;
 281         metaslab_group_t *mgprev, *mgnext;
 282 
 283         ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 284 
 285         if (--mg->mg_activation_count != 0) {
 286                 ASSERT(mc->mc_rotor != mg);
 287                 ASSERT(mg->mg_prev == NULL);
 288                 ASSERT(mg->mg_next == NULL);
 289                 ASSERT(mg->mg_activation_count < 0);
 290                 return;
 291         }
 292 
 293         mgprev = mg->mg_prev;
 294         mgnext = mg->mg_next;
 295 
 296         if (mg == mgnext) {
 297                 mc->mc_rotor = NULL;
 298         } else {
 299                 mc->mc_rotor = mgnext;
 300                 mgprev->mg_next = mgnext;
 301                 mgnext->mg_prev = mgprev;
 302         }
 303 
 304         mg->mg_prev = NULL;
 305         mg->mg_next = NULL;
 306 }
 307 
 308 static void
 309 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 310 {
 311         mutex_enter(&mg->mg_lock);
 312         ASSERT(msp->ms_group == NULL);
 313         msp->ms_group = mg;
 314         msp->ms_weight = 0;
 315         avl_add(&mg->mg_metaslab_tree, msp);
 316         mutex_exit(&mg->mg_lock);
 317 }
 318 
 319 static void
 320 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 321 {
 322         mutex_enter(&mg->mg_lock);
 323         ASSERT(msp->ms_group == mg);
 324         avl_remove(&mg->mg_metaslab_tree, msp);
 325         msp->ms_group = NULL;
 326         mutex_exit(&mg->mg_lock);
 327 }
 328 
 329 static void
 330 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 331 {
 332         /*
 333          * Although in principle the weight can be any value, in
 334          * practice we do not use values in the range [1, 510].
 335          */
 336         ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
 337         ASSERT(MUTEX_HELD(&msp->ms_lock));
 338 
 339         mutex_enter(&mg->mg_lock);
 340         ASSERT(msp->ms_group == mg);
 341         avl_remove(&mg->mg_metaslab_tree, msp);
 342         msp->ms_weight = weight;
 343         avl_add(&mg->mg_metaslab_tree, msp);
 344         mutex_exit(&mg->mg_lock);
 345 }
 346 
 347 /*
 348  * ==========================================================================
 349  * Common allocator routines
 350  * ==========================================================================
 351  */
 352 static int
 353 metaslab_segsize_compare(const void *x1, const void *x2)
 354 {
 355         const space_seg_t *s1 = x1;
 356         const space_seg_t *s2 = x2;
 357         uint64_t ss_size1 = s1->ss_end - s1->ss_start;
 358         uint64_t ss_size2 = s2->ss_end - s2->ss_start;
 359 
 360         if (ss_size1 < ss_size2)
 361                 return (-1);
 362         if (ss_size1 > ss_size2)
 363                 return (1);
 364 
 365         if (s1->ss_start < s2->ss_start)
 366                 return (-1);
 367         if (s1->ss_start > s2->ss_start)
 368                 return (1);
 369 
 370         return (0);
 371 }
 372 
 373 /*
 374  * This is a helper function that can be used by the allocator to find
 375  * a suitable block to allocate. This will search the specified AVL
 376  * tree looking for a block that matches the specified criteria.
 377  */
 378 static uint64_t
 379 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 380     uint64_t align)
 381 {
 382         space_seg_t *ss, ssearch;
 383         avl_index_t where;
 384 
 385         ssearch.ss_start = *cursor;
 386         ssearch.ss_end = *cursor + size;
 387 
 388         ss = avl_find(t, &ssearch, &where);
 389         if (ss == NULL)
 390                 ss = avl_nearest(t, where, AVL_AFTER);
 391 
 392         while (ss != NULL) {
 393                 uint64_t offset = P2ROUNDUP(ss->ss_start, align);
 394 
 395                 if (offset + size <= ss->ss_end) {
 396                         *cursor = offset + size;
 397                         return (offset);
 398                 }
 399                 ss = AVL_NEXT(t, ss);
 400         }
 401 
 402         /*
 403          * If we know we've searched the whole map (*cursor == 0), give up.
 404          * Otherwise, reset the cursor to the beginning and try again.
 405          */
 406         if (*cursor == 0)
 407                 return (-1ULL);
 408 
 409         *cursor = 0;
 410         return (metaslab_block_picker(t, cursor, size, align));
 411 }
 412 
 413 static void
 414 metaslab_pp_load(space_map_t *sm)
 415 {
 416         space_seg_t *ss;
 417 
 418         ASSERT(sm->sm_ppd == NULL);
 419         sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
 420 
 421         sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
 422         avl_create(sm->sm_pp_root, metaslab_segsize_compare,
 423             sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
 424 
 425         for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
 426                 avl_add(sm->sm_pp_root, ss);
 427 }
 428 
 429 static void
 430 metaslab_pp_unload(space_map_t *sm)
 431 {
 432         void *cookie = NULL;
 433 
 434         kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
 435         sm->sm_ppd = NULL;
 436 
 437         while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
 438                 /* tear down the tree */
 439         }
 440 
 441         avl_destroy(sm->sm_pp_root);
 442         kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
 443         sm->sm_pp_root = NULL;
 444 }
 445 
 446 /* ARGSUSED */
 447 static void
 448 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
 449 {
 450         /* No need to update cursor */
 451 }
 452 
 453 /* ARGSUSED */
 454 static void
 455 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
 456 {
 457         /* No need to update cursor */
 458 }
 459 
 460 /*
 461  * Return the maximum contiguous segment within the metaslab.
 462  */
 463 uint64_t
 464 metaslab_pp_maxsize(space_map_t *sm)
 465 {
 466         avl_tree_t *t = sm->sm_pp_root;
 467         space_seg_t *ss;
 468 
 469         if (t == NULL || (ss = avl_last(t)) == NULL)
 470                 return (0ULL);
 471 
 472         return (ss->ss_end - ss->ss_start);
 473 }
 474 
 475 /*
 476  * ==========================================================================
 477  * The first-fit block allocator
 478  * ==========================================================================
 479  */
 480 static uint64_t
 481 metaslab_ff_alloc(space_map_t *sm, uint64_t size)
 482 {
 483         avl_tree_t *t = &sm->sm_root;
 484         uint64_t align = size & -size;
 485         uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 486 
 487         return (metaslab_block_picker(t, cursor, size, align));
 488 }
 489 
 490 /* ARGSUSED */
 491 boolean_t
 492 metaslab_ff_fragmented(space_map_t *sm)
 493 {
 494         return (B_TRUE);
 495 }
 496 
 497 static space_map_ops_t metaslab_ff_ops = {
 498         metaslab_pp_load,
 499         metaslab_pp_unload,
 500         metaslab_ff_alloc,
 501         metaslab_pp_claim,
 502         metaslab_pp_free,
 503         metaslab_pp_maxsize,
 504         metaslab_ff_fragmented
 505 };
 506 
 507 /*
 508  * ==========================================================================
 509  * Dynamic block allocator -
 510  * Uses the first fit allocation scheme until space get low and then
 511  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
 512  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
 513  * ==========================================================================
 514  */
 515 static uint64_t
 516 metaslab_df_alloc(space_map_t *sm, uint64_t size)
 517 {
 518         avl_tree_t *t = &sm->sm_root;
 519         uint64_t align = size & -size;
 520         uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 521         uint64_t max_size = metaslab_pp_maxsize(sm);
 522         int free_pct = sm->sm_space * 100 / sm->sm_size;
 523 
 524         ASSERT(MUTEX_HELD(sm->sm_lock));
 525         ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 526 
 527         if (max_size < size)
 528                 return (-1ULL);
 529 
 530         /*
 531          * If we're running low on space switch to using the size
 532          * sorted AVL tree (best-fit).
 533          */
 534         if (max_size < metaslab_df_alloc_threshold ||
 535             free_pct < metaslab_df_free_pct) {
 536                 t = sm->sm_pp_root;
 537                 *cursor = 0;
 538         }
 539 
 540         return (metaslab_block_picker(t, cursor, size, 1ULL));
 541 }
 542 
 543 static boolean_t
 544 metaslab_df_fragmented(space_map_t *sm)
 545 {
 546         uint64_t max_size = metaslab_pp_maxsize(sm);
 547         int free_pct = sm->sm_space * 100 / sm->sm_size;
 548 
 549         if (max_size >= metaslab_df_alloc_threshold &&
 550             free_pct >= metaslab_df_free_pct)
 551                 return (B_FALSE);
 552 
 553         return (B_TRUE);
 554 }
 555 
 556 static space_map_ops_t metaslab_df_ops = {
 557         metaslab_pp_load,
 558         metaslab_pp_unload,
 559         metaslab_df_alloc,
 560         metaslab_pp_claim,
 561         metaslab_pp_free,
 562         metaslab_pp_maxsize,
 563         metaslab_df_fragmented
 564 };
 565 
 566 /*
 567  * ==========================================================================
 568  * Other experimental allocators
 569  * ==========================================================================
 570  */
 571 static uint64_t
 572 metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
 573 {
 574         avl_tree_t *t = &sm->sm_root;
 575         uint64_t *cursor = (uint64_t *)sm->sm_ppd;
 576         uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
 577         uint64_t max_size = metaslab_pp_maxsize(sm);
 578         uint64_t rsize = size;
 579         uint64_t offset = 0;
 580 
 581         ASSERT(MUTEX_HELD(sm->sm_lock));
 582         ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 583 
 584         if (max_size < size)
 585                 return (-1ULL);
 586 
 587         ASSERT3U(*extent_end, >=, *cursor);
 588 
 589         /*
 590          * If we're running low on space switch to using the size
 591          * sorted AVL tree (best-fit).
 592          */
 593         if ((*cursor + size) > *extent_end) {
 594 
 595                 t = sm->sm_pp_root;
 596                 *cursor = *extent_end = 0;
 597 
 598                 if (max_size > 2 * SPA_MAXBLOCKSIZE)
 599                         rsize = MIN(metaslab_min_alloc_size, max_size);
 600                 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
 601                 if (offset != -1)
 602                         *cursor = offset + size;
 603         } else {
 604                 offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
 605         }
 606         ASSERT3U(*cursor, <=, *extent_end);
 607         return (offset);
 608 }
 609 
 610 static boolean_t
 611 metaslab_cdf_fragmented(space_map_t *sm)
 612 {
 613         uint64_t max_size = metaslab_pp_maxsize(sm);
 614 
 615         if (max_size > (metaslab_min_alloc_size * 10))
 616                 return (B_FALSE);
 617         return (B_TRUE);
 618 }
 619 
 620 static space_map_ops_t metaslab_cdf_ops = {
 621         metaslab_pp_load,
 622         metaslab_pp_unload,
 623         metaslab_cdf_alloc,
 624         metaslab_pp_claim,
 625         metaslab_pp_free,
 626         metaslab_pp_maxsize,
 627         metaslab_cdf_fragmented
 628 };
 629 
 630 uint64_t metaslab_ndf_clump_shift = 4;
 631 
 632 static uint64_t
 633 metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
 634 {
 635         avl_tree_t *t = &sm->sm_root;
 636         avl_index_t where;
 637         space_seg_t *ss, ssearch;
 638         uint64_t hbit = highbit(size);
 639         uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
 640         uint64_t max_size = metaslab_pp_maxsize(sm);
 641 
 642         ASSERT(MUTEX_HELD(sm->sm_lock));
 643         ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 644 
 645         if (max_size < size)
 646                 return (-1ULL);
 647 
 648         ssearch.ss_start = *cursor;
 649         ssearch.ss_end = *cursor + size;
 650 
 651         ss = avl_find(t, &ssearch, &where);
 652         if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
 653                 t = sm->sm_pp_root;
 654 
 655                 ssearch.ss_start = 0;
 656                 ssearch.ss_end = MIN(max_size,
 657                     1ULL << (hbit + metaslab_ndf_clump_shift));
 658                 ss = avl_find(t, &ssearch, &where);
 659                 if (ss == NULL)
 660                         ss = avl_nearest(t, where, AVL_AFTER);
 661                 ASSERT(ss != NULL);
 662         }
 663 
 664         if (ss != NULL) {
 665                 if (ss->ss_start + size <= ss->ss_end) {
 666                         *cursor = ss->ss_start + size;
 667                         return (ss->ss_start);
 668                 }
 669         }
 670         return (-1ULL);
 671 }
 672 
 673 static boolean_t
 674 metaslab_ndf_fragmented(space_map_t *sm)
 675 {
 676         uint64_t max_size = metaslab_pp_maxsize(sm);
 677 
 678         if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
 679                 return (B_FALSE);
 680         return (B_TRUE);
 681 }
 682 
 683 
 684 static space_map_ops_t metaslab_ndf_ops = {
 685         metaslab_pp_load,
 686         metaslab_pp_unload,
 687         metaslab_ndf_alloc,
 688         metaslab_pp_claim,
 689         metaslab_pp_free,
 690         metaslab_pp_maxsize,
 691         metaslab_ndf_fragmented
 692 };
 693 
 694 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 695 
 696 /*
 697  * ==========================================================================
 698  * Metaslabs
 699  * ==========================================================================
 700  */
 701 metaslab_t *
 702 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 703         uint64_t start, uint64_t size, uint64_t txg)
 704 {
 705         vdev_t *vd = mg->mg_vd;
 706         metaslab_t *msp;
 707 
 708         msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 709         mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 710 
 711         msp->ms_smo_syncing = *smo;
 712 
 713         /*
 714          * We create the main space map here, but we don't create the
 715          * allocmaps and freemaps until metaslab_sync_done().  This serves
 716          * two purposes: it allows metaslab_sync_done() to detect the
 717          * addition of new space; and for debugging, it ensures that we'd
 718          * data fault on any attempt to use this metaslab before it's ready.
 719          */
 720         space_map_create(&msp->ms_map, start, size,
 721             vd->vdev_ashift, &msp->ms_lock);
 722 
 723         metaslab_group_add(mg, msp);
 724 
 725         if (metaslab_debug && smo->smo_object != 0) {
 726                 mutex_enter(&msp->ms_lock);
 727                 VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
 728                     SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
 729                 mutex_exit(&msp->ms_lock);
 730         }
 731 
 732         /*
 733          * If we're opening an existing pool (txg == 0) or creating
 734          * a new one (txg == TXG_INITIAL), all space is available now.
 735          * If we're adding space to an existing pool, the new space
 736          * does not become available until after this txg has synced.
 737          */
 738         if (txg <= TXG_INITIAL)
 739                 metaslab_sync_done(msp, 0);
 740 
 741         if (txg != 0) {
 742                 vdev_dirty(vd, 0, NULL, txg);
 743                 vdev_dirty(vd, VDD_METASLAB, msp, txg);
 744         }
 745 
 746         return (msp);
 747 }
 748 
 749 void
 750 metaslab_fini(metaslab_t *msp)
 751 {
 752         metaslab_group_t *mg = msp->ms_group;
 753 
 754         vdev_space_update(mg->mg_vd,
 755             -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
 756 
 757         metaslab_group_remove(mg, msp);
 758 
 759         mutex_enter(&msp->ms_lock);
 760 
 761         space_map_unload(&msp->ms_map);
 762         space_map_destroy(&msp->ms_map);
 763 
 764         for (int t = 0; t < TXG_SIZE; t++) {
 765                 space_map_destroy(&msp->ms_allocmap[t]);
 766                 space_map_destroy(&msp->ms_freemap[t]);
 767         }
 768 
 769         for (int t = 0; t < TXG_DEFER_SIZE; t++)
 770                 space_map_destroy(&msp->ms_defermap[t]);
 771 
 772         ASSERT3S(msp->ms_deferspace, ==, 0);
 773 
 774         mutex_exit(&msp->ms_lock);
 775         mutex_destroy(&msp->ms_lock);
 776 
 777         kmem_free(msp, sizeof (metaslab_t));
 778 }
 779 
 780 #define METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
 781 #define METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
 782 #define METASLAB_ACTIVE_MASK            \
 783         (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
 784 
 785 static uint64_t
 786 metaslab_weight(metaslab_t *msp)
 787 {
 788         metaslab_group_t *mg = msp->ms_group;
 789         space_map_t *sm = &msp->ms_map;
 790         space_map_obj_t *smo = &msp->ms_smo;
 791         vdev_t *vd = mg->mg_vd;
 792         uint64_t weight, space;
 793 
 794         ASSERT(MUTEX_HELD(&msp->ms_lock));
 795 
 796         /*
 797          * The baseline weight is the metaslab's free space.
 798          */
 799         space = sm->sm_size - smo->smo_alloc;
 800         weight = space;
 801 
 802         /*
 803          * Modern disks have uniform bit density and constant angular velocity.
 804          * Therefore, the outer recording zones are faster (higher bandwidth)
 805          * than the inner zones by the ratio of outer to inner track diameter,
 806          * which is typically around 2:1.  We account for this by assigning
 807          * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 808          * In effect, this means that we'll select the metaslab with the most
 809          * free bandwidth rather than simply the one with the most free space.
 810          */
 811         weight = 2 * weight -
 812             ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
 813         ASSERT(weight >= space && weight <= 2 * space);
 814 
 815         /*
 816          * For locality, assign higher weight to metaslabs which have
 817          * a lower offset than what we've already activated.
 818          */
 819         if (sm->sm_start <= mg->mg_bonus_area)
 820                 weight *= (metaslab_smo_bonus_pct / 100);
 821         ASSERT(weight >= space &&
 822             weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
 823 
 824         if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
 825                 /*
 826                  * If this metaslab is one we're actively using, adjust its
 827                  * weight to make it preferable to any inactive metaslab so
 828                  * we'll polish it off.
 829                  */
 830                 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 831         }
 832         return (weight);
 833 }
 834 
 835 static void
 836 metaslab_prefetch(metaslab_group_t *mg)
 837 {
 838         spa_t *spa = mg->mg_vd->vdev_spa;
 839         metaslab_t *msp;
 840         avl_tree_t *t = &mg->mg_metaslab_tree;
 841         int m;
 842 
 843         mutex_enter(&mg->mg_lock);
 844 
 845         /*
 846          * Prefetch the next potential metaslabs
 847          */
 848         for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
 849                 space_map_t *sm = &msp->ms_map;
 850                 space_map_obj_t *smo = &msp->ms_smo;
 851 
 852                 /* If we have reached our prefetch limit then we're done */
 853                 if (m >= metaslab_prefetch_limit)
 854                         break;
 855 
 856                 if (!sm->sm_loaded && smo->smo_object != 0) {
 857                         mutex_exit(&mg->mg_lock);
 858                         dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
 859                             0ULL, smo->smo_objsize);
 860                         mutex_enter(&mg->mg_lock);
 861                 }
 862         }
 863         mutex_exit(&mg->mg_lock);
 864 }
 865 
 866 static int
 867 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 868 {
 869         metaslab_group_t *mg = msp->ms_group;
 870         space_map_t *sm = &msp->ms_map;
 871         space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 872 
 873         ASSERT(MUTEX_HELD(&msp->ms_lock));
 874 
 875         if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 876                 space_map_load_wait(sm);
 877                 if (!sm->sm_loaded) {
 878                         int error = space_map_load(sm, sm_ops, SM_FREE,
 879                             &msp->ms_smo,
 880                             spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
 881                         if (error)  {
 882                                 metaslab_group_sort(msp->ms_group, msp, 0);
 883                                 return (error);
 884                         }
 885                         for (int t = 0; t < TXG_DEFER_SIZE; t++)
 886                                 space_map_walk(&msp->ms_defermap[t],
 887                                     space_map_claim, sm);
 888 
 889                 }
 890 
 891                 /*
 892                  * Track the bonus area as we activate new metaslabs.
 893                  */
 894                 if (sm->sm_start > mg->mg_bonus_area) {
 895                         mutex_enter(&mg->mg_lock);
 896                         mg->mg_bonus_area = sm->sm_start;
 897                         mutex_exit(&mg->mg_lock);
 898                 }
 899 
 900                 metaslab_group_sort(msp->ms_group, msp,
 901                     msp->ms_weight | activation_weight);
 902         }
 903         ASSERT(sm->sm_loaded);
 904         ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 905 
 906         return (0);
 907 }
 908 
 909 static void
 910 metaslab_passivate(metaslab_t *msp, uint64_t size)
 911 {
 912         /*
 913          * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 914          * this metaslab again.  In that case, it had better be empty,
 915          * or we would be leaving space on the table.
 916          */
 917         ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
 918         metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
 919         ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 920 }
 921 
 922 /*
 923  * Write a metaslab to disk in the context of the specified transaction group.
 924  */
 925 void
 926 metaslab_sync(metaslab_t *msp, uint64_t txg)
 927 {
 928         vdev_t *vd = msp->ms_group->mg_vd;
 929         spa_t *spa = vd->vdev_spa;
 930         objset_t *mos = spa_meta_objset(spa);
 931         space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
 932         space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
 933         space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
 934         space_map_t *sm = &msp->ms_map;
 935         space_map_obj_t *smo = &msp->ms_smo_syncing;
 936         dmu_buf_t *db;
 937         dmu_tx_t *tx;
 938 
 939         ASSERT(!vd->vdev_ishole);
 940 
 941         if (allocmap->sm_space == 0 && freemap->sm_space == 0)
 942                 return;
 943 
 944         /*
 945          * The only state that can actually be changing concurrently with
 946          * metaslab_sync() is the metaslab's ms_map.  No other thread can
 947          * be modifying this txg's allocmap, freemap, freed_map, or smo.
 948          * Therefore, we only hold ms_lock to satify space_map ASSERTs.
 949          * We drop it whenever we call into the DMU, because the DMU
 950          * can call down to us (e.g. via zio_free()) at any time.
 951          */
 952 
 953         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 954 
 955         if (smo->smo_object == 0) {
 956                 ASSERT(smo->smo_objsize == 0);
 957                 ASSERT(smo->smo_alloc == 0);
 958                 smo->smo_object = dmu_object_alloc(mos,
 959                     DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
 960                     DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
 961                 ASSERT(smo->smo_object != 0);
 962                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 963                     (sm->sm_start >> vd->vdev_ms_shift),
 964                     sizeof (uint64_t), &smo->smo_object, tx);
 965         }
 966 
 967         mutex_enter(&msp->ms_lock);
 968 
 969         space_map_walk(freemap, space_map_add, freed_map);
 970 
 971         if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
 972             2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
 973                 /*
 974                  * The in-core space map representation is twice as compact
 975                  * as the on-disk one, so it's time to condense the latter
 976                  * by generating a pure allocmap from first principles.
 977                  *
 978                  * This metaslab is 100% allocated,
 979                  * minus the content of the in-core map (sm),
 980                  * minus what's been freed this txg (freed_map),
 981                  * minus deferred frees (ms_defermap[]),
 982                  * minus allocations from txgs in the future
 983                  * (because they haven't been committed yet).
 984                  */
 985                 space_map_vacate(allocmap, NULL, NULL);
 986                 space_map_vacate(freemap, NULL, NULL);
 987 
 988                 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
 989 
 990                 space_map_walk(sm, space_map_remove, allocmap);
 991                 space_map_walk(freed_map, space_map_remove, allocmap);
 992 
 993                 for (int t = 0; t < TXG_DEFER_SIZE; t++)
 994                         space_map_walk(&msp->ms_defermap[t],
 995                             space_map_remove, allocmap);
 996 
 997                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
 998                         space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
 999                             space_map_remove, allocmap);
1000 
1001                 mutex_exit(&msp->ms_lock);
1002                 space_map_truncate(smo, mos, tx);
1003                 mutex_enter(&msp->ms_lock);
1004         }
1005 
1006         space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
1007         space_map_sync(freemap, SM_FREE, smo, mos, tx);
1008 
1009         mutex_exit(&msp->ms_lock);
1010 
1011         VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1012         dmu_buf_will_dirty(db, tx);
1013         ASSERT3U(db->db_size, >=, sizeof (*smo));
1014         bcopy(smo, db->db_data, sizeof (*smo));
1015         dmu_buf_rele(db, FTAG);
1016 
1017         dmu_tx_commit(tx);
1018 }
1019 
1020 /*
1021  * Called after a transaction group has completely synced to mark
1022  * all of the metaslab's free space as usable.
1023  */
1024 void
1025 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1026 {
1027         space_map_obj_t *smo = &msp->ms_smo;
1028         space_map_obj_t *smosync = &msp->ms_smo_syncing;
1029         space_map_t *sm = &msp->ms_map;
1030         space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1031         space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1032         metaslab_group_t *mg = msp->ms_group;
1033         vdev_t *vd = mg->mg_vd;
1034         int64_t alloc_delta, defer_delta;
1035 
1036         ASSERT(!vd->vdev_ishole);
1037 
1038         mutex_enter(&msp->ms_lock);
1039 
1040         /*
1041          * If this metaslab is just becoming available, initialize its
1042          * allocmaps and freemaps and add its capacity to the vdev.
1043          */
1044         if (freed_map->sm_size == 0) {
1045                 for (int t = 0; t < TXG_SIZE; t++) {
1046                         space_map_create(&msp->ms_allocmap[t], sm->sm_start,
1047                             sm->sm_size, sm->sm_shift, sm->sm_lock);
1048                         space_map_create(&msp->ms_freemap[t], sm->sm_start,
1049                             sm->sm_size, sm->sm_shift, sm->sm_lock);
1050                 }
1051 
1052                 for (int t = 0; t < TXG_DEFER_SIZE; t++)
1053                         space_map_create(&msp->ms_defermap[t], sm->sm_start,
1054                             sm->sm_size, sm->sm_shift, sm->sm_lock);
1055 
1056                 vdev_space_update(vd, 0, 0, sm->sm_size);
1057         }
1058 
1059         alloc_delta = smosync->smo_alloc - smo->smo_alloc;
1060         defer_delta = freed_map->sm_space - defer_map->sm_space;
1061 
1062         vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1063 
1064         ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
1065         ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
1066 
1067         /*
1068          * If there's a space_map_load() in progress, wait for it to complete
1069          * so that we have a consistent view of the in-core space map.
1070          * Then, add defer_map (oldest deferred frees) to this map and
1071          * transfer freed_map (this txg's frees) to defer_map.
1072          */
1073         space_map_load_wait(sm);
1074         space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
1075         space_map_vacate(freed_map, space_map_add, defer_map);
1076 
1077         *smo = *smosync;
1078 
1079         msp->ms_deferspace += defer_delta;
1080         ASSERT3S(msp->ms_deferspace, >=, 0);
1081         ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
1082         if (msp->ms_deferspace != 0) {
1083                 /*
1084                  * Keep syncing this metaslab until all deferred frees
1085                  * are back in circulation.
1086                  */
1087                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1088         }
1089 
1090         /*
1091          * If the map is loaded but no longer active, evict it as soon as all
1092          * future allocations have synced.  (If we unloaded it now and then
1093          * loaded a moment later, the map wouldn't reflect those allocations.)
1094          */
1095         if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1096                 int evictable = 1;
1097 
1098                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1099                         if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
1100                                 evictable = 0;
1101 
1102                 if (evictable && !metaslab_debug)
1103                         space_map_unload(sm);
1104         }
1105 
1106         metaslab_group_sort(mg, msp, metaslab_weight(msp));
1107 
1108         mutex_exit(&msp->ms_lock);
1109 }
1110 
1111 void
1112 metaslab_sync_reassess(metaslab_group_t *mg)
1113 {
1114         vdev_t *vd = mg->mg_vd;
1115         int64_t failures = mg->mg_alloc_failures;
1116 
1117         /*
1118          * Re-evaluate all metaslabs which have lower offsets than the
1119          * bonus area.
1120          */
1121         for (int m = 0; m < vd->vdev_ms_count; m++) {
1122                 metaslab_t *msp = vd->vdev_ms[m];
1123 
1124                 if (msp->ms_map.sm_start > mg->mg_bonus_area)
1125                         break;
1126 
1127                 mutex_enter(&msp->ms_lock);
1128                 metaslab_group_sort(mg, msp, metaslab_weight(msp));
1129                 mutex_exit(&msp->ms_lock);
1130         }
1131 
1132         atomic_add_64(&mg->mg_alloc_failures, -failures);
1133 
1134         /*
1135          * Prefetch the next potential metaslabs
1136          */
1137         metaslab_prefetch(mg);
1138 }
1139 
1140 static uint64_t
1141 metaslab_distance(metaslab_t *msp, dva_t *dva)
1142 {
1143         uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1144         uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1145         uint64_t start = msp->ms_map.sm_start >> ms_shift;
1146 
1147         if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1148                 return (1ULL << 63);
1149 
1150         if (offset < start)
1151                 return ((start - offset) << ms_shift);
1152         if (offset > start)
1153                 return ((offset - start) << ms_shift);
1154         return (0);
1155 }
1156 
1157 static uint64_t
1158 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1159     uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1160 {
1161         spa_t *spa = mg->mg_vd->vdev_spa;
1162         metaslab_t *msp = NULL;
1163         uint64_t offset = -1ULL;
1164         avl_tree_t *t = &mg->mg_metaslab_tree;
1165         uint64_t activation_weight;
1166         uint64_t target_distance;
1167         int i;
1168 
1169         activation_weight = METASLAB_WEIGHT_PRIMARY;
1170         for (i = 0; i < d; i++) {
1171                 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1172                         activation_weight = METASLAB_WEIGHT_SECONDARY;
1173                         break;
1174                 }
1175         }
1176 
1177         for (;;) {
1178                 boolean_t was_active;
1179 
1180                 mutex_enter(&mg->mg_lock);
1181                 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1182                         if (msp->ms_weight < asize) {
1183                                 spa_dbgmsg(spa, "%s: failed to meet weight "
1184                                     "requirement: vdev %llu, txg %llu, mg %p, "
1185                                     "msp %p, psize %llu, asize %llu, "
1186                                     "failures %llu, weight %llu",
1187                                     spa_name(spa), mg->mg_vd->vdev_id, txg,
1188                                     mg, msp, psize, asize,
1189                                     mg->mg_alloc_failures, msp->ms_weight);
1190                                 mutex_exit(&mg->mg_lock);
1191                                 return (-1ULL);
1192                         }
1193                         was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1194                         if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1195                                 break;
1196 
1197                         target_distance = min_distance +
1198                             (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1199 
1200                         for (i = 0; i < d; i++)
1201                                 if (metaslab_distance(msp, &dva[i]) <
1202                                     target_distance)
1203                                         break;
1204                         if (i == d)
1205                                 break;
1206                 }
1207                 mutex_exit(&mg->mg_lock);
1208                 if (msp == NULL)
1209                         return (-1ULL);
1210 
1211                 /*
1212                  * If we've already reached the allowable number of failed
1213                  * allocation attempts on this metaslab group then we
1214                  * consider skipping it. We skip it only if we're allowed
1215                  * to "fast" gang, the physical size is larger than
1216                  * a gang block, and we're attempting to allocate from
1217                  * the primary metaslab.
1218                  */
1219                 if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1220                     CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1221                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
1222                         spa_dbgmsg(spa, "%s: skipping metaslab group: "
1223                             "vdev %llu, txg %llu, mg %p, psize %llu, "
1224                             "asize %llu, failures %llu", spa_name(spa),
1225                             mg->mg_vd->vdev_id, txg, mg, psize, asize,
1226                             mg->mg_alloc_failures);
1227                         return (-1ULL);
1228                 }
1229 
1230                 mutex_enter(&msp->ms_lock);
1231 
1232                 /*
1233                  * Ensure that the metaslab we have selected is still
1234                  * capable of handling our request. It's possible that
1235                  * another thread may have changed the weight while we
1236                  * were blocked on the metaslab lock.
1237                  */
1238                 if (msp->ms_weight < asize || (was_active &&
1239                     !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1240                     activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1241                         mutex_exit(&msp->ms_lock);
1242                         continue;
1243                 }
1244 
1245                 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1246                     activation_weight == METASLAB_WEIGHT_PRIMARY) {
1247                         metaslab_passivate(msp,
1248                             msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1249                         mutex_exit(&msp->ms_lock);
1250                         continue;
1251                 }
1252 
1253                 if (metaslab_activate(msp, activation_weight) != 0) {
1254                         mutex_exit(&msp->ms_lock);
1255                         continue;
1256                 }
1257 
1258                 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
1259                         break;
1260 
1261                 atomic_inc_64(&mg->mg_alloc_failures);
1262 
1263                 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
1264 
1265                 mutex_exit(&msp->ms_lock);
1266         }
1267 
1268         if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1269                 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1270 
1271         space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
1272 
1273         mutex_exit(&msp->ms_lock);
1274 
1275         return (offset);
1276 }
1277 
1278 /*
1279  * Allocate a block for the specified i/o.
1280  */
1281 static int
1282 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1283     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1284 {
1285         metaslab_group_t *mg, *rotor;
1286         vdev_t *vd;
1287         int dshift = 3;
1288         int all_zero;
1289         int zio_lock = B_FALSE;
1290         boolean_t allocatable;
1291         uint64_t offset = -1ULL;
1292         uint64_t asize;
1293         uint64_t distance;
1294 
1295         ASSERT(!DVA_IS_VALID(&dva[d]));
1296 
1297         /*
1298          * For testing, make some blocks above a certain size be gang blocks.
1299          */
1300         if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1301                 return (ENOSPC);
1302 
1303         /*
1304          * Start at the rotor and loop through all mgs until we find something.
1305          * Note that there's no locking on mc_rotor or mc_aliquot because
1306          * nothing actually breaks if we miss a few updates -- we just won't
1307          * allocate quite as evenly.  It all balances out over time.
1308          *
1309          * If we are doing ditto or log blocks, try to spread them across
1310          * consecutive vdevs.  If we're forced to reuse a vdev before we've
1311          * allocated all of our ditto blocks, then try and spread them out on
1312          * that vdev as much as possible.  If it turns out to not be possible,
1313          * gradually lower our standards until anything becomes acceptable.
1314          * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1315          * gives us hope of containing our fault domains to something we're
1316          * able to reason about.  Otherwise, any two top-level vdev failures
1317          * will guarantee the loss of data.  With consecutive allocation,
1318          * only two adjacent top-level vdev failures will result in data loss.
1319          *
1320          * If we are doing gang blocks (hintdva is non-NULL), try to keep
1321          * ourselves on the same vdev as our gang block header.  That
1322          * way, we can hope for locality in vdev_cache, plus it makes our
1323          * fault domains something tractable.
1324          */
1325         if (hintdva) {
1326                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1327 
1328                 /*
1329                  * It's possible the vdev we're using as the hint no
1330                  * longer exists (i.e. removed). Consult the rotor when
1331                  * all else fails.
1332                  */
1333                 if (vd != NULL) {
1334                         mg = vd->vdev_mg;
1335 
1336                         if (flags & METASLAB_HINTBP_AVOID &&
1337                             mg->mg_next != NULL)
1338                                 mg = mg->mg_next;
1339                 } else {
1340                         mg = mc->mc_rotor;
1341                 }
1342         } else if (d != 0) {
1343                 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1344                 mg = vd->vdev_mg->mg_next;
1345         } else {
1346                 mg = mc->mc_rotor;
1347         }
1348 
1349         /*
1350          * If the hint put us into the wrong metaslab class, or into a
1351          * metaslab group that has been passivated, just follow the rotor.
1352          */
1353         if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1354                 mg = mc->mc_rotor;
1355 
1356         rotor = mg;
1357 top:
1358         all_zero = B_TRUE;
1359         do {
1360                 ASSERT(mg->mg_activation_count == 1);
1361 
1362                 vd = mg->mg_vd;
1363 
1364                 /*
1365                  * Don't allocate from faulted devices.
1366                  */
1367                 if (zio_lock) {
1368                         spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1369                         allocatable = vdev_allocatable(vd);
1370                         spa_config_exit(spa, SCL_ZIO, FTAG);
1371                 } else {
1372                         allocatable = vdev_allocatable(vd);
1373                 }
1374                 if (!allocatable)
1375                         goto next;
1376 
1377                 /*
1378                  * Avoid writing single-copy data to a failing vdev
1379                  */
1380                 if ((vd->vdev_stat.vs_write_errors > 0 ||
1381                     vd->vdev_state < VDEV_STATE_HEALTHY) &&
1382                     d == 0 && dshift == 3) {
1383                         all_zero = B_FALSE;
1384                         goto next;
1385                 }
1386 
1387                 ASSERT(mg->mg_class == mc);
1388 
1389                 distance = vd->vdev_asize >> dshift;
1390                 if (distance <= (1ULL << vd->vdev_ms_shift))
1391                         distance = 0;
1392                 else
1393                         all_zero = B_FALSE;
1394 
1395                 asize = vdev_psize_to_asize(vd, psize);
1396                 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1397 
1398                 offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1399                     dva, d, flags);
1400                 if (offset != -1ULL) {
1401                         /*
1402                          * If we've just selected this metaslab group,
1403                          * figure out whether the corresponding vdev is
1404                          * over- or under-used relative to the pool,
1405                          * and set an allocation bias to even it out.
1406                          */
1407                         if (mc->mc_aliquot == 0) {
1408                                 vdev_stat_t *vs = &vd->vdev_stat;
1409                                 int64_t vu, cu;
1410 
1411                                 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1412                                 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1413 
1414                                 /*
1415                                  * Calculate how much more or less we should
1416                                  * try to allocate from this device during
1417                                  * this iteration around the rotor.
1418                                  * For example, if a device is 80% full
1419                                  * and the pool is 20% full then we should
1420                                  * reduce allocations by 60% on this device.
1421                                  *
1422                                  * mg_bias = (20 - 80) * 512K / 100 = -307K
1423                                  *
1424                                  * This reduces allocations by 307K for this
1425                                  * iteration.
1426                                  */
1427                                 mg->mg_bias = ((cu - vu) *
1428                                     (int64_t)mg->mg_aliquot) / 100;
1429                         }
1430 
1431                         if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1432                             mg->mg_aliquot + mg->mg_bias) {
1433                                 mc->mc_rotor = mg->mg_next;
1434                                 mc->mc_aliquot = 0;
1435                         }
1436 
1437                         DVA_SET_VDEV(&dva[d], vd->vdev_id);
1438                         DVA_SET_OFFSET(&dva[d], offset);
1439                         DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1440                         DVA_SET_ASIZE(&dva[d], asize);
1441 
1442                         return (0);
1443                 }
1444 next:
1445                 mc->mc_rotor = mg->mg_next;
1446                 mc->mc_aliquot = 0;
1447         } while ((mg = mg->mg_next) != rotor);
1448 
1449         if (!all_zero) {
1450                 dshift++;
1451                 ASSERT(dshift < 64);
1452                 goto top;
1453         }
1454 
1455         if (!allocatable && !zio_lock) {
1456                 dshift = 3;
1457                 zio_lock = B_TRUE;
1458                 goto top;
1459         }
1460 
1461         bzero(&dva[d], sizeof (dva_t));
1462 
1463         return (ENOSPC);
1464 }
1465 
1466 /*
1467  * Free the block represented by DVA in the context of the specified
1468  * transaction group.
1469  */
1470 static void
1471 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1472 {
1473         uint64_t vdev = DVA_GET_VDEV(dva);
1474         uint64_t offset = DVA_GET_OFFSET(dva);
1475         uint64_t size = DVA_GET_ASIZE(dva);
1476         vdev_t *vd;
1477         metaslab_t *msp;
1478 
1479         ASSERT(DVA_IS_VALID(dva));
1480 
1481         if (txg > spa_freeze_txg(spa))
1482                 return;
1483 
1484         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1485             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1486                 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1487                     (u_longlong_t)vdev, (u_longlong_t)offset);
1488                 ASSERT(0);
1489                 return;
1490         }
1491 
1492         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1493 
1494         if (DVA_GET_GANG(dva))
1495                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1496 
1497         mutex_enter(&msp->ms_lock);
1498 
1499         if (now) {
1500                 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
1501                     offset, size);
1502                 space_map_free(&msp->ms_map, offset, size);
1503         } else {
1504                 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
1505                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
1506                 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
1507         }
1508 
1509         mutex_exit(&msp->ms_lock);
1510 }
1511 
1512 /*
1513  * Intent log support: upon opening the pool after a crash, notify the SPA
1514  * of blocks that the intent log has allocated for immediate write, but
1515  * which are still considered free by the SPA because the last transaction
1516  * group didn't commit yet.
1517  */
1518 static int
1519 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
1520 {
1521         uint64_t vdev = DVA_GET_VDEV(dva);
1522         uint64_t offset = DVA_GET_OFFSET(dva);
1523         uint64_t size = DVA_GET_ASIZE(dva);
1524         vdev_t *vd;
1525         metaslab_t *msp;
1526         int error = 0;
1527 
1528         ASSERT(DVA_IS_VALID(dva));
1529 
1530         if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1531             (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1532                 return (ENXIO);
1533 
1534         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1535 
1536         if (DVA_GET_GANG(dva))
1537                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1538 
1539         mutex_enter(&msp->ms_lock);
1540 
1541         if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
1542                 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1543 
1544         if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
1545                 error = ENOENT;
1546 
1547         if (error || txg == 0) {        /* txg == 0 indicates dry run */
1548                 mutex_exit(&msp->ms_lock);
1549                 return (error);
1550         }
1551 
1552         space_map_claim(&msp->ms_map, offset, size);
1553 
1554         if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
1555                 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1556                         vdev_dirty(vd, VDD_METASLAB, msp, txg);
1557                 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
1558         }
1559 
1560         mutex_exit(&msp->ms_lock);
1561 
1562         return (0);
1563 }
1564 
1565 int
1566 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
1567     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
1568 {
1569         dva_t *dva = bp->blk_dva;
1570         dva_t *hintdva = hintbp->blk_dva;
1571         int error = 0;
1572 
1573         ASSERT(bp->blk_birth == 0);
1574         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
1575 
1576         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1577 
1578         if (mc->mc_rotor == NULL) {  /* no vdevs in this class */
1579                 spa_config_exit(spa, SCL_ALLOC, FTAG);
1580                 return (ENOSPC);
1581         }
1582 
1583         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
1584         ASSERT(BP_GET_NDVAS(bp) == 0);
1585         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
1586 
1587         for (int d = 0; d < ndvas; d++) {
1588                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
1589                     txg, flags);
1590                 if (error) {
1591                         for (d--; d >= 0; d--) {
1592                                 metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
1593                                 bzero(&dva[d], sizeof (dva_t));
1594                         }
1595                         spa_config_exit(spa, SCL_ALLOC, FTAG);
1596                         return (error);
1597                 }
1598         }
1599         ASSERT(error == 0);
1600         ASSERT(BP_GET_NDVAS(bp) == ndvas);
1601 
1602         spa_config_exit(spa, SCL_ALLOC, FTAG);
1603 
1604         BP_SET_BIRTH(bp, txg, txg);
1605 
1606         return (0);
1607 }
1608 
1609 void
1610 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
1611 {
1612         const dva_t *dva = bp->blk_dva;
1613         int ndvas = BP_GET_NDVAS(bp);
1614 
1615         ASSERT(!BP_IS_HOLE(bp));
1616         ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
1617 
1618         spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
1619 
1620         for (int d = 0; d < ndvas; d++)
1621                 metaslab_free_dva(spa, &dva[d], txg, now);
1622 
1623         spa_config_exit(spa, SCL_FREE, FTAG);
1624 }
1625 
1626 int
1627 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1628 {
1629         const dva_t *dva = bp->blk_dva;
1630         int ndvas = BP_GET_NDVAS(bp);
1631         int error = 0;
1632 
1633         ASSERT(!BP_IS_HOLE(bp));
1634 
1635         if (txg != 0) {
1636                 /*
1637                  * First do a dry run to make sure all DVAs are claimable,
1638                  * so we don't have to unwind from partial failures below.
1639                  */
1640                 if ((error = metaslab_claim(spa, bp, 0)) != 0)
1641                         return (error);
1642         }
1643 
1644         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1645 
1646         for (int d = 0; d < ndvas; d++)
1647                 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1648                         break;
1649 
1650         spa_config_exit(spa, SCL_ALLOC, FTAG);
1651 
1652         ASSERT(error == 0 || txg == 0);
1653 
1654         return (error);
1655 }