1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/dmu.h> 29 #include <sys/dmu_tx.h> 30 #include <sys/space_map.h> 31 #include <sys/metaslab_impl.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/zio.h> 34 35 /* 36 * Allow allocations to switch to gang blocks quickly. We do this to 37 * avoid having to load lots of space_maps in a given txg. There are, 38 * however, some cases where we want to avoid "fast" ganging and instead 39 * we want to do an exhaustive search of all metaslabs on this device. 40 * Currently we don't allow any gang, zil, or dump device related allocations 41 * to "fast" gang. 42 */ 43 #define CAN_FASTGANG(flags) \ 44 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 45 METASLAB_GANG_AVOID))) 46 47 uint64_t metaslab_aliquot = 512ULL << 10; 48 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 49 50 /* 51 * The in-core space map representation is more compact than its on-disk form. 52 * The zfs_condense_pct determines how much more compact the in-core 53 * space_map representation must be before we compact it on-disk. 54 * Values should be greater than or equal to 100. 55 */ 56 int zfs_condense_pct = 200; 57 58 /* 59 * This value defines the number of allowed allocation failures per vdev. 60 * If a device reaches this threshold in a given txg then we consider skipping 61 * allocations on that device. 62 */ 63 int zfs_mg_alloc_failures; 64 65 /* 66 * Metaslab debugging: when set, keeps all space maps in core to verify frees. 67 */ 68 static int metaslab_debug = 0; 69 70 /* 71 * Minimum size which forces the dynamic allocator to change 72 * it's allocation strategy. Once the space map cannot satisfy 73 * an allocation of this size then it switches to using more 74 * aggressive strategy (i.e search by size rather than offset). 75 */ 76 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 77 78 /* 79 * The minimum free space, in percent, which must be available 80 * in a space map to continue allocations in a first-fit fashion. 81 * Once the space_map's free space drops below this level we dynamically 82 * switch to using best-fit allocations. 83 */ 84 int metaslab_df_free_pct = 4; 85 86 /* 87 * A metaslab is considered "free" if it contains a contiguous 88 * segment which is greater than metaslab_min_alloc_size. 89 */ 90 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 91 92 /* 93 * Max number of space_maps to prefetch. 94 */ 95 int metaslab_prefetch_limit = SPA_DVAS_PER_BP; 96 97 /* 98 * Percentage bonus multiplier for metaslabs that are in the bonus area. 99 */ 100 int metaslab_smo_bonus_pct = 150; 101 102 /* 103 * Should we be willing to write data to degraded vdevs? 104 */ 105 boolean_t zfs_write_to_degraded = B_FALSE; 106 107 /* 108 * ========================================================================== 109 * Metaslab classes 110 * ========================================================================== 111 */ 112 metaslab_class_t * 113 metaslab_class_create(spa_t *spa, space_map_ops_t *ops) 114 { 115 metaslab_class_t *mc; 116 117 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 118 119 mc->mc_spa = spa; 120 mc->mc_rotor = NULL; 121 mc->mc_ops = ops; 122 123 return (mc); 124 } 125 126 void 127 metaslab_class_destroy(metaslab_class_t *mc) 128 { 129 ASSERT(mc->mc_rotor == NULL); 130 ASSERT(mc->mc_alloc == 0); 131 ASSERT(mc->mc_deferred == 0); 132 ASSERT(mc->mc_space == 0); 133 ASSERT(mc->mc_dspace == 0); 134 135 kmem_free(mc, sizeof (metaslab_class_t)); 136 } 137 138 int 139 metaslab_class_validate(metaslab_class_t *mc) 140 { 141 metaslab_group_t *mg; 142 vdev_t *vd; 143 144 /* 145 * Must hold one of the spa_config locks. 146 */ 147 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 148 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 149 150 if ((mg = mc->mc_rotor) == NULL) 151 return (0); 152 153 do { 154 vd = mg->mg_vd; 155 ASSERT(vd->vdev_mg != NULL); 156 ASSERT3P(vd->vdev_top, ==, vd); 157 ASSERT3P(mg->mg_class, ==, mc); 158 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 159 } while ((mg = mg->mg_next) != mc->mc_rotor); 160 161 return (0); 162 } 163 164 void 165 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 166 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 167 { 168 atomic_add_64(&mc->mc_alloc, alloc_delta); 169 atomic_add_64(&mc->mc_deferred, defer_delta); 170 atomic_add_64(&mc->mc_space, space_delta); 171 atomic_add_64(&mc->mc_dspace, dspace_delta); 172 } 173 174 uint64_t 175 metaslab_class_get_alloc(metaslab_class_t *mc) 176 { 177 return (mc->mc_alloc); 178 } 179 180 uint64_t 181 metaslab_class_get_deferred(metaslab_class_t *mc) 182 { 183 return (mc->mc_deferred); 184 } 185 186 uint64_t 187 metaslab_class_get_space(metaslab_class_t *mc) 188 { 189 return (mc->mc_space); 190 } 191 192 uint64_t 193 metaslab_class_get_dspace(metaslab_class_t *mc) 194 { 195 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 196 } 197 198 /* 199 * ========================================================================== 200 * Metaslab groups 201 * ========================================================================== 202 */ 203 static int 204 metaslab_compare(const void *x1, const void *x2) 205 { 206 const metaslab_t *m1 = x1; 207 const metaslab_t *m2 = x2; 208 209 if (m1->ms_weight < m2->ms_weight) 210 return (1); 211 if (m1->ms_weight > m2->ms_weight) 212 return (-1); 213 214 /* 215 * If the weights are identical, use the offset to force uniqueness. 216 */ 217 if (m1->ms_map->sm_start < m2->ms_map->sm_start) 218 return (-1); 219 if (m1->ms_map->sm_start > m2->ms_map->sm_start) 220 return (1); 221 222 ASSERT3P(m1, ==, m2); 223 224 return (0); 225 } 226 227 metaslab_group_t * 228 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 229 { 230 metaslab_group_t *mg; 231 232 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 233 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 234 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 235 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 236 mg->mg_vd = vd; 237 mg->mg_class = mc; 238 mg->mg_activation_count = 0; 239 240 return (mg); 241 } 242 243 void 244 metaslab_group_destroy(metaslab_group_t *mg) 245 { 246 ASSERT(mg->mg_prev == NULL); 247 ASSERT(mg->mg_next == NULL); 248 /* 249 * We may have gone below zero with the activation count 250 * either because we never activated in the first place or 251 * because we're done, and possibly removing the vdev. 252 */ 253 ASSERT(mg->mg_activation_count <= 0); 254 255 avl_destroy(&mg->mg_metaslab_tree); 256 mutex_destroy(&mg->mg_lock); 257 kmem_free(mg, sizeof (metaslab_group_t)); 258 } 259 260 void 261 metaslab_group_activate(metaslab_group_t *mg) 262 { 263 metaslab_class_t *mc = mg->mg_class; 264 metaslab_group_t *mgprev, *mgnext; 265 266 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 267 268 ASSERT(mc->mc_rotor != mg); 269 ASSERT(mg->mg_prev == NULL); 270 ASSERT(mg->mg_next == NULL); 271 ASSERT(mg->mg_activation_count <= 0); 272 273 if (++mg->mg_activation_count <= 0) 274 return; 275 276 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 277 278 if ((mgprev = mc->mc_rotor) == NULL) { 279 mg->mg_prev = mg; 280 mg->mg_next = mg; 281 } else { 282 mgnext = mgprev->mg_next; 283 mg->mg_prev = mgprev; 284 mg->mg_next = mgnext; 285 mgprev->mg_next = mg; 286 mgnext->mg_prev = mg; 287 } 288 mc->mc_rotor = mg; 289 } 290 291 void 292 metaslab_group_passivate(metaslab_group_t *mg) 293 { 294 metaslab_class_t *mc = mg->mg_class; 295 metaslab_group_t *mgprev, *mgnext; 296 297 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 298 299 if (--mg->mg_activation_count != 0) { 300 ASSERT(mc->mc_rotor != mg); 301 ASSERT(mg->mg_prev == NULL); 302 ASSERT(mg->mg_next == NULL); 303 ASSERT(mg->mg_activation_count < 0); 304 return; 305 } 306 307 mgprev = mg->mg_prev; 308 mgnext = mg->mg_next; 309 310 if (mg == mgnext) { 311 mc->mc_rotor = NULL; 312 } else { 313 mc->mc_rotor = mgnext; 314 mgprev->mg_next = mgnext; 315 mgnext->mg_prev = mgprev; 316 } 317 318 mg->mg_prev = NULL; 319 mg->mg_next = NULL; 320 } 321 322 static void 323 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 324 { 325 mutex_enter(&mg->mg_lock); 326 ASSERT(msp->ms_group == NULL); 327 msp->ms_group = mg; 328 msp->ms_weight = 0; 329 avl_add(&mg->mg_metaslab_tree, msp); 330 mutex_exit(&mg->mg_lock); 331 } 332 333 static void 334 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 335 { 336 mutex_enter(&mg->mg_lock); 337 ASSERT(msp->ms_group == mg); 338 avl_remove(&mg->mg_metaslab_tree, msp); 339 msp->ms_group = NULL; 340 mutex_exit(&mg->mg_lock); 341 } 342 343 static void 344 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 345 { 346 /* 347 * Although in principle the weight can be any value, in 348 * practice we do not use values in the range [1, 510]. 349 */ 350 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 351 ASSERT(MUTEX_HELD(&msp->ms_lock)); 352 353 mutex_enter(&mg->mg_lock); 354 ASSERT(msp->ms_group == mg); 355 avl_remove(&mg->mg_metaslab_tree, msp); 356 msp->ms_weight = weight; 357 avl_add(&mg->mg_metaslab_tree, msp); 358 mutex_exit(&mg->mg_lock); 359 } 360 361 /* 362 * ========================================================================== 363 * Common allocator routines 364 * ========================================================================== 365 */ 366 static int 367 metaslab_segsize_compare(const void *x1, const void *x2) 368 { 369 const space_seg_t *s1 = x1; 370 const space_seg_t *s2 = x2; 371 uint64_t ss_size1 = s1->ss_end - s1->ss_start; 372 uint64_t ss_size2 = s2->ss_end - s2->ss_start; 373 374 if (ss_size1 < ss_size2) 375 return (-1); 376 if (ss_size1 > ss_size2) 377 return (1); 378 379 if (s1->ss_start < s2->ss_start) 380 return (-1); 381 if (s1->ss_start > s2->ss_start) 382 return (1); 383 384 return (0); 385 } 386 387 /* 388 * This is a helper function that can be used by the allocator to find 389 * a suitable block to allocate. This will search the specified AVL 390 * tree looking for a block that matches the specified criteria. 391 */ 392 static uint64_t 393 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 394 uint64_t align) 395 { 396 space_seg_t *ss, ssearch; 397 avl_index_t where; 398 399 ssearch.ss_start = *cursor; 400 ssearch.ss_end = *cursor + size; 401 402 ss = avl_find(t, &ssearch, &where); 403 if (ss == NULL) 404 ss = avl_nearest(t, where, AVL_AFTER); 405 406 while (ss != NULL) { 407 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 408 409 if (offset + size <= ss->ss_end) { 410 *cursor = offset + size; 411 return (offset); 412 } 413 ss = AVL_NEXT(t, ss); 414 } 415 416 /* 417 * If we know we've searched the whole map (*cursor == 0), give up. 418 * Otherwise, reset the cursor to the beginning and try again. 419 */ 420 if (*cursor == 0) 421 return (-1ULL); 422 423 *cursor = 0; 424 return (metaslab_block_picker(t, cursor, size, align)); 425 } 426 427 static void 428 metaslab_pp_load(space_map_t *sm) 429 { 430 space_seg_t *ss; 431 432 ASSERT(sm->sm_ppd == NULL); 433 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 434 435 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 436 avl_create(sm->sm_pp_root, metaslab_segsize_compare, 437 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 438 439 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 440 avl_add(sm->sm_pp_root, ss); 441 } 442 443 static void 444 metaslab_pp_unload(space_map_t *sm) 445 { 446 void *cookie = NULL; 447 448 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 449 sm->sm_ppd = NULL; 450 451 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 452 /* tear down the tree */ 453 } 454 455 avl_destroy(sm->sm_pp_root); 456 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 457 sm->sm_pp_root = NULL; 458 } 459 460 /* ARGSUSED */ 461 static void 462 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) 463 { 464 /* No need to update cursor */ 465 } 466 467 /* ARGSUSED */ 468 static void 469 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) 470 { 471 /* No need to update cursor */ 472 } 473 474 /* 475 * Return the maximum contiguous segment within the metaslab. 476 */ 477 uint64_t 478 metaslab_pp_maxsize(space_map_t *sm) 479 { 480 avl_tree_t *t = sm->sm_pp_root; 481 space_seg_t *ss; 482 483 if (t == NULL || (ss = avl_last(t)) == NULL) 484 return (0ULL); 485 486 return (ss->ss_end - ss->ss_start); 487 } 488 489 /* 490 * ========================================================================== 491 * The first-fit block allocator 492 * ========================================================================== 493 */ 494 static uint64_t 495 metaslab_ff_alloc(space_map_t *sm, uint64_t size) 496 { 497 avl_tree_t *t = &sm->sm_root; 498 uint64_t align = size & -size; 499 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 500 501 return (metaslab_block_picker(t, cursor, size, align)); 502 } 503 504 /* ARGSUSED */ 505 boolean_t 506 metaslab_ff_fragmented(space_map_t *sm) 507 { 508 return (B_TRUE); 509 } 510 511 static space_map_ops_t metaslab_ff_ops = { 512 metaslab_pp_load, 513 metaslab_pp_unload, 514 metaslab_ff_alloc, 515 metaslab_pp_claim, 516 metaslab_pp_free, 517 metaslab_pp_maxsize, 518 metaslab_ff_fragmented 519 }; 520 521 /* 522 * ========================================================================== 523 * Dynamic block allocator - 524 * Uses the first fit allocation scheme until space get low and then 525 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 526 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 527 * ========================================================================== 528 */ 529 static uint64_t 530 metaslab_df_alloc(space_map_t *sm, uint64_t size) 531 { 532 avl_tree_t *t = &sm->sm_root; 533 uint64_t align = size & -size; 534 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 535 uint64_t max_size = metaslab_pp_maxsize(sm); 536 int free_pct = sm->sm_space * 100 / sm->sm_size; 537 538 ASSERT(MUTEX_HELD(sm->sm_lock)); 539 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 540 541 if (max_size < size) 542 return (-1ULL); 543 544 /* 545 * If we're running low on space switch to using the size 546 * sorted AVL tree (best-fit). 547 */ 548 if (max_size < metaslab_df_alloc_threshold || 549 free_pct < metaslab_df_free_pct) { 550 t = sm->sm_pp_root; 551 *cursor = 0; 552 } 553 554 return (metaslab_block_picker(t, cursor, size, 1ULL)); 555 } 556 557 static boolean_t 558 metaslab_df_fragmented(space_map_t *sm) 559 { 560 uint64_t max_size = metaslab_pp_maxsize(sm); 561 int free_pct = sm->sm_space * 100 / sm->sm_size; 562 563 if (max_size >= metaslab_df_alloc_threshold && 564 free_pct >= metaslab_df_free_pct) 565 return (B_FALSE); 566 567 return (B_TRUE); 568 } 569 570 static space_map_ops_t metaslab_df_ops = { 571 metaslab_pp_load, 572 metaslab_pp_unload, 573 metaslab_df_alloc, 574 metaslab_pp_claim, 575 metaslab_pp_free, 576 metaslab_pp_maxsize, 577 metaslab_df_fragmented 578 }; 579 580 /* 581 * ========================================================================== 582 * Other experimental allocators 583 * ========================================================================== 584 */ 585 static uint64_t 586 metaslab_cdf_alloc(space_map_t *sm, uint64_t size) 587 { 588 avl_tree_t *t = &sm->sm_root; 589 uint64_t *cursor = (uint64_t *)sm->sm_ppd; 590 uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; 591 uint64_t max_size = metaslab_pp_maxsize(sm); 592 uint64_t rsize = size; 593 uint64_t offset = 0; 594 595 ASSERT(MUTEX_HELD(sm->sm_lock)); 596 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 597 598 if (max_size < size) 599 return (-1ULL); 600 601 ASSERT3U(*extent_end, >=, *cursor); 602 603 /* 604 * If we're running low on space switch to using the size 605 * sorted AVL tree (best-fit). 606 */ 607 if ((*cursor + size) > *extent_end) { 608 609 t = sm->sm_pp_root; 610 *cursor = *extent_end = 0; 611 612 if (max_size > 2 * SPA_MAXBLOCKSIZE) 613 rsize = MIN(metaslab_min_alloc_size, max_size); 614 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); 615 if (offset != -1) 616 *cursor = offset + size; 617 } else { 618 offset = metaslab_block_picker(t, cursor, rsize, 1ULL); 619 } 620 ASSERT3U(*cursor, <=, *extent_end); 621 return (offset); 622 } 623 624 static boolean_t 625 metaslab_cdf_fragmented(space_map_t *sm) 626 { 627 uint64_t max_size = metaslab_pp_maxsize(sm); 628 629 if (max_size > (metaslab_min_alloc_size * 10)) 630 return (B_FALSE); 631 return (B_TRUE); 632 } 633 634 static space_map_ops_t metaslab_cdf_ops = { 635 metaslab_pp_load, 636 metaslab_pp_unload, 637 metaslab_cdf_alloc, 638 metaslab_pp_claim, 639 metaslab_pp_free, 640 metaslab_pp_maxsize, 641 metaslab_cdf_fragmented 642 }; 643 644 uint64_t metaslab_ndf_clump_shift = 4; 645 646 static uint64_t 647 metaslab_ndf_alloc(space_map_t *sm, uint64_t size) 648 { 649 avl_tree_t *t = &sm->sm_root; 650 avl_index_t where; 651 space_seg_t *ss, ssearch; 652 uint64_t hbit = highbit(size); 653 uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; 654 uint64_t max_size = metaslab_pp_maxsize(sm); 655 656 ASSERT(MUTEX_HELD(sm->sm_lock)); 657 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 658 659 if (max_size < size) 660 return (-1ULL); 661 662 ssearch.ss_start = *cursor; 663 ssearch.ss_end = *cursor + size; 664 665 ss = avl_find(t, &ssearch, &where); 666 if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { 667 t = sm->sm_pp_root; 668 669 ssearch.ss_start = 0; 670 ssearch.ss_end = MIN(max_size, 671 1ULL << (hbit + metaslab_ndf_clump_shift)); 672 ss = avl_find(t, &ssearch, &where); 673 if (ss == NULL) 674 ss = avl_nearest(t, where, AVL_AFTER); 675 ASSERT(ss != NULL); 676 } 677 678 if (ss != NULL) { 679 if (ss->ss_start + size <= ss->ss_end) { 680 *cursor = ss->ss_start + size; 681 return (ss->ss_start); 682 } 683 } 684 return (-1ULL); 685 } 686 687 static boolean_t 688 metaslab_ndf_fragmented(space_map_t *sm) 689 { 690 uint64_t max_size = metaslab_pp_maxsize(sm); 691 692 if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) 693 return (B_FALSE); 694 return (B_TRUE); 695 } 696 697 698 static space_map_ops_t metaslab_ndf_ops = { 699 metaslab_pp_load, 700 metaslab_pp_unload, 701 metaslab_ndf_alloc, 702 metaslab_pp_claim, 703 metaslab_pp_free, 704 metaslab_pp_maxsize, 705 metaslab_ndf_fragmented 706 }; 707 708 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 709 710 /* 711 * ========================================================================== 712 * Metaslabs 713 * ========================================================================== 714 */ 715 metaslab_t * 716 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 717 uint64_t start, uint64_t size, uint64_t txg) 718 { 719 vdev_t *vd = mg->mg_vd; 720 metaslab_t *msp; 721 722 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 723 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 724 725 msp->ms_smo_syncing = *smo; 726 727 /* 728 * We create the main space map here, but we don't create the 729 * allocmaps and freemaps until metaslab_sync_done(). This serves 730 * two purposes: it allows metaslab_sync_done() to detect the 731 * addition of new space; and for debugging, it ensures that we'd 732 * data fault on any attempt to use this metaslab before it's ready. 733 */ 734 msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); 735 space_map_create(msp->ms_map, start, size, 736 vd->vdev_ashift, &msp->ms_lock); 737 738 metaslab_group_add(mg, msp); 739 740 if (metaslab_debug && smo->smo_object != 0) { 741 mutex_enter(&msp->ms_lock); 742 VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops, 743 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); 744 mutex_exit(&msp->ms_lock); 745 } 746 747 /* 748 * If we're opening an existing pool (txg == 0) or creating 749 * a new one (txg == TXG_INITIAL), all space is available now. 750 * If we're adding space to an existing pool, the new space 751 * does not become available until after this txg has synced. 752 */ 753 if (txg <= TXG_INITIAL) 754 metaslab_sync_done(msp, 0); 755 756 if (txg != 0) { 757 vdev_dirty(vd, 0, NULL, txg); 758 vdev_dirty(vd, VDD_METASLAB, msp, txg); 759 } 760 761 return (msp); 762 } 763 764 void 765 metaslab_fini(metaslab_t *msp) 766 { 767 metaslab_group_t *mg = msp->ms_group; 768 769 vdev_space_update(mg->mg_vd, 770 -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size); 771 772 metaslab_group_remove(mg, msp); 773 774 mutex_enter(&msp->ms_lock); 775 776 space_map_unload(msp->ms_map); 777 space_map_destroy(msp->ms_map); 778 kmem_free(msp->ms_map, sizeof (*msp->ms_map)); 779 780 for (int t = 0; t < TXG_SIZE; t++) { 781 space_map_destroy(msp->ms_allocmap[t]); 782 space_map_destroy(msp->ms_freemap[t]); 783 kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t])); 784 kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t])); 785 } 786 787 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 788 space_map_destroy(msp->ms_defermap[t]); 789 kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t])); 790 } 791 792 ASSERT0(msp->ms_deferspace); 793 794 mutex_exit(&msp->ms_lock); 795 mutex_destroy(&msp->ms_lock); 796 797 kmem_free(msp, sizeof (metaslab_t)); 798 } 799 800 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 801 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 802 #define METASLAB_ACTIVE_MASK \ 803 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 804 805 static uint64_t 806 metaslab_weight(metaslab_t *msp) 807 { 808 metaslab_group_t *mg = msp->ms_group; 809 space_map_t *sm = msp->ms_map; 810 space_map_obj_t *smo = &msp->ms_smo; 811 vdev_t *vd = mg->mg_vd; 812 uint64_t weight, space; 813 814 ASSERT(MUTEX_HELD(&msp->ms_lock)); 815 816 /* 817 * This vdev is in the process of being removed so there is nothing 818 * for us to do here. 819 */ 820 if (vd->vdev_removing) { 821 ASSERT0(smo->smo_alloc); 822 ASSERT0(vd->vdev_ms_shift); 823 return (0); 824 } 825 826 /* 827 * The baseline weight is the metaslab's free space. 828 */ 829 space = sm->sm_size - smo->smo_alloc; 830 weight = space; 831 832 /* 833 * Modern disks have uniform bit density and constant angular velocity. 834 * Therefore, the outer recording zones are faster (higher bandwidth) 835 * than the inner zones by the ratio of outer to inner track diameter, 836 * which is typically around 2:1. We account for this by assigning 837 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 838 * In effect, this means that we'll select the metaslab with the most 839 * free bandwidth rather than simply the one with the most free space. 840 */ 841 weight = 2 * weight - 842 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 843 ASSERT(weight >= space && weight <= 2 * space); 844 845 /* 846 * For locality, assign higher weight to metaslabs which have 847 * a lower offset than what we've already activated. 848 */ 849 if (sm->sm_start <= mg->mg_bonus_area) 850 weight *= (metaslab_smo_bonus_pct / 100); 851 ASSERT(weight >= space && 852 weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); 853 854 if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { 855 /* 856 * If this metaslab is one we're actively using, adjust its 857 * weight to make it preferable to any inactive metaslab so 858 * we'll polish it off. 859 */ 860 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 861 } 862 return (weight); 863 } 864 865 static void 866 metaslab_prefetch(metaslab_group_t *mg) 867 { 868 spa_t *spa = mg->mg_vd->vdev_spa; 869 metaslab_t *msp; 870 avl_tree_t *t = &mg->mg_metaslab_tree; 871 int m; 872 873 mutex_enter(&mg->mg_lock); 874 875 /* 876 * Prefetch the next potential metaslabs 877 */ 878 for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { 879 space_map_t *sm = msp->ms_map; 880 space_map_obj_t *smo = &msp->ms_smo; 881 882 /* If we have reached our prefetch limit then we're done */ 883 if (m >= metaslab_prefetch_limit) 884 break; 885 886 if (!sm->sm_loaded && smo->smo_object != 0) { 887 mutex_exit(&mg->mg_lock); 888 dmu_prefetch(spa_meta_objset(spa), smo->smo_object, 889 0ULL, smo->smo_objsize); 890 mutex_enter(&mg->mg_lock); 891 } 892 } 893 mutex_exit(&mg->mg_lock); 894 } 895 896 static int 897 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 898 { 899 metaslab_group_t *mg = msp->ms_group; 900 space_map_t *sm = msp->ms_map; 901 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 902 903 ASSERT(MUTEX_HELD(&msp->ms_lock)); 904 905 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 906 space_map_load_wait(sm); 907 if (!sm->sm_loaded) { 908 space_map_obj_t *smo = &msp->ms_smo; 909 910 int error = space_map_load(sm, sm_ops, SM_FREE, smo, 911 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); 912 if (error) { 913 metaslab_group_sort(msp->ms_group, msp, 0); 914 return (error); 915 } 916 for (int t = 0; t < TXG_DEFER_SIZE; t++) 917 space_map_walk(msp->ms_defermap[t], 918 space_map_claim, sm); 919 920 } 921 922 /* 923 * Track the bonus area as we activate new metaslabs. 924 */ 925 if (sm->sm_start > mg->mg_bonus_area) { 926 mutex_enter(&mg->mg_lock); 927 mg->mg_bonus_area = sm->sm_start; 928 mutex_exit(&mg->mg_lock); 929 } 930 931 metaslab_group_sort(msp->ms_group, msp, 932 msp->ms_weight | activation_weight); 933 } 934 ASSERT(sm->sm_loaded); 935 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 936 937 return (0); 938 } 939 940 static void 941 metaslab_passivate(metaslab_t *msp, uint64_t size) 942 { 943 /* 944 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 945 * this metaslab again. In that case, it had better be empty, 946 * or we would be leaving space on the table. 947 */ 948 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0); 949 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 950 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 951 } 952 953 /* 954 * Determine if the in-core space map representation can be condensed on-disk. 955 * We would like to use the following criteria to make our decision: 956 * 957 * 1. The size of the space map object should not dramatically increase as a 958 * result of writing out our in-core free map. 959 * 960 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 961 * times the size than the in-core representation (i.e. zfs_condense_pct = 110 962 * and in-core = 1MB, minimal = 1.1.MB). 963 * 964 * Checking the first condition is tricky since we don't want to walk 965 * the entire AVL tree calculating the estimated on-disk size. Instead we 966 * use the size-ordered AVL tree in the space map and calculate the 967 * size required for the largest segment in our in-core free map. If the 968 * size required to represent that segment on disk is larger than the space 969 * map object then we avoid condensing this map. 970 * 971 * To determine the second criterion we use a best-case estimate and assume 972 * each segment can be represented on-disk as a single 64-bit entry. We refer 973 * to this best-case estimate as the space map's minimal form. 974 */ 975 static boolean_t 976 metaslab_should_condense(metaslab_t *msp) 977 { 978 space_map_t *sm = msp->ms_map; 979 space_map_obj_t *smo = &msp->ms_smo_syncing; 980 space_seg_t *ss; 981 uint64_t size, entries, segsz; 982 983 ASSERT(MUTEX_HELD(&msp->ms_lock)); 984 ASSERT(sm->sm_loaded); 985 986 /* 987 * Use the sm_pp_root AVL tree, which is ordered by size, to obtain 988 * the largest segment in the in-core free map. If the tree is 989 * empty then we should condense the map. 990 */ 991 ss = avl_last(sm->sm_pp_root); 992 if (ss == NULL) 993 return (B_TRUE); 994 995 /* 996 * Calculate the number of 64-bit entries this segment would 997 * require when written to disk. If this single segment would be 998 * larger on-disk than the entire current on-disk structure, then 999 * clearly condensing will increase the on-disk structure size. 1000 */ 1001 size = (ss->ss_end - ss->ss_start) >> sm->sm_shift; 1002 entries = size / (MIN(size, SM_RUN_MAX)); 1003 segsz = entries * sizeof (uint64_t); 1004 1005 return (segsz <= smo->smo_objsize && 1006 smo->smo_objsize >= (zfs_condense_pct * 1007 sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100); 1008 } 1009 1010 /* 1011 * Condense the on-disk space map representation to its minimized form. 1012 * The minimized form consists of a small number of allocations followed by 1013 * the in-core free map. 1014 */ 1015 static void 1016 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1017 { 1018 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1019 space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK]; 1020 space_map_t condense_map; 1021 space_map_t *sm = msp->ms_map; 1022 objset_t *mos = spa_meta_objset(spa); 1023 space_map_obj_t *smo = &msp->ms_smo_syncing; 1024 1025 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1026 ASSERT3U(spa_sync_pass(spa), ==, 1); 1027 ASSERT(sm->sm_loaded); 1028 1029 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " 1030 "smo size %llu, segments %lu", txg, 1031 (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, 1032 smo->smo_objsize, avl_numnodes(&sm->sm_root)); 1033 1034 /* 1035 * Create an map that is a 100% allocated map. We remove segments 1036 * that have been freed in this txg, any deferred frees that exist, 1037 * and any allocation in the future. Removing segments should be 1038 * a relatively inexpensive operation since we expect these maps to 1039 * a small number of nodes. 1040 */ 1041 space_map_create(&condense_map, sm->sm_start, sm->sm_size, 1042 sm->sm_shift, sm->sm_lock); 1043 space_map_add(&condense_map, condense_map.sm_start, 1044 condense_map.sm_size); 1045 1046 /* 1047 * Remove what's been freed in this txg from the condense_map. 1048 * Since we're in sync_pass 1, we know that all the frees from 1049 * this txg are in the freemap. 1050 */ 1051 space_map_walk(freemap, space_map_remove, &condense_map); 1052 1053 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1054 space_map_walk(msp->ms_defermap[t], 1055 space_map_remove, &condense_map); 1056 1057 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1058 space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK], 1059 space_map_remove, &condense_map); 1060 1061 /* 1062 * We're about to drop the metaslab's lock thus allowing 1063 * other consumers to change it's content. Set the 1064 * space_map's sm_condensing flag to ensure that 1065 * allocations on this metaslab do not occur while we're 1066 * in the middle of committing it to disk. This is only critical 1067 * for the ms_map as all other space_maps use per txg 1068 * views of their content. 1069 */ 1070 sm->sm_condensing = B_TRUE; 1071 1072 mutex_exit(&msp->ms_lock); 1073 space_map_truncate(smo, mos, tx); 1074 mutex_enter(&msp->ms_lock); 1075 1076 /* 1077 * While we would ideally like to create a space_map representation 1078 * that consists only of allocation records, doing so can be 1079 * prohibitively expensive because the in-core free map can be 1080 * large, and therefore computationally expensive to subtract 1081 * from the condense_map. Instead we sync out two maps, a cheap 1082 * allocation only map followed by the in-core free map. While not 1083 * optimal, this is typically close to optimal, and much cheaper to 1084 * compute. 1085 */ 1086 space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx); 1087 space_map_vacate(&condense_map, NULL, NULL); 1088 space_map_destroy(&condense_map); 1089 1090 space_map_sync(sm, SM_FREE, smo, mos, tx); 1091 sm->sm_condensing = B_FALSE; 1092 1093 spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, " 1094 "smo size %llu", txg, 1095 (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, 1096 smo->smo_objsize); 1097 } 1098 1099 /* 1100 * Write a metaslab to disk in the context of the specified transaction group. 1101 */ 1102 void 1103 metaslab_sync(metaslab_t *msp, uint64_t txg) 1104 { 1105 vdev_t *vd = msp->ms_group->mg_vd; 1106 spa_t *spa = vd->vdev_spa; 1107 objset_t *mos = spa_meta_objset(spa); 1108 space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK]; 1109 space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK]; 1110 space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1111 space_map_t *sm = msp->ms_map; 1112 space_map_obj_t *smo = &msp->ms_smo_syncing; 1113 dmu_buf_t *db; 1114 dmu_tx_t *tx; 1115 1116 ASSERT(!vd->vdev_ishole); 1117 1118 /* 1119 * This metaslab has just been added so there's no work to do now. 1120 */ 1121 if (*freemap == NULL) { 1122 ASSERT3P(allocmap, ==, NULL); 1123 return; 1124 } 1125 1126 ASSERT3P(allocmap, !=, NULL); 1127 ASSERT3P(*freemap, !=, NULL); 1128 ASSERT3P(*freed_map, !=, NULL); 1129 1130 if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0) 1131 return; 1132 1133 /* 1134 * The only state that can actually be changing concurrently with 1135 * metaslab_sync() is the metaslab's ms_map. No other thread can 1136 * be modifying this txg's allocmap, freemap, freed_map, or smo. 1137 * Therefore, we only hold ms_lock to satify space_map ASSERTs. 1138 * We drop it whenever we call into the DMU, because the DMU 1139 * can call down to us (e.g. via zio_free()) at any time. 1140 */ 1141 1142 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1143 1144 if (smo->smo_object == 0) { 1145 ASSERT(smo->smo_objsize == 0); 1146 ASSERT(smo->smo_alloc == 0); 1147 smo->smo_object = dmu_object_alloc(mos, 1148 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1149 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1150 ASSERT(smo->smo_object != 0); 1151 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 1152 (sm->sm_start >> vd->vdev_ms_shift), 1153 sizeof (uint64_t), &smo->smo_object, tx); 1154 } 1155 1156 mutex_enter(&msp->ms_lock); 1157 1158 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && 1159 metaslab_should_condense(msp)) { 1160 metaslab_condense(msp, txg, tx); 1161 } else { 1162 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 1163 space_map_sync(*freemap, SM_FREE, smo, mos, tx); 1164 } 1165 1166 space_map_vacate(allocmap, NULL, NULL); 1167 1168 /* 1169 * For sync pass 1, we avoid walking the entire space map and 1170 * instead will just swap the pointers for freemap and 1171 * freed_map. We can safely do this since the freed_map is 1172 * guaranteed to be empty on the initial pass. 1173 */ 1174 if (spa_sync_pass(spa) == 1) { 1175 ASSERT0((*freed_map)->sm_space); 1176 ASSERT0(avl_numnodes(&(*freed_map)->sm_root)); 1177 space_map_swap(freemap, freed_map); 1178 } else { 1179 space_map_vacate(*freemap, space_map_add, *freed_map); 1180 } 1181 1182 ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space); 1183 ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space); 1184 1185 mutex_exit(&msp->ms_lock); 1186 1187 VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1188 dmu_buf_will_dirty(db, tx); 1189 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1190 bcopy(smo, db->db_data, sizeof (*smo)); 1191 dmu_buf_rele(db, FTAG); 1192 1193 dmu_tx_commit(tx); 1194 } 1195 1196 /* 1197 * Called after a transaction group has completely synced to mark 1198 * all of the metaslab's free space as usable. 1199 */ 1200 void 1201 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1202 { 1203 space_map_obj_t *smo = &msp->ms_smo; 1204 space_map_obj_t *smosync = &msp->ms_smo_syncing; 1205 space_map_t *sm = msp->ms_map; 1206 space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1207 space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1208 metaslab_group_t *mg = msp->ms_group; 1209 vdev_t *vd = mg->mg_vd; 1210 int64_t alloc_delta, defer_delta; 1211 1212 ASSERT(!vd->vdev_ishole); 1213 1214 mutex_enter(&msp->ms_lock); 1215 1216 /* 1217 * If this metaslab is just becoming available, initialize its 1218 * allocmaps, freemaps, and defermap and add its capacity to the vdev. 1219 */ 1220 if (*freed_map == NULL) { 1221 ASSERT(*defer_map == NULL); 1222 for (int t = 0; t < TXG_SIZE; t++) { 1223 msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t), 1224 KM_SLEEP); 1225 space_map_create(msp->ms_allocmap[t], sm->sm_start, 1226 sm->sm_size, sm->sm_shift, sm->sm_lock); 1227 msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t), 1228 KM_SLEEP); 1229 space_map_create(msp->ms_freemap[t], sm->sm_start, 1230 sm->sm_size, sm->sm_shift, sm->sm_lock); 1231 } 1232 1233 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1234 msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t), 1235 KM_SLEEP); 1236 space_map_create(msp->ms_defermap[t], sm->sm_start, 1237 sm->sm_size, sm->sm_shift, sm->sm_lock); 1238 } 1239 1240 freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1241 defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1242 1243 vdev_space_update(vd, 0, 0, sm->sm_size); 1244 } 1245 1246 alloc_delta = smosync->smo_alloc - smo->smo_alloc; 1247 defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space; 1248 1249 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1250 1251 ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0); 1252 ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0); 1253 1254 /* 1255 * If there's a space_map_load() in progress, wait for it to complete 1256 * so that we have a consistent view of the in-core space map. 1257 */ 1258 space_map_load_wait(sm); 1259 1260 /* 1261 * Move the frees from the defer_map to this map (if it's loaded). 1262 * Swap the freed_map and the defer_map -- this is safe to do 1263 * because we've just emptied out the defer_map. 1264 */ 1265 space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm); 1266 ASSERT0((*defer_map)->sm_space); 1267 ASSERT0(avl_numnodes(&(*defer_map)->sm_root)); 1268 space_map_swap(freed_map, defer_map); 1269 1270 *smo = *smosync; 1271 1272 msp->ms_deferspace += defer_delta; 1273 ASSERT3S(msp->ms_deferspace, >=, 0); 1274 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); 1275 if (msp->ms_deferspace != 0) { 1276 /* 1277 * Keep syncing this metaslab until all deferred frees 1278 * are back in circulation. 1279 */ 1280 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1281 } 1282 1283 /* 1284 * If the map is loaded but no longer active, evict it as soon as all 1285 * future allocations have synced. (If we unloaded it now and then 1286 * loaded a moment later, the map wouldn't reflect those allocations.) 1287 */ 1288 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1289 int evictable = 1; 1290 1291 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1292 if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space) 1293 evictable = 0; 1294 1295 if (evictable && !metaslab_debug) 1296 space_map_unload(sm); 1297 } 1298 1299 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1300 1301 mutex_exit(&msp->ms_lock); 1302 } 1303 1304 void 1305 metaslab_sync_reassess(metaslab_group_t *mg) 1306 { 1307 vdev_t *vd = mg->mg_vd; 1308 int64_t failures = mg->mg_alloc_failures; 1309 1310 /* 1311 * Re-evaluate all metaslabs which have lower offsets than the 1312 * bonus area. 1313 */ 1314 for (int m = 0; m < vd->vdev_ms_count; m++) { 1315 metaslab_t *msp = vd->vdev_ms[m]; 1316 1317 if (msp->ms_map->sm_start > mg->mg_bonus_area) 1318 break; 1319 1320 mutex_enter(&msp->ms_lock); 1321 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1322 mutex_exit(&msp->ms_lock); 1323 } 1324 1325 atomic_add_64(&mg->mg_alloc_failures, -failures); 1326 1327 /* 1328 * Prefetch the next potential metaslabs 1329 */ 1330 metaslab_prefetch(mg); 1331 } 1332 1333 static uint64_t 1334 metaslab_distance(metaslab_t *msp, dva_t *dva) 1335 { 1336 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 1337 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 1338 uint64_t start = msp->ms_map->sm_start >> ms_shift; 1339 1340 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 1341 return (1ULL << 63); 1342 1343 if (offset < start) 1344 return ((start - offset) << ms_shift); 1345 if (offset > start) 1346 return ((offset - start) << ms_shift); 1347 return (0); 1348 } 1349 1350 static uint64_t 1351 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 1352 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) 1353 { 1354 spa_t *spa = mg->mg_vd->vdev_spa; 1355 metaslab_t *msp = NULL; 1356 uint64_t offset = -1ULL; 1357 avl_tree_t *t = &mg->mg_metaslab_tree; 1358 uint64_t activation_weight; 1359 uint64_t target_distance; 1360 int i; 1361 1362 activation_weight = METASLAB_WEIGHT_PRIMARY; 1363 for (i = 0; i < d; i++) { 1364 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 1365 activation_weight = METASLAB_WEIGHT_SECONDARY; 1366 break; 1367 } 1368 } 1369 1370 for (;;) { 1371 boolean_t was_active; 1372 1373 mutex_enter(&mg->mg_lock); 1374 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 1375 if (msp->ms_weight < asize) { 1376 spa_dbgmsg(spa, "%s: failed to meet weight " 1377 "requirement: vdev %llu, txg %llu, mg %p, " 1378 "msp %p, psize %llu, asize %llu, " 1379 "failures %llu, weight %llu", 1380 spa_name(spa), mg->mg_vd->vdev_id, txg, 1381 mg, msp, psize, asize, 1382 mg->mg_alloc_failures, msp->ms_weight); 1383 mutex_exit(&mg->mg_lock); 1384 return (-1ULL); 1385 } 1386 1387 /* 1388 * If the selected metaslab is condensing, skip it. 1389 */ 1390 if (msp->ms_map->sm_condensing) 1391 continue; 1392 1393 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1394 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1395 break; 1396 1397 target_distance = min_distance + 1398 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 1399 1400 for (i = 0; i < d; i++) 1401 if (metaslab_distance(msp, &dva[i]) < 1402 target_distance) 1403 break; 1404 if (i == d) 1405 break; 1406 } 1407 mutex_exit(&mg->mg_lock); 1408 if (msp == NULL) 1409 return (-1ULL); 1410 1411 /* 1412 * If we've already reached the allowable number of failed 1413 * allocation attempts on this metaslab group then we 1414 * consider skipping it. We skip it only if we're allowed 1415 * to "fast" gang, the physical size is larger than 1416 * a gang block, and we're attempting to allocate from 1417 * the primary metaslab. 1418 */ 1419 if (mg->mg_alloc_failures > zfs_mg_alloc_failures && 1420 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && 1421 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1422 spa_dbgmsg(spa, "%s: skipping metaslab group: " 1423 "vdev %llu, txg %llu, mg %p, psize %llu, " 1424 "asize %llu, failures %llu", spa_name(spa), 1425 mg->mg_vd->vdev_id, txg, mg, psize, asize, 1426 mg->mg_alloc_failures); 1427 return (-1ULL); 1428 } 1429 1430 mutex_enter(&msp->ms_lock); 1431 1432 /* 1433 * Ensure that the metaslab we have selected is still 1434 * capable of handling our request. It's possible that 1435 * another thread may have changed the weight while we 1436 * were blocked on the metaslab lock. 1437 */ 1438 if (msp->ms_weight < asize || (was_active && 1439 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1440 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1441 mutex_exit(&msp->ms_lock); 1442 continue; 1443 } 1444 1445 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1446 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1447 metaslab_passivate(msp, 1448 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1449 mutex_exit(&msp->ms_lock); 1450 continue; 1451 } 1452 1453 if (metaslab_activate(msp, activation_weight) != 0) { 1454 mutex_exit(&msp->ms_lock); 1455 continue; 1456 } 1457 1458 /* 1459 * If this metaslab is currently condensing then pick again as 1460 * we can't manipulate this metaslab until it's committed 1461 * to disk. 1462 */ 1463 if (msp->ms_map->sm_condensing) { 1464 mutex_exit(&msp->ms_lock); 1465 continue; 1466 } 1467 1468 if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL) 1469 break; 1470 1471 atomic_inc_64(&mg->mg_alloc_failures); 1472 1473 metaslab_passivate(msp, space_map_maxsize(msp->ms_map)); 1474 1475 mutex_exit(&msp->ms_lock); 1476 } 1477 1478 if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) 1479 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1480 1481 space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize); 1482 1483 mutex_exit(&msp->ms_lock); 1484 1485 return (offset); 1486 } 1487 1488 /* 1489 * Allocate a block for the specified i/o. 1490 */ 1491 static int 1492 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 1493 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 1494 { 1495 metaslab_group_t *mg, *rotor; 1496 vdev_t *vd; 1497 int dshift = 3; 1498 int all_zero; 1499 int zio_lock = B_FALSE; 1500 boolean_t allocatable; 1501 uint64_t offset = -1ULL; 1502 uint64_t asize; 1503 uint64_t distance; 1504 1505 ASSERT(!DVA_IS_VALID(&dva[d])); 1506 1507 /* 1508 * For testing, make some blocks above a certain size be gang blocks. 1509 */ 1510 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 1511 return (SET_ERROR(ENOSPC)); 1512 1513 /* 1514 * Start at the rotor and loop through all mgs until we find something. 1515 * Note that there's no locking on mc_rotor or mc_aliquot because 1516 * nothing actually breaks if we miss a few updates -- we just won't 1517 * allocate quite as evenly. It all balances out over time. 1518 * 1519 * If we are doing ditto or log blocks, try to spread them across 1520 * consecutive vdevs. If we're forced to reuse a vdev before we've 1521 * allocated all of our ditto blocks, then try and spread them out on 1522 * that vdev as much as possible. If it turns out to not be possible, 1523 * gradually lower our standards until anything becomes acceptable. 1524 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 1525 * gives us hope of containing our fault domains to something we're 1526 * able to reason about. Otherwise, any two top-level vdev failures 1527 * will guarantee the loss of data. With consecutive allocation, 1528 * only two adjacent top-level vdev failures will result in data loss. 1529 * 1530 * If we are doing gang blocks (hintdva is non-NULL), try to keep 1531 * ourselves on the same vdev as our gang block header. That 1532 * way, we can hope for locality in vdev_cache, plus it makes our 1533 * fault domains something tractable. 1534 */ 1535 if (hintdva) { 1536 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 1537 1538 /* 1539 * It's possible the vdev we're using as the hint no 1540 * longer exists (i.e. removed). Consult the rotor when 1541 * all else fails. 1542 */ 1543 if (vd != NULL) { 1544 mg = vd->vdev_mg; 1545 1546 if (flags & METASLAB_HINTBP_AVOID && 1547 mg->mg_next != NULL) 1548 mg = mg->mg_next; 1549 } else { 1550 mg = mc->mc_rotor; 1551 } 1552 } else if (d != 0) { 1553 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 1554 mg = vd->vdev_mg->mg_next; 1555 } else { 1556 mg = mc->mc_rotor; 1557 } 1558 1559 /* 1560 * If the hint put us into the wrong metaslab class, or into a 1561 * metaslab group that has been passivated, just follow the rotor. 1562 */ 1563 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 1564 mg = mc->mc_rotor; 1565 1566 rotor = mg; 1567 top: 1568 all_zero = B_TRUE; 1569 do { 1570 ASSERT(mg->mg_activation_count == 1); 1571 1572 vd = mg->mg_vd; 1573 1574 /* 1575 * Don't allocate from faulted devices. 1576 */ 1577 if (zio_lock) { 1578 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1579 allocatable = vdev_allocatable(vd); 1580 spa_config_exit(spa, SCL_ZIO, FTAG); 1581 } else { 1582 allocatable = vdev_allocatable(vd); 1583 } 1584 if (!allocatable) 1585 goto next; 1586 1587 /* 1588 * Avoid writing single-copy data to a failing vdev 1589 * unless the user instructs us that it is okay. 1590 */ 1591 if ((vd->vdev_stat.vs_write_errors > 0 || 1592 vd->vdev_state < VDEV_STATE_HEALTHY) && 1593 d == 0 && dshift == 3 && 1594 !(zfs_write_to_degraded && vd->vdev_state == 1595 VDEV_STATE_DEGRADED)) { 1596 all_zero = B_FALSE; 1597 goto next; 1598 } 1599 1600 ASSERT(mg->mg_class == mc); 1601 1602 distance = vd->vdev_asize >> dshift; 1603 if (distance <= (1ULL << vd->vdev_ms_shift)) 1604 distance = 0; 1605 else 1606 all_zero = B_FALSE; 1607 1608 asize = vdev_psize_to_asize(vd, psize); 1609 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1610 1611 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 1612 dva, d, flags); 1613 if (offset != -1ULL) { 1614 /* 1615 * If we've just selected this metaslab group, 1616 * figure out whether the corresponding vdev is 1617 * over- or under-used relative to the pool, 1618 * and set an allocation bias to even it out. 1619 */ 1620 if (mc->mc_aliquot == 0) { 1621 vdev_stat_t *vs = &vd->vdev_stat; 1622 int64_t vu, cu; 1623 1624 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 1625 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 1626 1627 /* 1628 * Calculate how much more or less we should 1629 * try to allocate from this device during 1630 * this iteration around the rotor. 1631 * For example, if a device is 80% full 1632 * and the pool is 20% full then we should 1633 * reduce allocations by 60% on this device. 1634 * 1635 * mg_bias = (20 - 80) * 512K / 100 = -307K 1636 * 1637 * This reduces allocations by 307K for this 1638 * iteration. 1639 */ 1640 mg->mg_bias = ((cu - vu) * 1641 (int64_t)mg->mg_aliquot) / 100; 1642 } 1643 1644 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1645 mg->mg_aliquot + mg->mg_bias) { 1646 mc->mc_rotor = mg->mg_next; 1647 mc->mc_aliquot = 0; 1648 } 1649 1650 DVA_SET_VDEV(&dva[d], vd->vdev_id); 1651 DVA_SET_OFFSET(&dva[d], offset); 1652 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1653 DVA_SET_ASIZE(&dva[d], asize); 1654 1655 return (0); 1656 } 1657 next: 1658 mc->mc_rotor = mg->mg_next; 1659 mc->mc_aliquot = 0; 1660 } while ((mg = mg->mg_next) != rotor); 1661 1662 if (!all_zero) { 1663 dshift++; 1664 ASSERT(dshift < 64); 1665 goto top; 1666 } 1667 1668 if (!allocatable && !zio_lock) { 1669 dshift = 3; 1670 zio_lock = B_TRUE; 1671 goto top; 1672 } 1673 1674 bzero(&dva[d], sizeof (dva_t)); 1675 1676 return (SET_ERROR(ENOSPC)); 1677 } 1678 1679 /* 1680 * Free the block represented by DVA in the context of the specified 1681 * transaction group. 1682 */ 1683 static void 1684 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1685 { 1686 uint64_t vdev = DVA_GET_VDEV(dva); 1687 uint64_t offset = DVA_GET_OFFSET(dva); 1688 uint64_t size = DVA_GET_ASIZE(dva); 1689 vdev_t *vd; 1690 metaslab_t *msp; 1691 1692 ASSERT(DVA_IS_VALID(dva)); 1693 1694 if (txg > spa_freeze_txg(spa)) 1695 return; 1696 1697 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1698 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 1699 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 1700 (u_longlong_t)vdev, (u_longlong_t)offset); 1701 ASSERT(0); 1702 return; 1703 } 1704 1705 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1706 1707 if (DVA_GET_GANG(dva)) 1708 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1709 1710 mutex_enter(&msp->ms_lock); 1711 1712 if (now) { 1713 space_map_remove(msp->ms_allocmap[txg & TXG_MASK], 1714 offset, size); 1715 space_map_free(msp->ms_map, offset, size); 1716 } else { 1717 if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0) 1718 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1719 space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size); 1720 } 1721 1722 mutex_exit(&msp->ms_lock); 1723 } 1724 1725 /* 1726 * Intent log support: upon opening the pool after a crash, notify the SPA 1727 * of blocks that the intent log has allocated for immediate write, but 1728 * which are still considered free by the SPA because the last transaction 1729 * group didn't commit yet. 1730 */ 1731 static int 1732 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 1733 { 1734 uint64_t vdev = DVA_GET_VDEV(dva); 1735 uint64_t offset = DVA_GET_OFFSET(dva); 1736 uint64_t size = DVA_GET_ASIZE(dva); 1737 vdev_t *vd; 1738 metaslab_t *msp; 1739 int error = 0; 1740 1741 ASSERT(DVA_IS_VALID(dva)); 1742 1743 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1744 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 1745 return (SET_ERROR(ENXIO)); 1746 1747 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1748 1749 if (DVA_GET_GANG(dva)) 1750 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1751 1752 mutex_enter(&msp->ms_lock); 1753 1754 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded) 1755 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 1756 1757 if (error == 0 && !space_map_contains(msp->ms_map, offset, size)) 1758 error = SET_ERROR(ENOENT); 1759 1760 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1761 mutex_exit(&msp->ms_lock); 1762 return (error); 1763 } 1764 1765 space_map_claim(msp->ms_map, offset, size); 1766 1767 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 1768 if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) 1769 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1770 space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size); 1771 } 1772 1773 mutex_exit(&msp->ms_lock); 1774 1775 return (0); 1776 } 1777 1778 int 1779 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 1780 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 1781 { 1782 dva_t *dva = bp->blk_dva; 1783 dva_t *hintdva = hintbp->blk_dva; 1784 int error = 0; 1785 1786 ASSERT(bp->blk_birth == 0); 1787 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 1788 1789 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1790 1791 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 1792 spa_config_exit(spa, SCL_ALLOC, FTAG); 1793 return (SET_ERROR(ENOSPC)); 1794 } 1795 1796 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 1797 ASSERT(BP_GET_NDVAS(bp) == 0); 1798 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 1799 1800 for (int d = 0; d < ndvas; d++) { 1801 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 1802 txg, flags); 1803 if (error) { 1804 for (d--; d >= 0; d--) { 1805 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 1806 bzero(&dva[d], sizeof (dva_t)); 1807 } 1808 spa_config_exit(spa, SCL_ALLOC, FTAG); 1809 return (error); 1810 } 1811 } 1812 ASSERT(error == 0); 1813 ASSERT(BP_GET_NDVAS(bp) == ndvas); 1814 1815 spa_config_exit(spa, SCL_ALLOC, FTAG); 1816 1817 BP_SET_BIRTH(bp, txg, txg); 1818 1819 return (0); 1820 } 1821 1822 void 1823 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 1824 { 1825 const dva_t *dva = bp->blk_dva; 1826 int ndvas = BP_GET_NDVAS(bp); 1827 1828 ASSERT(!BP_IS_HOLE(bp)); 1829 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 1830 1831 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 1832 1833 for (int d = 0; d < ndvas; d++) 1834 metaslab_free_dva(spa, &dva[d], txg, now); 1835 1836 spa_config_exit(spa, SCL_FREE, FTAG); 1837 } 1838 1839 int 1840 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 1841 { 1842 const dva_t *dva = bp->blk_dva; 1843 int ndvas = BP_GET_NDVAS(bp); 1844 int error = 0; 1845 1846 ASSERT(!BP_IS_HOLE(bp)); 1847 1848 if (txg != 0) { 1849 /* 1850 * First do a dry run to make sure all DVAs are claimable, 1851 * so we don't have to unwind from partial failures below. 1852 */ 1853 if ((error = metaslab_claim(spa, bp, 0)) != 0) 1854 return (error); 1855 } 1856 1857 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1858 1859 for (int d = 0; d < ndvas; d++) 1860 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 1861 break; 1862 1863 spa_config_exit(spa, SCL_ALLOC, FTAG); 1864 1865 ASSERT(error == 0 || txg == 0); 1866 1867 return (error); 1868 } 1869 1870 static void 1871 checkmap(space_map_t *sm, uint64_t off, uint64_t size) 1872 { 1873 space_seg_t *ss; 1874 avl_index_t where; 1875 1876 mutex_enter(sm->sm_lock); 1877 ss = space_map_find(sm, off, size, &where); 1878 if (ss != NULL) 1879 panic("freeing free block; ss=%p", (void *)ss); 1880 mutex_exit(sm->sm_lock); 1881 } 1882 1883 void 1884 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 1885 { 1886 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 1887 return; 1888 1889 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 1890 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 1891 uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]); 1892 vdev_t *vd = vdev_lookup_top(spa, vdid); 1893 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]); 1894 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 1895 metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift]; 1896 1897 if (ms->ms_map->sm_loaded) 1898 checkmap(ms->ms_map, off, size); 1899 1900 for (int j = 0; j < TXG_SIZE; j++) 1901 checkmap(ms->ms_freemap[j], off, size); 1902 for (int j = 0; j < TXG_DEFER_SIZE; j++) 1903 checkmap(ms->ms_defermap[j], off, size); 1904 } 1905 spa_config_exit(spa, SCL_VDEV, FTAG); 1906 }