illumos-gate Wdiff usr/src/uts/common/fs/zfs/metaslab.c

Print this page

4730 Destroy the metaslab group taskq in metaslab_group_destroy()

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/metaslab.c
          +++ new/usr/src/uts/common/fs/zfs/metaslab.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/dmu.h>
  29   29  #include <sys/dmu_tx.h>
  30   30  #include <sys/space_map.h>
  31   31  #include <sys/metaslab_impl.h>
  32   32  #include <sys/vdev_impl.h>
  33   33  #include <sys/zio.h>
  34   34  #include <sys/spa_impl.h>
  35   35  
  36   36  /*
  37   37   * Allow allocations to switch to gang blocks quickly. We do this to
  38   38   * avoid having to load lots of space_maps in a given txg. There are,
  39   39   * however, some cases where we want to avoid "fast" ganging and instead
  40   40   * we want to do an exhaustive search of all metaslabs on this device.
  41   41   * Currently we don't allow any gang, zil, or dump device related allocations
  42   42   * to "fast" gang.
  43   43   */
  44   44  #define CAN_FASTGANG(flags) \
  45   45          (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
  46   46          METASLAB_GANG_AVOID)))
  47   47  
  48   48  #define METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
  49   49  #define METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
  50   50  #define METASLAB_ACTIVE_MASK            \
  51   51          (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
  52   52  
  53   53  uint64_t metaslab_aliquot = 512ULL << 10;
  54   54  uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  55   55  
  56   56  /*
  57   57   * The in-core space map representation is more compact than its on-disk form.
  58   58   * The zfs_condense_pct determines how much more compact the in-core
  59   59   * space_map representation must be before we compact it on-disk.
  60   60   * Values should be greater than or equal to 100.
  61   61   */
  62   62  int zfs_condense_pct = 200;
  63   63  
  64   64  /*
  65   65   * This value defines the number of allowed allocation failures per vdev.
  66   66   * If a device reaches this threshold in a given txg then we consider skipping
  67   67   * allocations on that device. The value of zfs_mg_alloc_failures is computed
  68   68   * in zio_init() unless it has been overridden in /etc/system.
  69   69   */
  70   70  int zfs_mg_alloc_failures = 0;
  71   71  
  72   72  /*
  73   73   * The zfs_mg_noalloc_threshold defines which metaslab groups should
  74   74   * be eligible for allocation. The value is defined as a percentage of
  75   75   * a free space. Metaslab groups that have more free space than
  76   76   * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  77   77   * a metaslab group's free space is less than or equal to the
  78   78   * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  79   79   * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  80   80   * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  81   81   * groups are allowed to accept allocations. Gang blocks are always
  82   82   * eligible to allocate on any metaslab group. The default value of 0 means
  83   83   * no metaslab group will be excluded based on this criterion.
  84   84   */
  85   85  int zfs_mg_noalloc_threshold = 0;
  86   86  
  87   87  /*
  88   88   * When set will load all metaslabs when pool is first opened.
  89   89   */
  90   90  int metaslab_debug_load = 0;
  91   91  
  92   92  /*
  93   93   * When set will prevent metaslabs from being unloaded.
  94   94   */
  95   95  int metaslab_debug_unload = 0;
  96   96  
  97   97  /*
  98   98   * Minimum size which forces the dynamic allocator to change
  99   99   * it's allocation strategy.  Once the space map cannot satisfy
 100  100   * an allocation of this size then it switches to using more
 101  101   * aggressive strategy (i.e search by size rather than offset).
 102  102   */
 103  103  uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
 104  104  
 105  105  /*
 106  106   * The minimum free space, in percent, which must be available
 107  107   * in a space map to continue allocations in a first-fit fashion.
 108  108   * Once the space_map's free space drops below this level we dynamically
 109  109   * switch to using best-fit allocations.
 110  110   */
 111  111  int metaslab_df_free_pct = 4;
 112  112  
 113  113  /*
 114  114   * A metaslab is considered "free" if it contains a contiguous
 115  115   * segment which is greater than metaslab_min_alloc_size.
 116  116   */
 117  117  uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 118  118  
 119  119  /*
 120  120   * Percentage of all cpus that can be used by the metaslab taskq.
 121  121   */
 122  122  int metaslab_load_pct = 50;
 123  123  
 124  124  /*
 125  125   * Determines how many txgs a metaslab may remain loaded without having any
 126  126   * allocations from it. As long as a metaslab continues to be used we will
 127  127   * keep it loaded.
 128  128   */
 129  129  int metaslab_unload_delay = TXG_SIZE * 2;
 130  130  
 131  131  /*
 132  132   * Should we be willing to write data to degraded vdevs?
 133  133   */
 134  134  boolean_t zfs_write_to_degraded = B_FALSE;
 135  135  
 136  136  /*
 137  137   * Max number of metaslabs per group to preload.
 138  138   */
 139  139  int metaslab_preload_limit = SPA_DVAS_PER_BP;
 140  140  
 141  141  /*
 142  142   * Enable/disable preloading of metaslab.
 143  143   */
 144  144  boolean_t metaslab_preload_enabled = B_TRUE;
 145  145  
 146  146  /*
 147  147   * Enable/disable additional weight factor for each metaslab.
 148  148   */
 149  149  boolean_t metaslab_weight_factor_enable = B_FALSE;
 150  150  
 151  151  
 152  152  /*
 153  153   * ==========================================================================
 154  154   * Metaslab classes
 155  155   * ==========================================================================
 156  156   */
 157  157  metaslab_class_t *
 158  158  metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 159  159  {
 160  160          metaslab_class_t *mc;
 161  161  
 162  162          mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 163  163  
 164  164          mc->mc_spa = spa;
 165  165          mc->mc_rotor = NULL;
 166  166          mc->mc_ops = ops;
 167  167  
 168  168          return (mc);
 169  169  }
 170  170  
 171  171  void
 172  172  metaslab_class_destroy(metaslab_class_t *mc)
 173  173  {
 174  174          ASSERT(mc->mc_rotor == NULL);
 175  175          ASSERT(mc->mc_alloc == 0);
 176  176          ASSERT(mc->mc_deferred == 0);
 177  177          ASSERT(mc->mc_space == 0);
 178  178          ASSERT(mc->mc_dspace == 0);
 179  179  
 180  180          kmem_free(mc, sizeof (metaslab_class_t));
 181  181  }
 182  182  
 183  183  int
 184  184  metaslab_class_validate(metaslab_class_t *mc)
 185  185  {
 186  186          metaslab_group_t *mg;
 187  187          vdev_t *vd;
 188  188  
 189  189          /*
 190  190           * Must hold one of the spa_config locks.
 191  191           */
 192  192          ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 193  193              spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 194  194  
 195  195          if ((mg = mc->mc_rotor) == NULL)
 196  196                  return (0);
 197  197  
 198  198          do {
 199  199                  vd = mg->mg_vd;
 200  200                  ASSERT(vd->vdev_mg != NULL);
 201  201                  ASSERT3P(vd->vdev_top, ==, vd);
 202  202                  ASSERT3P(mg->mg_class, ==, mc);
 203  203                  ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 204  204          } while ((mg = mg->mg_next) != mc->mc_rotor);
 205  205  
 206  206          return (0);
 207  207  }
 208  208  
 209  209  void
 210  210  metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 211  211      int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 212  212  {
 213  213          atomic_add_64(&mc->mc_alloc, alloc_delta);
 214  214          atomic_add_64(&mc->mc_deferred, defer_delta);
 215  215          atomic_add_64(&mc->mc_space, space_delta);
 216  216          atomic_add_64(&mc->mc_dspace, dspace_delta);
 217  217  }
 218  218  
 219  219  uint64_t
 220  220  metaslab_class_get_alloc(metaslab_class_t *mc)
 221  221  {
 222  222          return (mc->mc_alloc);
 223  223  }
 224  224  
 225  225  uint64_t
 226  226  metaslab_class_get_deferred(metaslab_class_t *mc)
 227  227  {
 228  228          return (mc->mc_deferred);
 229  229  }
 230  230  
 231  231  uint64_t
 232  232  metaslab_class_get_space(metaslab_class_t *mc)
 233  233  {
 234  234          return (mc->mc_space);
 235  235  }
 236  236  
 237  237  uint64_t
 238  238  metaslab_class_get_dspace(metaslab_class_t *mc)
 239  239  {
 240  240          return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 241  241  }
 242  242  
 243  243  /*
 244  244   * ==========================================================================
 245  245   * Metaslab groups
 246  246   * ==========================================================================
 247  247   */
 248  248  static int
 249  249  metaslab_compare(const void *x1, const void *x2)
 250  250  {
 251  251          const metaslab_t *m1 = x1;
 252  252          const metaslab_t *m2 = x2;
 253  253  
 254  254          if (m1->ms_weight < m2->ms_weight)
 255  255                  return (1);
 256  256          if (m1->ms_weight > m2->ms_weight)
 257  257                  return (-1);
 258  258  
 259  259          /*
 260  260           * If the weights are identical, use the offset to force uniqueness.
 261  261           */
 262  262          if (m1->ms_start < m2->ms_start)
 263  263                  return (-1);
 264  264          if (m1->ms_start > m2->ms_start)
 265  265                  return (1);
 266  266  
 267  267          ASSERT3P(m1, ==, m2);
 268  268  
 269  269          return (0);
 270  270  }
 271  271  
 272  272  /*
 273  273   * Update the allocatable flag and the metaslab group's capacity.
 274  274   * The allocatable flag is set to true if the capacity is below
 275  275   * the zfs_mg_noalloc_threshold. If a metaslab group transitions
 276  276   * from allocatable to non-allocatable or vice versa then the metaslab
 277  277   * group's class is updated to reflect the transition.
 278  278   */
 279  279  static void
 280  280  metaslab_group_alloc_update(metaslab_group_t *mg)
 281  281  {
 282  282          vdev_t *vd = mg->mg_vd;
 283  283          metaslab_class_t *mc = mg->mg_class;
 284  284          vdev_stat_t *vs = &vd->vdev_stat;
 285  285          boolean_t was_allocatable;
 286  286  
 287  287          ASSERT(vd == vd->vdev_top);
 288  288  
 289  289          mutex_enter(&mg->mg_lock);
 290  290          was_allocatable = mg->mg_allocatable;
 291  291  
 292  292          mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 293  293              (vs->vs_space + 1);
 294  294  
 295  295          mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
 296  296  
 297  297          /*
 298  298           * The mc_alloc_groups maintains a count of the number of
 299  299           * groups in this metaslab class that are still above the
 300  300           * zfs_mg_noalloc_threshold. This is used by the allocating
 301  301           * threads to determine if they should avoid allocations to
 302  302           * a given group. The allocator will avoid allocations to a group
 303  303           * if that group has reached or is below the zfs_mg_noalloc_threshold
 304  304           * and there are still other groups that are above the threshold.
 305  305           * When a group transitions from allocatable to non-allocatable or
 306  306           * vice versa we update the metaslab class to reflect that change.
 307  307           * When the mc_alloc_groups value drops to 0 that means that all
 308  308           * groups have reached the zfs_mg_noalloc_threshold making all groups
 309  309           * eligible for allocations. This effectively means that all devices
 310  310           * are balanced again.
 311  311           */
 312  312          if (was_allocatable && !mg->mg_allocatable)
 313  313                  mc->mc_alloc_groups--;
 314  314          else if (!was_allocatable && mg->mg_allocatable)
 315  315                  mc->mc_alloc_groups++;
 316  316          mutex_exit(&mg->mg_lock);
 317  317  }
 318  318  
 319  319  metaslab_group_t *
 320  320  metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 321  321  {
 322  322          metaslab_group_t *mg;
 323  323  
 324  324          mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 325  325          mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 326  326          avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 327  327              sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 328  328          mg->mg_vd = vd;
 329  329          mg->mg_class = mc;
 330  330          mg->mg_activation_count = 0;
 331  331  
 332  332          mg->mg_taskq = taskq_create("metaslab_group_tasksq", metaslab_load_pct,
 333  333              minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 334  334  
 335  335          return (mg);
 336  336  }
 337  337  
 338  338  void
 339  339  metaslab_group_destroy(metaslab_group_t *mg)

↓ open down ↓

339 lines elided

↑ open up ↑

 340  340  {
 341  341          ASSERT(mg->mg_prev == NULL);
 342  342          ASSERT(mg->mg_next == NULL);
 343  343          /*
 344  344           * We may have gone below zero with the activation count
 345  345           * either because we never activated in the first place or
 346  346           * because we're done, and possibly removing the vdev.
 347  347           */
 348  348          ASSERT(mg->mg_activation_count <= 0);
 349  349  
      350 +        if (mg->mg_taskq) taskq_destroy(mg->mg_taskq);
      351 +
 350  352          avl_destroy(&mg->mg_metaslab_tree);
 351  353          mutex_destroy(&mg->mg_lock);
 352  354          kmem_free(mg, sizeof (metaslab_group_t));
 353  355  }
 354  356  
 355  357  void
 356  358  metaslab_group_activate(metaslab_group_t *mg)
 357  359  {
 358  360          metaslab_class_t *mc = mg->mg_class;
 359  361          metaslab_group_t *mgprev, *mgnext;

 360  362  
 361  363          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 362  364  
 363  365          ASSERT(mc->mc_rotor != mg);
 364  366          ASSERT(mg->mg_prev == NULL);
 365  367          ASSERT(mg->mg_next == NULL);
 366  368          ASSERT(mg->mg_activation_count <= 0);
 367  369  
 368  370          if (++mg->mg_activation_count <= 0)
 369  371                  return;
 370  372  
 371  373          mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 372  374          metaslab_group_alloc_update(mg);
 373  375  
 374  376          if ((mgprev = mc->mc_rotor) == NULL) {
 375  377                  mg->mg_prev = mg;
 376  378                  mg->mg_next = mg;
 377  379          } else {
 378  380                  mgnext = mgprev->mg_next;
 379  381                  mg->mg_prev = mgprev;
 380  382                  mg->mg_next = mgnext;
 381  383                  mgprev->mg_next = mg;
 382  384                  mgnext->mg_prev = mg;
 383  385          }
 384  386          mc->mc_rotor = mg;
 385  387  }
 386  388  
 387  389  void
 388  390  metaslab_group_passivate(metaslab_group_t *mg)
 389  391  {
 390  392          metaslab_class_t *mc = mg->mg_class;
 391  393          metaslab_group_t *mgprev, *mgnext;
 392  394  
 393  395          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 394  396  
 395  397          if (--mg->mg_activation_count != 0) {
 396  398                  ASSERT(mc->mc_rotor != mg);
 397  399                  ASSERT(mg->mg_prev == NULL);
 398  400                  ASSERT(mg->mg_next == NULL);
 399  401                  ASSERT(mg->mg_activation_count < 0);
 400  402                  return;
 401  403          }
 402  404  
 403  405          taskq_wait(mg->mg_taskq);
 404  406  
 405  407          mgprev = mg->mg_prev;
 406  408          mgnext = mg->mg_next;
 407  409  
 408  410          if (mg == mgnext) {
 409  411                  mc->mc_rotor = NULL;
 410  412          } else {
 411  413                  mc->mc_rotor = mgnext;
 412  414                  mgprev->mg_next = mgnext;
 413  415                  mgnext->mg_prev = mgprev;
 414  416          }
 415  417  
 416  418          mg->mg_prev = NULL;
 417  419          mg->mg_next = NULL;
 418  420  }
 419  421  
 420  422  static void
 421  423  metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 422  424  {
 423  425          mutex_enter(&mg->mg_lock);
 424  426          ASSERT(msp->ms_group == NULL);
 425  427          msp->ms_group = mg;
 426  428          msp->ms_weight = 0;
 427  429          avl_add(&mg->mg_metaslab_tree, msp);
 428  430          mutex_exit(&mg->mg_lock);
 429  431  }
 430  432  
 431  433  static void
 432  434  metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 433  435  {
 434  436          mutex_enter(&mg->mg_lock);
 435  437          ASSERT(msp->ms_group == mg);
 436  438          avl_remove(&mg->mg_metaslab_tree, msp);
 437  439          msp->ms_group = NULL;
 438  440          mutex_exit(&mg->mg_lock);
 439  441  }
 440  442  
 441  443  static void
 442  444  metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 443  445  {
 444  446          /*
 445  447           * Although in principle the weight can be any value, in
 446  448           * practice we do not use values in the range [1, 510].
 447  449           */
 448  450          ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
 449  451          ASSERT(MUTEX_HELD(&msp->ms_lock));
 450  452  
 451  453          mutex_enter(&mg->mg_lock);
 452  454          ASSERT(msp->ms_group == mg);
 453  455          avl_remove(&mg->mg_metaslab_tree, msp);
 454  456          msp->ms_weight = weight;
 455  457          avl_add(&mg->mg_metaslab_tree, msp);
 456  458          mutex_exit(&mg->mg_lock);
 457  459  }
 458  460  
 459  461  /*
 460  462   * Determine if a given metaslab group should skip allocations. A metaslab
 461  463   * group should avoid allocations if its used capacity has crossed the
 462  464   * zfs_mg_noalloc_threshold and there is at least one metaslab group
 463  465   * that can still handle allocations.
 464  466   */
 465  467  static boolean_t
 466  468  metaslab_group_allocatable(metaslab_group_t *mg)
 467  469  {
 468  470          vdev_t *vd = mg->mg_vd;
 469  471          spa_t *spa = vd->vdev_spa;
 470  472          metaslab_class_t *mc = mg->mg_class;
 471  473  
 472  474          /*
 473  475           * A metaslab group is considered allocatable if its free capacity
 474  476           * is greater than the set value of zfs_mg_noalloc_threshold, it's
 475  477           * associated with a slog, or there are no other metaslab groups
 476  478           * with free capacity greater than zfs_mg_noalloc_threshold.
 477  479           */
 478  480          return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
 479  481              mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
 480  482  }
 481  483  
 482  484  /*
 483  485   * ==========================================================================
 484  486   * Range tree callbacks
 485  487   * ==========================================================================
 486  488   */
 487  489  
 488  490  /*
 489  491   * Comparison function for the private size-ordered tree. Tree is sorted
 490  492   * by size, larger sizes at the end of the tree.
 491  493   */
 492  494  static int
 493  495  metaslab_rangesize_compare(const void *x1, const void *x2)
 494  496  {
 495  497          const range_seg_t *r1 = x1;
 496  498          const range_seg_t *r2 = x2;
 497  499          uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 498  500          uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 499  501  
 500  502          if (rs_size1 < rs_size2)
 501  503                  return (-1);
 502  504          if (rs_size1 > rs_size2)
 503  505                  return (1);
 504  506  
 505  507          if (r1->rs_start < r2->rs_start)
 506  508                  return (-1);
 507  509  
 508  510          if (r1->rs_start > r2->rs_start)
 509  511                  return (1);
 510  512  
 511  513          return (0);
 512  514  }
 513  515  
 514  516  /*
 515  517   * Create any block allocator specific components. The current allocators
 516  518   * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
 517  519   */
 518  520  static void
 519  521  metaslab_rt_create(range_tree_t *rt, void *arg)
 520  522  {
 521  523          metaslab_t *msp = arg;
 522  524  
 523  525          ASSERT3P(rt->rt_arg, ==, msp);
 524  526          ASSERT(msp->ms_tree == NULL);
 525  527  
 526  528          avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 527  529              sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 528  530  }
 529  531  
 530  532  /*
 531  533   * Destroy the block allocator specific components.
 532  534   */
 533  535  static void
 534  536  metaslab_rt_destroy(range_tree_t *rt, void *arg)
 535  537  {
 536  538          metaslab_t *msp = arg;
 537  539  
 538  540          ASSERT3P(rt->rt_arg, ==, msp);
 539  541          ASSERT3P(msp->ms_tree, ==, rt);
 540  542          ASSERT0(avl_numnodes(&msp->ms_size_tree));
 541  543  
 542  544          avl_destroy(&msp->ms_size_tree);
 543  545  }
 544  546  
 545  547  static void
 546  548  metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 547  549  {
 548  550          metaslab_t *msp = arg;
 549  551  
 550  552          ASSERT3P(rt->rt_arg, ==, msp);
 551  553          ASSERT3P(msp->ms_tree, ==, rt);
 552  554          VERIFY(!msp->ms_condensing);
 553  555          avl_add(&msp->ms_size_tree, rs);
 554  556  }
 555  557  
 556  558  static void
 557  559  metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 558  560  {
 559  561          metaslab_t *msp = arg;
 560  562  
 561  563          ASSERT3P(rt->rt_arg, ==, msp);
 562  564          ASSERT3P(msp->ms_tree, ==, rt);
 563  565          VERIFY(!msp->ms_condensing);
 564  566          avl_remove(&msp->ms_size_tree, rs);
 565  567  }
 566  568  
 567  569  static void
 568  570  metaslab_rt_vacate(range_tree_t *rt, void *arg)
 569  571  {
 570  572          metaslab_t *msp = arg;
 571  573  
 572  574          ASSERT3P(rt->rt_arg, ==, msp);
 573  575          ASSERT3P(msp->ms_tree, ==, rt);
 574  576  
 575  577          /*
 576  578           * Normally one would walk the tree freeing nodes along the way.
 577  579           * Since the nodes are shared with the range trees we can avoid
 578  580           * walking all nodes and just reinitialize the avl tree. The nodes
 579  581           * will be freed by the range tree, so we don't want to free them here.
 580  582           */
 581  583          avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 582  584              sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 583  585  }
 584  586  
 585  587  static range_tree_ops_t metaslab_rt_ops = {
 586  588          metaslab_rt_create,
 587  589          metaslab_rt_destroy,
 588  590          metaslab_rt_add,
 589  591          metaslab_rt_remove,
 590  592          metaslab_rt_vacate
 591  593  };
 592  594  
 593  595  /*
 594  596   * ==========================================================================
 595  597   * Metaslab block operations
 596  598   * ==========================================================================
 597  599   */
 598  600  
 599  601  /*
 600  602   * Return the maximum contiguous segment within the metaslab.
 601  603   */
 602  604  uint64_t
 603  605  metaslab_block_maxsize(metaslab_t *msp)
 604  606  {
 605  607          avl_tree_t *t = &msp->ms_size_tree;
 606  608          range_seg_t *rs;
 607  609  
 608  610          if (t == NULL || (rs = avl_last(t)) == NULL)
 609  611                  return (0ULL);
 610  612  
 611  613          return (rs->rs_end - rs->rs_start);
 612  614  }
 613  615  
 614  616  uint64_t
 615  617  metaslab_block_alloc(metaslab_t *msp, uint64_t size)
 616  618  {
 617  619          uint64_t start;
 618  620          range_tree_t *rt = msp->ms_tree;
 619  621  
 620  622          VERIFY(!msp->ms_condensing);
 621  623  
 622  624          start = msp->ms_ops->msop_alloc(msp, size);
 623  625          if (start != -1ULL) {
 624  626                  vdev_t *vd = msp->ms_group->mg_vd;
 625  627  
 626  628                  VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
 627  629                  VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 628  630                  VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
 629  631                  range_tree_remove(rt, start, size);
 630  632          }
 631  633          return (start);
 632  634  }
 633  635  
 634  636  /*
 635  637   * ==========================================================================
 636  638   * Common allocator routines
 637  639   * ==========================================================================
 638  640   */
 639  641  
 640  642  /*
 641  643   * This is a helper function that can be used by the allocator to find
 642  644   * a suitable block to allocate. This will search the specified AVL
 643  645   * tree looking for a block that matches the specified criteria.
 644  646   */
 645  647  static uint64_t
 646  648  metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 647  649      uint64_t align)
 648  650  {
 649  651          range_seg_t *rs, rsearch;
 650  652          avl_index_t where;
 651  653  
 652  654          rsearch.rs_start = *cursor;
 653  655          rsearch.rs_end = *cursor + size;
 654  656  
 655  657          rs = avl_find(t, &rsearch, &where);
 656  658          if (rs == NULL)
 657  659                  rs = avl_nearest(t, where, AVL_AFTER);
 658  660  
 659  661          while (rs != NULL) {
 660  662                  uint64_t offset = P2ROUNDUP(rs->rs_start, align);
 661  663  
 662  664                  if (offset + size <= rs->rs_end) {
 663  665                          *cursor = offset + size;
 664  666                          return (offset);
 665  667                  }
 666  668                  rs = AVL_NEXT(t, rs);
 667  669          }
 668  670  
 669  671          /*
 670  672           * If we know we've searched the whole map (*cursor == 0), give up.
 671  673           * Otherwise, reset the cursor to the beginning and try again.
 672  674           */
 673  675          if (*cursor == 0)
 674  676                  return (-1ULL);
 675  677  
 676  678          *cursor = 0;
 677  679          return (metaslab_block_picker(t, cursor, size, align));
 678  680  }
 679  681  
 680  682  /*
 681  683   * ==========================================================================
 682  684   * The first-fit block allocator
 683  685   * ==========================================================================
 684  686   */
 685  687  static uint64_t
 686  688  metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 687  689  {
 688  690          /*
 689  691           * Find the largest power of 2 block size that evenly divides the
 690  692           * requested size. This is used to try to allocate blocks with similar
 691  693           * alignment from the same area of the metaslab (i.e. same cursor
 692  694           * bucket) but it does not guarantee that other allocations sizes
 693  695           * may exist in the same region.
 694  696           */
 695  697          uint64_t align = size & -size;
 696  698          uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
 697  699          avl_tree_t *t = &msp->ms_tree->rt_root;
 698  700  
 699  701          return (metaslab_block_picker(t, cursor, size, align));
 700  702  }
 701  703  
 702  704  /* ARGSUSED */
 703  705  static boolean_t
 704  706  metaslab_ff_fragmented(metaslab_t *msp)
 705  707  {
 706  708          return (B_TRUE);
 707  709  }
 708  710  
 709  711  static metaslab_ops_t metaslab_ff_ops = {
 710  712          metaslab_ff_alloc,
 711  713          metaslab_ff_fragmented
 712  714  };
 713  715  
 714  716  /*
 715  717   * ==========================================================================
 716  718   * Dynamic block allocator -
 717  719   * Uses the first fit allocation scheme until space get low and then
 718  720   * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
 719  721   * and metaslab_df_free_pct to determine when to switch the allocation scheme.
 720  722   * ==========================================================================
 721  723   */
 722  724  static uint64_t
 723  725  metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 724  726  {
 725  727          /*
 726  728           * Find the largest power of 2 block size that evenly divides the
 727  729           * requested size. This is used to try to allocate blocks with similar
 728  730           * alignment from the same area of the metaslab (i.e. same cursor
 729  731           * bucket) but it does not guarantee that other allocations sizes
 730  732           * may exist in the same region.
 731  733           */
 732  734          uint64_t align = size & -size;
 733  735          uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
 734  736          range_tree_t *rt = msp->ms_tree;
 735  737          avl_tree_t *t = &rt->rt_root;
 736  738          uint64_t max_size = metaslab_block_maxsize(msp);
 737  739          int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 738  740  
 739  741          ASSERT(MUTEX_HELD(&msp->ms_lock));
 740  742          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 741  743  
 742  744          if (max_size < size)
 743  745                  return (-1ULL);
 744  746  
 745  747          /*
 746  748           * If we're running low on space switch to using the size
 747  749           * sorted AVL tree (best-fit).
 748  750           */
 749  751          if (max_size < metaslab_df_alloc_threshold ||
 750  752              free_pct < metaslab_df_free_pct) {
 751  753                  t = &msp->ms_size_tree;
 752  754                  *cursor = 0;
 753  755          }
 754  756  
 755  757          return (metaslab_block_picker(t, cursor, size, 1ULL));
 756  758  }
 757  759  
 758  760  static boolean_t
 759  761  metaslab_df_fragmented(metaslab_t *msp)
 760  762  {
 761  763          range_tree_t *rt = msp->ms_tree;
 762  764          uint64_t max_size = metaslab_block_maxsize(msp);
 763  765          int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 764  766  
 765  767          if (max_size >= metaslab_df_alloc_threshold &&
 766  768              free_pct >= metaslab_df_free_pct)
 767  769                  return (B_FALSE);
 768  770  
 769  771          return (B_TRUE);
 770  772  }
 771  773  
 772  774  static metaslab_ops_t metaslab_df_ops = {
 773  775          metaslab_df_alloc,
 774  776          metaslab_df_fragmented
 775  777  };
 776  778  
 777  779  /*
 778  780   * ==========================================================================
 779  781   * Cursor fit block allocator -
 780  782   * Select the largest region in the metaslab, set the cursor to the beginning
 781  783   * of the range and the cursor_end to the end of the range. As allocations
 782  784   * are made advance the cursor. Continue allocating from the cursor until
 783  785   * the range is exhausted and then find a new range.
 784  786   * ==========================================================================
 785  787   */
 786  788  static uint64_t
 787  789  metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 788  790  {
 789  791          range_tree_t *rt = msp->ms_tree;
 790  792          avl_tree_t *t = &msp->ms_size_tree;
 791  793          uint64_t *cursor = &msp->ms_lbas[0];
 792  794          uint64_t *cursor_end = &msp->ms_lbas[1];
 793  795          uint64_t offset = 0;
 794  796  
 795  797          ASSERT(MUTEX_HELD(&msp->ms_lock));
 796  798          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
 797  799  
 798  800          ASSERT3U(*cursor_end, >=, *cursor);
 799  801  
 800  802          if ((*cursor + size) > *cursor_end) {
 801  803                  range_seg_t *rs;
 802  804  
 803  805                  rs = avl_last(&msp->ms_size_tree);
 804  806                  if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
 805  807                          return (-1ULL);
 806  808  
 807  809                  *cursor = rs->rs_start;
 808  810                  *cursor_end = rs->rs_end;
 809  811          }
 810  812  
 811  813          offset = *cursor;
 812  814          *cursor += size;
 813  815  
 814  816          return (offset);
 815  817  }
 816  818  
 817  819  static boolean_t
 818  820  metaslab_cf_fragmented(metaslab_t *msp)
 819  821  {
 820  822          return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
 821  823  }
 822  824  
 823  825  static metaslab_ops_t metaslab_cf_ops = {
 824  826          metaslab_cf_alloc,
 825  827          metaslab_cf_fragmented
 826  828  };
 827  829  
 828  830  /*
 829  831   * ==========================================================================
 830  832   * New dynamic fit allocator -
 831  833   * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
 832  834   * contiguous blocks. If no region is found then just use the largest segment
 833  835   * that remains.
 834  836   * ==========================================================================
 835  837   */
 836  838  
 837  839  /*
 838  840   * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
 839  841   * to request from the allocator.
 840  842   */
 841  843  uint64_t metaslab_ndf_clump_shift = 4;
 842  844  
 843  845  static uint64_t
 844  846  metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 845  847  {
 846  848          avl_tree_t *t = &msp->ms_tree->rt_root;
 847  849          avl_index_t where;
 848  850          range_seg_t *rs, rsearch;
 849  851          uint64_t hbit = highbit(size);
 850  852          uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 851  853          uint64_t max_size = metaslab_block_maxsize(msp);
 852  854  
 853  855          ASSERT(MUTEX_HELD(&msp->ms_lock));
 854  856          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 855  857  
 856  858          if (max_size < size)
 857  859                  return (-1ULL);
 858  860  
 859  861          rsearch.rs_start = *cursor;
 860  862          rsearch.rs_end = *cursor + size;
 861  863  
 862  864          rs = avl_find(t, &rsearch, &where);
 863  865          if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
 864  866                  t = &msp->ms_size_tree;
 865  867  
 866  868                  rsearch.rs_start = 0;
 867  869                  rsearch.rs_end = MIN(max_size,
 868  870                      1ULL << (hbit + metaslab_ndf_clump_shift));
 869  871                  rs = avl_find(t, &rsearch, &where);
 870  872                  if (rs == NULL)
 871  873                          rs = avl_nearest(t, where, AVL_AFTER);
 872  874                  ASSERT(rs != NULL);
 873  875          }
 874  876  
 875  877          if ((rs->rs_end - rs->rs_start) >= size) {
 876  878                  *cursor = rs->rs_start + size;
 877  879                  return (rs->rs_start);
 878  880          }
 879  881          return (-1ULL);
 880  882  }
 881  883  
 882  884  static boolean_t
 883  885  metaslab_ndf_fragmented(metaslab_t *msp)
 884  886  {
 885  887          return (metaslab_block_maxsize(msp) <=
 886  888              (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
 887  889  }
 888  890  
 889  891  static metaslab_ops_t metaslab_ndf_ops = {
 890  892          metaslab_ndf_alloc,
 891  893          metaslab_ndf_fragmented
 892  894  };
 893  895  
 894  896  metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 895  897  
 896  898  /*
 897  899   * ==========================================================================
 898  900   * Metaslabs
 899  901   * ==========================================================================
 900  902   */
 901  903  
 902  904  /*
 903  905   * Wait for any in-progress metaslab loads to complete.
 904  906   */
 905  907  void
 906  908  metaslab_load_wait(metaslab_t *msp)
 907  909  {
 908  910          ASSERT(MUTEX_HELD(&msp->ms_lock));
 909  911  
 910  912          while (msp->ms_loading) {
 911  913                  ASSERT(!msp->ms_loaded);
 912  914                  cv_wait(&msp->ms_load_cv, &msp->ms_lock);
 913  915          }
 914  916  }
 915  917  
 916  918  int
 917  919  metaslab_load(metaslab_t *msp)
 918  920  {
 919  921          int error = 0;
 920  922  
 921  923          ASSERT(MUTEX_HELD(&msp->ms_lock));
 922  924          ASSERT(!msp->ms_loaded);
 923  925          ASSERT(!msp->ms_loading);
 924  926  
 925  927          msp->ms_loading = B_TRUE;
 926  928  
 927  929          /*
 928  930           * If the space map has not been allocated yet, then treat
 929  931           * all the space in the metaslab as free and add it to the
 930  932           * ms_tree.
 931  933           */
 932  934          if (msp->ms_sm != NULL)
 933  935                  error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
 934  936          else
 935  937                  range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
 936  938  
 937  939          msp->ms_loaded = (error == 0);
 938  940          msp->ms_loading = B_FALSE;
 939  941  
 940  942          if (msp->ms_loaded) {
 941  943                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 942  944                          range_tree_walk(msp->ms_defertree[t],
 943  945                              range_tree_remove, msp->ms_tree);
 944  946                  }
 945  947          }
 946  948          cv_broadcast(&msp->ms_load_cv);
 947  949          return (error);
 948  950  }
 949  951  
 950  952  void
 951  953  metaslab_unload(metaslab_t *msp)
 952  954  {
 953  955          ASSERT(MUTEX_HELD(&msp->ms_lock));
 954  956          range_tree_vacate(msp->ms_tree, NULL, NULL);
 955  957          msp->ms_loaded = B_FALSE;
 956  958          msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 957  959  }
 958  960  
 959  961  metaslab_t *
 960  962  metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
 961  963  {
 962  964          vdev_t *vd = mg->mg_vd;
 963  965          objset_t *mos = vd->vdev_spa->spa_meta_objset;
 964  966          metaslab_t *msp;
 965  967  
 966  968          msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 967  969          mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 968  970          cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
 969  971          msp->ms_id = id;
 970  972          msp->ms_start = id << vd->vdev_ms_shift;
 971  973          msp->ms_size = 1ULL << vd->vdev_ms_shift;
 972  974  
 973  975          /*
 974  976           * We only open space map objects that already exist. All others
 975  977           * will be opened when we finally allocate an object for it.
 976  978           */
 977  979          if (object != 0) {
 978  980                  VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
 979  981                      msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
 980  982                  ASSERT(msp->ms_sm != NULL);
 981  983          }
 982  984  
 983  985          /*
 984  986           * We create the main range tree here, but we don't create the
 985  987           * alloctree and freetree until metaslab_sync_done().  This serves
 986  988           * two purposes: it allows metaslab_sync_done() to detect the
 987  989           * addition of new space; and for debugging, it ensures that we'd
 988  990           * data fault on any attempt to use this metaslab before it's ready.
 989  991           */
 990  992          msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
 991  993          metaslab_group_add(mg, msp);
 992  994  
 993  995          msp->ms_ops = mg->mg_class->mc_ops;
 994  996  
 995  997          /*
 996  998           * If we're opening an existing pool (txg == 0) or creating
 997  999           * a new one (txg == TXG_INITIAL), all space is available now.
 998 1000           * If we're adding space to an existing pool, the new space
 999 1001           * does not become available until after this txg has synced.
1000 1002           */
1001 1003          if (txg <= TXG_INITIAL)
1002 1004                  metaslab_sync_done(msp, 0);
1003 1005  
1004 1006          /*
1005 1007           * If metaslab_debug_load is set and we're initializing a metaslab
1006 1008           * that has an allocated space_map object then load the its space
1007 1009           * map so that can verify frees.
1008 1010           */
1009 1011          if (metaslab_debug_load && msp->ms_sm != NULL) {
1010 1012                  mutex_enter(&msp->ms_lock);
1011 1013                  VERIFY0(metaslab_load(msp));
1012 1014                  mutex_exit(&msp->ms_lock);
1013 1015          }
1014 1016  
1015 1017          if (txg != 0) {
1016 1018                  vdev_dirty(vd, 0, NULL, txg);
1017 1019                  vdev_dirty(vd, VDD_METASLAB, msp, txg);
1018 1020          }
1019 1021  
1020 1022          return (msp);
1021 1023  }
1022 1024  
1023 1025  void
1024 1026  metaslab_fini(metaslab_t *msp)
1025 1027  {
1026 1028          metaslab_group_t *mg = msp->ms_group;
1027 1029  
1028 1030          metaslab_group_remove(mg, msp);
1029 1031  
1030 1032          mutex_enter(&msp->ms_lock);
1031 1033  
1032 1034          VERIFY(msp->ms_group == NULL);
1033 1035          vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1034 1036              0, -msp->ms_size);
1035 1037          space_map_close(msp->ms_sm);
1036 1038  
1037 1039          metaslab_unload(msp);
1038 1040          range_tree_destroy(msp->ms_tree);
1039 1041  
1040 1042          for (int t = 0; t < TXG_SIZE; t++) {
1041 1043                  range_tree_destroy(msp->ms_alloctree[t]);
1042 1044                  range_tree_destroy(msp->ms_freetree[t]);
1043 1045          }
1044 1046  
1045 1047          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1046 1048                  range_tree_destroy(msp->ms_defertree[t]);
1047 1049          }
1048 1050  
1049 1051          ASSERT0(msp->ms_deferspace);
1050 1052  
1051 1053          mutex_exit(&msp->ms_lock);
1052 1054          cv_destroy(&msp->ms_load_cv);
1053 1055          mutex_destroy(&msp->ms_lock);
1054 1056  
1055 1057          kmem_free(msp, sizeof (metaslab_t));
1056 1058  }
1057 1059  
1058 1060  /*
1059 1061   * Apply a weighting factor based on the histogram information for this
1060 1062   * metaslab. The current weighting factor is somewhat arbitrary and requires
1061 1063   * additional investigation. The implementation provides a measure of
1062 1064   * "weighted" free space and gives a higher weighting for larger contiguous
1063 1065   * regions. The weighting factor is determined by counting the number of
1064 1066   * sm_shift sectors that exist in each region represented by the histogram.
1065 1067   * That value is then multiplied by the power of 2 exponent and the sm_shift
1066 1068   * value.
1067 1069   *
1068 1070   * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
1069 1071   * metaslab has an sm_shift value of 9 (512B):
1070 1072   *
1071 1073   * 1) calculate the number of sm_shift sectors in the region:
1072 1074   *      2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1073 1075   * 2) multiply by the power of 2 exponent and the sm_shift value:
1074 1076   *      16384 * 21 * 9 = 3096576
1075 1077   * This value will be added to the weighting of the metaslab.
1076 1078   */
1077 1079  static uint64_t
1078 1080  metaslab_weight_factor(metaslab_t *msp)
1079 1081  {
1080 1082          uint64_t factor = 0;
1081 1083          uint64_t sectors;
1082 1084          int i;
1083 1085  
1084 1086          /*
1085 1087           * A null space map means that the entire metaslab is free,
1086 1088           * calculate a weight factor that spans the entire size of the
1087 1089           * metaslab.
1088 1090           */
1089 1091          if (msp->ms_sm == NULL) {
1090 1092                  vdev_t *vd = msp->ms_group->mg_vd;
1091 1093  
1092 1094                  i = highbit(msp->ms_size) - 1;
1093 1095                  sectors = msp->ms_size >> vd->vdev_ashift;
1094 1096                  return (sectors * i * vd->vdev_ashift);
1095 1097          }
1096 1098  
1097 1099          if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1098 1100                  return (0);
1099 1101  
1100 1102          for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1101 1103                  if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1102 1104                          continue;
1103 1105  
1104 1106                  /*
1105 1107                   * Determine the number of sm_shift sectors in the region
1106 1108                   * indicated by the histogram. For example, given an
1107 1109                   * sm_shift value of 9 (512 bytes) and i = 4 then we know
1108 1110                   * that we're looking at an 8K region in the histogram
1109 1111                   * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1110 1112                   * number of sm_shift sectors (512 bytes in this example),
1111 1113                   * we would take 8192 / 512 = 16. Since the histogram
1112 1114                   * is offset by sm_shift we can simply use the value of
1113 1115                   * of i to calculate this (i.e. 2^i = 16 where i = 4).
1114 1116                   */
1115 1117                  sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
1116 1118                  factor += (i + msp->ms_sm->sm_shift) * sectors;
1117 1119          }
1118 1120          return (factor * msp->ms_sm->sm_shift);
1119 1121  }
1120 1122  
1121 1123  static uint64_t
1122 1124  metaslab_weight(metaslab_t *msp)
1123 1125  {
1124 1126          metaslab_group_t *mg = msp->ms_group;
1125 1127          vdev_t *vd = mg->mg_vd;
1126 1128          uint64_t weight, space;
1127 1129  
1128 1130          ASSERT(MUTEX_HELD(&msp->ms_lock));
1129 1131  
1130 1132          /*
1131 1133           * This vdev is in the process of being removed so there is nothing
1132 1134           * for us to do here.
1133 1135           */
1134 1136          if (vd->vdev_removing) {
1135 1137                  ASSERT0(space_map_allocated(msp->ms_sm));
1136 1138                  ASSERT0(vd->vdev_ms_shift);
1137 1139                  return (0);
1138 1140          }
1139 1141  
1140 1142          /*
1141 1143           * The baseline weight is the metaslab's free space.
1142 1144           */
1143 1145          space = msp->ms_size - space_map_allocated(msp->ms_sm);
1144 1146          weight = space;
1145 1147  
1146 1148          /*
1147 1149           * Modern disks have uniform bit density and constant angular velocity.
1148 1150           * Therefore, the outer recording zones are faster (higher bandwidth)
1149 1151           * than the inner zones by the ratio of outer to inner track diameter,
1150 1152           * which is typically around 2:1.  We account for this by assigning
1151 1153           * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1152 1154           * In effect, this means that we'll select the metaslab with the most
1153 1155           * free bandwidth rather than simply the one with the most free space.
1154 1156           */
1155 1157          weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1156 1158          ASSERT(weight >= space && weight <= 2 * space);
1157 1159  
1158 1160          msp->ms_factor = metaslab_weight_factor(msp);
1159 1161          if (metaslab_weight_factor_enable)
1160 1162                  weight += msp->ms_factor;
1161 1163  
1162 1164          if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
1163 1165                  /*
1164 1166                   * If this metaslab is one we're actively using, adjust its
1165 1167                   * weight to make it preferable to any inactive metaslab so
1166 1168                   * we'll polish it off.
1167 1169                   */
1168 1170                  weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1169 1171          }
1170 1172  
1171 1173          return (weight);
1172 1174  }
1173 1175  
1174 1176  static int
1175 1177  metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1176 1178  {
1177 1179          ASSERT(MUTEX_HELD(&msp->ms_lock));
1178 1180  
1179 1181          if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1180 1182                  metaslab_load_wait(msp);
1181 1183                  if (!msp->ms_loaded) {
1182 1184                          int error = metaslab_load(msp);
1183 1185                          if (error) {
1184 1186                                  metaslab_group_sort(msp->ms_group, msp, 0);
1185 1187                                  return (error);
1186 1188                          }
1187 1189                  }
1188 1190  
1189 1191                  metaslab_group_sort(msp->ms_group, msp,
1190 1192                      msp->ms_weight | activation_weight);
1191 1193          }
1192 1194          ASSERT(msp->ms_loaded);
1193 1195          ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1194 1196  
1195 1197          return (0);
1196 1198  }
1197 1199  
1198 1200  static void
1199 1201  metaslab_passivate(metaslab_t *msp, uint64_t size)
1200 1202  {
1201 1203          /*
1202 1204           * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1203 1205           * this metaslab again.  In that case, it had better be empty,
1204 1206           * or we would be leaving space on the table.
1205 1207           */
1206 1208          ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1207 1209          metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1208 1210          ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1209 1211  }
1210 1212  
1211 1213  static void
1212 1214  metaslab_preload(void *arg)
1213 1215  {
1214 1216          metaslab_t *msp = arg;
1215 1217          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1216 1218  
1217 1219          mutex_enter(&msp->ms_lock);
1218 1220          metaslab_load_wait(msp);
1219 1221          if (!msp->ms_loaded)
1220 1222                  (void) metaslab_load(msp);
1221 1223  
1222 1224          /*
1223 1225           * Set the ms_access_txg value so that we don't unload it right away.
1224 1226           */
1225 1227          msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1226 1228          mutex_exit(&msp->ms_lock);
1227 1229  }
1228 1230  
1229 1231  static void
1230 1232  metaslab_group_preload(metaslab_group_t *mg)
1231 1233  {
1232 1234          spa_t *spa = mg->mg_vd->vdev_spa;
1233 1235          metaslab_t *msp;
1234 1236          avl_tree_t *t = &mg->mg_metaslab_tree;
1235 1237          int m = 0;
1236 1238  
1237 1239          if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1238 1240                  taskq_wait(mg->mg_taskq);
1239 1241                  return;
1240 1242          }
1241 1243          mutex_enter(&mg->mg_lock);
1242 1244  
1243 1245          /*
1244 1246           * Prefetch the next potential metaslabs
1245 1247           */
1246 1248          for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1247 1249  
1248 1250                  /* If we have reached our preload limit then we're done */
1249 1251                  if (++m > metaslab_preload_limit)
1250 1252                          break;
1251 1253  
1252 1254                  VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1253 1255                      msp, TQ_SLEEP) != NULL);
1254 1256          }
1255 1257          mutex_exit(&mg->mg_lock);
1256 1258  }
1257 1259  
1258 1260  /*
1259 1261   * Determine if the space map's on-disk footprint is past our tolerance
1260 1262   * for inefficiency. We would like to use the following criteria to make
1261 1263   * our decision:
1262 1264   *
1263 1265   * 1. The size of the space map object should not dramatically increase as a
1264 1266   * result of writing out the free space range tree.
1265 1267   *
1266 1268   * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1267 1269   * times the size than the free space range tree representation
1268 1270   * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1269 1271   *
1270 1272   * Checking the first condition is tricky since we don't want to walk
1271 1273   * the entire AVL tree calculating the estimated on-disk size. Instead we
1272 1274   * use the size-ordered range tree in the metaslab and calculate the
1273 1275   * size required to write out the largest segment in our free tree. If the
1274 1276   * size required to represent that segment on disk is larger than the space
1275 1277   * map object then we avoid condensing this map.
1276 1278   *
1277 1279   * To determine the second criterion we use a best-case estimate and assume
1278 1280   * each segment can be represented on-disk as a single 64-bit entry. We refer
1279 1281   * to this best-case estimate as the space map's minimal form.
1280 1282   */
1281 1283  static boolean_t
1282 1284  metaslab_should_condense(metaslab_t *msp)
1283 1285  {
1284 1286          space_map_t *sm = msp->ms_sm;
1285 1287          range_seg_t *rs;
1286 1288          uint64_t size, entries, segsz;
1287 1289  
1288 1290          ASSERT(MUTEX_HELD(&msp->ms_lock));
1289 1291          ASSERT(msp->ms_loaded);
1290 1292  
1291 1293          /*
1292 1294           * Use the ms_size_tree range tree, which is ordered by size, to
1293 1295           * obtain the largest segment in the free tree. If the tree is empty
1294 1296           * then we should condense the map.
1295 1297           */
1296 1298          rs = avl_last(&msp->ms_size_tree);
1297 1299          if (rs == NULL)
1298 1300                  return (B_TRUE);
1299 1301  
1300 1302          /*
1301 1303           * Calculate the number of 64-bit entries this segment would
1302 1304           * require when written to disk. If this single segment would be
1303 1305           * larger on-disk than the entire current on-disk structure, then
1304 1306           * clearly condensing will increase the on-disk structure size.
1305 1307           */
1306 1308          size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1307 1309          entries = size / (MIN(size, SM_RUN_MAX));
1308 1310          segsz = entries * sizeof (uint64_t);
1309 1311  
1310 1312          return (segsz <= space_map_length(msp->ms_sm) &&
1311 1313              space_map_length(msp->ms_sm) >= (zfs_condense_pct *
1312 1314              sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
1313 1315  }
1314 1316  
1315 1317  /*
1316 1318   * Condense the on-disk space map representation to its minimized form.
1317 1319   * The minimized form consists of a small number of allocations followed by
1318 1320   * the entries of the free range tree.
1319 1321   */
1320 1322  static void
1321 1323  metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1322 1324  {
1323 1325          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1324 1326          range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1325 1327          range_tree_t *condense_tree;
1326 1328          space_map_t *sm = msp->ms_sm;
1327 1329  
1328 1330          ASSERT(MUTEX_HELD(&msp->ms_lock));
1329 1331          ASSERT3U(spa_sync_pass(spa), ==, 1);
1330 1332          ASSERT(msp->ms_loaded);
1331 1333  
1332 1334          spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1333 1335              "smp size %llu, segments %lu", txg, msp->ms_id, msp,
1334 1336              space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1335 1337  
1336 1338          /*
1337 1339           * Create an range tree that is 100% allocated. We remove segments
1338 1340           * that have been freed in this txg, any deferred frees that exist,
1339 1341           * and any allocation in the future. Removing segments should be
1340 1342           * a relatively inexpensive operation since we expect these trees to
1341 1343           * have a small number of nodes.
1342 1344           */
1343 1345          condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1344 1346          range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1345 1347  
1346 1348          /*
1347 1349           * Remove what's been freed in this txg from the condense_tree.
1348 1350           * Since we're in sync_pass 1, we know that all the frees from
1349 1351           * this txg are in the freetree.
1350 1352           */
1351 1353          range_tree_walk(freetree, range_tree_remove, condense_tree);
1352 1354  
1353 1355          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1354 1356                  range_tree_walk(msp->ms_defertree[t],
1355 1357                      range_tree_remove, condense_tree);
1356 1358          }
1357 1359  
1358 1360          for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1359 1361                  range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1360 1362                      range_tree_remove, condense_tree);
1361 1363          }
1362 1364  
1363 1365          /*
1364 1366           * We're about to drop the metaslab's lock thus allowing
1365 1367           * other consumers to change it's content. Set the
1366 1368           * metaslab's ms_condensing flag to ensure that
1367 1369           * allocations on this metaslab do not occur while we're
1368 1370           * in the middle of committing it to disk. This is only critical
1369 1371           * for the ms_tree as all other range trees use per txg
1370 1372           * views of their content.
1371 1373           */
1372 1374          msp->ms_condensing = B_TRUE;
1373 1375  
1374 1376          mutex_exit(&msp->ms_lock);
1375 1377          space_map_truncate(sm, tx);
1376 1378          mutex_enter(&msp->ms_lock);
1377 1379  
1378 1380          /*
1379 1381           * While we would ideally like to create a space_map representation
1380 1382           * that consists only of allocation records, doing so can be
1381 1383           * prohibitively expensive because the in-core free tree can be
1382 1384           * large, and therefore computationally expensive to subtract
1383 1385           * from the condense_tree. Instead we sync out two trees, a cheap
1384 1386           * allocation only tree followed by the in-core free tree. While not
1385 1387           * optimal, this is typically close to optimal, and much cheaper to
1386 1388           * compute.
1387 1389           */
1388 1390          space_map_write(sm, condense_tree, SM_ALLOC, tx);
1389 1391          range_tree_vacate(condense_tree, NULL, NULL);
1390 1392          range_tree_destroy(condense_tree);
1391 1393  
1392 1394          space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1393 1395          msp->ms_condensing = B_FALSE;
1394 1396  }
1395 1397  
1396 1398  /*
1397 1399   * Write a metaslab to disk in the context of the specified transaction group.
1398 1400   */
1399 1401  void
1400 1402  metaslab_sync(metaslab_t *msp, uint64_t txg)
1401 1403  {
1402 1404          metaslab_group_t *mg = msp->ms_group;
1403 1405          vdev_t *vd = mg->mg_vd;
1404 1406          spa_t *spa = vd->vdev_spa;
1405 1407          objset_t *mos = spa_meta_objset(spa);
1406 1408          range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1407 1409          range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1408 1410          range_tree_t **freed_tree =
1409 1411              &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1410 1412          dmu_tx_t *tx;
1411 1413          uint64_t object = space_map_object(msp->ms_sm);
1412 1414  
1413 1415          ASSERT(!vd->vdev_ishole);
1414 1416  
1415 1417          /*
1416 1418           * This metaslab has just been added so there's no work to do now.
1417 1419           */
1418 1420          if (*freetree == NULL) {
1419 1421                  ASSERT3P(alloctree, ==, NULL);
1420 1422                  return;
1421 1423          }
1422 1424  
1423 1425          ASSERT3P(alloctree, !=, NULL);
1424 1426          ASSERT3P(*freetree, !=, NULL);
1425 1427          ASSERT3P(*freed_tree, !=, NULL);
1426 1428  
1427 1429          if (range_tree_space(alloctree) == 0 &&
1428 1430              range_tree_space(*freetree) == 0)
1429 1431                  return;
1430 1432  
1431 1433          /*
1432 1434           * The only state that can actually be changing concurrently with
1433 1435           * metaslab_sync() is the metaslab's ms_tree.  No other thread can
1434 1436           * be modifying this txg's alloctree, freetree, freed_tree, or
1435 1437           * space_map_phys_t. Therefore, we only hold ms_lock to satify
1436 1438           * space_map ASSERTs. We drop it whenever we call into the DMU,
1437 1439           * because the DMU can call down to us (e.g. via zio_free()) at
1438 1440           * any time.
1439 1441           */
1440 1442  
1441 1443          tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1442 1444  
1443 1445          if (msp->ms_sm == NULL) {
1444 1446                  uint64_t new_object;
1445 1447  
1446 1448                  new_object = space_map_alloc(mos, tx);
1447 1449                  VERIFY3U(new_object, !=, 0);
1448 1450  
1449 1451                  VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1450 1452                      msp->ms_start, msp->ms_size, vd->vdev_ashift,
1451 1453                      &msp->ms_lock));
1452 1454                  ASSERT(msp->ms_sm != NULL);
1453 1455          }
1454 1456  
1455 1457          mutex_enter(&msp->ms_lock);
1456 1458  
1457 1459          if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1458 1460              metaslab_should_condense(msp)) {
1459 1461                  metaslab_condense(msp, txg, tx);
1460 1462          } else {
1461 1463                  space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1462 1464                  space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1463 1465          }
1464 1466  
1465 1467          range_tree_vacate(alloctree, NULL, NULL);
1466 1468  
1467 1469          if (msp->ms_loaded) {
1468 1470                  /*
1469 1471                   * When the space map is loaded, we have an accruate
1470 1472                   * histogram in the range tree. This gives us an opportunity
1471 1473                   * to bring the space map's histogram up-to-date so we clear
1472 1474                   * it first before updating it.
1473 1475                   */
1474 1476                  space_map_histogram_clear(msp->ms_sm);
1475 1477                  space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1476 1478          } else {
1477 1479                  /*
1478 1480                   * Since the space map is not loaded we simply update the
1479 1481                   * exisiting histogram with what was freed in this txg. This
1480 1482                   * means that the on-disk histogram may not have an accurate
1481 1483                   * view of the free space but it's close enough to allow
1482 1484                   * us to make allocation decisions.
1483 1485                   */
1484 1486                  space_map_histogram_add(msp->ms_sm, *freetree, tx);
1485 1487          }
1486 1488  
1487 1489          /*
1488 1490           * For sync pass 1, we avoid traversing this txg's free range tree
1489 1491           * and instead will just swap the pointers for freetree and
1490 1492           * freed_tree. We can safely do this since the freed_tree is
1491 1493           * guaranteed to be empty on the initial pass.
1492 1494           */
1493 1495          if (spa_sync_pass(spa) == 1) {
1494 1496                  range_tree_swap(freetree, freed_tree);
1495 1497          } else {
1496 1498                  range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1497 1499          }
1498 1500  
1499 1501          ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1500 1502          ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1501 1503  
1502 1504          mutex_exit(&msp->ms_lock);
1503 1505  
1504 1506          if (object != space_map_object(msp->ms_sm)) {
1505 1507                  object = space_map_object(msp->ms_sm);
1506 1508                  dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1507 1509                      msp->ms_id, sizeof (uint64_t), &object, tx);
1508 1510          }
1509 1511          dmu_tx_commit(tx);
1510 1512  }
1511 1513  
1512 1514  /*
1513 1515   * Called after a transaction group has completely synced to mark
1514 1516   * all of the metaslab's free space as usable.
1515 1517   */
1516 1518  void
1517 1519  metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1518 1520  {
1519 1521          metaslab_group_t *mg = msp->ms_group;
1520 1522          vdev_t *vd = mg->mg_vd;
1521 1523          range_tree_t **freed_tree;
1522 1524          range_tree_t **defer_tree;
1523 1525          int64_t alloc_delta, defer_delta;
1524 1526  
1525 1527          ASSERT(!vd->vdev_ishole);
1526 1528  
1527 1529          mutex_enter(&msp->ms_lock);
1528 1530  
1529 1531          /*
1530 1532           * If this metaslab is just becoming available, initialize its
1531 1533           * alloctrees, freetrees, and defertree and add its capacity to
1532 1534           * the vdev.
1533 1535           */
1534 1536          if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
1535 1537                  for (int t = 0; t < TXG_SIZE; t++) {
1536 1538                          ASSERT(msp->ms_alloctree[t] == NULL);
1537 1539                          ASSERT(msp->ms_freetree[t] == NULL);
1538 1540  
1539 1541                          msp->ms_alloctree[t] = range_tree_create(NULL, msp,
1540 1542                              &msp->ms_lock);
1541 1543                          msp->ms_freetree[t] = range_tree_create(NULL, msp,
1542 1544                              &msp->ms_lock);
1543 1545                  }
1544 1546  
1545 1547                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1546 1548                          ASSERT(msp->ms_defertree[t] == NULL);
1547 1549  
1548 1550                          msp->ms_defertree[t] = range_tree_create(NULL, msp,
1549 1551                              &msp->ms_lock);
1550 1552                  }
1551 1553  
1552 1554                  vdev_space_update(vd, 0, 0, msp->ms_size);
1553 1555          }
1554 1556  
1555 1557          freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1556 1558          defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
1557 1559  
1558 1560          alloc_delta = space_map_alloc_delta(msp->ms_sm);
1559 1561          defer_delta = range_tree_space(*freed_tree) -
1560 1562              range_tree_space(*defer_tree);
1561 1563  
1562 1564          vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1563 1565  
1564 1566          ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1565 1567          ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1566 1568  
1567 1569          /*
1568 1570           * If there's a metaslab_load() in progress, wait for it to complete
1569 1571           * so that we have a consistent view of the in-core space map.
1570 1572           */
1571 1573          metaslab_load_wait(msp);
1572 1574  
1573 1575          /*
1574 1576           * Move the frees from the defer_tree back to the free
1575 1577           * range tree (if it's loaded). Swap the freed_tree and the
1576 1578           * defer_tree -- this is safe to do because we've just emptied out
1577 1579           * the defer_tree.
1578 1580           */
1579 1581          range_tree_vacate(*defer_tree,
1580 1582              msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
1581 1583          range_tree_swap(freed_tree, defer_tree);
1582 1584  
1583 1585          space_map_update(msp->ms_sm);
1584 1586  
1585 1587          msp->ms_deferspace += defer_delta;
1586 1588          ASSERT3S(msp->ms_deferspace, >=, 0);
1587 1589          ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
1588 1590          if (msp->ms_deferspace != 0) {
1589 1591                  /*
1590 1592                   * Keep syncing this metaslab until all deferred frees
1591 1593                   * are back in circulation.
1592 1594                   */
1593 1595                  vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1594 1596          }
1595 1597  
1596 1598          if (msp->ms_loaded && msp->ms_access_txg < txg) {
1597 1599                  for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1598 1600                          VERIFY0(range_tree_space(
1599 1601                              msp->ms_alloctree[(txg + t) & TXG_MASK]));
1600 1602                  }
1601 1603  
1602 1604                  if (!metaslab_debug_unload)
1603 1605                          metaslab_unload(msp);
1604 1606          }
1605 1607  
1606 1608          metaslab_group_sort(mg, msp, metaslab_weight(msp));
1607 1609          mutex_exit(&msp->ms_lock);
1608 1610  
1609 1611  }
1610 1612  
1611 1613  void
1612 1614  metaslab_sync_reassess(metaslab_group_t *mg)
1613 1615  {
1614 1616          int64_t failures = mg->mg_alloc_failures;
1615 1617  
1616 1618          metaslab_group_alloc_update(mg);
1617 1619          atomic_add_64(&mg->mg_alloc_failures, -failures);
1618 1620  
1619 1621          /*
1620 1622           * Preload the next potential metaslabs
1621 1623           */
1622 1624          metaslab_group_preload(mg);
1623 1625  }
1624 1626  
1625 1627  static uint64_t
1626 1628  metaslab_distance(metaslab_t *msp, dva_t *dva)
1627 1629  {
1628 1630          uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1629 1631          uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1630 1632          uint64_t start = msp->ms_id;
1631 1633  
1632 1634          if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1633 1635                  return (1ULL << 63);
1634 1636  
1635 1637          if (offset < start)
1636 1638                  return ((start - offset) << ms_shift);
1637 1639          if (offset > start)
1638 1640                  return ((offset - start) << ms_shift);
1639 1641          return (0);
1640 1642  }
1641 1643  
1642 1644  static uint64_t
1643 1645  metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1644 1646      uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1645 1647  {
1646 1648          spa_t *spa = mg->mg_vd->vdev_spa;
1647 1649          metaslab_t *msp = NULL;
1648 1650          uint64_t offset = -1ULL;
1649 1651          avl_tree_t *t = &mg->mg_metaslab_tree;
1650 1652          uint64_t activation_weight;
1651 1653          uint64_t target_distance;
1652 1654          int i;
1653 1655  
1654 1656          activation_weight = METASLAB_WEIGHT_PRIMARY;
1655 1657          for (i = 0; i < d; i++) {
1656 1658                  if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1657 1659                          activation_weight = METASLAB_WEIGHT_SECONDARY;
1658 1660                          break;
1659 1661                  }
1660 1662          }
1661 1663  
1662 1664          for (;;) {
1663 1665                  boolean_t was_active;
1664 1666  
1665 1667                  mutex_enter(&mg->mg_lock);
1666 1668                  for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1667 1669                          if (msp->ms_weight < asize) {
1668 1670                                  spa_dbgmsg(spa, "%s: failed to meet weight "
1669 1671                                      "requirement: vdev %llu, txg %llu, mg %p, "
1670 1672                                      "msp %p, psize %llu, asize %llu, "
1671 1673                                      "failures %llu, weight %llu",
1672 1674                                      spa_name(spa), mg->mg_vd->vdev_id, txg,
1673 1675                                      mg, msp, psize, asize,
1674 1676                                      mg->mg_alloc_failures, msp->ms_weight);
1675 1677                                  mutex_exit(&mg->mg_lock);
1676 1678                                  return (-1ULL);
1677 1679                          }
1678 1680  
1679 1681                          /*
1680 1682                           * If the selected metaslab is condensing, skip it.
1681 1683                           */
1682 1684                          if (msp->ms_condensing)
1683 1685                                  continue;
1684 1686  
1685 1687                          was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1686 1688                          if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1687 1689                                  break;
1688 1690  
1689 1691                          target_distance = min_distance +
1690 1692                              (space_map_allocated(msp->ms_sm) != 0 ? 0 :
1691 1693                              min_distance >> 1);
1692 1694  
1693 1695                          for (i = 0; i < d; i++)
1694 1696                                  if (metaslab_distance(msp, &dva[i]) <
1695 1697                                      target_distance)
1696 1698                                          break;
1697 1699                          if (i == d)
1698 1700                                  break;
1699 1701                  }
1700 1702                  mutex_exit(&mg->mg_lock);
1701 1703                  if (msp == NULL)
1702 1704                          return (-1ULL);
1703 1705  
1704 1706                  mutex_enter(&msp->ms_lock);
1705 1707  
1706 1708                  /*
1707 1709                   * If we've already reached the allowable number of failed
1708 1710                   * allocation attempts on this metaslab group then we
1709 1711                   * consider skipping it. We skip it only if we're allowed
1710 1712                   * to "fast" gang, the physical size is larger than
1711 1713                   * a gang block, and we're attempting to allocate from
1712 1714                   * the primary metaslab.
1713 1715                   */
1714 1716                  if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1715 1717                      CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1716 1718                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
1717 1719                          spa_dbgmsg(spa, "%s: skipping metaslab group: "
1718 1720                              "vdev %llu, txg %llu, mg %p, msp[%llu] %p, "
1719 1721                              "psize %llu, asize %llu, failures %llu",
1720 1722                              spa_name(spa), mg->mg_vd->vdev_id, txg, mg,
1721 1723                              msp->ms_id, msp, psize, asize,
1722 1724                              mg->mg_alloc_failures);
1723 1725                          mutex_exit(&msp->ms_lock);
1724 1726                          return (-1ULL);
1725 1727                  }
1726 1728  
1727 1729                  /*
1728 1730                   * Ensure that the metaslab we have selected is still
1729 1731                   * capable of handling our request. It's possible that
1730 1732                   * another thread may have changed the weight while we
1731 1733                   * were blocked on the metaslab lock.
1732 1734                   */
1733 1735                  if (msp->ms_weight < asize || (was_active &&
1734 1736                      !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1735 1737                      activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1736 1738                          mutex_exit(&msp->ms_lock);
1737 1739                          continue;
1738 1740                  }
1739 1741  
1740 1742                  if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1741 1743                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
1742 1744                          metaslab_passivate(msp,
1743 1745                              msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1744 1746                          mutex_exit(&msp->ms_lock);
1745 1747                          continue;
1746 1748                  }
1747 1749  
1748 1750                  if (metaslab_activate(msp, activation_weight) != 0) {
1749 1751                          mutex_exit(&msp->ms_lock);
1750 1752                          continue;
1751 1753                  }
1752 1754  
1753 1755                  /*
1754 1756                   * If this metaslab is currently condensing then pick again as
1755 1757                   * we can't manipulate this metaslab until it's committed
1756 1758                   * to disk.
1757 1759                   */
1758 1760                  if (msp->ms_condensing) {
1759 1761                          mutex_exit(&msp->ms_lock);
1760 1762                          continue;
1761 1763                  }
1762 1764  
1763 1765                  if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
1764 1766                          break;
1765 1767  
1766 1768                  atomic_inc_64(&mg->mg_alloc_failures);
1767 1769  
1768 1770                  metaslab_passivate(msp, metaslab_block_maxsize(msp));
1769 1771                  mutex_exit(&msp->ms_lock);
1770 1772          }
1771 1773  
1772 1774          if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1773 1775                  vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1774 1776  
1775 1777          range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
1776 1778          msp->ms_access_txg = txg + metaslab_unload_delay;
1777 1779  
1778 1780          mutex_exit(&msp->ms_lock);
1779 1781  
1780 1782          return (offset);
1781 1783  }
1782 1784  
1783 1785  /*
1784 1786   * Allocate a block for the specified i/o.
1785 1787   */
1786 1788  static int
1787 1789  metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1788 1790      dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1789 1791  {
1790 1792          metaslab_group_t *mg, *rotor;
1791 1793          vdev_t *vd;
1792 1794          int dshift = 3;
1793 1795          int all_zero;
1794 1796          int zio_lock = B_FALSE;
1795 1797          boolean_t allocatable;
1796 1798          uint64_t offset = -1ULL;
1797 1799          uint64_t asize;
1798 1800          uint64_t distance;
1799 1801  
1800 1802          ASSERT(!DVA_IS_VALID(&dva[d]));
1801 1803  
1802 1804          /*
1803 1805           * For testing, make some blocks above a certain size be gang blocks.
1804 1806           */
1805 1807          if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1806 1808                  return (SET_ERROR(ENOSPC));
1807 1809  
1808 1810          /*
1809 1811           * Start at the rotor and loop through all mgs until we find something.
1810 1812           * Note that there's no locking on mc_rotor or mc_aliquot because
1811 1813           * nothing actually breaks if we miss a few updates -- we just won't
1812 1814           * allocate quite as evenly.  It all balances out over time.
1813 1815           *
1814 1816           * If we are doing ditto or log blocks, try to spread them across
1815 1817           * consecutive vdevs.  If we're forced to reuse a vdev before we've
1816 1818           * allocated all of our ditto blocks, then try and spread them out on
1817 1819           * that vdev as much as possible.  If it turns out to not be possible,
1818 1820           * gradually lower our standards until anything becomes acceptable.
1819 1821           * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1820 1822           * gives us hope of containing our fault domains to something we're
1821 1823           * able to reason about.  Otherwise, any two top-level vdev failures
1822 1824           * will guarantee the loss of data.  With consecutive allocation,
1823 1825           * only two adjacent top-level vdev failures will result in data loss.
1824 1826           *
1825 1827           * If we are doing gang blocks (hintdva is non-NULL), try to keep
1826 1828           * ourselves on the same vdev as our gang block header.  That
1827 1829           * way, we can hope for locality in vdev_cache, plus it makes our
1828 1830           * fault domains something tractable.
1829 1831           */
1830 1832          if (hintdva) {
1831 1833                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1832 1834  
1833 1835                  /*
1834 1836                   * It's possible the vdev we're using as the hint no
1835 1837                   * longer exists (i.e. removed). Consult the rotor when
1836 1838                   * all else fails.
1837 1839                   */
1838 1840                  if (vd != NULL) {
1839 1841                          mg = vd->vdev_mg;
1840 1842  
1841 1843                          if (flags & METASLAB_HINTBP_AVOID &&
1842 1844                              mg->mg_next != NULL)
1843 1845                                  mg = mg->mg_next;
1844 1846                  } else {
1845 1847                          mg = mc->mc_rotor;
1846 1848                  }
1847 1849          } else if (d != 0) {
1848 1850                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1849 1851                  mg = vd->vdev_mg->mg_next;
1850 1852          } else {
1851 1853                  mg = mc->mc_rotor;
1852 1854          }
1853 1855  
1854 1856          /*
1855 1857           * If the hint put us into the wrong metaslab class, or into a
1856 1858           * metaslab group that has been passivated, just follow the rotor.
1857 1859           */
1858 1860          if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1859 1861                  mg = mc->mc_rotor;
1860 1862  
1861 1863          rotor = mg;
1862 1864  top:
1863 1865          all_zero = B_TRUE;
1864 1866          do {
1865 1867                  ASSERT(mg->mg_activation_count == 1);
1866 1868  
1867 1869                  vd = mg->mg_vd;
1868 1870  
1869 1871                  /*
1870 1872                   * Don't allocate from faulted devices.
1871 1873                   */
1872 1874                  if (zio_lock) {
1873 1875                          spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1874 1876                          allocatable = vdev_allocatable(vd);
1875 1877                          spa_config_exit(spa, SCL_ZIO, FTAG);
1876 1878                  } else {
1877 1879                          allocatable = vdev_allocatable(vd);
1878 1880                  }
1879 1881  
1880 1882                  /*
1881 1883                   * Determine if the selected metaslab group is eligible
1882 1884                   * for allocations. If we're ganging or have requested
1883 1885                   * an allocation for the smallest gang block size
1884 1886                   * then we don't want to avoid allocating to the this
1885 1887                   * metaslab group. If we're in this condition we should
1886 1888                   * try to allocate from any device possible so that we
1887 1889                   * don't inadvertently return ENOSPC and suspend the pool
1888 1890                   * even though space is still available.
1889 1891                   */
1890 1892                  if (allocatable && CAN_FASTGANG(flags) &&
1891 1893                      psize > SPA_GANGBLOCKSIZE)
1892 1894                          allocatable = metaslab_group_allocatable(mg);
1893 1895  
1894 1896                  if (!allocatable)
1895 1897                          goto next;
1896 1898  
1897 1899                  /*
1898 1900                   * Avoid writing single-copy data to a failing vdev
1899 1901                   * unless the user instructs us that it is okay.
1900 1902                   */
1901 1903                  if ((vd->vdev_stat.vs_write_errors > 0 ||
1902 1904                      vd->vdev_state < VDEV_STATE_HEALTHY) &&
1903 1905                      d == 0 && dshift == 3 &&
1904 1906                      !(zfs_write_to_degraded && vd->vdev_state ==
1905 1907                      VDEV_STATE_DEGRADED)) {
1906 1908                          all_zero = B_FALSE;
1907 1909                          goto next;
1908 1910                  }
1909 1911  
1910 1912                  ASSERT(mg->mg_class == mc);
1911 1913  
1912 1914                  distance = vd->vdev_asize >> dshift;
1913 1915                  if (distance <= (1ULL << vd->vdev_ms_shift))
1914 1916                          distance = 0;
1915 1917                  else
1916 1918                          all_zero = B_FALSE;
1917 1919  
1918 1920                  asize = vdev_psize_to_asize(vd, psize);
1919 1921                  ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1920 1922  
1921 1923                  offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1922 1924                      dva, d, flags);
1923 1925                  if (offset != -1ULL) {
1924 1926                          /*
1925 1927                           * If we've just selected this metaslab group,
1926 1928                           * figure out whether the corresponding vdev is
1927 1929                           * over- or under-used relative to the pool,
1928 1930                           * and set an allocation bias to even it out.
1929 1931                           */
1930 1932                          if (mc->mc_aliquot == 0) {
1931 1933                                  vdev_stat_t *vs = &vd->vdev_stat;
1932 1934                                  int64_t vu, cu;
1933 1935  
1934 1936                                  vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1935 1937                                  cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1936 1938  
1937 1939                                  /*
1938 1940                                   * Calculate how much more or less we should
1939 1941                                   * try to allocate from this device during
1940 1942                                   * this iteration around the rotor.
1941 1943                                   * For example, if a device is 80% full
1942 1944                                   * and the pool is 20% full then we should
1943 1945                                   * reduce allocations by 60% on this device.
1944 1946                                   *
1945 1947                                   * mg_bias = (20 - 80) * 512K / 100 = -307K
1946 1948                                   *
1947 1949                                   * This reduces allocations by 307K for this
1948 1950                                   * iteration.
1949 1951                                   */
1950 1952                                  mg->mg_bias = ((cu - vu) *
1951 1953                                      (int64_t)mg->mg_aliquot) / 100;
1952 1954                          }
1953 1955  
1954 1956                          if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1955 1957                              mg->mg_aliquot + mg->mg_bias) {
1956 1958                                  mc->mc_rotor = mg->mg_next;
1957 1959                                  mc->mc_aliquot = 0;
1958 1960                          }
1959 1961  
1960 1962                          DVA_SET_VDEV(&dva[d], vd->vdev_id);
1961 1963                          DVA_SET_OFFSET(&dva[d], offset);
1962 1964                          DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1963 1965                          DVA_SET_ASIZE(&dva[d], asize);
1964 1966  
1965 1967                          return (0);
1966 1968                  }
1967 1969  next:
1968 1970                  mc->mc_rotor = mg->mg_next;
1969 1971                  mc->mc_aliquot = 0;
1970 1972          } while ((mg = mg->mg_next) != rotor);
1971 1973  
1972 1974          if (!all_zero) {
1973 1975                  dshift++;
1974 1976                  ASSERT(dshift < 64);
1975 1977                  goto top;
1976 1978          }
1977 1979  
1978 1980          if (!allocatable && !zio_lock) {
1979 1981                  dshift = 3;
1980 1982                  zio_lock = B_TRUE;
1981 1983                  goto top;
1982 1984          }
1983 1985  
1984 1986          bzero(&dva[d], sizeof (dva_t));
1985 1987  
1986 1988          return (SET_ERROR(ENOSPC));
1987 1989  }
1988 1990  
1989 1991  /*
1990 1992   * Free the block represented by DVA in the context of the specified
1991 1993   * transaction group.
1992 1994   */
1993 1995  static void
1994 1996  metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1995 1997  {
1996 1998          uint64_t vdev = DVA_GET_VDEV(dva);
1997 1999          uint64_t offset = DVA_GET_OFFSET(dva);
1998 2000          uint64_t size = DVA_GET_ASIZE(dva);
1999 2001          vdev_t *vd;
2000 2002          metaslab_t *msp;
2001 2003  
2002 2004          ASSERT(DVA_IS_VALID(dva));
2003 2005  
2004 2006          if (txg > spa_freeze_txg(spa))
2005 2007                  return;
2006 2008  
2007 2009          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2008 2010              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
2009 2011                  cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
2010 2012                      (u_longlong_t)vdev, (u_longlong_t)offset);
2011 2013                  ASSERT(0);
2012 2014                  return;
2013 2015          }
2014 2016  
2015 2017          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2016 2018  
2017 2019          if (DVA_GET_GANG(dva))
2018 2020                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2019 2021  
2020 2022          mutex_enter(&msp->ms_lock);
2021 2023  
2022 2024          if (now) {
2023 2025                  range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
2024 2026                      offset, size);
2025 2027  
2026 2028                  VERIFY(!msp->ms_condensing);
2027 2029                  VERIFY3U(offset, >=, msp->ms_start);
2028 2030                  VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2029 2031                  VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2030 2032                      msp->ms_size);
2031 2033                  VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2032 2034                  VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2033 2035                  range_tree_add(msp->ms_tree, offset, size);
2034 2036          } else {
2035 2037                  if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2036 2038                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
2037 2039                  range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2038 2040                      offset, size);
2039 2041          }
2040 2042  
2041 2043          mutex_exit(&msp->ms_lock);
2042 2044  }
2043 2045  
2044 2046  /*
2045 2047   * Intent log support: upon opening the pool after a crash, notify the SPA
2046 2048   * of blocks that the intent log has allocated for immediate write, but
2047 2049   * which are still considered free by the SPA because the last transaction
2048 2050   * group didn't commit yet.
2049 2051   */
2050 2052  static int
2051 2053  metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2052 2054  {
2053 2055          uint64_t vdev = DVA_GET_VDEV(dva);
2054 2056          uint64_t offset = DVA_GET_OFFSET(dva);
2055 2057          uint64_t size = DVA_GET_ASIZE(dva);
2056 2058          vdev_t *vd;
2057 2059          metaslab_t *msp;
2058 2060          int error = 0;
2059 2061  
2060 2062          ASSERT(DVA_IS_VALID(dva));
2061 2063  
2062 2064          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2063 2065              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2064 2066                  return (SET_ERROR(ENXIO));
2065 2067  
2066 2068          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2067 2069  
2068 2070          if (DVA_GET_GANG(dva))
2069 2071                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2070 2072  
2071 2073          mutex_enter(&msp->ms_lock);
2072 2074  
2073 2075          if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2074 2076                  error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2075 2077  
2076 2078          if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2077 2079                  error = SET_ERROR(ENOENT);
2078 2080  
2079 2081          if (error || txg == 0) {        /* txg == 0 indicates dry run */
2080 2082                  mutex_exit(&msp->ms_lock);
2081 2083                  return (error);
2082 2084          }
2083 2085  
2084 2086          VERIFY(!msp->ms_condensing);
2085 2087          VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2086 2088          VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2087 2089          VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2088 2090          range_tree_remove(msp->ms_tree, offset, size);
2089 2091  
2090 2092          if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
2091 2093                  if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2092 2094                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
2093 2095                  range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2094 2096          }
2095 2097  
2096 2098          mutex_exit(&msp->ms_lock);
2097 2099  
2098 2100          return (0);
2099 2101  }
2100 2102  
2101 2103  int
2102 2104  metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2103 2105      int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
2104 2106  {
2105 2107          dva_t *dva = bp->blk_dva;
2106 2108          dva_t *hintdva = hintbp->blk_dva;
2107 2109          int error = 0;
2108 2110  
2109 2111          ASSERT(bp->blk_birth == 0);
2110 2112          ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2111 2113  
2112 2114          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2113 2115  
2114 2116          if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
2115 2117                  spa_config_exit(spa, SCL_ALLOC, FTAG);
2116 2118                  return (SET_ERROR(ENOSPC));
2117 2119          }
2118 2120  
2119 2121          ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2120 2122          ASSERT(BP_GET_NDVAS(bp) == 0);
2121 2123          ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2122 2124  
2123 2125          for (int d = 0; d < ndvas; d++) {
2124 2126                  error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2125 2127                      txg, flags);
2126 2128                  if (error != 0) {
2127 2129                          for (d--; d >= 0; d--) {
2128 2130                                  metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2129 2131                                  bzero(&dva[d], sizeof (dva_t));
2130 2132                          }
2131 2133                          spa_config_exit(spa, SCL_ALLOC, FTAG);
2132 2134                          return (error);
2133 2135                  }
2134 2136          }
2135 2137          ASSERT(error == 0);
2136 2138          ASSERT(BP_GET_NDVAS(bp) == ndvas);
2137 2139  
2138 2140          spa_config_exit(spa, SCL_ALLOC, FTAG);
2139 2141  
2140 2142          BP_SET_BIRTH(bp, txg, txg);
2141 2143  
2142 2144          return (0);
2143 2145  }
2144 2146  
2145 2147  void
2146 2148  metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2147 2149  {
2148 2150          const dva_t *dva = bp->blk_dva;
2149 2151          int ndvas = BP_GET_NDVAS(bp);
2150 2152  
2151 2153          ASSERT(!BP_IS_HOLE(bp));
2152 2154          ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2153 2155  
2154 2156          spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2155 2157  
2156 2158          for (int d = 0; d < ndvas; d++)
2157 2159                  metaslab_free_dva(spa, &dva[d], txg, now);
2158 2160  
2159 2161          spa_config_exit(spa, SCL_FREE, FTAG);
2160 2162  }
2161 2163  
2162 2164  int
2163 2165  metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2164 2166  {
2165 2167          const dva_t *dva = bp->blk_dva;
2166 2168          int ndvas = BP_GET_NDVAS(bp);
2167 2169          int error = 0;
2168 2170  
2169 2171          ASSERT(!BP_IS_HOLE(bp));
2170 2172  
2171 2173          if (txg != 0) {
2172 2174                  /*
2173 2175                   * First do a dry run to make sure all DVAs are claimable,
2174 2176                   * so we don't have to unwind from partial failures below.
2175 2177                   */
2176 2178                  if ((error = metaslab_claim(spa, bp, 0)) != 0)
2177 2179                          return (error);
2178 2180          }
2179 2181  
2180 2182          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2181 2183  
2182 2184          for (int d = 0; d < ndvas; d++)
2183 2185                  if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2184 2186                          break;
2185 2187  
2186 2188          spa_config_exit(spa, SCL_ALLOC, FTAG);
2187 2189  
2188 2190          ASSERT(error == 0 || txg == 0);
2189 2191  
2190 2192          return (error);
2191 2193  }
2192 2194  
2193 2195  void
2194 2196  metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2195 2197  {
2196 2198          if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2197 2199                  return;
2198 2200  
2199 2201          spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2200 2202          for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2201 2203                  uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2202 2204                  vdev_t *vd = vdev_lookup_top(spa, vdev);
2203 2205                  uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2204 2206                  uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2205 2207                  metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2206 2208  
2207 2209                  if (msp->ms_loaded)
2208 2210                          range_tree_verify(msp->ms_tree, offset, size);
2209 2211  
2210 2212                  for (int j = 0; j < TXG_SIZE; j++)
2211 2213                          range_tree_verify(msp->ms_freetree[j], offset, size);
2212 2214                  for (int j = 0; j < TXG_DEFER_SIZE; j++)
2213 2215                          range_tree_verify(msp->ms_defertree[j], offset, size);
2214 2216          }
2215 2217          spa_config_exit(spa, SCL_VDEV, FTAG);
2216 2218  }

↓ open down ↓

1857 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX