illumos-gate Wdiff usr/src/uts/common/fs/zfs/metaslab.c

Print this page

7938 disable LBA weighting on files and SSDs
Reviewed by: Yuri Pankov <yuripv@gmx.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/metaslab.c
          +++ new/usr/src/uts/common/fs/zfs/metaslab.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25   25   * Copyright (c) 2014 Integros [integros.com]
  26   26   */
  27   27  
  28   28  #include <sys/zfs_context.h>
  29   29  #include <sys/dmu.h>
  30   30  #include <sys/dmu_tx.h>
  31   31  #include <sys/space_map.h>
  32   32  #include <sys/metaslab_impl.h>
  33   33  #include <sys/vdev_impl.h>
  34   34  #include <sys/zio.h>
  35   35  #include <sys/spa_impl.h>
  36   36  #include <sys/zfeature.h>
  37   37  
  38   38  #define GANG_ALLOCATION(flags) \
  39   39          ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  40   40  
  41   41  uint64_t metaslab_aliquot = 512ULL << 10;
  42   42  uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  43   43  
  44   44  /*
  45   45   * The in-core space map representation is more compact than its on-disk form.
  46   46   * The zfs_condense_pct determines how much more compact the in-core
  47   47   * space map representation must be before we compact it on-disk.
  48   48   * Values should be greater than or equal to 100.
  49   49   */
  50   50  int zfs_condense_pct = 200;
  51   51  
  52   52  /*
  53   53   * Condensing a metaslab is not guaranteed to actually reduce the amount of
  54   54   * space used on disk. In particular, a space map uses data in increments of
  55   55   * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
  56   56   * same number of blocks after condensing. Since the goal of condensing is to
  57   57   * reduce the number of IOPs required to read the space map, we only want to
  58   58   * condense when we can be sure we will reduce the number of blocks used by the
  59   59   * space map. Unfortunately, we cannot precisely compute whether or not this is
  60   60   * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  61   61   * we apply the following heuristic: do not condense a spacemap unless the
  62   62   * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  63   63   * blocks.
  64   64   */
  65   65  int zfs_metaslab_condense_block_threshold = 4;
  66   66  
  67   67  /*
  68   68   * The zfs_mg_noalloc_threshold defines which metaslab groups should
  69   69   * be eligible for allocation. The value is defined as a percentage of
  70   70   * free space. Metaslab groups that have more free space than
  71   71   * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  72   72   * a metaslab group's free space is less than or equal to the
  73   73   * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  74   74   * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  75   75   * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  76   76   * groups are allowed to accept allocations. Gang blocks are always
  77   77   * eligible to allocate on any metaslab group. The default value of 0 means
  78   78   * no metaslab group will be excluded based on this criterion.
  79   79   */
  80   80  int zfs_mg_noalloc_threshold = 0;
  81   81  
  82   82  /*
  83   83   * Metaslab groups are considered eligible for allocations if their
  84   84   * fragmenation metric (measured as a percentage) is less than or equal to
  85   85   * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
  86   86   * then it will be skipped unless all metaslab groups within the metaslab
  87   87   * class have also crossed this threshold.
  88   88   */
  89   89  int zfs_mg_fragmentation_threshold = 85;
  90   90  
  91   91  /*
  92   92   * Allow metaslabs to keep their active state as long as their fragmentation
  93   93   * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
  94   94   * active metaslab that exceeds this threshold will no longer keep its active
  95   95   * status allowing better metaslabs to be selected.
  96   96   */
  97   97  int zfs_metaslab_fragmentation_threshold = 70;
  98   98  
  99   99  /*
 100  100   * When set will load all metaslabs when pool is first opened.
 101  101   */
 102  102  int metaslab_debug_load = 0;
 103  103  
 104  104  /*
 105  105   * When set will prevent metaslabs from being unloaded.
 106  106   */
 107  107  int metaslab_debug_unload = 0;
 108  108  
 109  109  /*
 110  110   * Minimum size which forces the dynamic allocator to change
 111  111   * it's allocation strategy.  Once the space map cannot satisfy
 112  112   * an allocation of this size then it switches to using more
 113  113   * aggressive strategy (i.e search by size rather than offset).
 114  114   */
 115  115  uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 116  116  
 117  117  /*
 118  118   * The minimum free space, in percent, which must be available
 119  119   * in a space map to continue allocations in a first-fit fashion.
 120  120   * Once the space map's free space drops below this level we dynamically
 121  121   * switch to using best-fit allocations.
 122  122   */
 123  123  int metaslab_df_free_pct = 4;
 124  124  
 125  125  /*
 126  126   * A metaslab is considered "free" if it contains a contiguous
 127  127   * segment which is greater than metaslab_min_alloc_size.
 128  128   */
 129  129  uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 130  130  
 131  131  /*
 132  132   * Percentage of all cpus that can be used by the metaslab taskq.
 133  133   */
 134  134  int metaslab_load_pct = 50;
 135  135  
 136  136  /*
 137  137   * Determines how many txgs a metaslab may remain loaded without having any
 138  138   * allocations from it. As long as a metaslab continues to be used we will
 139  139   * keep it loaded.
 140  140   */
 141  141  int metaslab_unload_delay = TXG_SIZE * 2;
 142  142  
 143  143  /*
 144  144   * Max number of metaslabs per group to preload.
 145  145   */
 146  146  int metaslab_preload_limit = SPA_DVAS_PER_BP;
 147  147  
 148  148  /*
 149  149   * Enable/disable preloading of metaslab.
 150  150   */
 151  151  boolean_t metaslab_preload_enabled = B_TRUE;
 152  152  
 153  153  /*
 154  154   * Enable/disable fragmentation weighting on metaslabs.
 155  155   */
 156  156  boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
 157  157  
 158  158  /*
 159  159   * Enable/disable lba weighting (i.e. outer tracks are given preference).
 160  160   */
 161  161  boolean_t metaslab_lba_weighting_enabled = B_TRUE;
 162  162  
 163  163  /*
 164  164   * Enable/disable metaslab group biasing.
 165  165   */
 166  166  boolean_t metaslab_bias_enabled = B_TRUE;
 167  167  
 168  168  /*
 169  169   * Enable/disable segment-based metaslab selection.
 170  170   */
 171  171  boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
 172  172  
 173  173  /*
 174  174   * When using segment-based metaslab selection, we will continue
 175  175   * allocating from the active metaslab until we have exhausted
 176  176   * zfs_metaslab_switch_threshold of its buckets.
 177  177   */
 178  178  int zfs_metaslab_switch_threshold = 2;
 179  179  
 180  180  /*
 181  181   * Internal switch to enable/disable the metaslab allocation tracing
 182  182   * facility.
 183  183   */
 184  184  boolean_t metaslab_trace_enabled = B_TRUE;
 185  185  
 186  186  /*
 187  187   * Maximum entries that the metaslab allocation tracing facility will keep
 188  188   * in a given list when running in non-debug mode. We limit the number
 189  189   * of entries in non-debug mode to prevent us from using up too much memory.
 190  190   * The limit should be sufficiently large that we don't expect any allocation
 191  191   * to every exceed this value. In debug mode, the system will panic if this
 192  192   * limit is ever reached allowing for further investigation.
 193  193   */
 194  194  uint64_t metaslab_trace_max_entries = 5000;
 195  195  
 196  196  static uint64_t metaslab_weight(metaslab_t *);
 197  197  static void metaslab_set_fragmentation(metaslab_t *);
 198  198  
 199  199  kmem_cache_t *metaslab_alloc_trace_cache;
 200  200  
 201  201  /*
 202  202   * ==========================================================================
 203  203   * Metaslab classes
 204  204   * ==========================================================================
 205  205   */
 206  206  metaslab_class_t *
 207  207  metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 208  208  {
 209  209          metaslab_class_t *mc;
 210  210  
 211  211          mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 212  212  
 213  213          mc->mc_spa = spa;
 214  214          mc->mc_rotor = NULL;
 215  215          mc->mc_ops = ops;
 216  216          mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 217  217          refcount_create_tracked(&mc->mc_alloc_slots);
 218  218  
 219  219          return (mc);
 220  220  }
 221  221  
 222  222  void
 223  223  metaslab_class_destroy(metaslab_class_t *mc)
 224  224  {
 225  225          ASSERT(mc->mc_rotor == NULL);
 226  226          ASSERT(mc->mc_alloc == 0);
 227  227          ASSERT(mc->mc_deferred == 0);
 228  228          ASSERT(mc->mc_space == 0);
 229  229          ASSERT(mc->mc_dspace == 0);
 230  230  
 231  231          refcount_destroy(&mc->mc_alloc_slots);
 232  232          mutex_destroy(&mc->mc_lock);
 233  233          kmem_free(mc, sizeof (metaslab_class_t));
 234  234  }
 235  235  
 236  236  int
 237  237  metaslab_class_validate(metaslab_class_t *mc)
 238  238  {
 239  239          metaslab_group_t *mg;
 240  240          vdev_t *vd;
 241  241  
 242  242          /*
 243  243           * Must hold one of the spa_config locks.
 244  244           */
 245  245          ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 246  246              spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 247  247  
 248  248          if ((mg = mc->mc_rotor) == NULL)
 249  249                  return (0);
 250  250  
 251  251          do {
 252  252                  vd = mg->mg_vd;
 253  253                  ASSERT(vd->vdev_mg != NULL);
 254  254                  ASSERT3P(vd->vdev_top, ==, vd);
 255  255                  ASSERT3P(mg->mg_class, ==, mc);
 256  256                  ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 257  257          } while ((mg = mg->mg_next) != mc->mc_rotor);
 258  258  
 259  259          return (0);
 260  260  }
 261  261  
 262  262  void
 263  263  metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 264  264      int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 265  265  {
 266  266          atomic_add_64(&mc->mc_alloc, alloc_delta);
 267  267          atomic_add_64(&mc->mc_deferred, defer_delta);
 268  268          atomic_add_64(&mc->mc_space, space_delta);
 269  269          atomic_add_64(&mc->mc_dspace, dspace_delta);
 270  270  }
 271  271  
 272  272  uint64_t
 273  273  metaslab_class_get_alloc(metaslab_class_t *mc)
 274  274  {
 275  275          return (mc->mc_alloc);
 276  276  }
 277  277  
 278  278  uint64_t
 279  279  metaslab_class_get_deferred(metaslab_class_t *mc)
 280  280  {
 281  281          return (mc->mc_deferred);
 282  282  }
 283  283  
 284  284  uint64_t
 285  285  metaslab_class_get_space(metaslab_class_t *mc)
 286  286  {
 287  287          return (mc->mc_space);
 288  288  }
 289  289  
 290  290  uint64_t
 291  291  metaslab_class_get_dspace(metaslab_class_t *mc)
 292  292  {
 293  293          return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 294  294  }
 295  295  
 296  296  void
 297  297  metaslab_class_histogram_verify(metaslab_class_t *mc)
 298  298  {
 299  299          vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 300  300          uint64_t *mc_hist;
 301  301          int i;
 302  302  
 303  303          if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 304  304                  return;
 305  305  
 306  306          mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 307  307              KM_SLEEP);
 308  308  
 309  309          for (int c = 0; c < rvd->vdev_children; c++) {
 310  310                  vdev_t *tvd = rvd->vdev_child[c];
 311  311                  metaslab_group_t *mg = tvd->vdev_mg;
 312  312  
 313  313                  /*
 314  314                   * Skip any holes, uninitialized top-levels, or
 315  315                   * vdevs that are not in this metalab class.
 316  316                   */
 317  317                  if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 318  318                      mg->mg_class != mc) {
 319  319                          continue;
 320  320                  }
 321  321  
 322  322                  for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 323  323                          mc_hist[i] += mg->mg_histogram[i];
 324  324          }
 325  325  
 326  326          for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 327  327                  VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 328  328  
 329  329          kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 330  330  }
 331  331  
 332  332  /*
 333  333   * Calculate the metaslab class's fragmentation metric. The metric
 334  334   * is weighted based on the space contribution of each metaslab group.
 335  335   * The return value will be a number between 0 and 100 (inclusive), or
 336  336   * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 337  337   * zfs_frag_table for more information about the metric.
 338  338   */
 339  339  uint64_t
 340  340  metaslab_class_fragmentation(metaslab_class_t *mc)
 341  341  {
 342  342          vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 343  343          uint64_t fragmentation = 0;
 344  344  
 345  345          spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 346  346  
 347  347          for (int c = 0; c < rvd->vdev_children; c++) {
 348  348                  vdev_t *tvd = rvd->vdev_child[c];
 349  349                  metaslab_group_t *mg = tvd->vdev_mg;
 350  350  
 351  351                  /*
 352  352                   * Skip any holes, uninitialized top-levels, or
 353  353                   * vdevs that are not in this metalab class.
 354  354                   */
 355  355                  if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 356  356                      mg->mg_class != mc) {
 357  357                          continue;
 358  358                  }
 359  359  
 360  360                  /*
 361  361                   * If a metaslab group does not contain a fragmentation
 362  362                   * metric then just bail out.
 363  363                   */
 364  364                  if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 365  365                          spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 366  366                          return (ZFS_FRAG_INVALID);
 367  367                  }
 368  368  
 369  369                  /*
 370  370                   * Determine how much this metaslab_group is contributing
 371  371                   * to the overall pool fragmentation metric.
 372  372                   */
 373  373                  fragmentation += mg->mg_fragmentation *
 374  374                      metaslab_group_get_space(mg);
 375  375          }
 376  376          fragmentation /= metaslab_class_get_space(mc);
 377  377  
 378  378          ASSERT3U(fragmentation, <=, 100);
 379  379          spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 380  380          return (fragmentation);
 381  381  }
 382  382  
 383  383  /*
 384  384   * Calculate the amount of expandable space that is available in
 385  385   * this metaslab class. If a device is expanded then its expandable
 386  386   * space will be the amount of allocatable space that is currently not
 387  387   * part of this metaslab class.
 388  388   */
 389  389  uint64_t
 390  390  metaslab_class_expandable_space(metaslab_class_t *mc)
 391  391  {
 392  392          vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 393  393          uint64_t space = 0;
 394  394  
 395  395          spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 396  396          for (int c = 0; c < rvd->vdev_children; c++) {
 397  397                  uint64_t tspace;
 398  398                  vdev_t *tvd = rvd->vdev_child[c];
 399  399                  metaslab_group_t *mg = tvd->vdev_mg;
 400  400  
 401  401                  if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 402  402                      mg->mg_class != mc) {
 403  403                          continue;
 404  404                  }
 405  405  
 406  406                  /*
 407  407                   * Calculate if we have enough space to add additional
 408  408                   * metaslabs. We report the expandable space in terms
 409  409                   * of the metaslab size since that's the unit of expansion.
 410  410                   * Adjust by efi system partition size.
 411  411                   */
 412  412                  tspace = tvd->vdev_max_asize - tvd->vdev_asize;
 413  413                  if (tspace > mc->mc_spa->spa_bootsize) {
 414  414                          tspace -= mc->mc_spa->spa_bootsize;
 415  415                  }
 416  416                  space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
 417  417          }
 418  418          spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 419  419          return (space);
 420  420  }
 421  421  
 422  422  static int
 423  423  metaslab_compare(const void *x1, const void *x2)
 424  424  {
 425  425          const metaslab_t *m1 = x1;
 426  426          const metaslab_t *m2 = x2;
 427  427  
 428  428          if (m1->ms_weight < m2->ms_weight)
 429  429                  return (1);
 430  430          if (m1->ms_weight > m2->ms_weight)
 431  431                  return (-1);
 432  432  
 433  433          /*
 434  434           * If the weights are identical, use the offset to force uniqueness.
 435  435           */
 436  436          if (m1->ms_start < m2->ms_start)
 437  437                  return (-1);
 438  438          if (m1->ms_start > m2->ms_start)
 439  439                  return (1);
 440  440  
 441  441          ASSERT3P(m1, ==, m2);
 442  442  
 443  443          return (0);
 444  444  }
 445  445  
 446  446  /*
 447  447   * Verify that the space accounting on disk matches the in-core range_trees.
 448  448   */
 449  449  void
 450  450  metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 451  451  {
 452  452          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 453  453          uint64_t allocated = 0;
 454  454          uint64_t sm_free_space, msp_free_space;
 455  455  
 456  456          ASSERT(MUTEX_HELD(&msp->ms_lock));
 457  457  
 458  458          if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 459  459                  return;
 460  460  
 461  461          /*
 462  462           * We can only verify the metaslab space when we're called
 463  463           * from syncing context with a loaded metaslab that has an allocated
 464  464           * space map. Calling this in non-syncing context does not
 465  465           * provide a consistent view of the metaslab since we're performing
 466  466           * allocations in the future.
 467  467           */
 468  468          if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 469  469              !msp->ms_loaded)
 470  470                  return;
 471  471  
 472  472          sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
 473  473              space_map_alloc_delta(msp->ms_sm);
 474  474  
 475  475          /*
 476  476           * Account for future allocations since we would have already
 477  477           * deducted that space from the ms_freetree.
 478  478           */
 479  479          for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 480  480                  allocated +=
 481  481                      range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
 482  482          }
 483  483  
 484  484          msp_free_space = range_tree_space(msp->ms_tree) + allocated +
 485  485              msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
 486  486  
 487  487          VERIFY3U(sm_free_space, ==, msp_free_space);
 488  488  }
 489  489  
 490  490  /*
 491  491   * ==========================================================================
 492  492   * Metaslab groups
 493  493   * ==========================================================================
 494  494   */
 495  495  /*
 496  496   * Update the allocatable flag and the metaslab group's capacity.
 497  497   * The allocatable flag is set to true if the capacity is below
 498  498   * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 499  499   * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 500  500   * transitions from allocatable to non-allocatable or vice versa then the
 501  501   * metaslab group's class is updated to reflect the transition.
 502  502   */
 503  503  static void
 504  504  metaslab_group_alloc_update(metaslab_group_t *mg)
 505  505  {
 506  506          vdev_t *vd = mg->mg_vd;
 507  507          metaslab_class_t *mc = mg->mg_class;
 508  508          vdev_stat_t *vs = &vd->vdev_stat;
 509  509          boolean_t was_allocatable;
 510  510          boolean_t was_initialized;
 511  511  
 512  512          ASSERT(vd == vd->vdev_top);
 513  513  
 514  514          mutex_enter(&mg->mg_lock);
 515  515          was_allocatable = mg->mg_allocatable;
 516  516          was_initialized = mg->mg_initialized;
 517  517  
 518  518          mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 519  519              (vs->vs_space + 1);
 520  520  
 521  521          mutex_enter(&mc->mc_lock);
 522  522  
 523  523          /*
 524  524           * If the metaslab group was just added then it won't
 525  525           * have any space until we finish syncing out this txg.
 526  526           * At that point we will consider it initialized and available
 527  527           * for allocations.  We also don't consider non-activated
 528  528           * metaslab groups (e.g. vdevs that are in the middle of being removed)
 529  529           * to be initialized, because they can't be used for allocation.
 530  530           */
 531  531          mg->mg_initialized = metaslab_group_initialized(mg);
 532  532          if (!was_initialized && mg->mg_initialized) {
 533  533                  mc->mc_groups++;
 534  534          } else if (was_initialized && !mg->mg_initialized) {
 535  535                  ASSERT3U(mc->mc_groups, >, 0);
 536  536                  mc->mc_groups--;
 537  537          }
 538  538          if (mg->mg_initialized)
 539  539                  mg->mg_no_free_space = B_FALSE;
 540  540  
 541  541          /*
 542  542           * A metaslab group is considered allocatable if it has plenty
 543  543           * of free space or is not heavily fragmented. We only take
 544  544           * fragmentation into account if the metaslab group has a valid
 545  545           * fragmentation metric (i.e. a value between 0 and 100).
 546  546           */
 547  547          mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 548  548              mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 549  549              (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 550  550              mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 551  551  
 552  552          /*
 553  553           * The mc_alloc_groups maintains a count of the number of
 554  554           * groups in this metaslab class that are still above the
 555  555           * zfs_mg_noalloc_threshold. This is used by the allocating
 556  556           * threads to determine if they should avoid allocations to
 557  557           * a given group. The allocator will avoid allocations to a group
 558  558           * if that group has reached or is below the zfs_mg_noalloc_threshold
 559  559           * and there are still other groups that are above the threshold.
 560  560           * When a group transitions from allocatable to non-allocatable or
 561  561           * vice versa we update the metaslab class to reflect that change.
 562  562           * When the mc_alloc_groups value drops to 0 that means that all
 563  563           * groups have reached the zfs_mg_noalloc_threshold making all groups
 564  564           * eligible for allocations. This effectively means that all devices
 565  565           * are balanced again.
 566  566           */
 567  567          if (was_allocatable && !mg->mg_allocatable)
 568  568                  mc->mc_alloc_groups--;
 569  569          else if (!was_allocatable && mg->mg_allocatable)
 570  570                  mc->mc_alloc_groups++;
 571  571          mutex_exit(&mc->mc_lock);
 572  572  
 573  573          mutex_exit(&mg->mg_lock);
 574  574  }
 575  575  
 576  576  metaslab_group_t *
 577  577  metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 578  578  {
 579  579          metaslab_group_t *mg;
 580  580  
 581  581          mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 582  582          mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 583  583          avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 584  584              sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 585  585          mg->mg_vd = vd;
 586  586          mg->mg_class = mc;
 587  587          mg->mg_activation_count = 0;
 588  588          mg->mg_initialized = B_FALSE;
 589  589          mg->mg_no_free_space = B_TRUE;
 590  590          refcount_create_tracked(&mg->mg_alloc_queue_depth);
 591  591  
 592  592          mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 593  593              minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 594  594  
 595  595          return (mg);
 596  596  }
 597  597  
 598  598  void
 599  599  metaslab_group_destroy(metaslab_group_t *mg)
 600  600  {
 601  601          ASSERT(mg->mg_prev == NULL);
 602  602          ASSERT(mg->mg_next == NULL);
 603  603          /*
 604  604           * We may have gone below zero with the activation count
 605  605           * either because we never activated in the first place or
 606  606           * because we're done, and possibly removing the vdev.
 607  607           */
 608  608          ASSERT(mg->mg_activation_count <= 0);
 609  609  
 610  610          taskq_destroy(mg->mg_taskq);
 611  611          avl_destroy(&mg->mg_metaslab_tree);
 612  612          mutex_destroy(&mg->mg_lock);
 613  613          refcount_destroy(&mg->mg_alloc_queue_depth);
 614  614          kmem_free(mg, sizeof (metaslab_group_t));
 615  615  }
 616  616  
 617  617  void
 618  618  metaslab_group_activate(metaslab_group_t *mg)
 619  619  {
 620  620          metaslab_class_t *mc = mg->mg_class;
 621  621          metaslab_group_t *mgprev, *mgnext;
 622  622  
 623  623          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 624  624  
 625  625          ASSERT(mc->mc_rotor != mg);
 626  626          ASSERT(mg->mg_prev == NULL);
 627  627          ASSERT(mg->mg_next == NULL);
 628  628          ASSERT(mg->mg_activation_count <= 0);
 629  629  
 630  630          if (++mg->mg_activation_count <= 0)
 631  631                  return;
 632  632  
 633  633          mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 634  634          metaslab_group_alloc_update(mg);
 635  635  
 636  636          if ((mgprev = mc->mc_rotor) == NULL) {
 637  637                  mg->mg_prev = mg;
 638  638                  mg->mg_next = mg;
 639  639          } else {
 640  640                  mgnext = mgprev->mg_next;
 641  641                  mg->mg_prev = mgprev;
 642  642                  mg->mg_next = mgnext;
 643  643                  mgprev->mg_next = mg;
 644  644                  mgnext->mg_prev = mg;
 645  645          }
 646  646          mc->mc_rotor = mg;
 647  647  }
 648  648  
 649  649  void
 650  650  metaslab_group_passivate(metaslab_group_t *mg)
 651  651  {
 652  652          metaslab_class_t *mc = mg->mg_class;
 653  653          metaslab_group_t *mgprev, *mgnext;
 654  654  
 655  655          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 656  656  
 657  657          if (--mg->mg_activation_count != 0) {
 658  658                  ASSERT(mc->mc_rotor != mg);
 659  659                  ASSERT(mg->mg_prev == NULL);
 660  660                  ASSERT(mg->mg_next == NULL);
 661  661                  ASSERT(mg->mg_activation_count < 0);
 662  662                  return;
 663  663          }
 664  664  
 665  665          taskq_wait(mg->mg_taskq);
 666  666          metaslab_group_alloc_update(mg);
 667  667  
 668  668          mgprev = mg->mg_prev;
 669  669          mgnext = mg->mg_next;
 670  670  
 671  671          if (mg == mgnext) {
 672  672                  mc->mc_rotor = NULL;
 673  673          } else {
 674  674                  mc->mc_rotor = mgnext;
 675  675                  mgprev->mg_next = mgnext;
 676  676                  mgnext->mg_prev = mgprev;
 677  677          }
 678  678  
 679  679          mg->mg_prev = NULL;
 680  680          mg->mg_next = NULL;
 681  681  }
 682  682  
 683  683  boolean_t
 684  684  metaslab_group_initialized(metaslab_group_t *mg)
 685  685  {
 686  686          vdev_t *vd = mg->mg_vd;
 687  687          vdev_stat_t *vs = &vd->vdev_stat;
 688  688  
 689  689          return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 690  690  }
 691  691  
 692  692  uint64_t
 693  693  metaslab_group_get_space(metaslab_group_t *mg)
 694  694  {
 695  695          return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 696  696  }
 697  697  
 698  698  void
 699  699  metaslab_group_histogram_verify(metaslab_group_t *mg)
 700  700  {
 701  701          uint64_t *mg_hist;
 702  702          vdev_t *vd = mg->mg_vd;
 703  703          uint64_t ashift = vd->vdev_ashift;
 704  704          int i;
 705  705  
 706  706          if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 707  707                  return;
 708  708  
 709  709          mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 710  710              KM_SLEEP);
 711  711  
 712  712          ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 713  713              SPACE_MAP_HISTOGRAM_SIZE + ashift);
 714  714  
 715  715          for (int m = 0; m < vd->vdev_ms_count; m++) {
 716  716                  metaslab_t *msp = vd->vdev_ms[m];
 717  717  
 718  718                  if (msp->ms_sm == NULL)
 719  719                          continue;
 720  720  
 721  721                  for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 722  722                          mg_hist[i + ashift] +=
 723  723                              msp->ms_sm->sm_phys->smp_histogram[i];
 724  724          }
 725  725  
 726  726          for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 727  727                  VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 728  728  
 729  729          kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 730  730  }
 731  731  
 732  732  static void
 733  733  metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 734  734  {
 735  735          metaslab_class_t *mc = mg->mg_class;
 736  736          uint64_t ashift = mg->mg_vd->vdev_ashift;
 737  737  
 738  738          ASSERT(MUTEX_HELD(&msp->ms_lock));
 739  739          if (msp->ms_sm == NULL)
 740  740                  return;
 741  741  
 742  742          mutex_enter(&mg->mg_lock);
 743  743          for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 744  744                  mg->mg_histogram[i + ashift] +=
 745  745                      msp->ms_sm->sm_phys->smp_histogram[i];
 746  746                  mc->mc_histogram[i + ashift] +=
 747  747                      msp->ms_sm->sm_phys->smp_histogram[i];
 748  748          }
 749  749          mutex_exit(&mg->mg_lock);
 750  750  }
 751  751  
 752  752  void
 753  753  metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 754  754  {
 755  755          metaslab_class_t *mc = mg->mg_class;
 756  756          uint64_t ashift = mg->mg_vd->vdev_ashift;
 757  757  
 758  758          ASSERT(MUTEX_HELD(&msp->ms_lock));
 759  759          if (msp->ms_sm == NULL)
 760  760                  return;
 761  761  
 762  762          mutex_enter(&mg->mg_lock);
 763  763          for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 764  764                  ASSERT3U(mg->mg_histogram[i + ashift], >=,
 765  765                      msp->ms_sm->sm_phys->smp_histogram[i]);
 766  766                  ASSERT3U(mc->mc_histogram[i + ashift], >=,
 767  767                      msp->ms_sm->sm_phys->smp_histogram[i]);
 768  768  
 769  769                  mg->mg_histogram[i + ashift] -=
 770  770                      msp->ms_sm->sm_phys->smp_histogram[i];
 771  771                  mc->mc_histogram[i + ashift] -=
 772  772                      msp->ms_sm->sm_phys->smp_histogram[i];
 773  773          }
 774  774          mutex_exit(&mg->mg_lock);
 775  775  }
 776  776  
 777  777  static void
 778  778  metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 779  779  {
 780  780          ASSERT(msp->ms_group == NULL);
 781  781          mutex_enter(&mg->mg_lock);
 782  782          msp->ms_group = mg;
 783  783          msp->ms_weight = 0;
 784  784          avl_add(&mg->mg_metaslab_tree, msp);
 785  785          mutex_exit(&mg->mg_lock);
 786  786  
 787  787          mutex_enter(&msp->ms_lock);
 788  788          metaslab_group_histogram_add(mg, msp);
 789  789          mutex_exit(&msp->ms_lock);
 790  790  }
 791  791  
 792  792  static void
 793  793  metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 794  794  {
 795  795          mutex_enter(&msp->ms_lock);
 796  796          metaslab_group_histogram_remove(mg, msp);
 797  797          mutex_exit(&msp->ms_lock);
 798  798  
 799  799          mutex_enter(&mg->mg_lock);
 800  800          ASSERT(msp->ms_group == mg);
 801  801          avl_remove(&mg->mg_metaslab_tree, msp);
 802  802          msp->ms_group = NULL;
 803  803          mutex_exit(&mg->mg_lock);
 804  804  }
 805  805  
 806  806  static void
 807  807  metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 808  808  {
 809  809          /*
 810  810           * Although in principle the weight can be any value, in
 811  811           * practice we do not use values in the range [1, 511].
 812  812           */
 813  813          ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 814  814          ASSERT(MUTEX_HELD(&msp->ms_lock));
 815  815  
 816  816          mutex_enter(&mg->mg_lock);
 817  817          ASSERT(msp->ms_group == mg);
 818  818          avl_remove(&mg->mg_metaslab_tree, msp);
 819  819          msp->ms_weight = weight;
 820  820          avl_add(&mg->mg_metaslab_tree, msp);
 821  821          mutex_exit(&mg->mg_lock);
 822  822  }
 823  823  
 824  824  /*
 825  825   * Calculate the fragmentation for a given metaslab group. We can use
 826  826   * a simple average here since all metaslabs within the group must have
 827  827   * the same size. The return value will be a value between 0 and 100
 828  828   * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
 829  829   * group have a fragmentation metric.
 830  830   */
 831  831  uint64_t
 832  832  metaslab_group_fragmentation(metaslab_group_t *mg)
 833  833  {
 834  834          vdev_t *vd = mg->mg_vd;
 835  835          uint64_t fragmentation = 0;
 836  836          uint64_t valid_ms = 0;
 837  837  
 838  838          for (int m = 0; m < vd->vdev_ms_count; m++) {
 839  839                  metaslab_t *msp = vd->vdev_ms[m];
 840  840  
 841  841                  if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 842  842                          continue;
 843  843  
 844  844                  valid_ms++;
 845  845                  fragmentation += msp->ms_fragmentation;
 846  846          }
 847  847  
 848  848          if (valid_ms <= vd->vdev_ms_count / 2)
 849  849                  return (ZFS_FRAG_INVALID);
 850  850  
 851  851          fragmentation /= valid_ms;
 852  852          ASSERT3U(fragmentation, <=, 100);
 853  853          return (fragmentation);
 854  854  }
 855  855  
 856  856  /*
 857  857   * Determine if a given metaslab group should skip allocations. A metaslab
 858  858   * group should avoid allocations if its free capacity is less than the
 859  859   * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
 860  860   * zfs_mg_fragmentation_threshold and there is at least one metaslab group
 861  861   * that can still handle allocations. If the allocation throttle is enabled
 862  862   * then we skip allocations to devices that have reached their maximum
 863  863   * allocation queue depth unless the selected metaslab group is the only
 864  864   * eligible group remaining.
 865  865   */
 866  866  static boolean_t
 867  867  metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 868  868      uint64_t psize)
 869  869  {
 870  870          spa_t *spa = mg->mg_vd->vdev_spa;
 871  871          metaslab_class_t *mc = mg->mg_class;
 872  872  
 873  873          /*
 874  874           * We can only consider skipping this metaslab group if it's
 875  875           * in the normal metaslab class and there are other metaslab
 876  876           * groups to select from. Otherwise, we always consider it eligible
 877  877           * for allocations.
 878  878           */
 879  879          if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
 880  880                  return (B_TRUE);
 881  881  
 882  882          /*
 883  883           * If the metaslab group's mg_allocatable flag is set (see comments
 884  884           * in metaslab_group_alloc_update() for more information) and
 885  885           * the allocation throttle is disabled then allow allocations to this
 886  886           * device. However, if the allocation throttle is enabled then
 887  887           * check if we have reached our allocation limit (mg_alloc_queue_depth)
 888  888           * to determine if we should allow allocations to this metaslab group.
 889  889           * If all metaslab groups are no longer considered allocatable
 890  890           * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 891  891           * gang block size then we allow allocations on this metaslab group
 892  892           * regardless of the mg_allocatable or throttle settings.
 893  893           */
 894  894          if (mg->mg_allocatable) {
 895  895                  metaslab_group_t *mgp;
 896  896                  int64_t qdepth;
 897  897                  uint64_t qmax = mg->mg_max_alloc_queue_depth;
 898  898  
 899  899                  if (!mc->mc_alloc_throttle_enabled)
 900  900                          return (B_TRUE);
 901  901  
 902  902                  /*
 903  903                   * If this metaslab group does not have any free space, then
 904  904                   * there is no point in looking further.
 905  905                   */
 906  906                  if (mg->mg_no_free_space)
 907  907                          return (B_FALSE);
 908  908  
 909  909                  qdepth = refcount_count(&mg->mg_alloc_queue_depth);
 910  910  
 911  911                  /*
 912  912                   * If this metaslab group is below its qmax or it's
 913  913                   * the only allocatable metasable group, then attempt
 914  914                   * to allocate from it.
 915  915                   */
 916  916                  if (qdepth < qmax || mc->mc_alloc_groups == 1)
 917  917                          return (B_TRUE);
 918  918                  ASSERT3U(mc->mc_alloc_groups, >, 1);
 919  919  
 920  920                  /*
 921  921                   * Since this metaslab group is at or over its qmax, we
 922  922                   * need to determine if there are metaslab groups after this
 923  923                   * one that might be able to handle this allocation. This is
 924  924                   * racy since we can't hold the locks for all metaslab
 925  925                   * groups at the same time when we make this check.
 926  926                   */
 927  927                  for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
 928  928                          qmax = mgp->mg_max_alloc_queue_depth;
 929  929  
 930  930                          qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
 931  931  
 932  932                          /*
 933  933                           * If there is another metaslab group that
 934  934                           * might be able to handle the allocation, then
 935  935                           * we return false so that we skip this group.
 936  936                           */
 937  937                          if (qdepth < qmax && !mgp->mg_no_free_space)
 938  938                                  return (B_FALSE);
 939  939                  }
 940  940  
 941  941                  /*
 942  942                   * We didn't find another group to handle the allocation
 943  943                   * so we can't skip this metaslab group even though
 944  944                   * we are at or over our qmax.
 945  945                   */
 946  946                  return (B_TRUE);
 947  947  
 948  948          } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 949  949                  return (B_TRUE);
 950  950          }
 951  951          return (B_FALSE);
 952  952  }
 953  953  
 954  954  /*
 955  955   * ==========================================================================
 956  956   * Range tree callbacks
 957  957   * ==========================================================================
 958  958   */
 959  959  
 960  960  /*
 961  961   * Comparison function for the private size-ordered tree. Tree is sorted
 962  962   * by size, larger sizes at the end of the tree.
 963  963   */
 964  964  static int
 965  965  metaslab_rangesize_compare(const void *x1, const void *x2)
 966  966  {
 967  967          const range_seg_t *r1 = x1;
 968  968          const range_seg_t *r2 = x2;
 969  969          uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 970  970          uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 971  971  
 972  972          if (rs_size1 < rs_size2)
 973  973                  return (-1);
 974  974          if (rs_size1 > rs_size2)
 975  975                  return (1);
 976  976  
 977  977          if (r1->rs_start < r2->rs_start)
 978  978                  return (-1);
 979  979  
 980  980          if (r1->rs_start > r2->rs_start)
 981  981                  return (1);
 982  982  
 983  983          return (0);
 984  984  }
 985  985  
 986  986  /*
 987  987   * Create any block allocator specific components. The current allocators
 988  988   * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
 989  989   */
 990  990  static void
 991  991  metaslab_rt_create(range_tree_t *rt, void *arg)
 992  992  {
 993  993          metaslab_t *msp = arg;
 994  994  
 995  995          ASSERT3P(rt->rt_arg, ==, msp);
 996  996          ASSERT(msp->ms_tree == NULL);
 997  997  
 998  998          avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 999  999              sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1000 1000  }
1001 1001  
1002 1002  /*
1003 1003   * Destroy the block allocator specific components.
1004 1004   */
1005 1005  static void
1006 1006  metaslab_rt_destroy(range_tree_t *rt, void *arg)
1007 1007  {
1008 1008          metaslab_t *msp = arg;
1009 1009  
1010 1010          ASSERT3P(rt->rt_arg, ==, msp);
1011 1011          ASSERT3P(msp->ms_tree, ==, rt);
1012 1012          ASSERT0(avl_numnodes(&msp->ms_size_tree));
1013 1013  
1014 1014          avl_destroy(&msp->ms_size_tree);
1015 1015  }
1016 1016  
1017 1017  static void
1018 1018  metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1019 1019  {
1020 1020          metaslab_t *msp = arg;
1021 1021  
1022 1022          ASSERT3P(rt->rt_arg, ==, msp);
1023 1023          ASSERT3P(msp->ms_tree, ==, rt);
1024 1024          VERIFY(!msp->ms_condensing);
1025 1025          avl_add(&msp->ms_size_tree, rs);
1026 1026  }
1027 1027  
1028 1028  static void
1029 1029  metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1030 1030  {
1031 1031          metaslab_t *msp = arg;
1032 1032  
1033 1033          ASSERT3P(rt->rt_arg, ==, msp);
1034 1034          ASSERT3P(msp->ms_tree, ==, rt);
1035 1035          VERIFY(!msp->ms_condensing);
1036 1036          avl_remove(&msp->ms_size_tree, rs);
1037 1037  }
1038 1038  
1039 1039  static void
1040 1040  metaslab_rt_vacate(range_tree_t *rt, void *arg)
1041 1041  {
1042 1042          metaslab_t *msp = arg;
1043 1043  
1044 1044          ASSERT3P(rt->rt_arg, ==, msp);
1045 1045          ASSERT3P(msp->ms_tree, ==, rt);
1046 1046  
1047 1047          /*
1048 1048           * Normally one would walk the tree freeing nodes along the way.
1049 1049           * Since the nodes are shared with the range trees we can avoid
1050 1050           * walking all nodes and just reinitialize the avl tree. The nodes
1051 1051           * will be freed by the range tree, so we don't want to free them here.
1052 1052           */
1053 1053          avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1054 1054              sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1055 1055  }
1056 1056  
1057 1057  static range_tree_ops_t metaslab_rt_ops = {
1058 1058          metaslab_rt_create,
1059 1059          metaslab_rt_destroy,
1060 1060          metaslab_rt_add,
1061 1061          metaslab_rt_remove,
1062 1062          metaslab_rt_vacate
1063 1063  };
1064 1064  
1065 1065  /*
1066 1066   * ==========================================================================
1067 1067   * Common allocator routines
1068 1068   * ==========================================================================
1069 1069   */
1070 1070  
1071 1071  /*
1072 1072   * Return the maximum contiguous segment within the metaslab.
1073 1073   */
1074 1074  uint64_t
1075 1075  metaslab_block_maxsize(metaslab_t *msp)
1076 1076  {
1077 1077          avl_tree_t *t = &msp->ms_size_tree;
1078 1078          range_seg_t *rs;
1079 1079  
1080 1080          if (t == NULL || (rs = avl_last(t)) == NULL)
1081 1081                  return (0ULL);
1082 1082  
1083 1083          return (rs->rs_end - rs->rs_start);
1084 1084  }
1085 1085  
1086 1086  static range_seg_t *
1087 1087  metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1088 1088  {
1089 1089          range_seg_t *rs, rsearch;
1090 1090          avl_index_t where;
1091 1091  
1092 1092          rsearch.rs_start = start;
1093 1093          rsearch.rs_end = start + size;
1094 1094  
1095 1095          rs = avl_find(t, &rsearch, &where);
1096 1096          if (rs == NULL) {
1097 1097                  rs = avl_nearest(t, where, AVL_AFTER);
1098 1098          }
1099 1099  
1100 1100          return (rs);
1101 1101  }
1102 1102  
1103 1103  /*
1104 1104   * This is a helper function that can be used by the allocator to find
1105 1105   * a suitable block to allocate. This will search the specified AVL
1106 1106   * tree looking for a block that matches the specified criteria.
1107 1107   */
1108 1108  static uint64_t
1109 1109  metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1110 1110      uint64_t align)
1111 1111  {
1112 1112          range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1113 1113  
1114 1114          while (rs != NULL) {
1115 1115                  uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1116 1116  
1117 1117                  if (offset + size <= rs->rs_end) {
1118 1118                          *cursor = offset + size;
1119 1119                          return (offset);
1120 1120                  }
1121 1121                  rs = AVL_NEXT(t, rs);
1122 1122          }
1123 1123  
1124 1124          /*
1125 1125           * If we know we've searched the whole map (*cursor == 0), give up.
1126 1126           * Otherwise, reset the cursor to the beginning and try again.
1127 1127           */
1128 1128          if (*cursor == 0)
1129 1129                  return (-1ULL);
1130 1130  
1131 1131          *cursor = 0;
1132 1132          return (metaslab_block_picker(t, cursor, size, align));
1133 1133  }
1134 1134  
1135 1135  /*
1136 1136   * ==========================================================================
1137 1137   * The first-fit block allocator
1138 1138   * ==========================================================================
1139 1139   */
1140 1140  static uint64_t
1141 1141  metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1142 1142  {
1143 1143          /*
1144 1144           * Find the largest power of 2 block size that evenly divides the
1145 1145           * requested size. This is used to try to allocate blocks with similar
1146 1146           * alignment from the same area of the metaslab (i.e. same cursor
1147 1147           * bucket) but it does not guarantee that other allocations sizes
1148 1148           * may exist in the same region.
1149 1149           */
1150 1150          uint64_t align = size & -size;
1151 1151          uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1152 1152          avl_tree_t *t = &msp->ms_tree->rt_root;
1153 1153  
1154 1154          return (metaslab_block_picker(t, cursor, size, align));
1155 1155  }
1156 1156  
1157 1157  static metaslab_ops_t metaslab_ff_ops = {
1158 1158          metaslab_ff_alloc
1159 1159  };
1160 1160  
1161 1161  /*
1162 1162   * ==========================================================================
1163 1163   * Dynamic block allocator -
1164 1164   * Uses the first fit allocation scheme until space get low and then
1165 1165   * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1166 1166   * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1167 1167   * ==========================================================================
1168 1168   */
1169 1169  static uint64_t
1170 1170  metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1171 1171  {
1172 1172          /*
1173 1173           * Find the largest power of 2 block size that evenly divides the
1174 1174           * requested size. This is used to try to allocate blocks with similar
1175 1175           * alignment from the same area of the metaslab (i.e. same cursor
1176 1176           * bucket) but it does not guarantee that other allocations sizes
1177 1177           * may exist in the same region.
1178 1178           */
1179 1179          uint64_t align = size & -size;
1180 1180          uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1181 1181          range_tree_t *rt = msp->ms_tree;
1182 1182          avl_tree_t *t = &rt->rt_root;
1183 1183          uint64_t max_size = metaslab_block_maxsize(msp);
1184 1184          int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1185 1185  
1186 1186          ASSERT(MUTEX_HELD(&msp->ms_lock));
1187 1187          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1188 1188  
1189 1189          if (max_size < size)
1190 1190                  return (-1ULL);
1191 1191  
1192 1192          /*
1193 1193           * If we're running low on space switch to using the size
1194 1194           * sorted AVL tree (best-fit).
1195 1195           */
1196 1196          if (max_size < metaslab_df_alloc_threshold ||
1197 1197              free_pct < metaslab_df_free_pct) {
1198 1198                  t = &msp->ms_size_tree;
1199 1199                  *cursor = 0;
1200 1200          }
1201 1201  
1202 1202          return (metaslab_block_picker(t, cursor, size, 1ULL));
1203 1203  }
1204 1204  
1205 1205  static metaslab_ops_t metaslab_df_ops = {
1206 1206          metaslab_df_alloc
1207 1207  };
1208 1208  
1209 1209  /*
1210 1210   * ==========================================================================
1211 1211   * Cursor fit block allocator -
1212 1212   * Select the largest region in the metaslab, set the cursor to the beginning
1213 1213   * of the range and the cursor_end to the end of the range. As allocations
1214 1214   * are made advance the cursor. Continue allocating from the cursor until
1215 1215   * the range is exhausted and then find a new range.
1216 1216   * ==========================================================================
1217 1217   */
1218 1218  static uint64_t
1219 1219  metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1220 1220  {
1221 1221          range_tree_t *rt = msp->ms_tree;
1222 1222          avl_tree_t *t = &msp->ms_size_tree;
1223 1223          uint64_t *cursor = &msp->ms_lbas[0];
1224 1224          uint64_t *cursor_end = &msp->ms_lbas[1];
1225 1225          uint64_t offset = 0;
1226 1226  
1227 1227          ASSERT(MUTEX_HELD(&msp->ms_lock));
1228 1228          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1229 1229  
1230 1230          ASSERT3U(*cursor_end, >=, *cursor);
1231 1231  
1232 1232          if ((*cursor + size) > *cursor_end) {
1233 1233                  range_seg_t *rs;
1234 1234  
1235 1235                  rs = avl_last(&msp->ms_size_tree);
1236 1236                  if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1237 1237                          return (-1ULL);
1238 1238  
1239 1239                  *cursor = rs->rs_start;
1240 1240                  *cursor_end = rs->rs_end;
1241 1241          }
1242 1242  
1243 1243          offset = *cursor;
1244 1244          *cursor += size;
1245 1245  
1246 1246          return (offset);
1247 1247  }
1248 1248  
1249 1249  static metaslab_ops_t metaslab_cf_ops = {
1250 1250          metaslab_cf_alloc
1251 1251  };
1252 1252  
1253 1253  /*
1254 1254   * ==========================================================================
1255 1255   * New dynamic fit allocator -
1256 1256   * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1257 1257   * contiguous blocks. If no region is found then just use the largest segment
1258 1258   * that remains.
1259 1259   * ==========================================================================
1260 1260   */
1261 1261  
1262 1262  /*
1263 1263   * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1264 1264   * to request from the allocator.
1265 1265   */
1266 1266  uint64_t metaslab_ndf_clump_shift = 4;
1267 1267  
1268 1268  static uint64_t
1269 1269  metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1270 1270  {
1271 1271          avl_tree_t *t = &msp->ms_tree->rt_root;
1272 1272          avl_index_t where;
1273 1273          range_seg_t *rs, rsearch;
1274 1274          uint64_t hbit = highbit64(size);
1275 1275          uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1276 1276          uint64_t max_size = metaslab_block_maxsize(msp);
1277 1277  
1278 1278          ASSERT(MUTEX_HELD(&msp->ms_lock));
1279 1279          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1280 1280  
1281 1281          if (max_size < size)
1282 1282                  return (-1ULL);
1283 1283  
1284 1284          rsearch.rs_start = *cursor;
1285 1285          rsearch.rs_end = *cursor + size;
1286 1286  
1287 1287          rs = avl_find(t, &rsearch, &where);
1288 1288          if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1289 1289                  t = &msp->ms_size_tree;
1290 1290  
1291 1291                  rsearch.rs_start = 0;
1292 1292                  rsearch.rs_end = MIN(max_size,
1293 1293                      1ULL << (hbit + metaslab_ndf_clump_shift));
1294 1294                  rs = avl_find(t, &rsearch, &where);
1295 1295                  if (rs == NULL)
1296 1296                          rs = avl_nearest(t, where, AVL_AFTER);
1297 1297                  ASSERT(rs != NULL);
1298 1298          }
1299 1299  
1300 1300          if ((rs->rs_end - rs->rs_start) >= size) {
1301 1301                  *cursor = rs->rs_start + size;
1302 1302                  return (rs->rs_start);
1303 1303          }
1304 1304          return (-1ULL);
1305 1305  }
1306 1306  
1307 1307  static metaslab_ops_t metaslab_ndf_ops = {
1308 1308          metaslab_ndf_alloc
1309 1309  };
1310 1310  
1311 1311  metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1312 1312  
1313 1313  /*
1314 1314   * ==========================================================================
1315 1315   * Metaslabs
1316 1316   * ==========================================================================
1317 1317   */
1318 1318  
1319 1319  /*
1320 1320   * Wait for any in-progress metaslab loads to complete.
1321 1321   */
1322 1322  void
1323 1323  metaslab_load_wait(metaslab_t *msp)
1324 1324  {
1325 1325          ASSERT(MUTEX_HELD(&msp->ms_lock));
1326 1326  
1327 1327          while (msp->ms_loading) {
1328 1328                  ASSERT(!msp->ms_loaded);
1329 1329                  cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1330 1330          }
1331 1331  }
1332 1332  
1333 1333  int
1334 1334  metaslab_load(metaslab_t *msp)
1335 1335  {
1336 1336          int error = 0;
1337 1337          boolean_t success = B_FALSE;
1338 1338  
1339 1339          ASSERT(MUTEX_HELD(&msp->ms_lock));
1340 1340          ASSERT(!msp->ms_loaded);
1341 1341          ASSERT(!msp->ms_loading);
1342 1342  
1343 1343          msp->ms_loading = B_TRUE;
1344 1344  
1345 1345          /*
1346 1346           * If the space map has not been allocated yet, then treat
1347 1347           * all the space in the metaslab as free and add it to the
1348 1348           * ms_tree.
1349 1349           */
1350 1350          if (msp->ms_sm != NULL)
1351 1351                  error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1352 1352          else
1353 1353                  range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1354 1354  
1355 1355          success = (error == 0);
1356 1356          msp->ms_loading = B_FALSE;
1357 1357  
1358 1358          if (success) {
1359 1359                  ASSERT3P(msp->ms_group, !=, NULL);
1360 1360                  msp->ms_loaded = B_TRUE;
1361 1361  
1362 1362                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1363 1363                          range_tree_walk(msp->ms_defertree[t],
1364 1364                              range_tree_remove, msp->ms_tree);
1365 1365                  }
1366 1366                  msp->ms_max_size = metaslab_block_maxsize(msp);
1367 1367          }
1368 1368          cv_broadcast(&msp->ms_load_cv);
1369 1369          return (error);
1370 1370  }
1371 1371  
1372 1372  void
1373 1373  metaslab_unload(metaslab_t *msp)
1374 1374  {
1375 1375          ASSERT(MUTEX_HELD(&msp->ms_lock));
1376 1376          range_tree_vacate(msp->ms_tree, NULL, NULL);
1377 1377          msp->ms_loaded = B_FALSE;
1378 1378          msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1379 1379          msp->ms_max_size = 0;
1380 1380  }
1381 1381  
1382 1382  int
1383 1383  metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1384 1384      metaslab_t **msp)
1385 1385  {
1386 1386          vdev_t *vd = mg->mg_vd;
1387 1387          objset_t *mos = vd->vdev_spa->spa_meta_objset;
1388 1388          metaslab_t *ms;
1389 1389          int error;
1390 1390  
1391 1391          ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1392 1392          mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1393 1393          cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1394 1394          ms->ms_id = id;
1395 1395          ms->ms_start = id << vd->vdev_ms_shift;
1396 1396          ms->ms_size = 1ULL << vd->vdev_ms_shift;
1397 1397  
1398 1398          /*
1399 1399           * We only open space map objects that already exist. All others
1400 1400           * will be opened when we finally allocate an object for it.
1401 1401           */
1402 1402          if (object != 0) {
1403 1403                  error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1404 1404                      ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
1405 1405  
1406 1406                  if (error != 0) {
1407 1407                          kmem_free(ms, sizeof (metaslab_t));
1408 1408                          return (error);
1409 1409                  }
1410 1410  
1411 1411                  ASSERT(ms->ms_sm != NULL);
1412 1412          }
1413 1413  
1414 1414          /*
1415 1415           * We create the main range tree here, but we don't create the
1416 1416           * other range trees until metaslab_sync_done().  This serves
1417 1417           * two purposes: it allows metaslab_sync_done() to detect the
1418 1418           * addition of new space; and for debugging, it ensures that we'd
1419 1419           * data fault on any attempt to use this metaslab before it's ready.
1420 1420           */
1421 1421          ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
1422 1422          metaslab_group_add(mg, ms);
1423 1423  
1424 1424          metaslab_set_fragmentation(ms);
1425 1425  
1426 1426          /*
1427 1427           * If we're opening an existing pool (txg == 0) or creating
1428 1428           * a new one (txg == TXG_INITIAL), all space is available now.
1429 1429           * If we're adding space to an existing pool, the new space
1430 1430           * does not become available until after this txg has synced.
1431 1431           * The metaslab's weight will also be initialized when we sync
1432 1432           * out this txg. This ensures that we don't attempt to allocate
1433 1433           * from it before we have initialized it completely.
1434 1434           */
1435 1435          if (txg <= TXG_INITIAL)
1436 1436                  metaslab_sync_done(ms, 0);
1437 1437  
1438 1438          /*
1439 1439           * If metaslab_debug_load is set and we're initializing a metaslab
1440 1440           * that has an allocated space map object then load the its space
1441 1441           * map so that can verify frees.
1442 1442           */
1443 1443          if (metaslab_debug_load && ms->ms_sm != NULL) {
1444 1444                  mutex_enter(&ms->ms_lock);
1445 1445                  VERIFY0(metaslab_load(ms));
1446 1446                  mutex_exit(&ms->ms_lock);
1447 1447          }
1448 1448  
1449 1449          if (txg != 0) {
1450 1450                  vdev_dirty(vd, 0, NULL, txg);
1451 1451                  vdev_dirty(vd, VDD_METASLAB, ms, txg);
1452 1452          }
1453 1453  
1454 1454          *msp = ms;
1455 1455  
1456 1456          return (0);
1457 1457  }
1458 1458  
1459 1459  void
1460 1460  metaslab_fini(metaslab_t *msp)
1461 1461  {
1462 1462          metaslab_group_t *mg = msp->ms_group;
1463 1463  
1464 1464          metaslab_group_remove(mg, msp);
1465 1465  
1466 1466          mutex_enter(&msp->ms_lock);
1467 1467          VERIFY(msp->ms_group == NULL);
1468 1468          vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1469 1469              0, -msp->ms_size);
1470 1470          space_map_close(msp->ms_sm);
1471 1471  
1472 1472          metaslab_unload(msp);
1473 1473          range_tree_destroy(msp->ms_tree);
1474 1474          range_tree_destroy(msp->ms_freeingtree);
1475 1475          range_tree_destroy(msp->ms_freedtree);
1476 1476  
1477 1477          for (int t = 0; t < TXG_SIZE; t++) {
1478 1478                  range_tree_destroy(msp->ms_alloctree[t]);
1479 1479          }
1480 1480  
1481 1481          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1482 1482                  range_tree_destroy(msp->ms_defertree[t]);
1483 1483          }
1484 1484  
1485 1485          ASSERT0(msp->ms_deferspace);
1486 1486  
1487 1487          mutex_exit(&msp->ms_lock);
1488 1488          cv_destroy(&msp->ms_load_cv);
1489 1489          mutex_destroy(&msp->ms_lock);
1490 1490  
1491 1491          kmem_free(msp, sizeof (metaslab_t));
1492 1492  }
1493 1493  
1494 1494  #define FRAGMENTATION_TABLE_SIZE        17
1495 1495  
1496 1496  /*
1497 1497   * This table defines a segment size based fragmentation metric that will
1498 1498   * allow each metaslab to derive its own fragmentation value. This is done
1499 1499   * by calculating the space in each bucket of the spacemap histogram and
1500 1500   * multiplying that by the fragmetation metric in this table. Doing
1501 1501   * this for all buckets and dividing it by the total amount of free
1502 1502   * space in this metaslab (i.e. the total free space in all buckets) gives
1503 1503   * us the fragmentation metric. This means that a high fragmentation metric
1504 1504   * equates to most of the free space being comprised of small segments.
1505 1505   * Conversely, if the metric is low, then most of the free space is in
1506 1506   * large segments. A 10% change in fragmentation equates to approximately
1507 1507   * double the number of segments.
1508 1508   *
1509 1509   * This table defines 0% fragmented space using 16MB segments. Testing has
1510 1510   * shown that segments that are greater than or equal to 16MB do not suffer
1511 1511   * from drastic performance problems. Using this value, we derive the rest
1512 1512   * of the table. Since the fragmentation value is never stored on disk, it
1513 1513   * is possible to change these calculations in the future.
1514 1514   */
1515 1515  int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1516 1516          100,    /* 512B */
1517 1517          100,    /* 1K   */
1518 1518          98,     /* 2K   */
1519 1519          95,     /* 4K   */
1520 1520          90,     /* 8K   */
1521 1521          80,     /* 16K  */
1522 1522          70,     /* 32K  */
1523 1523          60,     /* 64K  */
1524 1524          50,     /* 128K */
1525 1525          40,     /* 256K */
1526 1526          30,     /* 512K */
1527 1527          20,     /* 1M   */
1528 1528          15,     /* 2M   */
1529 1529          10,     /* 4M   */
1530 1530          5,      /* 8M   */
1531 1531          0       /* 16M  */
1532 1532  };
1533 1533  
1534 1534  /*
1535 1535   * Calclate the metaslab's fragmentation metric. A return value
1536 1536   * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1537 1537   * not support this metric. Otherwise, the return value should be in the
1538 1538   * range [0, 100].
1539 1539   */
1540 1540  static void
1541 1541  metaslab_set_fragmentation(metaslab_t *msp)
1542 1542  {
1543 1543          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1544 1544          uint64_t fragmentation = 0;
1545 1545          uint64_t total = 0;
1546 1546          boolean_t feature_enabled = spa_feature_is_enabled(spa,
1547 1547              SPA_FEATURE_SPACEMAP_HISTOGRAM);
1548 1548  
1549 1549          if (!feature_enabled) {
1550 1550                  msp->ms_fragmentation = ZFS_FRAG_INVALID;
1551 1551                  return;
1552 1552          }
1553 1553  
1554 1554          /*
1555 1555           * A null space map means that the entire metaslab is free
1556 1556           * and thus is not fragmented.
1557 1557           */
1558 1558          if (msp->ms_sm == NULL) {
1559 1559                  msp->ms_fragmentation = 0;
1560 1560                  return;
1561 1561          }
1562 1562  
1563 1563          /*
1564 1564           * If this metaslab's space map has not been upgraded, flag it
1565 1565           * so that we upgrade next time we encounter it.
1566 1566           */
1567 1567          if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1568 1568                  uint64_t txg = spa_syncing_txg(spa);
1569 1569                  vdev_t *vd = msp->ms_group->mg_vd;
1570 1570  
1571 1571                  /*
1572 1572                   * If we've reached the final dirty txg, then we must
1573 1573                   * be shutting down the pool. We don't want to dirty
1574 1574                   * any data past this point so skip setting the condense
1575 1575                   * flag. We can retry this action the next time the pool
1576 1576                   * is imported.
1577 1577                   */
1578 1578                  if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
1579 1579                          msp->ms_condense_wanted = B_TRUE;
1580 1580                          vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1581 1581                          spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1582 1582                              "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
1583 1583                              vd->vdev_id);
1584 1584                  }
1585 1585                  msp->ms_fragmentation = ZFS_FRAG_INVALID;
1586 1586                  return;
1587 1587          }
1588 1588  
1589 1589          for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1590 1590                  uint64_t space = 0;
1591 1591                  uint8_t shift = msp->ms_sm->sm_shift;
1592 1592  
1593 1593                  int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1594 1594                      FRAGMENTATION_TABLE_SIZE - 1);
1595 1595  
1596 1596                  if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1597 1597                          continue;
1598 1598  
1599 1599                  space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1600 1600                  total += space;
1601 1601  
1602 1602                  ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1603 1603                  fragmentation += space * zfs_frag_table[idx];
1604 1604          }
1605 1605  
1606 1606          if (total > 0)
1607 1607                  fragmentation /= total;
1608 1608          ASSERT3U(fragmentation, <=, 100);
1609 1609  
1610 1610          msp->ms_fragmentation = fragmentation;
1611 1611  }
1612 1612  
1613 1613  /*
1614 1614   * Compute a weight -- a selection preference value -- for the given metaslab.
1615 1615   * This is based on the amount of free space, the level of fragmentation,
1616 1616   * the LBA range, and whether the metaslab is loaded.
1617 1617   */
1618 1618  static uint64_t
1619 1619  metaslab_space_weight(metaslab_t *msp)
1620 1620  {
1621 1621          metaslab_group_t *mg = msp->ms_group;
1622 1622          vdev_t *vd = mg->mg_vd;
1623 1623          uint64_t weight, space;
1624 1624  
1625 1625          ASSERT(MUTEX_HELD(&msp->ms_lock));
1626 1626          ASSERT(!vd->vdev_removing);
1627 1627  
1628 1628          /*
1629 1629           * The baseline weight is the metaslab's free space.
1630 1630           */
1631 1631          space = msp->ms_size - space_map_allocated(msp->ms_sm);
1632 1632  
1633 1633          if (metaslab_fragmentation_factor_enabled &&
1634 1634              msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1635 1635                  /*
1636 1636                   * Use the fragmentation information to inversely scale
1637 1637                   * down the baseline weight. We need to ensure that we
1638 1638                   * don't exclude this metaslab completely when it's 100%
1639 1639                   * fragmented. To avoid this we reduce the fragmented value
1640 1640                   * by 1.
1641 1641                   */
1642 1642                  space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1643 1643  
1644 1644                  /*
1645 1645                   * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1646 1646                   * this metaslab again. The fragmentation metric may have
1647 1647                   * decreased the space to something smaller than
1648 1648                   * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1649 1649                   * so that we can consume any remaining space.
1650 1650                   */
1651 1651                  if (space > 0 && space < SPA_MINBLOCKSIZE)
1652 1652                          space = SPA_MINBLOCKSIZE;
1653 1653          }
1654 1654          weight = space;

↓ open down ↓

1654 lines elided

↑ open up ↑

1655 1655  
1656 1656          /*
1657 1657           * Modern disks have uniform bit density and constant angular velocity.
1658 1658           * Therefore, the outer recording zones are faster (higher bandwidth)
1659 1659           * than the inner zones by the ratio of outer to inner track diameter,
1660 1660           * which is typically around 2:1.  We account for this by assigning
1661 1661           * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1662 1662           * In effect, this means that we'll select the metaslab with the most
1663 1663           * free bandwidth rather than simply the one with the most free space.
1664 1664           */
1665      -        if (metaslab_lba_weighting_enabled) {
     1665 +        if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
1666 1666                  weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1667 1667                  ASSERT(weight >= space && weight <= 2 * space);
1668 1668          }
1669 1669  
1670 1670          /*
1671 1671           * If this metaslab is one we're actively using, adjust its
1672 1672           * weight to make it preferable to any inactive metaslab so
1673 1673           * we'll polish it off. If the fragmentation on this metaslab
1674 1674           * has exceed our threshold, then don't mark it active.
1675 1675           */

1676 1676          if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1677 1677              msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1678 1678                  weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1679 1679          }
1680 1680  
1681 1681          WEIGHT_SET_SPACEBASED(weight);
1682 1682          return (weight);
1683 1683  }
1684 1684  
1685 1685  /*
1686 1686   * Return the weight of the specified metaslab, according to the segment-based
1687 1687   * weighting algorithm. The metaslab must be loaded. This function can
1688 1688   * be called within a sync pass since it relies only on the metaslab's
1689 1689   * range tree which is always accurate when the metaslab is loaded.
1690 1690   */
1691 1691  static uint64_t
1692 1692  metaslab_weight_from_range_tree(metaslab_t *msp)
1693 1693  {
1694 1694          uint64_t weight = 0;
1695 1695          uint32_t segments = 0;
1696 1696  
1697 1697          ASSERT(msp->ms_loaded);
1698 1698  
1699 1699          for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1700 1700              i--) {
1701 1701                  uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1702 1702                  int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1703 1703  
1704 1704                  segments <<= 1;
1705 1705                  segments += msp->ms_tree->rt_histogram[i];
1706 1706  
1707 1707                  /*
1708 1708                   * The range tree provides more precision than the space map
1709 1709                   * and must be downgraded so that all values fit within the
1710 1710                   * space map's histogram. This allows us to compare loaded
1711 1711                   * vs. unloaded metaslabs to determine which metaslab is
1712 1712                   * considered "best".
1713 1713                   */
1714 1714                  if (i > max_idx)
1715 1715                          continue;
1716 1716  
1717 1717                  if (segments != 0) {
1718 1718                          WEIGHT_SET_COUNT(weight, segments);
1719 1719                          WEIGHT_SET_INDEX(weight, i);
1720 1720                          WEIGHT_SET_ACTIVE(weight, 0);
1721 1721                          break;
1722 1722                  }
1723 1723          }
1724 1724          return (weight);
1725 1725  }
1726 1726  
1727 1727  /*
1728 1728   * Calculate the weight based on the on-disk histogram. This should only
1729 1729   * be called after a sync pass has completely finished since the on-disk
1730 1730   * information is updated in metaslab_sync().
1731 1731   */
1732 1732  static uint64_t
1733 1733  metaslab_weight_from_spacemap(metaslab_t *msp)
1734 1734  {
1735 1735          uint64_t weight = 0;
1736 1736  
1737 1737          for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1738 1738                  if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1739 1739                          WEIGHT_SET_COUNT(weight,
1740 1740                              msp->ms_sm->sm_phys->smp_histogram[i]);
1741 1741                          WEIGHT_SET_INDEX(weight, i +
1742 1742                              msp->ms_sm->sm_shift);
1743 1743                          WEIGHT_SET_ACTIVE(weight, 0);
1744 1744                          break;
1745 1745                  }
1746 1746          }
1747 1747          return (weight);
1748 1748  }
1749 1749  
1750 1750  /*
1751 1751   * Compute a segment-based weight for the specified metaslab. The weight
1752 1752   * is determined by highest bucket in the histogram. The information
1753 1753   * for the highest bucket is encoded into the weight value.
1754 1754   */
1755 1755  static uint64_t
1756 1756  metaslab_segment_weight(metaslab_t *msp)
1757 1757  {
1758 1758          metaslab_group_t *mg = msp->ms_group;
1759 1759          uint64_t weight = 0;
1760 1760          uint8_t shift = mg->mg_vd->vdev_ashift;
1761 1761  
1762 1762          ASSERT(MUTEX_HELD(&msp->ms_lock));
1763 1763  
1764 1764          /*
1765 1765           * The metaslab is completely free.
1766 1766           */
1767 1767          if (space_map_allocated(msp->ms_sm) == 0) {
1768 1768                  int idx = highbit64(msp->ms_size) - 1;
1769 1769                  int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1770 1770  
1771 1771                  if (idx < max_idx) {
1772 1772                          WEIGHT_SET_COUNT(weight, 1ULL);
1773 1773                          WEIGHT_SET_INDEX(weight, idx);
1774 1774                  } else {
1775 1775                          WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1776 1776                          WEIGHT_SET_INDEX(weight, max_idx);
1777 1777                  }
1778 1778                  WEIGHT_SET_ACTIVE(weight, 0);
1779 1779                  ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1780 1780  
1781 1781                  return (weight);
1782 1782          }
1783 1783  
1784 1784          ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1785 1785  
1786 1786          /*
1787 1787           * If the metaslab is fully allocated then just make the weight 0.
1788 1788           */
1789 1789          if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1790 1790                  return (0);
1791 1791          /*
1792 1792           * If the metaslab is already loaded, then use the range tree to
1793 1793           * determine the weight. Otherwise, we rely on the space map information
1794 1794           * to generate the weight.
1795 1795           */
1796 1796          if (msp->ms_loaded) {
1797 1797                  weight = metaslab_weight_from_range_tree(msp);
1798 1798          } else {
1799 1799                  weight = metaslab_weight_from_spacemap(msp);
1800 1800          }
1801 1801  
1802 1802          /*
1803 1803           * If the metaslab was active the last time we calculated its weight
1804 1804           * then keep it active. We want to consume the entire region that
1805 1805           * is associated with this weight.
1806 1806           */
1807 1807          if (msp->ms_activation_weight != 0 && weight != 0)
1808 1808                  WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1809 1809          return (weight);
1810 1810  }
1811 1811  
1812 1812  /*
1813 1813   * Determine if we should attempt to allocate from this metaslab. If the
1814 1814   * metaslab has a maximum size then we can quickly determine if the desired
1815 1815   * allocation size can be satisfied. Otherwise, if we're using segment-based
1816 1816   * weighting then we can determine the maximum allocation that this metaslab
1817 1817   * can accommodate based on the index encoded in the weight. If we're using
1818 1818   * space-based weights then rely on the entire weight (excluding the weight
1819 1819   * type bit).
1820 1820   */
1821 1821  boolean_t
1822 1822  metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1823 1823  {
1824 1824          boolean_t should_allocate;
1825 1825  
1826 1826          if (msp->ms_max_size != 0)
1827 1827                  return (msp->ms_max_size >= asize);
1828 1828  
1829 1829          if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1830 1830                  /*
1831 1831                   * The metaslab segment weight indicates segments in the
1832 1832                   * range [2^i, 2^(i+1)), where i is the index in the weight.
1833 1833                   * Since the asize might be in the middle of the range, we
1834 1834                   * should attempt the allocation if asize < 2^(i+1).
1835 1835                   */
1836 1836                  should_allocate = (asize <
1837 1837                      1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
1838 1838          } else {
1839 1839                  should_allocate = (asize <=
1840 1840                      (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
1841 1841          }
1842 1842          return (should_allocate);
1843 1843  }
1844 1844  
1845 1845  static uint64_t
1846 1846  metaslab_weight(metaslab_t *msp)
1847 1847  {
1848 1848          vdev_t *vd = msp->ms_group->mg_vd;
1849 1849          spa_t *spa = vd->vdev_spa;
1850 1850          uint64_t weight;
1851 1851  
1852 1852          ASSERT(MUTEX_HELD(&msp->ms_lock));
1853 1853  
1854 1854          /*
1855 1855           * This vdev is in the process of being removed so there is nothing
1856 1856           * for us to do here.
1857 1857           */
1858 1858          if (vd->vdev_removing) {
1859 1859                  ASSERT0(space_map_allocated(msp->ms_sm));
1860 1860                  ASSERT0(vd->vdev_ms_shift);
1861 1861                  return (0);
1862 1862          }
1863 1863  
1864 1864          metaslab_set_fragmentation(msp);
1865 1865  
1866 1866          /*
1867 1867           * Update the maximum size if the metaslab is loaded. This will
1868 1868           * ensure that we get an accurate maximum size if newly freed space
1869 1869           * has been added back into the free tree.
1870 1870           */
1871 1871          if (msp->ms_loaded)
1872 1872                  msp->ms_max_size = metaslab_block_maxsize(msp);
1873 1873  
1874 1874          /*
1875 1875           * Segment-based weighting requires space map histogram support.
1876 1876           */
1877 1877          if (zfs_metaslab_segment_weight_enabled &&
1878 1878              spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
1879 1879              (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
1880 1880              sizeof (space_map_phys_t))) {
1881 1881                  weight = metaslab_segment_weight(msp);
1882 1882          } else {
1883 1883                  weight = metaslab_space_weight(msp);
1884 1884          }
1885 1885          return (weight);
1886 1886  }
1887 1887  
1888 1888  static int
1889 1889  metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1890 1890  {
1891 1891          ASSERT(MUTEX_HELD(&msp->ms_lock));
1892 1892  
1893 1893          if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1894 1894                  metaslab_load_wait(msp);
1895 1895                  if (!msp->ms_loaded) {
1896 1896                          int error = metaslab_load(msp);
1897 1897                          if (error) {
1898 1898                                  metaslab_group_sort(msp->ms_group, msp, 0);
1899 1899                                  return (error);
1900 1900                          }
1901 1901                  }
1902 1902  
1903 1903                  msp->ms_activation_weight = msp->ms_weight;
1904 1904                  metaslab_group_sort(msp->ms_group, msp,
1905 1905                      msp->ms_weight | activation_weight);
1906 1906          }
1907 1907          ASSERT(msp->ms_loaded);
1908 1908          ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1909 1909  
1910 1910          return (0);
1911 1911  }
1912 1912  
1913 1913  static void
1914 1914  metaslab_passivate(metaslab_t *msp, uint64_t weight)
1915 1915  {
1916 1916          uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
1917 1917  
1918 1918          /*
1919 1919           * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1920 1920           * this metaslab again.  In that case, it had better be empty,
1921 1921           * or we would be leaving space on the table.
1922 1922           */
1923 1923          ASSERT(size >= SPA_MINBLOCKSIZE ||
1924 1924              range_tree_space(msp->ms_tree) == 0);
1925 1925          ASSERT0(weight & METASLAB_ACTIVE_MASK);
1926 1926  
1927 1927          msp->ms_activation_weight = 0;
1928 1928          metaslab_group_sort(msp->ms_group, msp, weight);
1929 1929          ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1930 1930  }
1931 1931  
1932 1932  /*
1933 1933   * Segment-based metaslabs are activated once and remain active until
1934 1934   * we either fail an allocation attempt (similar to space-based metaslabs)
1935 1935   * or have exhausted the free space in zfs_metaslab_switch_threshold
1936 1936   * buckets since the metaslab was activated. This function checks to see
1937 1937   * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
1938 1938   * metaslab and passivates it proactively. This will allow us to select a
1939 1939   * metaslabs with larger contiguous region if any remaining within this
1940 1940   * metaslab group. If we're in sync pass > 1, then we continue using this
1941 1941   * metaslab so that we don't dirty more block and cause more sync passes.
1942 1942   */
1943 1943  void
1944 1944  metaslab_segment_may_passivate(metaslab_t *msp)
1945 1945  {
1946 1946          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1947 1947  
1948 1948          if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
1949 1949                  return;
1950 1950  
1951 1951          /*
1952 1952           * Since we are in the middle of a sync pass, the most accurate
1953 1953           * information that is accessible to us is the in-core range tree
1954 1954           * histogram; calculate the new weight based on that information.
1955 1955           */
1956 1956          uint64_t weight = metaslab_weight_from_range_tree(msp);
1957 1957          int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
1958 1958          int current_idx = WEIGHT_GET_INDEX(weight);
1959 1959  
1960 1960          if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
1961 1961                  metaslab_passivate(msp, weight);
1962 1962  }
1963 1963  
1964 1964  static void
1965 1965  metaslab_preload(void *arg)
1966 1966  {
1967 1967          metaslab_t *msp = arg;
1968 1968          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1969 1969  
1970 1970          ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1971 1971  
1972 1972          mutex_enter(&msp->ms_lock);
1973 1973          metaslab_load_wait(msp);
1974 1974          if (!msp->ms_loaded)
1975 1975                  (void) metaslab_load(msp);
1976 1976          msp->ms_selected_txg = spa_syncing_txg(spa);
1977 1977          mutex_exit(&msp->ms_lock);
1978 1978  }
1979 1979  
1980 1980  static void
1981 1981  metaslab_group_preload(metaslab_group_t *mg)
1982 1982  {
1983 1983          spa_t *spa = mg->mg_vd->vdev_spa;
1984 1984          metaslab_t *msp;
1985 1985          avl_tree_t *t = &mg->mg_metaslab_tree;
1986 1986          int m = 0;
1987 1987  
1988 1988          if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1989 1989                  taskq_wait(mg->mg_taskq);
1990 1990                  return;
1991 1991          }
1992 1992  
1993 1993          mutex_enter(&mg->mg_lock);
1994 1994          /*
1995 1995           * Load the next potential metaslabs
1996 1996           */
1997 1997          for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1998 1998                  /*
1999 1999                   * We preload only the maximum number of metaslabs specified
2000 2000                   * by metaslab_preload_limit. If a metaslab is being forced
2001 2001                   * to condense then we preload it too. This will ensure
2002 2002                   * that force condensing happens in the next txg.
2003 2003                   */
2004 2004                  if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
2005 2005                          continue;
2006 2006                  }
2007 2007  
2008 2008                  VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
2009 2009                      msp, TQ_SLEEP) != NULL);
2010 2010          }
2011 2011          mutex_exit(&mg->mg_lock);
2012 2012  }
2013 2013  
2014 2014  /*
2015 2015   * Determine if the space map's on-disk footprint is past our tolerance
2016 2016   * for inefficiency. We would like to use the following criteria to make
2017 2017   * our decision:
2018 2018   *
2019 2019   * 1. The size of the space map object should not dramatically increase as a
2020 2020   * result of writing out the free space range tree.
2021 2021   *
2022 2022   * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2023 2023   * times the size than the free space range tree representation
2024 2024   * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
2025 2025   *
2026 2026   * 3. The on-disk size of the space map should actually decrease.
2027 2027   *
2028 2028   * Checking the first condition is tricky since we don't want to walk
2029 2029   * the entire AVL tree calculating the estimated on-disk size. Instead we
2030 2030   * use the size-ordered range tree in the metaslab and calculate the
2031 2031   * size required to write out the largest segment in our free tree. If the
2032 2032   * size required to represent that segment on disk is larger than the space
2033 2033   * map object then we avoid condensing this map.
2034 2034   *
2035 2035   * To determine the second criterion we use a best-case estimate and assume
2036 2036   * each segment can be represented on-disk as a single 64-bit entry. We refer
2037 2037   * to this best-case estimate as the space map's minimal form.
2038 2038   *
2039 2039   * Unfortunately, we cannot compute the on-disk size of the space map in this
2040 2040   * context because we cannot accurately compute the effects of compression, etc.
2041 2041   * Instead, we apply the heuristic described in the block comment for
2042 2042   * zfs_metaslab_condense_block_threshold - we only condense if the space used
2043 2043   * is greater than a threshold number of blocks.
2044 2044   */
2045 2045  static boolean_t
2046 2046  metaslab_should_condense(metaslab_t *msp)
2047 2047  {
2048 2048          space_map_t *sm = msp->ms_sm;
2049 2049          range_seg_t *rs;
2050 2050          uint64_t size, entries, segsz, object_size, optimal_size, record_size;
2051 2051          dmu_object_info_t doi;
2052 2052          uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
2053 2053  
2054 2054          ASSERT(MUTEX_HELD(&msp->ms_lock));
2055 2055          ASSERT(msp->ms_loaded);
2056 2056  
2057 2057          /*
2058 2058           * Use the ms_size_tree range tree, which is ordered by size, to
2059 2059           * obtain the largest segment in the free tree. We always condense
2060 2060           * metaslabs that are empty and metaslabs for which a condense
2061 2061           * request has been made.
2062 2062           */
2063 2063          rs = avl_last(&msp->ms_size_tree);
2064 2064          if (rs == NULL || msp->ms_condense_wanted)
2065 2065                  return (B_TRUE);
2066 2066  
2067 2067          /*
2068 2068           * Calculate the number of 64-bit entries this segment would
2069 2069           * require when written to disk. If this single segment would be
2070 2070           * larger on-disk than the entire current on-disk structure, then
2071 2071           * clearly condensing will increase the on-disk structure size.
2072 2072           */
2073 2073          size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
2074 2074          entries = size / (MIN(size, SM_RUN_MAX));
2075 2075          segsz = entries * sizeof (uint64_t);
2076 2076  
2077 2077          optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
2078 2078          object_size = space_map_length(msp->ms_sm);
2079 2079  
2080 2080          dmu_object_info_from_db(sm->sm_dbuf, &doi);
2081 2081          record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2082 2082  
2083 2083          return (segsz <= object_size &&
2084 2084              object_size >= (optimal_size * zfs_condense_pct / 100) &&
2085 2085              object_size > zfs_metaslab_condense_block_threshold * record_size);
2086 2086  }
2087 2087  
2088 2088  /*
2089 2089   * Condense the on-disk space map representation to its minimized form.
2090 2090   * The minimized form consists of a small number of allocations followed by
2091 2091   * the entries of the free range tree.
2092 2092   */
2093 2093  static void
2094 2094  metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2095 2095  {
2096 2096          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2097 2097          range_tree_t *condense_tree;
2098 2098          space_map_t *sm = msp->ms_sm;
2099 2099  
2100 2100          ASSERT(MUTEX_HELD(&msp->ms_lock));
2101 2101          ASSERT3U(spa_sync_pass(spa), ==, 1);
2102 2102          ASSERT(msp->ms_loaded);
2103 2103  
2104 2104  
2105 2105          spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2106 2106              "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2107 2107              msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2108 2108              msp->ms_group->mg_vd->vdev_spa->spa_name,
2109 2109              space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
2110 2110              msp->ms_condense_wanted ? "TRUE" : "FALSE");
2111 2111  
2112 2112          msp->ms_condense_wanted = B_FALSE;
2113 2113  
2114 2114          /*
2115 2115           * Create an range tree that is 100% allocated. We remove segments
2116 2116           * that have been freed in this txg, any deferred frees that exist,
2117 2117           * and any allocation in the future. Removing segments should be
2118 2118           * a relatively inexpensive operation since we expect these trees to
2119 2119           * have a small number of nodes.
2120 2120           */
2121 2121          condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
2122 2122          range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2123 2123  
2124 2124          /*
2125 2125           * Remove what's been freed in this txg from the condense_tree.
2126 2126           * Since we're in sync_pass 1, we know that all the frees from
2127 2127           * this txg are in the freeingtree.
2128 2128           */
2129 2129          range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
2130 2130  
2131 2131          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2132 2132                  range_tree_walk(msp->ms_defertree[t],
2133 2133                      range_tree_remove, condense_tree);
2134 2134          }
2135 2135  
2136 2136          for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2137 2137                  range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
2138 2138                      range_tree_remove, condense_tree);
2139 2139          }
2140 2140  
2141 2141          /*
2142 2142           * We're about to drop the metaslab's lock thus allowing
2143 2143           * other consumers to change it's content. Set the
2144 2144           * metaslab's ms_condensing flag to ensure that
2145 2145           * allocations on this metaslab do not occur while we're
2146 2146           * in the middle of committing it to disk. This is only critical
2147 2147           * for the ms_tree as all other range trees use per txg
2148 2148           * views of their content.
2149 2149           */
2150 2150          msp->ms_condensing = B_TRUE;
2151 2151  
2152 2152          mutex_exit(&msp->ms_lock);
2153 2153          space_map_truncate(sm, tx);
2154 2154          mutex_enter(&msp->ms_lock);
2155 2155  
2156 2156          /*
2157 2157           * While we would ideally like to create a space map representation
2158 2158           * that consists only of allocation records, doing so can be
2159 2159           * prohibitively expensive because the in-core free tree can be
2160 2160           * large, and therefore computationally expensive to subtract
2161 2161           * from the condense_tree. Instead we sync out two trees, a cheap
2162 2162           * allocation only tree followed by the in-core free tree. While not
2163 2163           * optimal, this is typically close to optimal, and much cheaper to
2164 2164           * compute.
2165 2165           */
2166 2166          space_map_write(sm, condense_tree, SM_ALLOC, tx);
2167 2167          range_tree_vacate(condense_tree, NULL, NULL);
2168 2168          range_tree_destroy(condense_tree);
2169 2169  
2170 2170          space_map_write(sm, msp->ms_tree, SM_FREE, tx);
2171 2171          msp->ms_condensing = B_FALSE;
2172 2172  }
2173 2173  
2174 2174  /*
2175 2175   * Write a metaslab to disk in the context of the specified transaction group.
2176 2176   */
2177 2177  void
2178 2178  metaslab_sync(metaslab_t *msp, uint64_t txg)
2179 2179  {
2180 2180          metaslab_group_t *mg = msp->ms_group;
2181 2181          vdev_t *vd = mg->mg_vd;
2182 2182          spa_t *spa = vd->vdev_spa;
2183 2183          objset_t *mos = spa_meta_objset(spa);
2184 2184          range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2185 2185          dmu_tx_t *tx;
2186 2186          uint64_t object = space_map_object(msp->ms_sm);
2187 2187  
2188 2188          ASSERT(!vd->vdev_ishole);
2189 2189  
2190 2190          /*
2191 2191           * This metaslab has just been added so there's no work to do now.
2192 2192           */
2193 2193          if (msp->ms_freeingtree == NULL) {
2194 2194                  ASSERT3P(alloctree, ==, NULL);
2195 2195                  return;
2196 2196          }
2197 2197  
2198 2198          ASSERT3P(alloctree, !=, NULL);
2199 2199          ASSERT3P(msp->ms_freeingtree, !=, NULL);
2200 2200          ASSERT3P(msp->ms_freedtree, !=, NULL);
2201 2201  
2202 2202          /*
2203 2203           * Normally, we don't want to process a metaslab if there
2204 2204           * are no allocations or frees to perform. However, if the metaslab
2205 2205           * is being forced to condense and it's loaded, we need to let it
2206 2206           * through.
2207 2207           */
2208 2208          if (range_tree_space(alloctree) == 0 &&
2209 2209              range_tree_space(msp->ms_freeingtree) == 0 &&
2210 2210              !(msp->ms_loaded && msp->ms_condense_wanted))
2211 2211                  return;
2212 2212  
2213 2213  
2214 2214          VERIFY(txg <= spa_final_dirty_txg(spa));
2215 2215  
2216 2216          /*
2217 2217           * The only state that can actually be changing concurrently with
2218 2218           * metaslab_sync() is the metaslab's ms_tree.  No other thread can
2219 2219           * be modifying this txg's alloctree, freeingtree, freedtree, or
2220 2220           * space_map_phys_t. Therefore, we only hold ms_lock to satify
2221 2221           * space map ASSERTs. We drop it whenever we call into the DMU,
2222 2222           * because the DMU can call down to us (e.g. via zio_free()) at
2223 2223           * any time.
2224 2224           */
2225 2225  
2226 2226          tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2227 2227  
2228 2228          if (msp->ms_sm == NULL) {
2229 2229                  uint64_t new_object;
2230 2230  
2231 2231                  new_object = space_map_alloc(mos, tx);
2232 2232                  VERIFY3U(new_object, !=, 0);
2233 2233  
2234 2234                  VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2235 2235                      msp->ms_start, msp->ms_size, vd->vdev_ashift,
2236 2236                      &msp->ms_lock));
2237 2237                  ASSERT(msp->ms_sm != NULL);
2238 2238          }
2239 2239  
2240 2240          mutex_enter(&msp->ms_lock);
2241 2241  
2242 2242          /*
2243 2243           * Note: metaslab_condense() clears the space map's histogram.
2244 2244           * Therefore we must verify and remove this histogram before
2245 2245           * condensing.
2246 2246           */
2247 2247          metaslab_group_histogram_verify(mg);
2248 2248          metaslab_class_histogram_verify(mg->mg_class);
2249 2249          metaslab_group_histogram_remove(mg, msp);
2250 2250  
2251 2251          if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2252 2252              metaslab_should_condense(msp)) {
2253 2253                  metaslab_condense(msp, txg, tx);
2254 2254          } else {
2255 2255                  space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2256 2256                  space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
2257 2257          }
2258 2258  
2259 2259          if (msp->ms_loaded) {
2260 2260                  /*
2261 2261                   * When the space map is loaded, we have an accruate
2262 2262                   * histogram in the range tree. This gives us an opportunity
2263 2263                   * to bring the space map's histogram up-to-date so we clear
2264 2264                   * it first before updating it.
2265 2265                   */
2266 2266                  space_map_histogram_clear(msp->ms_sm);
2267 2267                  space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2268 2268  
2269 2269                  /*
2270 2270                   * Since we've cleared the histogram we need to add back
2271 2271                   * any free space that has already been processed, plus
2272 2272                   * any deferred space. This allows the on-disk histogram
2273 2273                   * to accurately reflect all free space even if some space
2274 2274                   * is not yet available for allocation (i.e. deferred).
2275 2275                   */
2276 2276                  space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
2277 2277  
2278 2278                  /*
2279 2279                   * Add back any deferred free space that has not been
2280 2280                   * added back into the in-core free tree yet. This will
2281 2281                   * ensure that we don't end up with a space map histogram
2282 2282                   * that is completely empty unless the metaslab is fully
2283 2283                   * allocated.
2284 2284                   */
2285 2285                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2286 2286                          space_map_histogram_add(msp->ms_sm,
2287 2287                              msp->ms_defertree[t], tx);
2288 2288                  }
2289 2289          }
2290 2290  
2291 2291          /*
2292 2292           * Always add the free space from this sync pass to the space
2293 2293           * map histogram. We want to make sure that the on-disk histogram
2294 2294           * accounts for all free space. If the space map is not loaded,
2295 2295           * then we will lose some accuracy but will correct it the next
2296 2296           * time we load the space map.
2297 2297           */
2298 2298          space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
2299 2299  
2300 2300          metaslab_group_histogram_add(mg, msp);
2301 2301          metaslab_group_histogram_verify(mg);
2302 2302          metaslab_class_histogram_verify(mg->mg_class);
2303 2303  
2304 2304          /*
2305 2305           * For sync pass 1, we avoid traversing this txg's free range tree
2306 2306           * and instead will just swap the pointers for freeingtree and
2307 2307           * freedtree. We can safely do this since the freed_tree is
2308 2308           * guaranteed to be empty on the initial pass.
2309 2309           */
2310 2310          if (spa_sync_pass(spa) == 1) {
2311 2311                  range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
2312 2312          } else {
2313 2313                  range_tree_vacate(msp->ms_freeingtree,
2314 2314                      range_tree_add, msp->ms_freedtree);
2315 2315          }
2316 2316          range_tree_vacate(alloctree, NULL, NULL);
2317 2317  
2318 2318          ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2319 2319          ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
2320 2320          ASSERT0(range_tree_space(msp->ms_freeingtree));
2321 2321  
2322 2322          mutex_exit(&msp->ms_lock);
2323 2323  
2324 2324          if (object != space_map_object(msp->ms_sm)) {
2325 2325                  object = space_map_object(msp->ms_sm);
2326 2326                  dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2327 2327                      msp->ms_id, sizeof (uint64_t), &object, tx);
2328 2328          }
2329 2329          dmu_tx_commit(tx);
2330 2330  }
2331 2331  
2332 2332  /*
2333 2333   * Called after a transaction group has completely synced to mark
2334 2334   * all of the metaslab's free space as usable.
2335 2335   */
2336 2336  void
2337 2337  metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2338 2338  {
2339 2339          metaslab_group_t *mg = msp->ms_group;
2340 2340          vdev_t *vd = mg->mg_vd;
2341 2341          spa_t *spa = vd->vdev_spa;
2342 2342          range_tree_t **defer_tree;
2343 2343          int64_t alloc_delta, defer_delta;
2344 2344          boolean_t defer_allowed = B_TRUE;
2345 2345  
2346 2346          ASSERT(!vd->vdev_ishole);
2347 2347  
2348 2348          mutex_enter(&msp->ms_lock);
2349 2349  
2350 2350          /*
2351 2351           * If this metaslab is just becoming available, initialize its
2352 2352           * range trees and add its capacity to the vdev.
2353 2353           */
2354 2354          if (msp->ms_freedtree == NULL) {
2355 2355                  for (int t = 0; t < TXG_SIZE; t++) {
2356 2356                          ASSERT(msp->ms_alloctree[t] == NULL);
2357 2357  
2358 2358                          msp->ms_alloctree[t] = range_tree_create(NULL, msp,
2359 2359                              &msp->ms_lock);
2360 2360                  }
2361 2361  
2362 2362                  ASSERT3P(msp->ms_freeingtree, ==, NULL);
2363 2363                  msp->ms_freeingtree = range_tree_create(NULL, msp,
2364 2364                      &msp->ms_lock);
2365 2365  
2366 2366                  ASSERT3P(msp->ms_freedtree, ==, NULL);
2367 2367                  msp->ms_freedtree = range_tree_create(NULL, msp,
2368 2368                      &msp->ms_lock);
2369 2369  
2370 2370                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2371 2371                          ASSERT(msp->ms_defertree[t] == NULL);
2372 2372  
2373 2373                          msp->ms_defertree[t] = range_tree_create(NULL, msp,
2374 2374                              &msp->ms_lock);
2375 2375                  }
2376 2376  
2377 2377                  vdev_space_update(vd, 0, 0, msp->ms_size);
2378 2378          }
2379 2379  
2380 2380          defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2381 2381  
2382 2382          uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2383 2383              metaslab_class_get_alloc(spa_normal_class(spa));
2384 2384          if (free_space <= spa_get_slop_space(spa)) {
2385 2385                  defer_allowed = B_FALSE;
2386 2386          }
2387 2387  
2388 2388          defer_delta = 0;
2389 2389          alloc_delta = space_map_alloc_delta(msp->ms_sm);
2390 2390          if (defer_allowed) {
2391 2391                  defer_delta = range_tree_space(msp->ms_freedtree) -
2392 2392                      range_tree_space(*defer_tree);
2393 2393          } else {
2394 2394                  defer_delta -= range_tree_space(*defer_tree);
2395 2395          }
2396 2396  
2397 2397          vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2398 2398  
2399 2399          /*
2400 2400           * If there's a metaslab_load() in progress, wait for it to complete
2401 2401           * so that we have a consistent view of the in-core space map.
2402 2402           */
2403 2403          metaslab_load_wait(msp);
2404 2404  
2405 2405          /*
2406 2406           * Move the frees from the defer_tree back to the free
2407 2407           * range tree (if it's loaded). Swap the freed_tree and the
2408 2408           * defer_tree -- this is safe to do because we've just emptied out
2409 2409           * the defer_tree.
2410 2410           */
2411 2411          range_tree_vacate(*defer_tree,
2412 2412              msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2413 2413          if (defer_allowed) {
2414 2414                  range_tree_swap(&msp->ms_freedtree, defer_tree);
2415 2415          } else {
2416 2416                  range_tree_vacate(msp->ms_freedtree,
2417 2417                      msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2418 2418          }
2419 2419  
2420 2420          space_map_update(msp->ms_sm);
2421 2421  
2422 2422          msp->ms_deferspace += defer_delta;
2423 2423          ASSERT3S(msp->ms_deferspace, >=, 0);
2424 2424          ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2425 2425          if (msp->ms_deferspace != 0) {
2426 2426                  /*
2427 2427                   * Keep syncing this metaslab until all deferred frees
2428 2428                   * are back in circulation.
2429 2429                   */
2430 2430                  vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2431 2431          }
2432 2432  
2433 2433          /*
2434 2434           * Calculate the new weights before unloading any metaslabs.
2435 2435           * This will give us the most accurate weighting.
2436 2436           */
2437 2437          metaslab_group_sort(mg, msp, metaslab_weight(msp));
2438 2438  
2439 2439          /*
2440 2440           * If the metaslab is loaded and we've not tried to load or allocate
2441 2441           * from it in 'metaslab_unload_delay' txgs, then unload it.
2442 2442           */
2443 2443          if (msp->ms_loaded &&
2444 2444              msp->ms_selected_txg + metaslab_unload_delay < txg) {
2445 2445                  for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2446 2446                          VERIFY0(range_tree_space(
2447 2447                              msp->ms_alloctree[(txg + t) & TXG_MASK]));
2448 2448                  }
2449 2449  
2450 2450                  if (!metaslab_debug_unload)
2451 2451                          metaslab_unload(msp);
2452 2452          }
2453 2453  
2454 2454          mutex_exit(&msp->ms_lock);
2455 2455  }
2456 2456  
2457 2457  void
2458 2458  metaslab_sync_reassess(metaslab_group_t *mg)
2459 2459  {
2460 2460          metaslab_group_alloc_update(mg);
2461 2461          mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2462 2462  
2463 2463          /*
2464 2464           * Preload the next potential metaslabs
2465 2465           */
2466 2466          metaslab_group_preload(mg);
2467 2467  }
2468 2468  
2469 2469  static uint64_t
2470 2470  metaslab_distance(metaslab_t *msp, dva_t *dva)
2471 2471  {
2472 2472          uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2473 2473          uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2474 2474          uint64_t start = msp->ms_id;
2475 2475  
2476 2476          if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2477 2477                  return (1ULL << 63);
2478 2478  
2479 2479          if (offset < start)
2480 2480                  return ((start - offset) << ms_shift);
2481 2481          if (offset > start)
2482 2482                  return ((offset - start) << ms_shift);
2483 2483          return (0);
2484 2484  }
2485 2485  
2486 2486  /*
2487 2487   * ==========================================================================
2488 2488   * Metaslab allocation tracing facility
2489 2489   * ==========================================================================
2490 2490   */
2491 2491  kstat_t *metaslab_trace_ksp;
2492 2492  kstat_named_t metaslab_trace_over_limit;
2493 2493  
2494 2494  void
2495 2495  metaslab_alloc_trace_init(void)
2496 2496  {
2497 2497          ASSERT(metaslab_alloc_trace_cache == NULL);
2498 2498          metaslab_alloc_trace_cache = kmem_cache_create(
2499 2499              "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2500 2500              0, NULL, NULL, NULL, NULL, NULL, 0);
2501 2501          metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2502 2502              "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2503 2503          if (metaslab_trace_ksp != NULL) {
2504 2504                  metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2505 2505                  kstat_named_init(&metaslab_trace_over_limit,
2506 2506                      "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2507 2507                  kstat_install(metaslab_trace_ksp);
2508 2508          }
2509 2509  }
2510 2510  
2511 2511  void
2512 2512  metaslab_alloc_trace_fini(void)
2513 2513  {
2514 2514          if (metaslab_trace_ksp != NULL) {
2515 2515                  kstat_delete(metaslab_trace_ksp);
2516 2516                  metaslab_trace_ksp = NULL;
2517 2517          }
2518 2518          kmem_cache_destroy(metaslab_alloc_trace_cache);
2519 2519          metaslab_alloc_trace_cache = NULL;
2520 2520  }
2521 2521  
2522 2522  /*
2523 2523   * Add an allocation trace element to the allocation tracing list.
2524 2524   */
2525 2525  static void
2526 2526  metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2527 2527      metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2528 2528  {
2529 2529          if (!metaslab_trace_enabled)
2530 2530                  return;
2531 2531  
2532 2532          /*
2533 2533           * When the tracing list reaches its maximum we remove
2534 2534           * the second element in the list before adding a new one.
2535 2535           * By removing the second element we preserve the original
2536 2536           * entry as a clue to what allocations steps have already been
2537 2537           * performed.
2538 2538           */
2539 2539          if (zal->zal_size == metaslab_trace_max_entries) {
2540 2540                  metaslab_alloc_trace_t *mat_next;
2541 2541  #ifdef DEBUG
2542 2542                  panic("too many entries in allocation list");
2543 2543  #endif
2544 2544                  atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2545 2545                  zal->zal_size--;
2546 2546                  mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2547 2547                  list_remove(&zal->zal_list, mat_next);
2548 2548                  kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2549 2549          }
2550 2550  
2551 2551          metaslab_alloc_trace_t *mat =
2552 2552              kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2553 2553          list_link_init(&mat->mat_list_node);
2554 2554          mat->mat_mg = mg;
2555 2555          mat->mat_msp = msp;
2556 2556          mat->mat_size = psize;
2557 2557          mat->mat_dva_id = dva_id;
2558 2558          mat->mat_offset = offset;
2559 2559          mat->mat_weight = 0;
2560 2560  
2561 2561          if (msp != NULL)
2562 2562                  mat->mat_weight = msp->ms_weight;
2563 2563  
2564 2564          /*
2565 2565           * The list is part of the zio so locking is not required. Only
2566 2566           * a single thread will perform allocations for a given zio.
2567 2567           */
2568 2568          list_insert_tail(&zal->zal_list, mat);
2569 2569          zal->zal_size++;
2570 2570  
2571 2571          ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2572 2572  }
2573 2573  
2574 2574  void
2575 2575  metaslab_trace_init(zio_alloc_list_t *zal)
2576 2576  {
2577 2577          list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2578 2578              offsetof(metaslab_alloc_trace_t, mat_list_node));
2579 2579          zal->zal_size = 0;
2580 2580  }
2581 2581  
2582 2582  void
2583 2583  metaslab_trace_fini(zio_alloc_list_t *zal)
2584 2584  {
2585 2585          metaslab_alloc_trace_t *mat;
2586 2586  
2587 2587          while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2588 2588                  kmem_cache_free(metaslab_alloc_trace_cache, mat);
2589 2589          list_destroy(&zal->zal_list);
2590 2590          zal->zal_size = 0;
2591 2591  }
2592 2592  
2593 2593  /*
2594 2594   * ==========================================================================
2595 2595   * Metaslab block operations
2596 2596   * ==========================================================================
2597 2597   */
2598 2598  
2599 2599  static void
2600 2600  metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2601 2601  {
2602 2602          if (!(flags & METASLAB_ASYNC_ALLOC) ||
2603 2603              flags & METASLAB_DONT_THROTTLE)
2604 2604                  return;
2605 2605  
2606 2606          metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2607 2607          if (!mg->mg_class->mc_alloc_throttle_enabled)
2608 2608                  return;
2609 2609  
2610 2610          (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2611 2611  }
2612 2612  
2613 2613  void
2614 2614  metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2615 2615  {
2616 2616          if (!(flags & METASLAB_ASYNC_ALLOC) ||
2617 2617              flags & METASLAB_DONT_THROTTLE)
2618 2618                  return;
2619 2619  
2620 2620          metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2621 2621          if (!mg->mg_class->mc_alloc_throttle_enabled)
2622 2622                  return;
2623 2623  
2624 2624          (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2625 2625  }
2626 2626  
2627 2627  void
2628 2628  metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2629 2629  {
2630 2630  #ifdef ZFS_DEBUG
2631 2631          const dva_t *dva = bp->blk_dva;
2632 2632          int ndvas = BP_GET_NDVAS(bp);
2633 2633  
2634 2634          for (int d = 0; d < ndvas; d++) {
2635 2635                  uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2636 2636                  metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2637 2637                  VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2638 2638          }
2639 2639  #endif
2640 2640  }
2641 2641  
2642 2642  static uint64_t
2643 2643  metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2644 2644  {
2645 2645          uint64_t start;
2646 2646          range_tree_t *rt = msp->ms_tree;
2647 2647          metaslab_class_t *mc = msp->ms_group->mg_class;
2648 2648  
2649 2649          VERIFY(!msp->ms_condensing);
2650 2650  
2651 2651          start = mc->mc_ops->msop_alloc(msp, size);
2652 2652          if (start != -1ULL) {
2653 2653                  metaslab_group_t *mg = msp->ms_group;
2654 2654                  vdev_t *vd = mg->mg_vd;
2655 2655  
2656 2656                  VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2657 2657                  VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2658 2658                  VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2659 2659                  range_tree_remove(rt, start, size);
2660 2660  
2661 2661                  if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2662 2662                          vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2663 2663  
2664 2664                  range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
2665 2665  
2666 2666                  /* Track the last successful allocation */
2667 2667                  msp->ms_alloc_txg = txg;
2668 2668                  metaslab_verify_space(msp, txg);
2669 2669          }
2670 2670  
2671 2671          /*
2672 2672           * Now that we've attempted the allocation we need to update the
2673 2673           * metaslab's maximum block size since it may have changed.
2674 2674           */
2675 2675          msp->ms_max_size = metaslab_block_maxsize(msp);
2676 2676          return (start);
2677 2677  }
2678 2678  
2679 2679  static uint64_t
2680 2680  metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2681 2681      uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2682 2682  {
2683 2683          metaslab_t *msp = NULL;
2684 2684          uint64_t offset = -1ULL;
2685 2685          uint64_t activation_weight;
2686 2686          uint64_t target_distance;
2687 2687          int i;
2688 2688  
2689 2689          activation_weight = METASLAB_WEIGHT_PRIMARY;
2690 2690          for (i = 0; i < d; i++) {
2691 2691                  if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2692 2692                          activation_weight = METASLAB_WEIGHT_SECONDARY;
2693 2693                          break;
2694 2694                  }
2695 2695          }
2696 2696  
2697 2697          metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2698 2698          search->ms_weight = UINT64_MAX;
2699 2699          search->ms_start = 0;
2700 2700          for (;;) {
2701 2701                  boolean_t was_active;
2702 2702                  avl_tree_t *t = &mg->mg_metaslab_tree;
2703 2703                  avl_index_t idx;
2704 2704  
2705 2705                  mutex_enter(&mg->mg_lock);
2706 2706  
2707 2707                  /*
2708 2708                   * Find the metaslab with the highest weight that is less
2709 2709                   * than what we've already tried.  In the common case, this
2710 2710                   * means that we will examine each metaslab at most once.
2711 2711                   * Note that concurrent callers could reorder metaslabs
2712 2712                   * by activation/passivation once we have dropped the mg_lock.
2713 2713                   * If a metaslab is activated by another thread, and we fail
2714 2714                   * to allocate from the metaslab we have selected, we may
2715 2715                   * not try the newly-activated metaslab, and instead activate
2716 2716                   * another metaslab.  This is not optimal, but generally
2717 2717                   * does not cause any problems (a possible exception being
2718 2718                   * if every metaslab is completely full except for the
2719 2719                   * the newly-activated metaslab which we fail to examine).
2720 2720                   */
2721 2721                  msp = avl_find(t, search, &idx);
2722 2722                  if (msp == NULL)
2723 2723                          msp = avl_nearest(t, idx, AVL_AFTER);
2724 2724                  for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2725 2725  
2726 2726                          if (!metaslab_should_allocate(msp, asize)) {
2727 2727                                  metaslab_trace_add(zal, mg, msp, asize, d,
2728 2728                                      TRACE_TOO_SMALL);
2729 2729                                  continue;
2730 2730                          }
2731 2731  
2732 2732                          /*
2733 2733                           * If the selected metaslab is condensing, skip it.
2734 2734                           */
2735 2735                          if (msp->ms_condensing)
2736 2736                                  continue;
2737 2737  
2738 2738                          was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2739 2739                          if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2740 2740                                  break;
2741 2741  
2742 2742                          target_distance = min_distance +
2743 2743                              (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2744 2744                              min_distance >> 1);
2745 2745  
2746 2746                          for (i = 0; i < d; i++) {
2747 2747                                  if (metaslab_distance(msp, &dva[i]) <
2748 2748                                      target_distance)
2749 2749                                          break;
2750 2750                          }
2751 2751                          if (i == d)
2752 2752                                  break;
2753 2753                  }
2754 2754                  mutex_exit(&mg->mg_lock);
2755 2755                  if (msp == NULL) {
2756 2756                          kmem_free(search, sizeof (*search));
2757 2757                          return (-1ULL);
2758 2758                  }
2759 2759                  search->ms_weight = msp->ms_weight;
2760 2760                  search->ms_start = msp->ms_start + 1;
2761 2761  
2762 2762                  mutex_enter(&msp->ms_lock);
2763 2763  
2764 2764                  /*
2765 2765                   * Ensure that the metaslab we have selected is still
2766 2766                   * capable of handling our request. It's possible that
2767 2767                   * another thread may have changed the weight while we
2768 2768                   * were blocked on the metaslab lock. We check the
2769 2769                   * active status first to see if we need to reselect
2770 2770                   * a new metaslab.
2771 2771                   */
2772 2772                  if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2773 2773                          mutex_exit(&msp->ms_lock);
2774 2774                          continue;
2775 2775                  }
2776 2776  
2777 2777                  if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2778 2778                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
2779 2779                          metaslab_passivate(msp,
2780 2780                              msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2781 2781                          mutex_exit(&msp->ms_lock);
2782 2782                          continue;
2783 2783                  }
2784 2784  
2785 2785                  if (metaslab_activate(msp, activation_weight) != 0) {
2786 2786                          mutex_exit(&msp->ms_lock);
2787 2787                          continue;
2788 2788                  }
2789 2789                  msp->ms_selected_txg = txg;
2790 2790  
2791 2791                  /*
2792 2792                   * Now that we have the lock, recheck to see if we should
2793 2793                   * continue to use this metaslab for this allocation. The
2794 2794                   * the metaslab is now loaded so metaslab_should_allocate() can
2795 2795                   * accurately determine if the allocation attempt should
2796 2796                   * proceed.
2797 2797                   */
2798 2798                  if (!metaslab_should_allocate(msp, asize)) {
2799 2799                          /* Passivate this metaslab and select a new one. */
2800 2800                          metaslab_trace_add(zal, mg, msp, asize, d,
2801 2801                              TRACE_TOO_SMALL);
2802 2802                          goto next;
2803 2803                  }
2804 2804  
2805 2805                  /*
2806 2806                   * If this metaslab is currently condensing then pick again as
2807 2807                   * we can't manipulate this metaslab until it's committed
2808 2808                   * to disk.
2809 2809                   */
2810 2810                  if (msp->ms_condensing) {
2811 2811                          metaslab_trace_add(zal, mg, msp, asize, d,
2812 2812                              TRACE_CONDENSING);
2813 2813                          mutex_exit(&msp->ms_lock);
2814 2814                          continue;
2815 2815                  }
2816 2816  
2817 2817                  offset = metaslab_block_alloc(msp, asize, txg);
2818 2818                  metaslab_trace_add(zal, mg, msp, asize, d, offset);
2819 2819  
2820 2820                  if (offset != -1ULL) {
2821 2821                          /* Proactively passivate the metaslab, if needed */
2822 2822                          metaslab_segment_may_passivate(msp);
2823 2823                          break;
2824 2824                  }
2825 2825  next:
2826 2826                  ASSERT(msp->ms_loaded);
2827 2827  
2828 2828                  /*
2829 2829                   * We were unable to allocate from this metaslab so determine
2830 2830                   * a new weight for this metaslab. Now that we have loaded
2831 2831                   * the metaslab we can provide a better hint to the metaslab
2832 2832                   * selector.
2833 2833                   *
2834 2834                   * For space-based metaslabs, we use the maximum block size.
2835 2835                   * This information is only available when the metaslab
2836 2836                   * is loaded and is more accurate than the generic free
2837 2837                   * space weight that was calculated by metaslab_weight().
2838 2838                   * This information allows us to quickly compare the maximum
2839 2839                   * available allocation in the metaslab to the allocation
2840 2840                   * size being requested.
2841 2841                   *
2842 2842                   * For segment-based metaslabs, determine the new weight
2843 2843                   * based on the highest bucket in the range tree. We
2844 2844                   * explicitly use the loaded segment weight (i.e. the range
2845 2845                   * tree histogram) since it contains the space that is
2846 2846                   * currently available for allocation and is accurate
2847 2847                   * even within a sync pass.
2848 2848                   */
2849 2849                  if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2850 2850                          uint64_t weight = metaslab_block_maxsize(msp);
2851 2851                          WEIGHT_SET_SPACEBASED(weight);
2852 2852                          metaslab_passivate(msp, weight);
2853 2853                  } else {
2854 2854                          metaslab_passivate(msp,
2855 2855                              metaslab_weight_from_range_tree(msp));
2856 2856                  }
2857 2857  
2858 2858                  /*
2859 2859                   * We have just failed an allocation attempt, check
2860 2860                   * that metaslab_should_allocate() agrees. Otherwise,
2861 2861                   * we may end up in an infinite loop retrying the same
2862 2862                   * metaslab.
2863 2863                   */
2864 2864                  ASSERT(!metaslab_should_allocate(msp, asize));
2865 2865                  mutex_exit(&msp->ms_lock);
2866 2866          }
2867 2867          mutex_exit(&msp->ms_lock);
2868 2868          kmem_free(search, sizeof (*search));
2869 2869          return (offset);
2870 2870  }
2871 2871  
2872 2872  static uint64_t
2873 2873  metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
2874 2874      uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2875 2875  {
2876 2876          uint64_t offset;
2877 2877          ASSERT(mg->mg_initialized);
2878 2878  
2879 2879          offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
2880 2880              min_distance, dva, d);
2881 2881  
2882 2882          mutex_enter(&mg->mg_lock);
2883 2883          if (offset == -1ULL) {
2884 2884                  mg->mg_failed_allocations++;
2885 2885                  metaslab_trace_add(zal, mg, NULL, asize, d,
2886 2886                      TRACE_GROUP_FAILURE);
2887 2887                  if (asize == SPA_GANGBLOCKSIZE) {
2888 2888                          /*
2889 2889                           * This metaslab group was unable to allocate
2890 2890                           * the minimum gang block size so it must be out of
2891 2891                           * space. We must notify the allocation throttle
2892 2892                           * to start skipping allocation attempts to this
2893 2893                           * metaslab group until more space becomes available.
2894 2894                           * Note: this failure cannot be caused by the
2895 2895                           * allocation throttle since the allocation throttle
2896 2896                           * is only responsible for skipping devices and
2897 2897                           * not failing block allocations.
2898 2898                           */
2899 2899                          mg->mg_no_free_space = B_TRUE;
2900 2900                  }
2901 2901          }
2902 2902          mg->mg_allocations++;
2903 2903          mutex_exit(&mg->mg_lock);
2904 2904          return (offset);
2905 2905  }
2906 2906  
2907 2907  /*
2908 2908   * If we have to write a ditto block (i.e. more than one DVA for a given BP)
2909 2909   * on the same vdev as an existing DVA of this BP, then try to allocate it
2910 2910   * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
2911 2911   * existing DVAs.
2912 2912   */
2913 2913  int ditto_same_vdev_distance_shift = 3;
2914 2914  
2915 2915  /*
2916 2916   * Allocate a block for the specified i/o.
2917 2917   */
2918 2918  static int
2919 2919  metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2920 2920      dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
2921 2921      zio_alloc_list_t *zal)
2922 2922  {
2923 2923          metaslab_group_t *mg, *rotor;
2924 2924          vdev_t *vd;
2925 2925          boolean_t try_hard = B_FALSE;
2926 2926  
2927 2927          ASSERT(!DVA_IS_VALID(&dva[d]));
2928 2928  
2929 2929          /*
2930 2930           * For testing, make some blocks above a certain size be gang blocks.
2931 2931           */
2932 2932          if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
2933 2933                  metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
2934 2934                  return (SET_ERROR(ENOSPC));
2935 2935          }
2936 2936  
2937 2937          /*
2938 2938           * Start at the rotor and loop through all mgs until we find something.
2939 2939           * Note that there's no locking on mc_rotor or mc_aliquot because
2940 2940           * nothing actually breaks if we miss a few updates -- we just won't
2941 2941           * allocate quite as evenly.  It all balances out over time.
2942 2942           *
2943 2943           * If we are doing ditto or log blocks, try to spread them across
2944 2944           * consecutive vdevs.  If we're forced to reuse a vdev before we've
2945 2945           * allocated all of our ditto blocks, then try and spread them out on
2946 2946           * that vdev as much as possible.  If it turns out to not be possible,
2947 2947           * gradually lower our standards until anything becomes acceptable.
2948 2948           * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2949 2949           * gives us hope of containing our fault domains to something we're
2950 2950           * able to reason about.  Otherwise, any two top-level vdev failures
2951 2951           * will guarantee the loss of data.  With consecutive allocation,
2952 2952           * only two adjacent top-level vdev failures will result in data loss.
2953 2953           *
2954 2954           * If we are doing gang blocks (hintdva is non-NULL), try to keep
2955 2955           * ourselves on the same vdev as our gang block header.  That
2956 2956           * way, we can hope for locality in vdev_cache, plus it makes our
2957 2957           * fault domains something tractable.
2958 2958           */
2959 2959          if (hintdva) {
2960 2960                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2961 2961  
2962 2962                  /*
2963 2963                   * It's possible the vdev we're using as the hint no
2964 2964                   * longer exists (i.e. removed). Consult the rotor when
2965 2965                   * all else fails.
2966 2966                   */
2967 2967                  if (vd != NULL) {
2968 2968                          mg = vd->vdev_mg;
2969 2969  
2970 2970                          if (flags & METASLAB_HINTBP_AVOID &&
2971 2971                              mg->mg_next != NULL)
2972 2972                                  mg = mg->mg_next;
2973 2973                  } else {
2974 2974                          mg = mc->mc_rotor;
2975 2975                  }
2976 2976          } else if (d != 0) {
2977 2977                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
2978 2978                  mg = vd->vdev_mg->mg_next;
2979 2979          } else {
2980 2980                  mg = mc->mc_rotor;
2981 2981          }
2982 2982  
2983 2983          /*
2984 2984           * If the hint put us into the wrong metaslab class, or into a
2985 2985           * metaslab group that has been passivated, just follow the rotor.
2986 2986           */
2987 2987          if (mg->mg_class != mc || mg->mg_activation_count <= 0)
2988 2988                  mg = mc->mc_rotor;
2989 2989  
2990 2990          rotor = mg;
2991 2991  top:
2992 2992          do {
2993 2993                  boolean_t allocatable;
2994 2994  
2995 2995                  ASSERT(mg->mg_activation_count == 1);
2996 2996                  vd = mg->mg_vd;
2997 2997  
2998 2998                  /*
2999 2999                   * Don't allocate from faulted devices.
3000 3000                   */
3001 3001                  if (try_hard) {
3002 3002                          spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3003 3003                          allocatable = vdev_allocatable(vd);
3004 3004                          spa_config_exit(spa, SCL_ZIO, FTAG);
3005 3005                  } else {
3006 3006                          allocatable = vdev_allocatable(vd);
3007 3007                  }
3008 3008  
3009 3009                  /*
3010 3010                   * Determine if the selected metaslab group is eligible
3011 3011                   * for allocations. If we're ganging then don't allow
3012 3012                   * this metaslab group to skip allocations since that would
3013 3013                   * inadvertently return ENOSPC and suspend the pool
3014 3014                   * even though space is still available.
3015 3015                   */
3016 3016                  if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3017 3017                          allocatable = metaslab_group_allocatable(mg, rotor,
3018 3018                              psize);
3019 3019                  }
3020 3020  
3021 3021                  if (!allocatable) {
3022 3022                          metaslab_trace_add(zal, mg, NULL, psize, d,
3023 3023                              TRACE_NOT_ALLOCATABLE);
3024 3024                          goto next;
3025 3025                  }
3026 3026  
3027 3027                  ASSERT(mg->mg_initialized);
3028 3028  
3029 3029                  /*
3030 3030                   * Avoid writing single-copy data to a failing,
3031 3031                   * non-redundant vdev, unless we've already tried all
3032 3032                   * other vdevs.
3033 3033                   */
3034 3034                  if ((vd->vdev_stat.vs_write_errors > 0 ||
3035 3035                      vd->vdev_state < VDEV_STATE_HEALTHY) &&
3036 3036                      d == 0 && !try_hard && vd->vdev_children == 0) {
3037 3037                          metaslab_trace_add(zal, mg, NULL, psize, d,
3038 3038                              TRACE_VDEV_ERROR);
3039 3039                          goto next;
3040 3040                  }
3041 3041  
3042 3042                  ASSERT(mg->mg_class == mc);
3043 3043  
3044 3044                  /*
3045 3045                   * If we don't need to try hard, then require that the
3046 3046                   * block be 1/8th of the device away from any other DVAs
3047 3047                   * in this BP.  If we are trying hard, allow any offset
3048 3048                   * to be used (distance=0).
3049 3049                   */
3050 3050                  uint64_t distance = 0;
3051 3051                  if (!try_hard) {
3052 3052                          distance = vd->vdev_asize >>
3053 3053                              ditto_same_vdev_distance_shift;
3054 3054                          if (distance <= (1ULL << vd->vdev_ms_shift))
3055 3055                                  distance = 0;
3056 3056                  }
3057 3057  
3058 3058                  uint64_t asize = vdev_psize_to_asize(vd, psize);
3059 3059                  ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3060 3060  
3061 3061                  uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3062 3062                      distance, dva, d);
3063 3063  
3064 3064                  if (offset != -1ULL) {
3065 3065                          /*
3066 3066                           * If we've just selected this metaslab group,
3067 3067                           * figure out whether the corresponding vdev is
3068 3068                           * over- or under-used relative to the pool,
3069 3069                           * and set an allocation bias to even it out.
3070 3070                           */
3071 3071                          if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3072 3072                                  vdev_stat_t *vs = &vd->vdev_stat;
3073 3073                                  int64_t vu, cu;
3074 3074  
3075 3075                                  vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
3076 3076                                  cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
3077 3077  
3078 3078                                  /*
3079 3079                                   * Calculate how much more or less we should
3080 3080                                   * try to allocate from this device during
3081 3081                                   * this iteration around the rotor.
3082 3082                                   * For example, if a device is 80% full
3083 3083                                   * and the pool is 20% full then we should
3084 3084                                   * reduce allocations by 60% on this device.
3085 3085                                   *
3086 3086                                   * mg_bias = (20 - 80) * 512K / 100 = -307K
3087 3087                                   *
3088 3088                                   * This reduces allocations by 307K for this
3089 3089                                   * iteration.
3090 3090                                   */
3091 3091                                  mg->mg_bias = ((cu - vu) *
3092 3092                                      (int64_t)mg->mg_aliquot) / 100;
3093 3093                          } else if (!metaslab_bias_enabled) {
3094 3094                                  mg->mg_bias = 0;
3095 3095                          }
3096 3096  
3097 3097                          if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3098 3098                              mg->mg_aliquot + mg->mg_bias) {
3099 3099                                  mc->mc_rotor = mg->mg_next;
3100 3100                                  mc->mc_aliquot = 0;
3101 3101                          }
3102 3102  
3103 3103                          DVA_SET_VDEV(&dva[d], vd->vdev_id);
3104 3104                          DVA_SET_OFFSET(&dva[d], offset);
3105 3105                          DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
3106 3106                          DVA_SET_ASIZE(&dva[d], asize);
3107 3107  
3108 3108                          return (0);
3109 3109                  }
3110 3110  next:
3111 3111                  mc->mc_rotor = mg->mg_next;
3112 3112                  mc->mc_aliquot = 0;
3113 3113          } while ((mg = mg->mg_next) != rotor);
3114 3114  
3115 3115          /*
3116 3116           * If we haven't tried hard, do so now.
3117 3117           */
3118 3118          if (!try_hard) {
3119 3119                  try_hard = B_TRUE;
3120 3120                  goto top;
3121 3121          }
3122 3122  
3123 3123          bzero(&dva[d], sizeof (dva_t));
3124 3124  
3125 3125          metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3126 3126          return (SET_ERROR(ENOSPC));
3127 3127  }
3128 3128  
3129 3129  /*
3130 3130   * Free the block represented by DVA in the context of the specified
3131 3131   * transaction group.
3132 3132   */
3133 3133  static void
3134 3134  metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
3135 3135  {
3136 3136          uint64_t vdev = DVA_GET_VDEV(dva);
3137 3137          uint64_t offset = DVA_GET_OFFSET(dva);
3138 3138          uint64_t size = DVA_GET_ASIZE(dva);
3139 3139          vdev_t *vd;
3140 3140          metaslab_t *msp;
3141 3141  
3142 3142          ASSERT(DVA_IS_VALID(dva));
3143 3143  
3144 3144          if (txg > spa_freeze_txg(spa))
3145 3145                  return;
3146 3146  
3147 3147          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3148 3148              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3149 3149                  cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
3150 3150                      (u_longlong_t)vdev, (u_longlong_t)offset);
3151 3151                  ASSERT(0);
3152 3152                  return;
3153 3153          }
3154 3154  
3155 3155          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3156 3156  
3157 3157          if (DVA_GET_GANG(dva))
3158 3158                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3159 3159  
3160 3160          mutex_enter(&msp->ms_lock);
3161 3161  
3162 3162          if (now) {
3163 3163                  range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3164 3164                      offset, size);
3165 3165  
3166 3166                  VERIFY(!msp->ms_condensing);
3167 3167                  VERIFY3U(offset, >=, msp->ms_start);
3168 3168                  VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3169 3169                  VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
3170 3170                      msp->ms_size);
3171 3171                  VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3172 3172                  VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3173 3173                  range_tree_add(msp->ms_tree, offset, size);
3174 3174                  msp->ms_max_size = metaslab_block_maxsize(msp);
3175 3175          } else {
3176 3176                  VERIFY3U(txg, ==, spa->spa_syncing_txg);
3177 3177                  if (range_tree_space(msp->ms_freeingtree) == 0)
3178 3178                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
3179 3179                  range_tree_add(msp->ms_freeingtree, offset, size);
3180 3180          }
3181 3181  
3182 3182          mutex_exit(&msp->ms_lock);
3183 3183  }
3184 3184  
3185 3185  /*
3186 3186   * Intent log support: upon opening the pool after a crash, notify the SPA
3187 3187   * of blocks that the intent log has allocated for immediate write, but
3188 3188   * which are still considered free by the SPA because the last transaction
3189 3189   * group didn't commit yet.
3190 3190   */
3191 3191  static int
3192 3192  metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3193 3193  {
3194 3194          uint64_t vdev = DVA_GET_VDEV(dva);
3195 3195          uint64_t offset = DVA_GET_OFFSET(dva);
3196 3196          uint64_t size = DVA_GET_ASIZE(dva);
3197 3197          vdev_t *vd;
3198 3198          metaslab_t *msp;
3199 3199          int error = 0;
3200 3200  
3201 3201          ASSERT(DVA_IS_VALID(dva));
3202 3202  
3203 3203          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3204 3204              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
3205 3205                  return (SET_ERROR(ENXIO));
3206 3206  
3207 3207          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3208 3208  
3209 3209          if (DVA_GET_GANG(dva))
3210 3210                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3211 3211  
3212 3212          mutex_enter(&msp->ms_lock);
3213 3213  
3214 3214          if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3215 3215                  error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3216 3216  
3217 3217          if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
3218 3218                  error = SET_ERROR(ENOENT);
3219 3219  
3220 3220          if (error || txg == 0) {        /* txg == 0 indicates dry run */
3221 3221                  mutex_exit(&msp->ms_lock);
3222 3222                  return (error);
3223 3223          }
3224 3224  
3225 3225          VERIFY(!msp->ms_condensing);
3226 3226          VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3227 3227          VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3228 3228          VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
3229 3229          range_tree_remove(msp->ms_tree, offset, size);
3230 3230  
3231 3231          if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
3232 3232                  if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3233 3233                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
3234 3234                  range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3235 3235          }
3236 3236  
3237 3237          mutex_exit(&msp->ms_lock);
3238 3238  
3239 3239          return (0);
3240 3240  }
3241 3241  
3242 3242  /*
3243 3243   * Reserve some allocation slots. The reservation system must be called
3244 3244   * before we call into the allocator. If there aren't any available slots
3245 3245   * then the I/O will be throttled until an I/O completes and its slots are
3246 3246   * freed up. The function returns true if it was successful in placing
3247 3247   * the reservation.
3248 3248   */
3249 3249  boolean_t
3250 3250  metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3251 3251      int flags)
3252 3252  {
3253 3253          uint64_t available_slots = 0;
3254 3254          boolean_t slot_reserved = B_FALSE;
3255 3255  
3256 3256          ASSERT(mc->mc_alloc_throttle_enabled);
3257 3257          mutex_enter(&mc->mc_lock);
3258 3258  
3259 3259          uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3260 3260          if (reserved_slots < mc->mc_alloc_max_slots)
3261 3261                  available_slots = mc->mc_alloc_max_slots - reserved_slots;
3262 3262  
3263 3263          if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3264 3264                  /*
3265 3265                   * We reserve the slots individually so that we can unreserve
3266 3266                   * them individually when an I/O completes.
3267 3267                   */
3268 3268                  for (int d = 0; d < slots; d++) {
3269 3269                          reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3270 3270                  }
3271 3271                  zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3272 3272                  slot_reserved = B_TRUE;
3273 3273          }
3274 3274  
3275 3275          mutex_exit(&mc->mc_lock);
3276 3276          return (slot_reserved);
3277 3277  }
3278 3278  
3279 3279  void
3280 3280  metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3281 3281  {
3282 3282          ASSERT(mc->mc_alloc_throttle_enabled);
3283 3283          mutex_enter(&mc->mc_lock);
3284 3284          for (int d = 0; d < slots; d++) {
3285 3285                  (void) refcount_remove(&mc->mc_alloc_slots, zio);
3286 3286          }
3287 3287          mutex_exit(&mc->mc_lock);
3288 3288  }
3289 3289  
3290 3290  int
3291 3291  metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3292 3292      int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3293 3293      zio_alloc_list_t *zal, zio_t *zio)
3294 3294  {
3295 3295          dva_t *dva = bp->blk_dva;
3296 3296          dva_t *hintdva = hintbp->blk_dva;
3297 3297          int error = 0;
3298 3298  
3299 3299          ASSERT(bp->blk_birth == 0);
3300 3300          ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3301 3301  
3302 3302          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3303 3303  
3304 3304          if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
3305 3305                  spa_config_exit(spa, SCL_ALLOC, FTAG);
3306 3306                  return (SET_ERROR(ENOSPC));
3307 3307          }
3308 3308  
3309 3309          ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3310 3310          ASSERT(BP_GET_NDVAS(bp) == 0);
3311 3311          ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3312 3312          ASSERT3P(zal, !=, NULL);
3313 3313  
3314 3314          for (int d = 0; d < ndvas; d++) {
3315 3315                  error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3316 3316                      txg, flags, zal);
3317 3317                  if (error != 0) {
3318 3318                          for (d--; d >= 0; d--) {
3319 3319                                  metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
3320 3320                                  metaslab_group_alloc_decrement(spa,
3321 3321                                      DVA_GET_VDEV(&dva[d]), zio, flags);
3322 3322                                  bzero(&dva[d], sizeof (dva_t));
3323 3323                          }
3324 3324                          spa_config_exit(spa, SCL_ALLOC, FTAG);
3325 3325                          return (error);
3326 3326                  } else {
3327 3327                          /*
3328 3328                           * Update the metaslab group's queue depth
3329 3329                           * based on the newly allocated dva.
3330 3330                           */
3331 3331                          metaslab_group_alloc_increment(spa,
3332 3332                              DVA_GET_VDEV(&dva[d]), zio, flags);
3333 3333                  }
3334 3334  
3335 3335          }
3336 3336          ASSERT(error == 0);
3337 3337          ASSERT(BP_GET_NDVAS(bp) == ndvas);
3338 3338  
3339 3339          spa_config_exit(spa, SCL_ALLOC, FTAG);
3340 3340  
3341 3341          BP_SET_BIRTH(bp, txg, txg);
3342 3342  
3343 3343          return (0);
3344 3344  }
3345 3345  
3346 3346  void
3347 3347  metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3348 3348  {
3349 3349          const dva_t *dva = bp->blk_dva;
3350 3350          int ndvas = BP_GET_NDVAS(bp);
3351 3351  
3352 3352          ASSERT(!BP_IS_HOLE(bp));
3353 3353          ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3354 3354  
3355 3355          spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3356 3356  
3357 3357          for (int d = 0; d < ndvas; d++)
3358 3358                  metaslab_free_dva(spa, &dva[d], txg, now);
3359 3359  
3360 3360          spa_config_exit(spa, SCL_FREE, FTAG);
3361 3361  }
3362 3362  
3363 3363  int
3364 3364  metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3365 3365  {
3366 3366          const dva_t *dva = bp->blk_dva;
3367 3367          int ndvas = BP_GET_NDVAS(bp);
3368 3368          int error = 0;
3369 3369  
3370 3370          ASSERT(!BP_IS_HOLE(bp));
3371 3371  
3372 3372          if (txg != 0) {
3373 3373                  /*
3374 3374                   * First do a dry run to make sure all DVAs are claimable,
3375 3375                   * so we don't have to unwind from partial failures below.
3376 3376                   */
3377 3377                  if ((error = metaslab_claim(spa, bp, 0)) != 0)
3378 3378                          return (error);
3379 3379          }
3380 3380  
3381 3381          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3382 3382  
3383 3383          for (int d = 0; d < ndvas; d++)
3384 3384                  if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3385 3385                          break;
3386 3386  
3387 3387          spa_config_exit(spa, SCL_ALLOC, FTAG);
3388 3388  
3389 3389          ASSERT(error == 0 || txg == 0);
3390 3390  
3391 3391          return (error);
3392 3392  }
3393 3393  
3394 3394  void
3395 3395  metaslab_check_free(spa_t *spa, const blkptr_t *bp)
3396 3396  {
3397 3397          if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3398 3398                  return;
3399 3399  
3400 3400          spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3401 3401          for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
3402 3402                  uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
3403 3403                  vdev_t *vd = vdev_lookup_top(spa, vdev);
3404 3404                  uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
3405 3405                  uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
3406 3406                  metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3407 3407  
3408 3408                  if (msp->ms_loaded)
3409 3409                          range_tree_verify(msp->ms_tree, offset, size);
3410 3410  
3411 3411                  range_tree_verify(msp->ms_freeingtree, offset, size);
3412 3412                  range_tree_verify(msp->ms_freedtree, offset, size);
3413 3413                  for (int j = 0; j < TXG_DEFER_SIZE; j++)
3414 3414                          range_tree_verify(msp->ms_defertree[j], offset, size);
3415 3415          }
3416 3416          spa_config_exit(spa, SCL_VDEV, FTAG);
3417 3417  }

↓ open down ↓

1742 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX