illumos-gate Wdiff usr/src/uts/common/fs/zfs/metaslab.c

Print this page

7938 Port ZOL #3712 disable LBA weighting on files and SSDs

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/metaslab.c
          +++ new/usr/src/uts/common/fs/zfs/metaslab.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25   25   * Copyright (c) 2014 Integros [integros.com]
  26   26   */
  27   27  
  28   28  #include <sys/zfs_context.h>
  29   29  #include <sys/dmu.h>
  30   30  #include <sys/dmu_tx.h>
  31   31  #include <sys/space_map.h>
  32   32  #include <sys/metaslab_impl.h>
  33   33  #include <sys/vdev_impl.h>
  34   34  #include <sys/zio.h>
  35   35  #include <sys/spa_impl.h>
  36   36  #include <sys/zfeature.h>
  37   37  
  38   38  #define GANG_ALLOCATION(flags) \
  39   39          ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
  40   40  
  41   41  uint64_t metaslab_aliquot = 512ULL << 10;
  42   42  uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  43   43  
  44   44  /*
  45   45   * The in-core space map representation is more compact than its on-disk form.
  46   46   * The zfs_condense_pct determines how much more compact the in-core
  47   47   * space map representation must be before we compact it on-disk.
  48   48   * Values should be greater than or equal to 100.
  49   49   */
  50   50  int zfs_condense_pct = 200;
  51   51  
  52   52  /*
  53   53   * Condensing a metaslab is not guaranteed to actually reduce the amount of
  54   54   * space used on disk. In particular, a space map uses data in increments of
  55   55   * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
  56   56   * same number of blocks after condensing. Since the goal of condensing is to
  57   57   * reduce the number of IOPs required to read the space map, we only want to
  58   58   * condense when we can be sure we will reduce the number of blocks used by the
  59   59   * space map. Unfortunately, we cannot precisely compute whether or not this is
  60   60   * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  61   61   * we apply the following heuristic: do not condense a spacemap unless the
  62   62   * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  63   63   * blocks.
  64   64   */
  65   65  int zfs_metaslab_condense_block_threshold = 4;
  66   66  
  67   67  /*
  68   68   * The zfs_mg_noalloc_threshold defines which metaslab groups should
  69   69   * be eligible for allocation. The value is defined as a percentage of
  70   70   * free space. Metaslab groups that have more free space than
  71   71   * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  72   72   * a metaslab group's free space is less than or equal to the
  73   73   * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  74   74   * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  75   75   * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  76   76   * groups are allowed to accept allocations. Gang blocks are always
  77   77   * eligible to allocate on any metaslab group. The default value of 0 means
  78   78   * no metaslab group will be excluded based on this criterion.
  79   79   */
  80   80  int zfs_mg_noalloc_threshold = 0;
  81   81  
  82   82  /*
  83   83   * Metaslab groups are considered eligible for allocations if their
  84   84   * fragmenation metric (measured as a percentage) is less than or equal to
  85   85   * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
  86   86   * then it will be skipped unless all metaslab groups within the metaslab
  87   87   * class have also crossed this threshold.
  88   88   */
  89   89  int zfs_mg_fragmentation_threshold = 85;
  90   90  
  91   91  /*
  92   92   * Allow metaslabs to keep their active state as long as their fragmentation
  93   93   * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
  94   94   * active metaslab that exceeds this threshold will no longer keep its active
  95   95   * status allowing better metaslabs to be selected.
  96   96   */
  97   97  int zfs_metaslab_fragmentation_threshold = 70;
  98   98  
  99   99  /*
 100  100   * When set will load all metaslabs when pool is first opened.
 101  101   */
 102  102  int metaslab_debug_load = 0;
 103  103  
 104  104  /*
 105  105   * When set will prevent metaslabs from being unloaded.
 106  106   */
 107  107  int metaslab_debug_unload = 0;
 108  108  
 109  109  /*
 110  110   * Minimum size which forces the dynamic allocator to change
 111  111   * it's allocation strategy.  Once the space map cannot satisfy
 112  112   * an allocation of this size then it switches to using more
 113  113   * aggressive strategy (i.e search by size rather than offset).
 114  114   */
 115  115  uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 116  116  
 117  117  /*
 118  118   * The minimum free space, in percent, which must be available
 119  119   * in a space map to continue allocations in a first-fit fashion.
 120  120   * Once the space map's free space drops below this level we dynamically
 121  121   * switch to using best-fit allocations.
 122  122   */
 123  123  int metaslab_df_free_pct = 4;
 124  124  
 125  125  /*
 126  126   * A metaslab is considered "free" if it contains a contiguous
 127  127   * segment which is greater than metaslab_min_alloc_size.
 128  128   */
 129  129  uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 130  130  
 131  131  /*
 132  132   * Percentage of all cpus that can be used by the metaslab taskq.
 133  133   */
 134  134  int metaslab_load_pct = 50;
 135  135  
 136  136  /*
 137  137   * Determines how many txgs a metaslab may remain loaded without having any
 138  138   * allocations from it. As long as a metaslab continues to be used we will
 139  139   * keep it loaded.
 140  140   */
 141  141  int metaslab_unload_delay = TXG_SIZE * 2;
 142  142  
 143  143  /*
 144  144   * Max number of metaslabs per group to preload.
 145  145   */
 146  146  int metaslab_preload_limit = SPA_DVAS_PER_BP;
 147  147  
 148  148  /*
 149  149   * Enable/disable preloading of metaslab.
 150  150   */
 151  151  boolean_t metaslab_preload_enabled = B_TRUE;
 152  152  
 153  153  /*
 154  154   * Enable/disable fragmentation weighting on metaslabs.
 155  155   */
 156  156  boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
 157  157  
 158  158  /*
 159  159   * Enable/disable lba weighting (i.e. outer tracks are given preference).
 160  160   */
 161  161  boolean_t metaslab_lba_weighting_enabled = B_TRUE;
 162  162  
 163  163  /*
 164  164   * Enable/disable metaslab group biasing.
 165  165   */
 166  166  boolean_t metaslab_bias_enabled = B_TRUE;
 167  167  
 168  168  /*
 169  169   * Enable/disable segment-based metaslab selection.
 170  170   */
 171  171  boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
 172  172  
 173  173  /*
 174  174   * When using segment-based metaslab selection, we will continue
 175  175   * allocating from the active metaslab until we have exhausted
 176  176   * zfs_metaslab_switch_threshold of its buckets.
 177  177   */
 178  178  int zfs_metaslab_switch_threshold = 2;
 179  179  
 180  180  /*
 181  181   * Internal switch to enable/disable the metaslab allocation tracing
 182  182   * facility.
 183  183   */
 184  184  boolean_t metaslab_trace_enabled = B_TRUE;
 185  185  
 186  186  /*
 187  187   * Maximum entries that the metaslab allocation tracing facility will keep
 188  188   * in a given list when running in non-debug mode. We limit the number
 189  189   * of entries in non-debug mode to prevent us from using up too much memory.
 190  190   * The limit should be sufficiently large that we don't expect any allocation
 191  191   * to every exceed this value. In debug mode, the system will panic if this
 192  192   * limit is ever reached allowing for further investigation.
 193  193   */
 194  194  uint64_t metaslab_trace_max_entries = 5000;
 195  195  
 196  196  static uint64_t metaslab_weight(metaslab_t *);
 197  197  static void metaslab_set_fragmentation(metaslab_t *);
 198  198  
 199  199  kmem_cache_t *metaslab_alloc_trace_cache;
 200  200  
 201  201  /*
 202  202   * ==========================================================================
 203  203   * Metaslab classes
 204  204   * ==========================================================================
 205  205   */
 206  206  metaslab_class_t *
 207  207  metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 208  208  {
 209  209          metaslab_class_t *mc;
 210  210  
 211  211          mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 212  212  
 213  213          mc->mc_spa = spa;
 214  214          mc->mc_rotor = NULL;
 215  215          mc->mc_ops = ops;
 216  216          mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 217  217          refcount_create_tracked(&mc->mc_alloc_slots);
 218  218  
 219  219          return (mc);
 220  220  }
 221  221  
 222  222  void
 223  223  metaslab_class_destroy(metaslab_class_t *mc)
 224  224  {
 225  225          ASSERT(mc->mc_rotor == NULL);
 226  226          ASSERT(mc->mc_alloc == 0);
 227  227          ASSERT(mc->mc_deferred == 0);
 228  228          ASSERT(mc->mc_space == 0);
 229  229          ASSERT(mc->mc_dspace == 0);
 230  230  
 231  231          refcount_destroy(&mc->mc_alloc_slots);
 232  232          mutex_destroy(&mc->mc_lock);
 233  233          kmem_free(mc, sizeof (metaslab_class_t));
 234  234  }
 235  235  
 236  236  int
 237  237  metaslab_class_validate(metaslab_class_t *mc)
 238  238  {
 239  239          metaslab_group_t *mg;
 240  240          vdev_t *vd;
 241  241  
 242  242          /*
 243  243           * Must hold one of the spa_config locks.
 244  244           */
 245  245          ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 246  246              spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 247  247  
 248  248          if ((mg = mc->mc_rotor) == NULL)
 249  249                  return (0);
 250  250  
 251  251          do {
 252  252                  vd = mg->mg_vd;
 253  253                  ASSERT(vd->vdev_mg != NULL);
 254  254                  ASSERT3P(vd->vdev_top, ==, vd);
 255  255                  ASSERT3P(mg->mg_class, ==, mc);
 256  256                  ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 257  257          } while ((mg = mg->mg_next) != mc->mc_rotor);
 258  258  
 259  259          return (0);
 260  260  }
 261  261  
 262  262  void
 263  263  metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 264  264      int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 265  265  {
 266  266          atomic_add_64(&mc->mc_alloc, alloc_delta);
 267  267          atomic_add_64(&mc->mc_deferred, defer_delta);
 268  268          atomic_add_64(&mc->mc_space, space_delta);
 269  269          atomic_add_64(&mc->mc_dspace, dspace_delta);
 270  270  }
 271  271  
 272  272  uint64_t
 273  273  metaslab_class_get_alloc(metaslab_class_t *mc)
 274  274  {
 275  275          return (mc->mc_alloc);
 276  276  }
 277  277  
 278  278  uint64_t
 279  279  metaslab_class_get_deferred(metaslab_class_t *mc)
 280  280  {
 281  281          return (mc->mc_deferred);
 282  282  }
 283  283  
 284  284  uint64_t
 285  285  metaslab_class_get_space(metaslab_class_t *mc)
 286  286  {
 287  287          return (mc->mc_space);
 288  288  }
 289  289  
 290  290  uint64_t
 291  291  metaslab_class_get_dspace(metaslab_class_t *mc)
 292  292  {
 293  293          return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 294  294  }
 295  295  
 296  296  void
 297  297  metaslab_class_histogram_verify(metaslab_class_t *mc)
 298  298  {
 299  299          vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 300  300          uint64_t *mc_hist;
 301  301          int i;
 302  302  
 303  303          if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 304  304                  return;
 305  305  
 306  306          mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 307  307              KM_SLEEP);
 308  308  
 309  309          for (int c = 0; c < rvd->vdev_children; c++) {
 310  310                  vdev_t *tvd = rvd->vdev_child[c];
 311  311                  metaslab_group_t *mg = tvd->vdev_mg;
 312  312  
 313  313                  /*
 314  314                   * Skip any holes, uninitialized top-levels, or
 315  315                   * vdevs that are not in this metalab class.
 316  316                   */
 317  317                  if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 318  318                      mg->mg_class != mc) {
 319  319                          continue;
 320  320                  }
 321  321  
 322  322                  for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 323  323                          mc_hist[i] += mg->mg_histogram[i];
 324  324          }
 325  325  
 326  326          for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 327  327                  VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 328  328  
 329  329          kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 330  330  }
 331  331  
 332  332  /*
 333  333   * Calculate the metaslab class's fragmentation metric. The metric
 334  334   * is weighted based on the space contribution of each metaslab group.
 335  335   * The return value will be a number between 0 and 100 (inclusive), or
 336  336   * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
 337  337   * zfs_frag_table for more information about the metric.
 338  338   */
 339  339  uint64_t
 340  340  metaslab_class_fragmentation(metaslab_class_t *mc)
 341  341  {
 342  342          vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 343  343          uint64_t fragmentation = 0;
 344  344  
 345  345          spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 346  346  
 347  347          for (int c = 0; c < rvd->vdev_children; c++) {
 348  348                  vdev_t *tvd = rvd->vdev_child[c];
 349  349                  metaslab_group_t *mg = tvd->vdev_mg;
 350  350  
 351  351                  /*
 352  352                   * Skip any holes, uninitialized top-levels, or
 353  353                   * vdevs that are not in this metalab class.
 354  354                   */
 355  355                  if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 356  356                      mg->mg_class != mc) {
 357  357                          continue;
 358  358                  }
 359  359  
 360  360                  /*
 361  361                   * If a metaslab group does not contain a fragmentation
 362  362                   * metric then just bail out.
 363  363                   */
 364  364                  if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 365  365                          spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 366  366                          return (ZFS_FRAG_INVALID);
 367  367                  }
 368  368  
 369  369                  /*
 370  370                   * Determine how much this metaslab_group is contributing
 371  371                   * to the overall pool fragmentation metric.
 372  372                   */
 373  373                  fragmentation += mg->mg_fragmentation *
 374  374                      metaslab_group_get_space(mg);
 375  375          }
 376  376          fragmentation /= metaslab_class_get_space(mc);
 377  377  
 378  378          ASSERT3U(fragmentation, <=, 100);
 379  379          spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 380  380          return (fragmentation);
 381  381  }
 382  382  
 383  383  /*
 384  384   * Calculate the amount of expandable space that is available in
 385  385   * this metaslab class. If a device is expanded then its expandable
 386  386   * space will be the amount of allocatable space that is currently not
 387  387   * part of this metaslab class.
 388  388   */
 389  389  uint64_t
 390  390  metaslab_class_expandable_space(metaslab_class_t *mc)
 391  391  {
 392  392          vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 393  393          uint64_t space = 0;
 394  394  
 395  395          spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 396  396          for (int c = 0; c < rvd->vdev_children; c++) {
 397  397                  vdev_t *tvd = rvd->vdev_child[c];
 398  398                  metaslab_group_t *mg = tvd->vdev_mg;
 399  399  
 400  400                  if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
 401  401                      mg->mg_class != mc) {
 402  402                          continue;
 403  403                  }
 404  404  
 405  405                  /*
 406  406                   * Calculate if we have enough space to add additional
 407  407                   * metaslabs. We report the expandable space in terms
 408  408                   * of the metaslab size since that's the unit of expansion.
 409  409                   */
 410  410                  space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
 411  411                      1ULL << tvd->vdev_ms_shift);
 412  412          }
 413  413          spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 414  414          return (space);
 415  415  }
 416  416  
 417  417  static int
 418  418  metaslab_compare(const void *x1, const void *x2)
 419  419  {
 420  420          const metaslab_t *m1 = x1;
 421  421          const metaslab_t *m2 = x2;
 422  422  
 423  423          if (m1->ms_weight < m2->ms_weight)
 424  424                  return (1);
 425  425          if (m1->ms_weight > m2->ms_weight)
 426  426                  return (-1);
 427  427  
 428  428          /*
 429  429           * If the weights are identical, use the offset to force uniqueness.
 430  430           */
 431  431          if (m1->ms_start < m2->ms_start)
 432  432                  return (-1);
 433  433          if (m1->ms_start > m2->ms_start)
 434  434                  return (1);
 435  435  
 436  436          ASSERT3P(m1, ==, m2);
 437  437  
 438  438          return (0);
 439  439  }
 440  440  
 441  441  /*
 442  442   * Verify that the space accounting on disk matches the in-core range_trees.
 443  443   */
 444  444  void
 445  445  metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 446  446  {
 447  447          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 448  448          uint64_t allocated = 0;
 449  449          uint64_t sm_free_space, msp_free_space;
 450  450  
 451  451          ASSERT(MUTEX_HELD(&msp->ms_lock));
 452  452  
 453  453          if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 454  454                  return;
 455  455  
 456  456          /*
 457  457           * We can only verify the metaslab space when we're called
 458  458           * from syncing context with a loaded metaslab that has an allocated
 459  459           * space map. Calling this in non-syncing context does not
 460  460           * provide a consistent view of the metaslab since we're performing
 461  461           * allocations in the future.
 462  462           */
 463  463          if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 464  464              !msp->ms_loaded)
 465  465                  return;
 466  466  
 467  467          sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
 468  468              space_map_alloc_delta(msp->ms_sm);
 469  469  
 470  470          /*
 471  471           * Account for future allocations since we would have already
 472  472           * deducted that space from the ms_freetree.
 473  473           */
 474  474          for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 475  475                  allocated +=
 476  476                      range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
 477  477          }
 478  478  
 479  479          msp_free_space = range_tree_space(msp->ms_tree) + allocated +
 480  480              msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
 481  481  
 482  482          VERIFY3U(sm_free_space, ==, msp_free_space);
 483  483  }
 484  484  
 485  485  /*
 486  486   * ==========================================================================
 487  487   * Metaslab groups
 488  488   * ==========================================================================
 489  489   */
 490  490  /*
 491  491   * Update the allocatable flag and the metaslab group's capacity.
 492  492   * The allocatable flag is set to true if the capacity is below
 493  493   * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 494  494   * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 495  495   * transitions from allocatable to non-allocatable or vice versa then the
 496  496   * metaslab group's class is updated to reflect the transition.
 497  497   */
 498  498  static void
 499  499  metaslab_group_alloc_update(metaslab_group_t *mg)
 500  500  {
 501  501          vdev_t *vd = mg->mg_vd;
 502  502          metaslab_class_t *mc = mg->mg_class;
 503  503          vdev_stat_t *vs = &vd->vdev_stat;
 504  504          boolean_t was_allocatable;
 505  505          boolean_t was_initialized;
 506  506  
 507  507          ASSERT(vd == vd->vdev_top);
 508  508  
 509  509          mutex_enter(&mg->mg_lock);
 510  510          was_allocatable = mg->mg_allocatable;
 511  511          was_initialized = mg->mg_initialized;
 512  512  
 513  513          mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 514  514              (vs->vs_space + 1);
 515  515  
 516  516          mutex_enter(&mc->mc_lock);
 517  517  
 518  518          /*
 519  519           * If the metaslab group was just added then it won't
 520  520           * have any space until we finish syncing out this txg.
 521  521           * At that point we will consider it initialized and available
 522  522           * for allocations.  We also don't consider non-activated
 523  523           * metaslab groups (e.g. vdevs that are in the middle of being removed)
 524  524           * to be initialized, because they can't be used for allocation.
 525  525           */
 526  526          mg->mg_initialized = metaslab_group_initialized(mg);
 527  527          if (!was_initialized && mg->mg_initialized) {
 528  528                  mc->mc_groups++;
 529  529          } else if (was_initialized && !mg->mg_initialized) {
 530  530                  ASSERT3U(mc->mc_groups, >, 0);
 531  531                  mc->mc_groups--;
 532  532          }
 533  533          if (mg->mg_initialized)
 534  534                  mg->mg_no_free_space = B_FALSE;
 535  535  
 536  536          /*
 537  537           * A metaslab group is considered allocatable if it has plenty
 538  538           * of free space or is not heavily fragmented. We only take
 539  539           * fragmentation into account if the metaslab group has a valid
 540  540           * fragmentation metric (i.e. a value between 0 and 100).
 541  541           */
 542  542          mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 543  543              mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 544  544              (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 545  545              mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 546  546  
 547  547          /*
 548  548           * The mc_alloc_groups maintains a count of the number of
 549  549           * groups in this metaslab class that are still above the
 550  550           * zfs_mg_noalloc_threshold. This is used by the allocating
 551  551           * threads to determine if they should avoid allocations to
 552  552           * a given group. The allocator will avoid allocations to a group
 553  553           * if that group has reached or is below the zfs_mg_noalloc_threshold
 554  554           * and there are still other groups that are above the threshold.
 555  555           * When a group transitions from allocatable to non-allocatable or
 556  556           * vice versa we update the metaslab class to reflect that change.
 557  557           * When the mc_alloc_groups value drops to 0 that means that all
 558  558           * groups have reached the zfs_mg_noalloc_threshold making all groups
 559  559           * eligible for allocations. This effectively means that all devices
 560  560           * are balanced again.
 561  561           */
 562  562          if (was_allocatable && !mg->mg_allocatable)
 563  563                  mc->mc_alloc_groups--;
 564  564          else if (!was_allocatable && mg->mg_allocatable)
 565  565                  mc->mc_alloc_groups++;
 566  566          mutex_exit(&mc->mc_lock);
 567  567  
 568  568          mutex_exit(&mg->mg_lock);
 569  569  }
 570  570  
 571  571  metaslab_group_t *
 572  572  metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 573  573  {
 574  574          metaslab_group_t *mg;
 575  575  
 576  576          mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 577  577          mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 578  578          avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 579  579              sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 580  580          mg->mg_vd = vd;
 581  581          mg->mg_class = mc;
 582  582          mg->mg_activation_count = 0;
 583  583          mg->mg_initialized = B_FALSE;
 584  584          mg->mg_no_free_space = B_TRUE;
 585  585          refcount_create_tracked(&mg->mg_alloc_queue_depth);
 586  586  
 587  587          mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 588  588              minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 589  589  
 590  590          return (mg);
 591  591  }
 592  592  
 593  593  void
 594  594  metaslab_group_destroy(metaslab_group_t *mg)
 595  595  {
 596  596          ASSERT(mg->mg_prev == NULL);
 597  597          ASSERT(mg->mg_next == NULL);
 598  598          /*
 599  599           * We may have gone below zero with the activation count
 600  600           * either because we never activated in the first place or
 601  601           * because we're done, and possibly removing the vdev.
 602  602           */
 603  603          ASSERT(mg->mg_activation_count <= 0);
 604  604  
 605  605          taskq_destroy(mg->mg_taskq);
 606  606          avl_destroy(&mg->mg_metaslab_tree);
 607  607          mutex_destroy(&mg->mg_lock);
 608  608          refcount_destroy(&mg->mg_alloc_queue_depth);
 609  609          kmem_free(mg, sizeof (metaslab_group_t));
 610  610  }
 611  611  
 612  612  void
 613  613  metaslab_group_activate(metaslab_group_t *mg)
 614  614  {
 615  615          metaslab_class_t *mc = mg->mg_class;
 616  616          metaslab_group_t *mgprev, *mgnext;
 617  617  
 618  618          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 619  619  
 620  620          ASSERT(mc->mc_rotor != mg);
 621  621          ASSERT(mg->mg_prev == NULL);
 622  622          ASSERT(mg->mg_next == NULL);
 623  623          ASSERT(mg->mg_activation_count <= 0);
 624  624  
 625  625          if (++mg->mg_activation_count <= 0)
 626  626                  return;
 627  627  
 628  628          mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 629  629          metaslab_group_alloc_update(mg);
 630  630  
 631  631          if ((mgprev = mc->mc_rotor) == NULL) {
 632  632                  mg->mg_prev = mg;
 633  633                  mg->mg_next = mg;
 634  634          } else {
 635  635                  mgnext = mgprev->mg_next;
 636  636                  mg->mg_prev = mgprev;
 637  637                  mg->mg_next = mgnext;
 638  638                  mgprev->mg_next = mg;
 639  639                  mgnext->mg_prev = mg;
 640  640          }
 641  641          mc->mc_rotor = mg;
 642  642  }
 643  643  
 644  644  void
 645  645  metaslab_group_passivate(metaslab_group_t *mg)
 646  646  {
 647  647          metaslab_class_t *mc = mg->mg_class;
 648  648          metaslab_group_t *mgprev, *mgnext;
 649  649  
 650  650          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 651  651  
 652  652          if (--mg->mg_activation_count != 0) {
 653  653                  ASSERT(mc->mc_rotor != mg);
 654  654                  ASSERT(mg->mg_prev == NULL);
 655  655                  ASSERT(mg->mg_next == NULL);
 656  656                  ASSERT(mg->mg_activation_count < 0);
 657  657                  return;
 658  658          }
 659  659  
 660  660          taskq_wait(mg->mg_taskq);
 661  661          metaslab_group_alloc_update(mg);
 662  662  
 663  663          mgprev = mg->mg_prev;
 664  664          mgnext = mg->mg_next;
 665  665  
 666  666          if (mg == mgnext) {
 667  667                  mc->mc_rotor = NULL;
 668  668          } else {
 669  669                  mc->mc_rotor = mgnext;
 670  670                  mgprev->mg_next = mgnext;
 671  671                  mgnext->mg_prev = mgprev;
 672  672          }
 673  673  
 674  674          mg->mg_prev = NULL;
 675  675          mg->mg_next = NULL;
 676  676  }
 677  677  
 678  678  boolean_t
 679  679  metaslab_group_initialized(metaslab_group_t *mg)
 680  680  {
 681  681          vdev_t *vd = mg->mg_vd;
 682  682          vdev_stat_t *vs = &vd->vdev_stat;
 683  683  
 684  684          return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 685  685  }
 686  686  
 687  687  uint64_t
 688  688  metaslab_group_get_space(metaslab_group_t *mg)
 689  689  {
 690  690          return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 691  691  }
 692  692  
 693  693  void
 694  694  metaslab_group_histogram_verify(metaslab_group_t *mg)
 695  695  {
 696  696          uint64_t *mg_hist;
 697  697          vdev_t *vd = mg->mg_vd;
 698  698          uint64_t ashift = vd->vdev_ashift;
 699  699          int i;
 700  700  
 701  701          if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 702  702                  return;
 703  703  
 704  704          mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 705  705              KM_SLEEP);
 706  706  
 707  707          ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 708  708              SPACE_MAP_HISTOGRAM_SIZE + ashift);
 709  709  
 710  710          for (int m = 0; m < vd->vdev_ms_count; m++) {
 711  711                  metaslab_t *msp = vd->vdev_ms[m];
 712  712  
 713  713                  if (msp->ms_sm == NULL)
 714  714                          continue;
 715  715  
 716  716                  for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 717  717                          mg_hist[i + ashift] +=
 718  718                              msp->ms_sm->sm_phys->smp_histogram[i];
 719  719          }
 720  720  
 721  721          for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 722  722                  VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 723  723  
 724  724          kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 725  725  }
 726  726  
 727  727  static void
 728  728  metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 729  729  {
 730  730          metaslab_class_t *mc = mg->mg_class;
 731  731          uint64_t ashift = mg->mg_vd->vdev_ashift;
 732  732  
 733  733          ASSERT(MUTEX_HELD(&msp->ms_lock));
 734  734          if (msp->ms_sm == NULL)
 735  735                  return;
 736  736  
 737  737          mutex_enter(&mg->mg_lock);
 738  738          for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 739  739                  mg->mg_histogram[i + ashift] +=
 740  740                      msp->ms_sm->sm_phys->smp_histogram[i];
 741  741                  mc->mc_histogram[i + ashift] +=
 742  742                      msp->ms_sm->sm_phys->smp_histogram[i];
 743  743          }
 744  744          mutex_exit(&mg->mg_lock);
 745  745  }
 746  746  
 747  747  void
 748  748  metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 749  749  {
 750  750          metaslab_class_t *mc = mg->mg_class;
 751  751          uint64_t ashift = mg->mg_vd->vdev_ashift;
 752  752  
 753  753          ASSERT(MUTEX_HELD(&msp->ms_lock));
 754  754          if (msp->ms_sm == NULL)
 755  755                  return;
 756  756  
 757  757          mutex_enter(&mg->mg_lock);
 758  758          for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 759  759                  ASSERT3U(mg->mg_histogram[i + ashift], >=,
 760  760                      msp->ms_sm->sm_phys->smp_histogram[i]);
 761  761                  ASSERT3U(mc->mc_histogram[i + ashift], >=,
 762  762                      msp->ms_sm->sm_phys->smp_histogram[i]);
 763  763  
 764  764                  mg->mg_histogram[i + ashift] -=
 765  765                      msp->ms_sm->sm_phys->smp_histogram[i];
 766  766                  mc->mc_histogram[i + ashift] -=
 767  767                      msp->ms_sm->sm_phys->smp_histogram[i];
 768  768          }
 769  769          mutex_exit(&mg->mg_lock);
 770  770  }
 771  771  
 772  772  static void
 773  773  metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 774  774  {
 775  775          ASSERT(msp->ms_group == NULL);
 776  776          mutex_enter(&mg->mg_lock);
 777  777          msp->ms_group = mg;
 778  778          msp->ms_weight = 0;
 779  779          avl_add(&mg->mg_metaslab_tree, msp);
 780  780          mutex_exit(&mg->mg_lock);
 781  781  
 782  782          mutex_enter(&msp->ms_lock);
 783  783          metaslab_group_histogram_add(mg, msp);
 784  784          mutex_exit(&msp->ms_lock);
 785  785  }
 786  786  
 787  787  static void
 788  788  metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 789  789  {
 790  790          mutex_enter(&msp->ms_lock);
 791  791          metaslab_group_histogram_remove(mg, msp);
 792  792          mutex_exit(&msp->ms_lock);
 793  793  
 794  794          mutex_enter(&mg->mg_lock);
 795  795          ASSERT(msp->ms_group == mg);
 796  796          avl_remove(&mg->mg_metaslab_tree, msp);
 797  797          msp->ms_group = NULL;
 798  798          mutex_exit(&mg->mg_lock);
 799  799  }
 800  800  
 801  801  static void
 802  802  metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 803  803  {
 804  804          /*
 805  805           * Although in principle the weight can be any value, in
 806  806           * practice we do not use values in the range [1, 511].
 807  807           */
 808  808          ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 809  809          ASSERT(MUTEX_HELD(&msp->ms_lock));
 810  810  
 811  811          mutex_enter(&mg->mg_lock);
 812  812          ASSERT(msp->ms_group == mg);
 813  813          avl_remove(&mg->mg_metaslab_tree, msp);
 814  814          msp->ms_weight = weight;
 815  815          avl_add(&mg->mg_metaslab_tree, msp);
 816  816          mutex_exit(&mg->mg_lock);
 817  817  }
 818  818  
 819  819  /*
 820  820   * Calculate the fragmentation for a given metaslab group. We can use
 821  821   * a simple average here since all metaslabs within the group must have
 822  822   * the same size. The return value will be a value between 0 and 100
 823  823   * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
 824  824   * group have a fragmentation metric.
 825  825   */
 826  826  uint64_t
 827  827  metaslab_group_fragmentation(metaslab_group_t *mg)
 828  828  {
 829  829          vdev_t *vd = mg->mg_vd;
 830  830          uint64_t fragmentation = 0;
 831  831          uint64_t valid_ms = 0;
 832  832  
 833  833          for (int m = 0; m < vd->vdev_ms_count; m++) {
 834  834                  metaslab_t *msp = vd->vdev_ms[m];
 835  835  
 836  836                  if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 837  837                          continue;
 838  838  
 839  839                  valid_ms++;
 840  840                  fragmentation += msp->ms_fragmentation;
 841  841          }
 842  842  
 843  843          if (valid_ms <= vd->vdev_ms_count / 2)
 844  844                  return (ZFS_FRAG_INVALID);
 845  845  
 846  846          fragmentation /= valid_ms;
 847  847          ASSERT3U(fragmentation, <=, 100);
 848  848          return (fragmentation);
 849  849  }
 850  850  
 851  851  /*
 852  852   * Determine if a given metaslab group should skip allocations. A metaslab
 853  853   * group should avoid allocations if its free capacity is less than the
 854  854   * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
 855  855   * zfs_mg_fragmentation_threshold and there is at least one metaslab group
 856  856   * that can still handle allocations. If the allocation throttle is enabled
 857  857   * then we skip allocations to devices that have reached their maximum
 858  858   * allocation queue depth unless the selected metaslab group is the only
 859  859   * eligible group remaining.
 860  860   */
 861  861  static boolean_t
 862  862  metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 863  863      uint64_t psize)
 864  864  {
 865  865          spa_t *spa = mg->mg_vd->vdev_spa;
 866  866          metaslab_class_t *mc = mg->mg_class;
 867  867  
 868  868          /*
 869  869           * We can only consider skipping this metaslab group if it's
 870  870           * in the normal metaslab class and there are other metaslab
 871  871           * groups to select from. Otherwise, we always consider it eligible
 872  872           * for allocations.
 873  873           */
 874  874          if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
 875  875                  return (B_TRUE);
 876  876  
 877  877          /*
 878  878           * If the metaslab group's mg_allocatable flag is set (see comments
 879  879           * in metaslab_group_alloc_update() for more information) and
 880  880           * the allocation throttle is disabled then allow allocations to this
 881  881           * device. However, if the allocation throttle is enabled then
 882  882           * check if we have reached our allocation limit (mg_alloc_queue_depth)
 883  883           * to determine if we should allow allocations to this metaslab group.
 884  884           * If all metaslab groups are no longer considered allocatable
 885  885           * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 886  886           * gang block size then we allow allocations on this metaslab group
 887  887           * regardless of the mg_allocatable or throttle settings.
 888  888           */
 889  889          if (mg->mg_allocatable) {
 890  890                  metaslab_group_t *mgp;
 891  891                  int64_t qdepth;
 892  892                  uint64_t qmax = mg->mg_max_alloc_queue_depth;
 893  893  
 894  894                  if (!mc->mc_alloc_throttle_enabled)
 895  895                          return (B_TRUE);
 896  896  
 897  897                  /*
 898  898                   * If this metaslab group does not have any free space, then
 899  899                   * there is no point in looking further.
 900  900                   */
 901  901                  if (mg->mg_no_free_space)
 902  902                          return (B_FALSE);
 903  903  
 904  904                  qdepth = refcount_count(&mg->mg_alloc_queue_depth);
 905  905  
 906  906                  /*
 907  907                   * If this metaslab group is below its qmax or it's
 908  908                   * the only allocatable metasable group, then attempt
 909  909                   * to allocate from it.
 910  910                   */
 911  911                  if (qdepth < qmax || mc->mc_alloc_groups == 1)
 912  912                          return (B_TRUE);
 913  913                  ASSERT3U(mc->mc_alloc_groups, >, 1);
 914  914  
 915  915                  /*
 916  916                   * Since this metaslab group is at or over its qmax, we
 917  917                   * need to determine if there are metaslab groups after this
 918  918                   * one that might be able to handle this allocation. This is
 919  919                   * racy since we can't hold the locks for all metaslab
 920  920                   * groups at the same time when we make this check.
 921  921                   */
 922  922                  for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
 923  923                          qmax = mgp->mg_max_alloc_queue_depth;
 924  924  
 925  925                          qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
 926  926  
 927  927                          /*
 928  928                           * If there is another metaslab group that
 929  929                           * might be able to handle the allocation, then
 930  930                           * we return false so that we skip this group.
 931  931                           */
 932  932                          if (qdepth < qmax && !mgp->mg_no_free_space)
 933  933                                  return (B_FALSE);
 934  934                  }
 935  935  
 936  936                  /*
 937  937                   * We didn't find another group to handle the allocation
 938  938                   * so we can't skip this metaslab group even though
 939  939                   * we are at or over our qmax.
 940  940                   */
 941  941                  return (B_TRUE);
 942  942  
 943  943          } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 944  944                  return (B_TRUE);
 945  945          }
 946  946          return (B_FALSE);
 947  947  }
 948  948  
 949  949  /*
 950  950   * ==========================================================================
 951  951   * Range tree callbacks
 952  952   * ==========================================================================
 953  953   */
 954  954  
 955  955  /*
 956  956   * Comparison function for the private size-ordered tree. Tree is sorted
 957  957   * by size, larger sizes at the end of the tree.
 958  958   */
 959  959  static int
 960  960  metaslab_rangesize_compare(const void *x1, const void *x2)
 961  961  {
 962  962          const range_seg_t *r1 = x1;
 963  963          const range_seg_t *r2 = x2;
 964  964          uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 965  965          uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 966  966  
 967  967          if (rs_size1 < rs_size2)
 968  968                  return (-1);
 969  969          if (rs_size1 > rs_size2)
 970  970                  return (1);
 971  971  
 972  972          if (r1->rs_start < r2->rs_start)
 973  973                  return (-1);
 974  974  
 975  975          if (r1->rs_start > r2->rs_start)
 976  976                  return (1);
 977  977  
 978  978          return (0);
 979  979  }
 980  980  
 981  981  /*
 982  982   * Create any block allocator specific components. The current allocators
 983  983   * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
 984  984   */
 985  985  static void
 986  986  metaslab_rt_create(range_tree_t *rt, void *arg)
 987  987  {
 988  988          metaslab_t *msp = arg;
 989  989  
 990  990          ASSERT3P(rt->rt_arg, ==, msp);
 991  991          ASSERT(msp->ms_tree == NULL);
 992  992  
 993  993          avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
 994  994              sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 995  995  }
 996  996  
 997  997  /*
 998  998   * Destroy the block allocator specific components.
 999  999   */
1000 1000  static void
1001 1001  metaslab_rt_destroy(range_tree_t *rt, void *arg)
1002 1002  {
1003 1003          metaslab_t *msp = arg;
1004 1004  
1005 1005          ASSERT3P(rt->rt_arg, ==, msp);
1006 1006          ASSERT3P(msp->ms_tree, ==, rt);
1007 1007          ASSERT0(avl_numnodes(&msp->ms_size_tree));
1008 1008  
1009 1009          avl_destroy(&msp->ms_size_tree);
1010 1010  }
1011 1011  
1012 1012  static void
1013 1013  metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1014 1014  {
1015 1015          metaslab_t *msp = arg;
1016 1016  
1017 1017          ASSERT3P(rt->rt_arg, ==, msp);
1018 1018          ASSERT3P(msp->ms_tree, ==, rt);
1019 1019          VERIFY(!msp->ms_condensing);
1020 1020          avl_add(&msp->ms_size_tree, rs);
1021 1021  }
1022 1022  
1023 1023  static void
1024 1024  metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1025 1025  {
1026 1026          metaslab_t *msp = arg;
1027 1027  
1028 1028          ASSERT3P(rt->rt_arg, ==, msp);
1029 1029          ASSERT3P(msp->ms_tree, ==, rt);
1030 1030          VERIFY(!msp->ms_condensing);
1031 1031          avl_remove(&msp->ms_size_tree, rs);
1032 1032  }
1033 1033  
1034 1034  static void
1035 1035  metaslab_rt_vacate(range_tree_t *rt, void *arg)
1036 1036  {
1037 1037          metaslab_t *msp = arg;
1038 1038  
1039 1039          ASSERT3P(rt->rt_arg, ==, msp);
1040 1040          ASSERT3P(msp->ms_tree, ==, rt);
1041 1041  
1042 1042          /*
1043 1043           * Normally one would walk the tree freeing nodes along the way.
1044 1044           * Since the nodes are shared with the range trees we can avoid
1045 1045           * walking all nodes and just reinitialize the avl tree. The nodes
1046 1046           * will be freed by the range tree, so we don't want to free them here.
1047 1047           */
1048 1048          avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1049 1049              sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1050 1050  }
1051 1051  
1052 1052  static range_tree_ops_t metaslab_rt_ops = {
1053 1053          metaslab_rt_create,
1054 1054          metaslab_rt_destroy,
1055 1055          metaslab_rt_add,
1056 1056          metaslab_rt_remove,
1057 1057          metaslab_rt_vacate
1058 1058  };
1059 1059  
1060 1060  /*
1061 1061   * ==========================================================================
1062 1062   * Common allocator routines
1063 1063   * ==========================================================================
1064 1064   */
1065 1065  
1066 1066  /*
1067 1067   * Return the maximum contiguous segment within the metaslab.
1068 1068   */
1069 1069  uint64_t
1070 1070  metaslab_block_maxsize(metaslab_t *msp)
1071 1071  {
1072 1072          avl_tree_t *t = &msp->ms_size_tree;
1073 1073          range_seg_t *rs;
1074 1074  
1075 1075          if (t == NULL || (rs = avl_last(t)) == NULL)
1076 1076                  return (0ULL);
1077 1077  
1078 1078          return (rs->rs_end - rs->rs_start);
1079 1079  }
1080 1080  
1081 1081  static range_seg_t *
1082 1082  metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1083 1083  {
1084 1084          range_seg_t *rs, rsearch;
1085 1085          avl_index_t where;
1086 1086  
1087 1087          rsearch.rs_start = start;
1088 1088          rsearch.rs_end = start + size;
1089 1089  
1090 1090          rs = avl_find(t, &rsearch, &where);
1091 1091          if (rs == NULL) {
1092 1092                  rs = avl_nearest(t, where, AVL_AFTER);
1093 1093          }
1094 1094  
1095 1095          return (rs);
1096 1096  }
1097 1097  
1098 1098  /*
1099 1099   * This is a helper function that can be used by the allocator to find
1100 1100   * a suitable block to allocate. This will search the specified AVL
1101 1101   * tree looking for a block that matches the specified criteria.
1102 1102   */
1103 1103  static uint64_t
1104 1104  metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1105 1105      uint64_t align)
1106 1106  {
1107 1107          range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1108 1108  
1109 1109          while (rs != NULL) {
1110 1110                  uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1111 1111  
1112 1112                  if (offset + size <= rs->rs_end) {
1113 1113                          *cursor = offset + size;
1114 1114                          return (offset);
1115 1115                  }
1116 1116                  rs = AVL_NEXT(t, rs);
1117 1117          }
1118 1118  
1119 1119          /*
1120 1120           * If we know we've searched the whole map (*cursor == 0), give up.
1121 1121           * Otherwise, reset the cursor to the beginning and try again.
1122 1122           */
1123 1123          if (*cursor == 0)
1124 1124                  return (-1ULL);
1125 1125  
1126 1126          *cursor = 0;
1127 1127          return (metaslab_block_picker(t, cursor, size, align));
1128 1128  }
1129 1129  
1130 1130  /*
1131 1131   * ==========================================================================
1132 1132   * The first-fit block allocator
1133 1133   * ==========================================================================
1134 1134   */
1135 1135  static uint64_t
1136 1136  metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1137 1137  {
1138 1138          /*
1139 1139           * Find the largest power of 2 block size that evenly divides the
1140 1140           * requested size. This is used to try to allocate blocks with similar
1141 1141           * alignment from the same area of the metaslab (i.e. same cursor
1142 1142           * bucket) but it does not guarantee that other allocations sizes
1143 1143           * may exist in the same region.
1144 1144           */
1145 1145          uint64_t align = size & -size;
1146 1146          uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1147 1147          avl_tree_t *t = &msp->ms_tree->rt_root;
1148 1148  
1149 1149          return (metaslab_block_picker(t, cursor, size, align));
1150 1150  }
1151 1151  
1152 1152  static metaslab_ops_t metaslab_ff_ops = {
1153 1153          metaslab_ff_alloc
1154 1154  };
1155 1155  
1156 1156  /*
1157 1157   * ==========================================================================
1158 1158   * Dynamic block allocator -
1159 1159   * Uses the first fit allocation scheme until space get low and then
1160 1160   * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1161 1161   * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1162 1162   * ==========================================================================
1163 1163   */
1164 1164  static uint64_t
1165 1165  metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1166 1166  {
1167 1167          /*
1168 1168           * Find the largest power of 2 block size that evenly divides the
1169 1169           * requested size. This is used to try to allocate blocks with similar
1170 1170           * alignment from the same area of the metaslab (i.e. same cursor
1171 1171           * bucket) but it does not guarantee that other allocations sizes
1172 1172           * may exist in the same region.
1173 1173           */
1174 1174          uint64_t align = size & -size;
1175 1175          uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1176 1176          range_tree_t *rt = msp->ms_tree;
1177 1177          avl_tree_t *t = &rt->rt_root;
1178 1178          uint64_t max_size = metaslab_block_maxsize(msp);
1179 1179          int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1180 1180  
1181 1181          ASSERT(MUTEX_HELD(&msp->ms_lock));
1182 1182          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1183 1183  
1184 1184          if (max_size < size)
1185 1185                  return (-1ULL);
1186 1186  
1187 1187          /*
1188 1188           * If we're running low on space switch to using the size
1189 1189           * sorted AVL tree (best-fit).
1190 1190           */
1191 1191          if (max_size < metaslab_df_alloc_threshold ||
1192 1192              free_pct < metaslab_df_free_pct) {
1193 1193                  t = &msp->ms_size_tree;
1194 1194                  *cursor = 0;
1195 1195          }
1196 1196  
1197 1197          return (metaslab_block_picker(t, cursor, size, 1ULL));
1198 1198  }
1199 1199  
1200 1200  static metaslab_ops_t metaslab_df_ops = {
1201 1201          metaslab_df_alloc
1202 1202  };
1203 1203  
1204 1204  /*
1205 1205   * ==========================================================================
1206 1206   * Cursor fit block allocator -
1207 1207   * Select the largest region in the metaslab, set the cursor to the beginning
1208 1208   * of the range and the cursor_end to the end of the range. As allocations
1209 1209   * are made advance the cursor. Continue allocating from the cursor until
1210 1210   * the range is exhausted and then find a new range.
1211 1211   * ==========================================================================
1212 1212   */
1213 1213  static uint64_t
1214 1214  metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1215 1215  {
1216 1216          range_tree_t *rt = msp->ms_tree;
1217 1217          avl_tree_t *t = &msp->ms_size_tree;
1218 1218          uint64_t *cursor = &msp->ms_lbas[0];
1219 1219          uint64_t *cursor_end = &msp->ms_lbas[1];
1220 1220          uint64_t offset = 0;
1221 1221  
1222 1222          ASSERT(MUTEX_HELD(&msp->ms_lock));
1223 1223          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1224 1224  
1225 1225          ASSERT3U(*cursor_end, >=, *cursor);
1226 1226  
1227 1227          if ((*cursor + size) > *cursor_end) {
1228 1228                  range_seg_t *rs;
1229 1229  
1230 1230                  rs = avl_last(&msp->ms_size_tree);
1231 1231                  if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1232 1232                          return (-1ULL);
1233 1233  
1234 1234                  *cursor = rs->rs_start;
1235 1235                  *cursor_end = rs->rs_end;
1236 1236          }
1237 1237  
1238 1238          offset = *cursor;
1239 1239          *cursor += size;
1240 1240  
1241 1241          return (offset);
1242 1242  }
1243 1243  
1244 1244  static metaslab_ops_t metaslab_cf_ops = {
1245 1245          metaslab_cf_alloc
1246 1246  };
1247 1247  
1248 1248  /*
1249 1249   * ==========================================================================
1250 1250   * New dynamic fit allocator -
1251 1251   * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1252 1252   * contiguous blocks. If no region is found then just use the largest segment
1253 1253   * that remains.
1254 1254   * ==========================================================================
1255 1255   */
1256 1256  
1257 1257  /*
1258 1258   * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1259 1259   * to request from the allocator.
1260 1260   */
1261 1261  uint64_t metaslab_ndf_clump_shift = 4;
1262 1262  
1263 1263  static uint64_t
1264 1264  metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1265 1265  {
1266 1266          avl_tree_t *t = &msp->ms_tree->rt_root;
1267 1267          avl_index_t where;
1268 1268          range_seg_t *rs, rsearch;
1269 1269          uint64_t hbit = highbit64(size);
1270 1270          uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1271 1271          uint64_t max_size = metaslab_block_maxsize(msp);
1272 1272  
1273 1273          ASSERT(MUTEX_HELD(&msp->ms_lock));
1274 1274          ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1275 1275  
1276 1276          if (max_size < size)
1277 1277                  return (-1ULL);
1278 1278  
1279 1279          rsearch.rs_start = *cursor;
1280 1280          rsearch.rs_end = *cursor + size;
1281 1281  
1282 1282          rs = avl_find(t, &rsearch, &where);
1283 1283          if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1284 1284                  t = &msp->ms_size_tree;
1285 1285  
1286 1286                  rsearch.rs_start = 0;
1287 1287                  rsearch.rs_end = MIN(max_size,
1288 1288                      1ULL << (hbit + metaslab_ndf_clump_shift));
1289 1289                  rs = avl_find(t, &rsearch, &where);
1290 1290                  if (rs == NULL)
1291 1291                          rs = avl_nearest(t, where, AVL_AFTER);
1292 1292                  ASSERT(rs != NULL);
1293 1293          }
1294 1294  
1295 1295          if ((rs->rs_end - rs->rs_start) >= size) {
1296 1296                  *cursor = rs->rs_start + size;
1297 1297                  return (rs->rs_start);
1298 1298          }
1299 1299          return (-1ULL);
1300 1300  }
1301 1301  
1302 1302  static metaslab_ops_t metaslab_ndf_ops = {
1303 1303          metaslab_ndf_alloc
1304 1304  };
1305 1305  
1306 1306  metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1307 1307  
1308 1308  /*
1309 1309   * ==========================================================================
1310 1310   * Metaslabs
1311 1311   * ==========================================================================
1312 1312   */
1313 1313  
1314 1314  /*
1315 1315   * Wait for any in-progress metaslab loads to complete.
1316 1316   */
1317 1317  void
1318 1318  metaslab_load_wait(metaslab_t *msp)
1319 1319  {
1320 1320          ASSERT(MUTEX_HELD(&msp->ms_lock));
1321 1321  
1322 1322          while (msp->ms_loading) {
1323 1323                  ASSERT(!msp->ms_loaded);
1324 1324                  cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1325 1325          }
1326 1326  }
1327 1327  
1328 1328  int
1329 1329  metaslab_load(metaslab_t *msp)
1330 1330  {
1331 1331          int error = 0;
1332 1332          boolean_t success = B_FALSE;
1333 1333  
1334 1334          ASSERT(MUTEX_HELD(&msp->ms_lock));
1335 1335          ASSERT(!msp->ms_loaded);
1336 1336          ASSERT(!msp->ms_loading);
1337 1337  
1338 1338          msp->ms_loading = B_TRUE;
1339 1339  
1340 1340          /*
1341 1341           * If the space map has not been allocated yet, then treat
1342 1342           * all the space in the metaslab as free and add it to the
1343 1343           * ms_tree.
1344 1344           */
1345 1345          if (msp->ms_sm != NULL)
1346 1346                  error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1347 1347          else
1348 1348                  range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1349 1349  
1350 1350          success = (error == 0);
1351 1351          msp->ms_loading = B_FALSE;
1352 1352  
1353 1353          if (success) {
1354 1354                  ASSERT3P(msp->ms_group, !=, NULL);
1355 1355                  msp->ms_loaded = B_TRUE;
1356 1356  
1357 1357                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1358 1358                          range_tree_walk(msp->ms_defertree[t],
1359 1359                              range_tree_remove, msp->ms_tree);
1360 1360                  }
1361 1361                  msp->ms_max_size = metaslab_block_maxsize(msp);
1362 1362          }
1363 1363          cv_broadcast(&msp->ms_load_cv);
1364 1364          return (error);
1365 1365  }
1366 1366  
1367 1367  void
1368 1368  metaslab_unload(metaslab_t *msp)
1369 1369  {
1370 1370          ASSERT(MUTEX_HELD(&msp->ms_lock));
1371 1371          range_tree_vacate(msp->ms_tree, NULL, NULL);
1372 1372          msp->ms_loaded = B_FALSE;
1373 1373          msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1374 1374          msp->ms_max_size = 0;
1375 1375  }
1376 1376  
1377 1377  int
1378 1378  metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1379 1379      metaslab_t **msp)
1380 1380  {
1381 1381          vdev_t *vd = mg->mg_vd;
1382 1382          objset_t *mos = vd->vdev_spa->spa_meta_objset;
1383 1383          metaslab_t *ms;
1384 1384          int error;
1385 1385  
1386 1386          ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1387 1387          mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1388 1388          cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1389 1389          ms->ms_id = id;
1390 1390          ms->ms_start = id << vd->vdev_ms_shift;
1391 1391          ms->ms_size = 1ULL << vd->vdev_ms_shift;
1392 1392  
1393 1393          /*
1394 1394           * We only open space map objects that already exist. All others
1395 1395           * will be opened when we finally allocate an object for it.
1396 1396           */
1397 1397          if (object != 0) {
1398 1398                  error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1399 1399                      ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
1400 1400  
1401 1401                  if (error != 0) {
1402 1402                          kmem_free(ms, sizeof (metaslab_t));
1403 1403                          return (error);
1404 1404                  }
1405 1405  
1406 1406                  ASSERT(ms->ms_sm != NULL);
1407 1407          }
1408 1408  
1409 1409          /*
1410 1410           * We create the main range tree here, but we don't create the
1411 1411           * other range trees until metaslab_sync_done().  This serves
1412 1412           * two purposes: it allows metaslab_sync_done() to detect the
1413 1413           * addition of new space; and for debugging, it ensures that we'd
1414 1414           * data fault on any attempt to use this metaslab before it's ready.
1415 1415           */
1416 1416          ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
1417 1417          metaslab_group_add(mg, ms);
1418 1418  
1419 1419          metaslab_set_fragmentation(ms);
1420 1420  
1421 1421          /*
1422 1422           * If we're opening an existing pool (txg == 0) or creating
1423 1423           * a new one (txg == TXG_INITIAL), all space is available now.
1424 1424           * If we're adding space to an existing pool, the new space
1425 1425           * does not become available until after this txg has synced.
1426 1426           * The metaslab's weight will also be initialized when we sync
1427 1427           * out this txg. This ensures that we don't attempt to allocate
1428 1428           * from it before we have initialized it completely.
1429 1429           */
1430 1430          if (txg <= TXG_INITIAL)
1431 1431                  metaslab_sync_done(ms, 0);
1432 1432  
1433 1433          /*
1434 1434           * If metaslab_debug_load is set and we're initializing a metaslab
1435 1435           * that has an allocated space map object then load the its space
1436 1436           * map so that can verify frees.
1437 1437           */
1438 1438          if (metaslab_debug_load && ms->ms_sm != NULL) {
1439 1439                  mutex_enter(&ms->ms_lock);
1440 1440                  VERIFY0(metaslab_load(ms));
1441 1441                  mutex_exit(&ms->ms_lock);
1442 1442          }
1443 1443  
1444 1444          if (txg != 0) {
1445 1445                  vdev_dirty(vd, 0, NULL, txg);
1446 1446                  vdev_dirty(vd, VDD_METASLAB, ms, txg);
1447 1447          }
1448 1448  
1449 1449          *msp = ms;
1450 1450  
1451 1451          return (0);
1452 1452  }
1453 1453  
1454 1454  void
1455 1455  metaslab_fini(metaslab_t *msp)
1456 1456  {
1457 1457          metaslab_group_t *mg = msp->ms_group;
1458 1458  
1459 1459          metaslab_group_remove(mg, msp);
1460 1460  
1461 1461          mutex_enter(&msp->ms_lock);
1462 1462          VERIFY(msp->ms_group == NULL);
1463 1463          vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1464 1464              0, -msp->ms_size);
1465 1465          space_map_close(msp->ms_sm);
1466 1466  
1467 1467          metaslab_unload(msp);
1468 1468          range_tree_destroy(msp->ms_tree);
1469 1469          range_tree_destroy(msp->ms_freeingtree);
1470 1470          range_tree_destroy(msp->ms_freedtree);
1471 1471  
1472 1472          for (int t = 0; t < TXG_SIZE; t++) {
1473 1473                  range_tree_destroy(msp->ms_alloctree[t]);
1474 1474          }
1475 1475  
1476 1476          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1477 1477                  range_tree_destroy(msp->ms_defertree[t]);
1478 1478          }
1479 1479  
1480 1480          ASSERT0(msp->ms_deferspace);
1481 1481  
1482 1482          mutex_exit(&msp->ms_lock);
1483 1483          cv_destroy(&msp->ms_load_cv);
1484 1484          mutex_destroy(&msp->ms_lock);
1485 1485  
1486 1486          kmem_free(msp, sizeof (metaslab_t));
1487 1487  }
1488 1488  
1489 1489  #define FRAGMENTATION_TABLE_SIZE        17
1490 1490  
1491 1491  /*
1492 1492   * This table defines a segment size based fragmentation metric that will
1493 1493   * allow each metaslab to derive its own fragmentation value. This is done
1494 1494   * by calculating the space in each bucket of the spacemap histogram and
1495 1495   * multiplying that by the fragmetation metric in this table. Doing
1496 1496   * this for all buckets and dividing it by the total amount of free
1497 1497   * space in this metaslab (i.e. the total free space in all buckets) gives
1498 1498   * us the fragmentation metric. This means that a high fragmentation metric
1499 1499   * equates to most of the free space being comprised of small segments.
1500 1500   * Conversely, if the metric is low, then most of the free space is in
1501 1501   * large segments. A 10% change in fragmentation equates to approximately
1502 1502   * double the number of segments.
1503 1503   *
1504 1504   * This table defines 0% fragmented space using 16MB segments. Testing has
1505 1505   * shown that segments that are greater than or equal to 16MB do not suffer
1506 1506   * from drastic performance problems. Using this value, we derive the rest
1507 1507   * of the table. Since the fragmentation value is never stored on disk, it
1508 1508   * is possible to change these calculations in the future.
1509 1509   */
1510 1510  int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1511 1511          100,    /* 512B */
1512 1512          100,    /* 1K   */
1513 1513          98,     /* 2K   */
1514 1514          95,     /* 4K   */
1515 1515          90,     /* 8K   */
1516 1516          80,     /* 16K  */
1517 1517          70,     /* 32K  */
1518 1518          60,     /* 64K  */
1519 1519          50,     /* 128K */
1520 1520          40,     /* 256K */
1521 1521          30,     /* 512K */
1522 1522          20,     /* 1M   */
1523 1523          15,     /* 2M   */
1524 1524          10,     /* 4M   */
1525 1525          5,      /* 8M   */
1526 1526          0       /* 16M  */
1527 1527  };
1528 1528  
1529 1529  /*
1530 1530   * Calclate the metaslab's fragmentation metric. A return value
1531 1531   * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1532 1532   * not support this metric. Otherwise, the return value should be in the
1533 1533   * range [0, 100].
1534 1534   */
1535 1535  static void
1536 1536  metaslab_set_fragmentation(metaslab_t *msp)
1537 1537  {
1538 1538          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1539 1539          uint64_t fragmentation = 0;
1540 1540          uint64_t total = 0;
1541 1541          boolean_t feature_enabled = spa_feature_is_enabled(spa,
1542 1542              SPA_FEATURE_SPACEMAP_HISTOGRAM);
1543 1543  
1544 1544          if (!feature_enabled) {
1545 1545                  msp->ms_fragmentation = ZFS_FRAG_INVALID;
1546 1546                  return;
1547 1547          }
1548 1548  
1549 1549          /*
1550 1550           * A null space map means that the entire metaslab is free
1551 1551           * and thus is not fragmented.
1552 1552           */
1553 1553          if (msp->ms_sm == NULL) {
1554 1554                  msp->ms_fragmentation = 0;
1555 1555                  return;
1556 1556          }
1557 1557  
1558 1558          /*
1559 1559           * If this metaslab's space map has not been upgraded, flag it
1560 1560           * so that we upgrade next time we encounter it.
1561 1561           */
1562 1562          if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1563 1563                  uint64_t txg = spa_syncing_txg(spa);
1564 1564                  vdev_t *vd = msp->ms_group->mg_vd;
1565 1565  
1566 1566                  if (spa_writeable(spa)) {
1567 1567                          msp->ms_condense_wanted = B_TRUE;
1568 1568                          vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1569 1569                          spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1570 1570                              "msp %p, vd %p", txg, msp, vd);
1571 1571                  }
1572 1572                  msp->ms_fragmentation = ZFS_FRAG_INVALID;
1573 1573                  return;
1574 1574          }
1575 1575  
1576 1576          for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1577 1577                  uint64_t space = 0;
1578 1578                  uint8_t shift = msp->ms_sm->sm_shift;
1579 1579  
1580 1580                  int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1581 1581                      FRAGMENTATION_TABLE_SIZE - 1);
1582 1582  
1583 1583                  if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1584 1584                          continue;
1585 1585  
1586 1586                  space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1587 1587                  total += space;
1588 1588  
1589 1589                  ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1590 1590                  fragmentation += space * zfs_frag_table[idx];
1591 1591          }
1592 1592  
1593 1593          if (total > 0)
1594 1594                  fragmentation /= total;
1595 1595          ASSERT3U(fragmentation, <=, 100);
1596 1596  
1597 1597          msp->ms_fragmentation = fragmentation;
1598 1598  }
1599 1599  
1600 1600  /*
1601 1601   * Compute a weight -- a selection preference value -- for the given metaslab.
1602 1602   * This is based on the amount of free space, the level of fragmentation,
1603 1603   * the LBA range, and whether the metaslab is loaded.
1604 1604   */
1605 1605  static uint64_t
1606 1606  metaslab_space_weight(metaslab_t *msp)
1607 1607  {
1608 1608          metaslab_group_t *mg = msp->ms_group;
1609 1609          vdev_t *vd = mg->mg_vd;
1610 1610          uint64_t weight, space;
1611 1611  
1612 1612          ASSERT(MUTEX_HELD(&msp->ms_lock));
1613 1613          ASSERT(!vd->vdev_removing);
1614 1614  
1615 1615          /*
1616 1616           * The baseline weight is the metaslab's free space.
1617 1617           */
1618 1618          space = msp->ms_size - space_map_allocated(msp->ms_sm);
1619 1619  
1620 1620          if (metaslab_fragmentation_factor_enabled &&
1621 1621              msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1622 1622                  /*
1623 1623                   * Use the fragmentation information to inversely scale
1624 1624                   * down the baseline weight. We need to ensure that we
1625 1625                   * don't exclude this metaslab completely when it's 100%
1626 1626                   * fragmented. To avoid this we reduce the fragmented value
1627 1627                   * by 1.
1628 1628                   */
1629 1629                  space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1630 1630  
1631 1631                  /*
1632 1632                   * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1633 1633                   * this metaslab again. The fragmentation metric may have
1634 1634                   * decreased the space to something smaller than
1635 1635                   * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1636 1636                   * so that we can consume any remaining space.
1637 1637                   */
1638 1638                  if (space > 0 && space < SPA_MINBLOCKSIZE)
1639 1639                          space = SPA_MINBLOCKSIZE;
1640 1640          }
1641 1641          weight = space;

↓ open down ↓

1641 lines elided

↑ open up ↑

1642 1642  
1643 1643          /*
1644 1644           * Modern disks have uniform bit density and constant angular velocity.
1645 1645           * Therefore, the outer recording zones are faster (higher bandwidth)
1646 1646           * than the inner zones by the ratio of outer to inner track diameter,
1647 1647           * which is typically around 2:1.  We account for this by assigning
1648 1648           * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1649 1649           * In effect, this means that we'll select the metaslab with the most
1650 1650           * free bandwidth rather than simply the one with the most free space.
1651 1651           */
1652      -        if (metaslab_lba_weighting_enabled) {
     1652 +        if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
1653 1653                  weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1654 1654                  ASSERT(weight >= space && weight <= 2 * space);
1655 1655          }
1656 1656  
1657 1657          /*
1658 1658           * If this metaslab is one we're actively using, adjust its
1659 1659           * weight to make it preferable to any inactive metaslab so
1660 1660           * we'll polish it off. If the fragmentation on this metaslab
1661 1661           * has exceed our threshold, then don't mark it active.
1662 1662           */

1663 1663          if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1664 1664              msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1665 1665                  weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1666 1666          }
1667 1667  
1668 1668          WEIGHT_SET_SPACEBASED(weight);
1669 1669          return (weight);
1670 1670  }
1671 1671  
1672 1672  /*
1673 1673   * Return the weight of the specified metaslab, according to the segment-based
1674 1674   * weighting algorithm. The metaslab must be loaded. This function can
1675 1675   * be called within a sync pass since it relies only on the metaslab's
1676 1676   * range tree which is always accurate when the metaslab is loaded.
1677 1677   */
1678 1678  static uint64_t
1679 1679  metaslab_weight_from_range_tree(metaslab_t *msp)
1680 1680  {
1681 1681          uint64_t weight = 0;
1682 1682          uint32_t segments = 0;
1683 1683  
1684 1684          ASSERT(msp->ms_loaded);
1685 1685  
1686 1686          for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1687 1687              i--) {
1688 1688                  uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1689 1689                  int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1690 1690  
1691 1691                  segments <<= 1;
1692 1692                  segments += msp->ms_tree->rt_histogram[i];
1693 1693  
1694 1694                  /*
1695 1695                   * The range tree provides more precision than the space map
1696 1696                   * and must be downgraded so that all values fit within the
1697 1697                   * space map's histogram. This allows us to compare loaded
1698 1698                   * vs. unloaded metaslabs to determine which metaslab is
1699 1699                   * considered "best".
1700 1700                   */
1701 1701                  if (i > max_idx)
1702 1702                          continue;
1703 1703  
1704 1704                  if (segments != 0) {
1705 1705                          WEIGHT_SET_COUNT(weight, segments);
1706 1706                          WEIGHT_SET_INDEX(weight, i);
1707 1707                          WEIGHT_SET_ACTIVE(weight, 0);
1708 1708                          break;
1709 1709                  }
1710 1710          }
1711 1711          return (weight);
1712 1712  }
1713 1713  
1714 1714  /*
1715 1715   * Calculate the weight based on the on-disk histogram. This should only
1716 1716   * be called after a sync pass has completely finished since the on-disk
1717 1717   * information is updated in metaslab_sync().
1718 1718   */
1719 1719  static uint64_t
1720 1720  metaslab_weight_from_spacemap(metaslab_t *msp)
1721 1721  {
1722 1722          uint64_t weight = 0;
1723 1723  
1724 1724          for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1725 1725                  if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1726 1726                          WEIGHT_SET_COUNT(weight,
1727 1727                              msp->ms_sm->sm_phys->smp_histogram[i]);
1728 1728                          WEIGHT_SET_INDEX(weight, i +
1729 1729                              msp->ms_sm->sm_shift);
1730 1730                          WEIGHT_SET_ACTIVE(weight, 0);
1731 1731                          break;
1732 1732                  }
1733 1733          }
1734 1734          return (weight);
1735 1735  }
1736 1736  
1737 1737  /*
1738 1738   * Compute a segment-based weight for the specified metaslab. The weight
1739 1739   * is determined by highest bucket in the histogram. The information
1740 1740   * for the highest bucket is encoded into the weight value.
1741 1741   */
1742 1742  static uint64_t
1743 1743  metaslab_segment_weight(metaslab_t *msp)
1744 1744  {
1745 1745          metaslab_group_t *mg = msp->ms_group;
1746 1746          uint64_t weight = 0;
1747 1747          uint8_t shift = mg->mg_vd->vdev_ashift;
1748 1748  
1749 1749          ASSERT(MUTEX_HELD(&msp->ms_lock));
1750 1750  
1751 1751          /*
1752 1752           * The metaslab is completely free.
1753 1753           */
1754 1754          if (space_map_allocated(msp->ms_sm) == 0) {
1755 1755                  int idx = highbit64(msp->ms_size) - 1;
1756 1756                  int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1757 1757  
1758 1758                  if (idx < max_idx) {
1759 1759                          WEIGHT_SET_COUNT(weight, 1ULL);
1760 1760                          WEIGHT_SET_INDEX(weight, idx);
1761 1761                  } else {
1762 1762                          WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1763 1763                          WEIGHT_SET_INDEX(weight, max_idx);
1764 1764                  }
1765 1765                  WEIGHT_SET_ACTIVE(weight, 0);
1766 1766                  ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1767 1767  
1768 1768                  return (weight);
1769 1769          }
1770 1770  
1771 1771          ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1772 1772  
1773 1773          /*
1774 1774           * If the metaslab is fully allocated then just make the weight 0.
1775 1775           */
1776 1776          if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1777 1777                  return (0);
1778 1778          /*
1779 1779           * If the metaslab is already loaded, then use the range tree to
1780 1780           * determine the weight. Otherwise, we rely on the space map information
1781 1781           * to generate the weight.
1782 1782           */
1783 1783          if (msp->ms_loaded) {
1784 1784                  weight = metaslab_weight_from_range_tree(msp);
1785 1785          } else {
1786 1786                  weight = metaslab_weight_from_spacemap(msp);
1787 1787          }
1788 1788  
1789 1789          /*
1790 1790           * If the metaslab was active the last time we calculated its weight
1791 1791           * then keep it active. We want to consume the entire region that
1792 1792           * is associated with this weight.
1793 1793           */
1794 1794          if (msp->ms_activation_weight != 0 && weight != 0)
1795 1795                  WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1796 1796          return (weight);
1797 1797  }
1798 1798  
1799 1799  /*
1800 1800   * Determine if we should attempt to allocate from this metaslab. If the
1801 1801   * metaslab has a maximum size then we can quickly determine if the desired
1802 1802   * allocation size can be satisfied. Otherwise, if we're using segment-based
1803 1803   * weighting then we can determine the maximum allocation that this metaslab
1804 1804   * can accommodate based on the index encoded in the weight. If we're using
1805 1805   * space-based weights then rely on the entire weight (excluding the weight
1806 1806   * type bit).
1807 1807   */
1808 1808  boolean_t
1809 1809  metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1810 1810  {
1811 1811          boolean_t should_allocate;
1812 1812  
1813 1813          if (msp->ms_max_size != 0)
1814 1814                  return (msp->ms_max_size >= asize);
1815 1815  
1816 1816          if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1817 1817                  /*
1818 1818                   * The metaslab segment weight indicates segments in the
1819 1819                   * range [2^i, 2^(i+1)), where i is the index in the weight.
1820 1820                   * Since the asize might be in the middle of the range, we
1821 1821                   * should attempt the allocation if asize < 2^(i+1).
1822 1822                   */
1823 1823                  should_allocate = (asize <
1824 1824                      1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
1825 1825          } else {
1826 1826                  should_allocate = (asize <=
1827 1827                      (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
1828 1828          }
1829 1829          return (should_allocate);
1830 1830  }
1831 1831  
1832 1832  static uint64_t
1833 1833  metaslab_weight(metaslab_t *msp)
1834 1834  {
1835 1835          vdev_t *vd = msp->ms_group->mg_vd;
1836 1836          spa_t *spa = vd->vdev_spa;
1837 1837          uint64_t weight;
1838 1838  
1839 1839          ASSERT(MUTEX_HELD(&msp->ms_lock));
1840 1840  
1841 1841          /*
1842 1842           * This vdev is in the process of being removed so there is nothing
1843 1843           * for us to do here.
1844 1844           */
1845 1845          if (vd->vdev_removing) {
1846 1846                  ASSERT0(space_map_allocated(msp->ms_sm));
1847 1847                  ASSERT0(vd->vdev_ms_shift);
1848 1848                  return (0);
1849 1849          }
1850 1850  
1851 1851          metaslab_set_fragmentation(msp);
1852 1852  
1853 1853          /*
1854 1854           * Update the maximum size if the metaslab is loaded. This will
1855 1855           * ensure that we get an accurate maximum size if newly freed space
1856 1856           * has been added back into the free tree.
1857 1857           */
1858 1858          if (msp->ms_loaded)
1859 1859                  msp->ms_max_size = metaslab_block_maxsize(msp);
1860 1860  
1861 1861          /*
1862 1862           * Segment-based weighting requires space map histogram support.
1863 1863           */
1864 1864          if (zfs_metaslab_segment_weight_enabled &&
1865 1865              spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
1866 1866              (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
1867 1867              sizeof (space_map_phys_t))) {
1868 1868                  weight = metaslab_segment_weight(msp);
1869 1869          } else {
1870 1870                  weight = metaslab_space_weight(msp);
1871 1871          }
1872 1872          return (weight);
1873 1873  }
1874 1874  
1875 1875  static int
1876 1876  metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1877 1877  {
1878 1878          ASSERT(MUTEX_HELD(&msp->ms_lock));
1879 1879  
1880 1880          if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1881 1881                  metaslab_load_wait(msp);
1882 1882                  if (!msp->ms_loaded) {
1883 1883                          int error = metaslab_load(msp);
1884 1884                          if (error) {
1885 1885                                  metaslab_group_sort(msp->ms_group, msp, 0);
1886 1886                                  return (error);
1887 1887                          }
1888 1888                  }
1889 1889  
1890 1890                  msp->ms_activation_weight = msp->ms_weight;
1891 1891                  metaslab_group_sort(msp->ms_group, msp,
1892 1892                      msp->ms_weight | activation_weight);
1893 1893          }
1894 1894          ASSERT(msp->ms_loaded);
1895 1895          ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1896 1896  
1897 1897          return (0);
1898 1898  }
1899 1899  
1900 1900  static void
1901 1901  metaslab_passivate(metaslab_t *msp, uint64_t weight)
1902 1902  {
1903 1903          uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
1904 1904  
1905 1905          /*
1906 1906           * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1907 1907           * this metaslab again.  In that case, it had better be empty,
1908 1908           * or we would be leaving space on the table.
1909 1909           */
1910 1910          ASSERT(size >= SPA_MINBLOCKSIZE ||
1911 1911              range_tree_space(msp->ms_tree) == 0);
1912 1912          ASSERT0(weight & METASLAB_ACTIVE_MASK);
1913 1913  
1914 1914          msp->ms_activation_weight = 0;
1915 1915          metaslab_group_sort(msp->ms_group, msp, weight);
1916 1916          ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1917 1917  }
1918 1918  
1919 1919  /*
1920 1920   * Segment-based metaslabs are activated once and remain active until
1921 1921   * we either fail an allocation attempt (similar to space-based metaslabs)
1922 1922   * or have exhausted the free space in zfs_metaslab_switch_threshold
1923 1923   * buckets since the metaslab was activated. This function checks to see
1924 1924   * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
1925 1925   * metaslab and passivates it proactively. This will allow us to select a
1926 1926   * metaslabs with larger contiguous region if any remaining within this
1927 1927   * metaslab group. If we're in sync pass > 1, then we continue using this
1928 1928   * metaslab so that we don't dirty more block and cause more sync passes.
1929 1929   */
1930 1930  void
1931 1931  metaslab_segment_may_passivate(metaslab_t *msp)
1932 1932  {
1933 1933          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1934 1934  
1935 1935          if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
1936 1936                  return;
1937 1937  
1938 1938          /*
1939 1939           * Since we are in the middle of a sync pass, the most accurate
1940 1940           * information that is accessible to us is the in-core range tree
1941 1941           * histogram; calculate the new weight based on that information.
1942 1942           */
1943 1943          uint64_t weight = metaslab_weight_from_range_tree(msp);
1944 1944          int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
1945 1945          int current_idx = WEIGHT_GET_INDEX(weight);
1946 1946  
1947 1947          if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
1948 1948                  metaslab_passivate(msp, weight);
1949 1949  }
1950 1950  
1951 1951  static void
1952 1952  metaslab_preload(void *arg)
1953 1953  {
1954 1954          metaslab_t *msp = arg;
1955 1955          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1956 1956  
1957 1957          ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1958 1958  
1959 1959          mutex_enter(&msp->ms_lock);
1960 1960          metaslab_load_wait(msp);
1961 1961          if (!msp->ms_loaded)
1962 1962                  (void) metaslab_load(msp);
1963 1963          msp->ms_selected_txg = spa_syncing_txg(spa);
1964 1964          mutex_exit(&msp->ms_lock);
1965 1965  }
1966 1966  
1967 1967  static void
1968 1968  metaslab_group_preload(metaslab_group_t *mg)
1969 1969  {
1970 1970          spa_t *spa = mg->mg_vd->vdev_spa;
1971 1971          metaslab_t *msp;
1972 1972          avl_tree_t *t = &mg->mg_metaslab_tree;
1973 1973          int m = 0;
1974 1974  
1975 1975          if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1976 1976                  taskq_wait(mg->mg_taskq);
1977 1977                  return;
1978 1978          }
1979 1979  
1980 1980          mutex_enter(&mg->mg_lock);
1981 1981          /*
1982 1982           * Load the next potential metaslabs
1983 1983           */
1984 1984          for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1985 1985                  /*
1986 1986                   * We preload only the maximum number of metaslabs specified
1987 1987                   * by metaslab_preload_limit. If a metaslab is being forced
1988 1988                   * to condense then we preload it too. This will ensure
1989 1989                   * that force condensing happens in the next txg.
1990 1990                   */
1991 1991                  if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
1992 1992                          continue;
1993 1993                  }
1994 1994  
1995 1995                  VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1996 1996                      msp, TQ_SLEEP) != NULL);
1997 1997          }
1998 1998          mutex_exit(&mg->mg_lock);
1999 1999  }
2000 2000  
2001 2001  /*
2002 2002   * Determine if the space map's on-disk footprint is past our tolerance
2003 2003   * for inefficiency. We would like to use the following criteria to make
2004 2004   * our decision:
2005 2005   *
2006 2006   * 1. The size of the space map object should not dramatically increase as a
2007 2007   * result of writing out the free space range tree.
2008 2008   *
2009 2009   * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2010 2010   * times the size than the free space range tree representation
2011 2011   * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
2012 2012   *
2013 2013   * 3. The on-disk size of the space map should actually decrease.
2014 2014   *
2015 2015   * Checking the first condition is tricky since we don't want to walk
2016 2016   * the entire AVL tree calculating the estimated on-disk size. Instead we
2017 2017   * use the size-ordered range tree in the metaslab and calculate the
2018 2018   * size required to write out the largest segment in our free tree. If the
2019 2019   * size required to represent that segment on disk is larger than the space
2020 2020   * map object then we avoid condensing this map.
2021 2021   *
2022 2022   * To determine the second criterion we use a best-case estimate and assume
2023 2023   * each segment can be represented on-disk as a single 64-bit entry. We refer
2024 2024   * to this best-case estimate as the space map's minimal form.
2025 2025   *
2026 2026   * Unfortunately, we cannot compute the on-disk size of the space map in this
2027 2027   * context because we cannot accurately compute the effects of compression, etc.
2028 2028   * Instead, we apply the heuristic described in the block comment for
2029 2029   * zfs_metaslab_condense_block_threshold - we only condense if the space used
2030 2030   * is greater than a threshold number of blocks.
2031 2031   */
2032 2032  static boolean_t
2033 2033  metaslab_should_condense(metaslab_t *msp)
2034 2034  {
2035 2035          space_map_t *sm = msp->ms_sm;
2036 2036          range_seg_t *rs;
2037 2037          uint64_t size, entries, segsz, object_size, optimal_size, record_size;
2038 2038          dmu_object_info_t doi;
2039 2039          uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
2040 2040  
2041 2041          ASSERT(MUTEX_HELD(&msp->ms_lock));
2042 2042          ASSERT(msp->ms_loaded);
2043 2043  
2044 2044          /*
2045 2045           * Use the ms_size_tree range tree, which is ordered by size, to
2046 2046           * obtain the largest segment in the free tree. We always condense
2047 2047           * metaslabs that are empty and metaslabs for which a condense
2048 2048           * request has been made.
2049 2049           */
2050 2050          rs = avl_last(&msp->ms_size_tree);
2051 2051          if (rs == NULL || msp->ms_condense_wanted)
2052 2052                  return (B_TRUE);
2053 2053  
2054 2054          /*
2055 2055           * Calculate the number of 64-bit entries this segment would
2056 2056           * require when written to disk. If this single segment would be
2057 2057           * larger on-disk than the entire current on-disk structure, then
2058 2058           * clearly condensing will increase the on-disk structure size.
2059 2059           */
2060 2060          size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
2061 2061          entries = size / (MIN(size, SM_RUN_MAX));
2062 2062          segsz = entries * sizeof (uint64_t);
2063 2063  
2064 2064          optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
2065 2065          object_size = space_map_length(msp->ms_sm);
2066 2066  
2067 2067          dmu_object_info_from_db(sm->sm_dbuf, &doi);
2068 2068          record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2069 2069  
2070 2070          return (segsz <= object_size &&
2071 2071              object_size >= (optimal_size * zfs_condense_pct / 100) &&
2072 2072              object_size > zfs_metaslab_condense_block_threshold * record_size);
2073 2073  }
2074 2074  
2075 2075  /*
2076 2076   * Condense the on-disk space map representation to its minimized form.
2077 2077   * The minimized form consists of a small number of allocations followed by
2078 2078   * the entries of the free range tree.
2079 2079   */
2080 2080  static void
2081 2081  metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2082 2082  {
2083 2083          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2084 2084          range_tree_t *condense_tree;
2085 2085          space_map_t *sm = msp->ms_sm;
2086 2086  
2087 2087          ASSERT(MUTEX_HELD(&msp->ms_lock));
2088 2088          ASSERT3U(spa_sync_pass(spa), ==, 1);
2089 2089          ASSERT(msp->ms_loaded);
2090 2090  
2091 2091  
2092 2092          spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2093 2093              "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2094 2094              msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2095 2095              msp->ms_group->mg_vd->vdev_spa->spa_name,
2096 2096              space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
2097 2097              msp->ms_condense_wanted ? "TRUE" : "FALSE");
2098 2098  
2099 2099          msp->ms_condense_wanted = B_FALSE;
2100 2100  
2101 2101          /*
2102 2102           * Create an range tree that is 100% allocated. We remove segments
2103 2103           * that have been freed in this txg, any deferred frees that exist,
2104 2104           * and any allocation in the future. Removing segments should be
2105 2105           * a relatively inexpensive operation since we expect these trees to
2106 2106           * have a small number of nodes.
2107 2107           */
2108 2108          condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
2109 2109          range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2110 2110  
2111 2111          /*
2112 2112           * Remove what's been freed in this txg from the condense_tree.
2113 2113           * Since we're in sync_pass 1, we know that all the frees from
2114 2114           * this txg are in the freeingtree.
2115 2115           */
2116 2116          range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
2117 2117  
2118 2118          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2119 2119                  range_tree_walk(msp->ms_defertree[t],
2120 2120                      range_tree_remove, condense_tree);
2121 2121          }
2122 2122  
2123 2123          for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2124 2124                  range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
2125 2125                      range_tree_remove, condense_tree);
2126 2126          }
2127 2127  
2128 2128          /*
2129 2129           * We're about to drop the metaslab's lock thus allowing
2130 2130           * other consumers to change it's content. Set the
2131 2131           * metaslab's ms_condensing flag to ensure that
2132 2132           * allocations on this metaslab do not occur while we're
2133 2133           * in the middle of committing it to disk. This is only critical
2134 2134           * for the ms_tree as all other range trees use per txg
2135 2135           * views of their content.
2136 2136           */
2137 2137          msp->ms_condensing = B_TRUE;
2138 2138  
2139 2139          mutex_exit(&msp->ms_lock);
2140 2140          space_map_truncate(sm, tx);
2141 2141          mutex_enter(&msp->ms_lock);
2142 2142  
2143 2143          /*
2144 2144           * While we would ideally like to create a space map representation
2145 2145           * that consists only of allocation records, doing so can be
2146 2146           * prohibitively expensive because the in-core free tree can be
2147 2147           * large, and therefore computationally expensive to subtract
2148 2148           * from the condense_tree. Instead we sync out two trees, a cheap
2149 2149           * allocation only tree followed by the in-core free tree. While not
2150 2150           * optimal, this is typically close to optimal, and much cheaper to
2151 2151           * compute.
2152 2152           */
2153 2153          space_map_write(sm, condense_tree, SM_ALLOC, tx);
2154 2154          range_tree_vacate(condense_tree, NULL, NULL);
2155 2155          range_tree_destroy(condense_tree);
2156 2156  
2157 2157          space_map_write(sm, msp->ms_tree, SM_FREE, tx);
2158 2158          msp->ms_condensing = B_FALSE;
2159 2159  }
2160 2160  
2161 2161  /*
2162 2162   * Write a metaslab to disk in the context of the specified transaction group.
2163 2163   */
2164 2164  void
2165 2165  metaslab_sync(metaslab_t *msp, uint64_t txg)
2166 2166  {
2167 2167          metaslab_group_t *mg = msp->ms_group;
2168 2168          vdev_t *vd = mg->mg_vd;
2169 2169          spa_t *spa = vd->vdev_spa;
2170 2170          objset_t *mos = spa_meta_objset(spa);
2171 2171          range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2172 2172          dmu_tx_t *tx;
2173 2173          uint64_t object = space_map_object(msp->ms_sm);
2174 2174  
2175 2175          ASSERT(!vd->vdev_ishole);
2176 2176  
2177 2177          /*
2178 2178           * This metaslab has just been added so there's no work to do now.
2179 2179           */
2180 2180          if (msp->ms_freeingtree == NULL) {
2181 2181                  ASSERT3P(alloctree, ==, NULL);
2182 2182                  return;
2183 2183          }
2184 2184  
2185 2185          ASSERT3P(alloctree, !=, NULL);
2186 2186          ASSERT3P(msp->ms_freeingtree, !=, NULL);
2187 2187          ASSERT3P(msp->ms_freedtree, !=, NULL);
2188 2188  
2189 2189          /*
2190 2190           * Normally, we don't want to process a metaslab if there
2191 2191           * are no allocations or frees to perform. However, if the metaslab
2192 2192           * is being forced to condense we need to let it through.
2193 2193           */
2194 2194          if (range_tree_space(alloctree) == 0 &&
2195 2195              range_tree_space(msp->ms_freeingtree) == 0 &&
2196 2196              !msp->ms_condense_wanted)
2197 2197                  return;
2198 2198  
2199 2199          /*
2200 2200           * The only state that can actually be changing concurrently with
2201 2201           * metaslab_sync() is the metaslab's ms_tree.  No other thread can
2202 2202           * be modifying this txg's alloctree, freeingtree, freedtree, or
2203 2203           * space_map_phys_t. Therefore, we only hold ms_lock to satify
2204 2204           * space map ASSERTs. We drop it whenever we call into the DMU,
2205 2205           * because the DMU can call down to us (e.g. via zio_free()) at
2206 2206           * any time.
2207 2207           */
2208 2208  
2209 2209          tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2210 2210  
2211 2211          if (msp->ms_sm == NULL) {
2212 2212                  uint64_t new_object;
2213 2213  
2214 2214                  new_object = space_map_alloc(mos, tx);
2215 2215                  VERIFY3U(new_object, !=, 0);
2216 2216  
2217 2217                  VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2218 2218                      msp->ms_start, msp->ms_size, vd->vdev_ashift,
2219 2219                      &msp->ms_lock));
2220 2220                  ASSERT(msp->ms_sm != NULL);
2221 2221          }
2222 2222  
2223 2223          mutex_enter(&msp->ms_lock);
2224 2224  
2225 2225          /*
2226 2226           * Note: metaslab_condense() clears the space map's histogram.
2227 2227           * Therefore we must verify and remove this histogram before
2228 2228           * condensing.
2229 2229           */
2230 2230          metaslab_group_histogram_verify(mg);
2231 2231          metaslab_class_histogram_verify(mg->mg_class);
2232 2232          metaslab_group_histogram_remove(mg, msp);
2233 2233  
2234 2234          if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2235 2235              metaslab_should_condense(msp)) {
2236 2236                  metaslab_condense(msp, txg, tx);
2237 2237          } else {
2238 2238                  space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2239 2239                  space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
2240 2240          }
2241 2241  
2242 2242          if (msp->ms_loaded) {
2243 2243                  /*
2244 2244                   * When the space map is loaded, we have an accruate
2245 2245                   * histogram in the range tree. This gives us an opportunity
2246 2246                   * to bring the space map's histogram up-to-date so we clear
2247 2247                   * it first before updating it.
2248 2248                   */
2249 2249                  space_map_histogram_clear(msp->ms_sm);
2250 2250                  space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2251 2251  
2252 2252                  /*
2253 2253                   * Since we've cleared the histogram we need to add back
2254 2254                   * any free space that has already been processed, plus
2255 2255                   * any deferred space. This allows the on-disk histogram
2256 2256                   * to accurately reflect all free space even if some space
2257 2257                   * is not yet available for allocation (i.e. deferred).
2258 2258                   */
2259 2259                  space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
2260 2260  
2261 2261                  /*
2262 2262                   * Add back any deferred free space that has not been
2263 2263                   * added back into the in-core free tree yet. This will
2264 2264                   * ensure that we don't end up with a space map histogram
2265 2265                   * that is completely empty unless the metaslab is fully
2266 2266                   * allocated.
2267 2267                   */
2268 2268                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2269 2269                          space_map_histogram_add(msp->ms_sm,
2270 2270                              msp->ms_defertree[t], tx);
2271 2271                  }
2272 2272          }
2273 2273  
2274 2274          /*
2275 2275           * Always add the free space from this sync pass to the space
2276 2276           * map histogram. We want to make sure that the on-disk histogram
2277 2277           * accounts for all free space. If the space map is not loaded,
2278 2278           * then we will lose some accuracy but will correct it the next
2279 2279           * time we load the space map.
2280 2280           */
2281 2281          space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
2282 2282  
2283 2283          metaslab_group_histogram_add(mg, msp);
2284 2284          metaslab_group_histogram_verify(mg);
2285 2285          metaslab_class_histogram_verify(mg->mg_class);
2286 2286  
2287 2287          /*
2288 2288           * For sync pass 1, we avoid traversing this txg's free range tree
2289 2289           * and instead will just swap the pointers for freeingtree and
2290 2290           * freedtree. We can safely do this since the freed_tree is
2291 2291           * guaranteed to be empty on the initial pass.
2292 2292           */
2293 2293          if (spa_sync_pass(spa) == 1) {
2294 2294                  range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
2295 2295          } else {
2296 2296                  range_tree_vacate(msp->ms_freeingtree,
2297 2297                      range_tree_add, msp->ms_freedtree);
2298 2298          }
2299 2299          range_tree_vacate(alloctree, NULL, NULL);
2300 2300  
2301 2301          ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2302 2302          ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
2303 2303          ASSERT0(range_tree_space(msp->ms_freeingtree));
2304 2304  
2305 2305          mutex_exit(&msp->ms_lock);
2306 2306  
2307 2307          if (object != space_map_object(msp->ms_sm)) {
2308 2308                  object = space_map_object(msp->ms_sm);
2309 2309                  dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2310 2310                      msp->ms_id, sizeof (uint64_t), &object, tx);
2311 2311          }
2312 2312          dmu_tx_commit(tx);
2313 2313  }
2314 2314  
2315 2315  /*
2316 2316   * Called after a transaction group has completely synced to mark
2317 2317   * all of the metaslab's free space as usable.
2318 2318   */
2319 2319  void
2320 2320  metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2321 2321  {
2322 2322          metaslab_group_t *mg = msp->ms_group;
2323 2323          vdev_t *vd = mg->mg_vd;
2324 2324          spa_t *spa = vd->vdev_spa;
2325 2325          range_tree_t **defer_tree;
2326 2326          int64_t alloc_delta, defer_delta;
2327 2327          boolean_t defer_allowed = B_TRUE;
2328 2328  
2329 2329          ASSERT(!vd->vdev_ishole);
2330 2330  
2331 2331          mutex_enter(&msp->ms_lock);
2332 2332  
2333 2333          /*
2334 2334           * If this metaslab is just becoming available, initialize its
2335 2335           * range trees and add its capacity to the vdev.
2336 2336           */
2337 2337          if (msp->ms_freedtree == NULL) {
2338 2338                  for (int t = 0; t < TXG_SIZE; t++) {
2339 2339                          ASSERT(msp->ms_alloctree[t] == NULL);
2340 2340  
2341 2341                          msp->ms_alloctree[t] = range_tree_create(NULL, msp,
2342 2342                              &msp->ms_lock);
2343 2343                  }
2344 2344  
2345 2345                  ASSERT3P(msp->ms_freeingtree, ==, NULL);
2346 2346                  msp->ms_freeingtree = range_tree_create(NULL, msp,
2347 2347                      &msp->ms_lock);
2348 2348  
2349 2349                  ASSERT3P(msp->ms_freedtree, ==, NULL);
2350 2350                  msp->ms_freedtree = range_tree_create(NULL, msp,
2351 2351                      &msp->ms_lock);
2352 2352  
2353 2353                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2354 2354                          ASSERT(msp->ms_defertree[t] == NULL);
2355 2355  
2356 2356                          msp->ms_defertree[t] = range_tree_create(NULL, msp,
2357 2357                              &msp->ms_lock);
2358 2358                  }
2359 2359  
2360 2360                  vdev_space_update(vd, 0, 0, msp->ms_size);
2361 2361          }
2362 2362  
2363 2363          defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2364 2364  
2365 2365          uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2366 2366              metaslab_class_get_alloc(spa_normal_class(spa));
2367 2367          if (free_space <= spa_get_slop_space(spa)) {
2368 2368                  defer_allowed = B_FALSE;
2369 2369          }
2370 2370  
2371 2371          defer_delta = 0;
2372 2372          alloc_delta = space_map_alloc_delta(msp->ms_sm);
2373 2373          if (defer_allowed) {
2374 2374                  defer_delta = range_tree_space(msp->ms_freedtree) -
2375 2375                      range_tree_space(*defer_tree);
2376 2376          } else {
2377 2377                  defer_delta -= range_tree_space(*defer_tree);
2378 2378          }
2379 2379  
2380 2380          vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2381 2381  
2382 2382          /*
2383 2383           * If there's a metaslab_load() in progress, wait for it to complete
2384 2384           * so that we have a consistent view of the in-core space map.
2385 2385           */
2386 2386          metaslab_load_wait(msp);
2387 2387  
2388 2388          /*
2389 2389           * Move the frees from the defer_tree back to the free
2390 2390           * range tree (if it's loaded). Swap the freed_tree and the
2391 2391           * defer_tree -- this is safe to do because we've just emptied out
2392 2392           * the defer_tree.
2393 2393           */
2394 2394          range_tree_vacate(*defer_tree,
2395 2395              msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2396 2396          if (defer_allowed) {
2397 2397                  range_tree_swap(&msp->ms_freedtree, defer_tree);
2398 2398          } else {
2399 2399                  range_tree_vacate(msp->ms_freedtree,
2400 2400                      msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2401 2401          }
2402 2402  
2403 2403          space_map_update(msp->ms_sm);
2404 2404  
2405 2405          msp->ms_deferspace += defer_delta;
2406 2406          ASSERT3S(msp->ms_deferspace, >=, 0);
2407 2407          ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2408 2408          if (msp->ms_deferspace != 0) {
2409 2409                  /*
2410 2410                   * Keep syncing this metaslab until all deferred frees
2411 2411                   * are back in circulation.
2412 2412                   */
2413 2413                  vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2414 2414          }
2415 2415  
2416 2416          /*
2417 2417           * Calculate the new weights before unloading any metaslabs.
2418 2418           * This will give us the most accurate weighting.
2419 2419           */
2420 2420          metaslab_group_sort(mg, msp, metaslab_weight(msp));
2421 2421  
2422 2422          /*
2423 2423           * If the metaslab is loaded and we've not tried to load or allocate
2424 2424           * from it in 'metaslab_unload_delay' txgs, then unload it.
2425 2425           */
2426 2426          if (msp->ms_loaded &&
2427 2427              msp->ms_selected_txg + metaslab_unload_delay < txg) {
2428 2428                  for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2429 2429                          VERIFY0(range_tree_space(
2430 2430                              msp->ms_alloctree[(txg + t) & TXG_MASK]));
2431 2431                  }
2432 2432  
2433 2433                  if (!metaslab_debug_unload)
2434 2434                          metaslab_unload(msp);
2435 2435          }
2436 2436  
2437 2437          mutex_exit(&msp->ms_lock);
2438 2438  }
2439 2439  
2440 2440  void
2441 2441  metaslab_sync_reassess(metaslab_group_t *mg)
2442 2442  {
2443 2443          metaslab_group_alloc_update(mg);
2444 2444          mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2445 2445  
2446 2446          /*
2447 2447           * Preload the next potential metaslabs
2448 2448           */
2449 2449          metaslab_group_preload(mg);
2450 2450  }
2451 2451  
2452 2452  static uint64_t
2453 2453  metaslab_distance(metaslab_t *msp, dva_t *dva)
2454 2454  {
2455 2455          uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2456 2456          uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2457 2457          uint64_t start = msp->ms_id;
2458 2458  
2459 2459          if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2460 2460                  return (1ULL << 63);
2461 2461  
2462 2462          if (offset < start)
2463 2463                  return ((start - offset) << ms_shift);
2464 2464          if (offset > start)
2465 2465                  return ((offset - start) << ms_shift);
2466 2466          return (0);
2467 2467  }
2468 2468  
2469 2469  /*
2470 2470   * ==========================================================================
2471 2471   * Metaslab allocation tracing facility
2472 2472   * ==========================================================================
2473 2473   */
2474 2474  kstat_t *metaslab_trace_ksp;
2475 2475  kstat_named_t metaslab_trace_over_limit;
2476 2476  
2477 2477  void
2478 2478  metaslab_alloc_trace_init(void)
2479 2479  {
2480 2480          ASSERT(metaslab_alloc_trace_cache == NULL);
2481 2481          metaslab_alloc_trace_cache = kmem_cache_create(
2482 2482              "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2483 2483              0, NULL, NULL, NULL, NULL, NULL, 0);
2484 2484          metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2485 2485              "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2486 2486          if (metaslab_trace_ksp != NULL) {
2487 2487                  metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2488 2488                  kstat_named_init(&metaslab_trace_over_limit,
2489 2489                      "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2490 2490                  kstat_install(metaslab_trace_ksp);
2491 2491          }
2492 2492  }
2493 2493  
2494 2494  void
2495 2495  metaslab_alloc_trace_fini(void)
2496 2496  {
2497 2497          if (metaslab_trace_ksp != NULL) {
2498 2498                  kstat_delete(metaslab_trace_ksp);
2499 2499                  metaslab_trace_ksp = NULL;
2500 2500          }
2501 2501          kmem_cache_destroy(metaslab_alloc_trace_cache);
2502 2502          metaslab_alloc_trace_cache = NULL;
2503 2503  }
2504 2504  
2505 2505  /*
2506 2506   * Add an allocation trace element to the allocation tracing list.
2507 2507   */
2508 2508  static void
2509 2509  metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2510 2510      metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2511 2511  {
2512 2512          if (!metaslab_trace_enabled)
2513 2513                  return;
2514 2514  
2515 2515          /*
2516 2516           * When the tracing list reaches its maximum we remove
2517 2517           * the second element in the list before adding a new one.
2518 2518           * By removing the second element we preserve the original
2519 2519           * entry as a clue to what allocations steps have already been
2520 2520           * performed.
2521 2521           */
2522 2522          if (zal->zal_size == metaslab_trace_max_entries) {
2523 2523                  metaslab_alloc_trace_t *mat_next;
2524 2524  #ifdef DEBUG
2525 2525                  panic("too many entries in allocation list");
2526 2526  #endif
2527 2527                  atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2528 2528                  zal->zal_size--;
2529 2529                  mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2530 2530                  list_remove(&zal->zal_list, mat_next);
2531 2531                  kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2532 2532          }
2533 2533  
2534 2534          metaslab_alloc_trace_t *mat =
2535 2535              kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2536 2536          list_link_init(&mat->mat_list_node);
2537 2537          mat->mat_mg = mg;
2538 2538          mat->mat_msp = msp;
2539 2539          mat->mat_size = psize;
2540 2540          mat->mat_dva_id = dva_id;
2541 2541          mat->mat_offset = offset;
2542 2542          mat->mat_weight = 0;
2543 2543  
2544 2544          if (msp != NULL)
2545 2545                  mat->mat_weight = msp->ms_weight;
2546 2546  
2547 2547          /*
2548 2548           * The list is part of the zio so locking is not required. Only
2549 2549           * a single thread will perform allocations for a given zio.
2550 2550           */
2551 2551          list_insert_tail(&zal->zal_list, mat);
2552 2552          zal->zal_size++;
2553 2553  
2554 2554          ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2555 2555  }
2556 2556  
2557 2557  void
2558 2558  metaslab_trace_init(zio_alloc_list_t *zal)
2559 2559  {
2560 2560          list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2561 2561              offsetof(metaslab_alloc_trace_t, mat_list_node));
2562 2562          zal->zal_size = 0;
2563 2563  }
2564 2564  
2565 2565  void
2566 2566  metaslab_trace_fini(zio_alloc_list_t *zal)
2567 2567  {
2568 2568          metaslab_alloc_trace_t *mat;
2569 2569  
2570 2570          while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2571 2571                  kmem_cache_free(metaslab_alloc_trace_cache, mat);
2572 2572          list_destroy(&zal->zal_list);
2573 2573          zal->zal_size = 0;
2574 2574  }
2575 2575  
2576 2576  /*
2577 2577   * ==========================================================================
2578 2578   * Metaslab block operations
2579 2579   * ==========================================================================
2580 2580   */
2581 2581  
2582 2582  static void
2583 2583  metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2584 2584  {
2585 2585          if (!(flags & METASLAB_ASYNC_ALLOC) ||
2586 2586              flags & METASLAB_DONT_THROTTLE)
2587 2587                  return;
2588 2588  
2589 2589          metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2590 2590          if (!mg->mg_class->mc_alloc_throttle_enabled)
2591 2591                  return;
2592 2592  
2593 2593          (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2594 2594  }
2595 2595  
2596 2596  void
2597 2597  metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2598 2598  {
2599 2599          if (!(flags & METASLAB_ASYNC_ALLOC) ||
2600 2600              flags & METASLAB_DONT_THROTTLE)
2601 2601                  return;
2602 2602  
2603 2603          metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2604 2604          if (!mg->mg_class->mc_alloc_throttle_enabled)
2605 2605                  return;
2606 2606  
2607 2607          (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2608 2608  }
2609 2609  
2610 2610  void
2611 2611  metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2612 2612  {
2613 2613  #ifdef ZFS_DEBUG
2614 2614          const dva_t *dva = bp->blk_dva;
2615 2615          int ndvas = BP_GET_NDVAS(bp);
2616 2616  
2617 2617          for (int d = 0; d < ndvas; d++) {
2618 2618                  uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2619 2619                  metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2620 2620                  VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2621 2621          }
2622 2622  #endif
2623 2623  }
2624 2624  
2625 2625  static uint64_t
2626 2626  metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2627 2627  {
2628 2628          uint64_t start;
2629 2629          range_tree_t *rt = msp->ms_tree;
2630 2630          metaslab_class_t *mc = msp->ms_group->mg_class;
2631 2631  
2632 2632          VERIFY(!msp->ms_condensing);
2633 2633  
2634 2634          start = mc->mc_ops->msop_alloc(msp, size);
2635 2635          if (start != -1ULL) {
2636 2636                  metaslab_group_t *mg = msp->ms_group;
2637 2637                  vdev_t *vd = mg->mg_vd;
2638 2638  
2639 2639                  VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2640 2640                  VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2641 2641                  VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2642 2642                  range_tree_remove(rt, start, size);
2643 2643  
2644 2644                  if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2645 2645                          vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2646 2646  
2647 2647                  range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
2648 2648  
2649 2649                  /* Track the last successful allocation */
2650 2650                  msp->ms_alloc_txg = txg;
2651 2651                  metaslab_verify_space(msp, txg);
2652 2652          }
2653 2653  
2654 2654          /*
2655 2655           * Now that we've attempted the allocation we need to update the
2656 2656           * metaslab's maximum block size since it may have changed.
2657 2657           */
2658 2658          msp->ms_max_size = metaslab_block_maxsize(msp);
2659 2659          return (start);
2660 2660  }
2661 2661  
2662 2662  static uint64_t
2663 2663  metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2664 2664      uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2665 2665  {
2666 2666          metaslab_t *msp = NULL;
2667 2667          uint64_t offset = -1ULL;
2668 2668          uint64_t activation_weight;
2669 2669          uint64_t target_distance;
2670 2670          int i;
2671 2671  
2672 2672          activation_weight = METASLAB_WEIGHT_PRIMARY;
2673 2673          for (i = 0; i < d; i++) {
2674 2674                  if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2675 2675                          activation_weight = METASLAB_WEIGHT_SECONDARY;
2676 2676                          break;
2677 2677                  }
2678 2678          }
2679 2679  
2680 2680          metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2681 2681          search->ms_weight = UINT64_MAX;
2682 2682          search->ms_start = 0;
2683 2683          for (;;) {
2684 2684                  boolean_t was_active;
2685 2685                  avl_tree_t *t = &mg->mg_metaslab_tree;
2686 2686                  avl_index_t idx;
2687 2687  
2688 2688                  mutex_enter(&mg->mg_lock);
2689 2689  
2690 2690                  /*
2691 2691                   * Find the metaslab with the highest weight that is less
2692 2692                   * than what we've already tried.  In the common case, this
2693 2693                   * means that we will examine each metaslab at most once.
2694 2694                   * Note that concurrent callers could reorder metaslabs
2695 2695                   * by activation/passivation once we have dropped the mg_lock.
2696 2696                   * If a metaslab is activated by another thread, and we fail
2697 2697                   * to allocate from the metaslab we have selected, we may
2698 2698                   * not try the newly-activated metaslab, and instead activate
2699 2699                   * another metaslab.  This is not optimal, but generally
2700 2700                   * does not cause any problems (a possible exception being
2701 2701                   * if every metaslab is completely full except for the
2702 2702                   * the newly-activated metaslab which we fail to examine).
2703 2703                   */
2704 2704                  msp = avl_find(t, search, &idx);
2705 2705                  if (msp == NULL)
2706 2706                          msp = avl_nearest(t, idx, AVL_AFTER);
2707 2707                  for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2708 2708  
2709 2709                          if (!metaslab_should_allocate(msp, asize)) {
2710 2710                                  metaslab_trace_add(zal, mg, msp, asize, d,
2711 2711                                      TRACE_TOO_SMALL);
2712 2712                                  continue;
2713 2713                          }
2714 2714  
2715 2715                          /*
2716 2716                           * If the selected metaslab is condensing, skip it.
2717 2717                           */
2718 2718                          if (msp->ms_condensing)
2719 2719                                  continue;
2720 2720  
2721 2721                          was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2722 2722                          if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2723 2723                                  break;
2724 2724  
2725 2725                          target_distance = min_distance +
2726 2726                              (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2727 2727                              min_distance >> 1);
2728 2728  
2729 2729                          for (i = 0; i < d; i++) {
2730 2730                                  if (metaslab_distance(msp, &dva[i]) <
2731 2731                                      target_distance)
2732 2732                                          break;
2733 2733                          }
2734 2734                          if (i == d)
2735 2735                                  break;
2736 2736                  }
2737 2737                  mutex_exit(&mg->mg_lock);
2738 2738                  if (msp == NULL) {
2739 2739                          kmem_free(search, sizeof (*search));
2740 2740                          return (-1ULL);
2741 2741                  }
2742 2742                  search->ms_weight = msp->ms_weight;
2743 2743                  search->ms_start = msp->ms_start + 1;
2744 2744  
2745 2745                  mutex_enter(&msp->ms_lock);
2746 2746  
2747 2747                  /*
2748 2748                   * Ensure that the metaslab we have selected is still
2749 2749                   * capable of handling our request. It's possible that
2750 2750                   * another thread may have changed the weight while we
2751 2751                   * were blocked on the metaslab lock. We check the
2752 2752                   * active status first to see if we need to reselect
2753 2753                   * a new metaslab.
2754 2754                   */
2755 2755                  if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2756 2756                          mutex_exit(&msp->ms_lock);
2757 2757                          continue;
2758 2758                  }
2759 2759  
2760 2760                  if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2761 2761                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
2762 2762                          metaslab_passivate(msp,
2763 2763                              msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2764 2764                          mutex_exit(&msp->ms_lock);
2765 2765                          continue;
2766 2766                  }
2767 2767  
2768 2768                  if (metaslab_activate(msp, activation_weight) != 0) {
2769 2769                          mutex_exit(&msp->ms_lock);
2770 2770                          continue;
2771 2771                  }
2772 2772                  msp->ms_selected_txg = txg;
2773 2773  
2774 2774                  /*
2775 2775                   * Now that we have the lock, recheck to see if we should
2776 2776                   * continue to use this metaslab for this allocation. The
2777 2777                   * the metaslab is now loaded so metaslab_should_allocate() can
2778 2778                   * accurately determine if the allocation attempt should
2779 2779                   * proceed.
2780 2780                   */
2781 2781                  if (!metaslab_should_allocate(msp, asize)) {
2782 2782                          /* Passivate this metaslab and select a new one. */
2783 2783                          metaslab_trace_add(zal, mg, msp, asize, d,
2784 2784                              TRACE_TOO_SMALL);
2785 2785                          goto next;
2786 2786                  }
2787 2787  
2788 2788                  /*
2789 2789                   * If this metaslab is currently condensing then pick again as
2790 2790                   * we can't manipulate this metaslab until it's committed
2791 2791                   * to disk.
2792 2792                   */
2793 2793                  if (msp->ms_condensing) {
2794 2794                          metaslab_trace_add(zal, mg, msp, asize, d,
2795 2795                              TRACE_CONDENSING);
2796 2796                          mutex_exit(&msp->ms_lock);
2797 2797                          continue;
2798 2798                  }
2799 2799  
2800 2800                  offset = metaslab_block_alloc(msp, asize, txg);
2801 2801                  metaslab_trace_add(zal, mg, msp, asize, d, offset);
2802 2802  
2803 2803                  if (offset != -1ULL) {
2804 2804                          /* Proactively passivate the metaslab, if needed */
2805 2805                          metaslab_segment_may_passivate(msp);
2806 2806                          break;
2807 2807                  }
2808 2808  next:
2809 2809                  ASSERT(msp->ms_loaded);
2810 2810  
2811 2811                  /*
2812 2812                   * We were unable to allocate from this metaslab so determine
2813 2813                   * a new weight for this metaslab. Now that we have loaded
2814 2814                   * the metaslab we can provide a better hint to the metaslab
2815 2815                   * selector.
2816 2816                   *
2817 2817                   * For space-based metaslabs, we use the maximum block size.
2818 2818                   * This information is only available when the metaslab
2819 2819                   * is loaded and is more accurate than the generic free
2820 2820                   * space weight that was calculated by metaslab_weight().
2821 2821                   * This information allows us to quickly compare the maximum
2822 2822                   * available allocation in the metaslab to the allocation
2823 2823                   * size being requested.
2824 2824                   *
2825 2825                   * For segment-based metaslabs, determine the new weight
2826 2826                   * based on the highest bucket in the range tree. We
2827 2827                   * explicitly use the loaded segment weight (i.e. the range
2828 2828                   * tree histogram) since it contains the space that is
2829 2829                   * currently available for allocation and is accurate
2830 2830                   * even within a sync pass.
2831 2831                   */
2832 2832                  if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2833 2833                          uint64_t weight = metaslab_block_maxsize(msp);
2834 2834                          WEIGHT_SET_SPACEBASED(weight);
2835 2835                          metaslab_passivate(msp, weight);
2836 2836                  } else {
2837 2837                          metaslab_passivate(msp,
2838 2838                              metaslab_weight_from_range_tree(msp));
2839 2839                  }
2840 2840  
2841 2841                  /*
2842 2842                   * We have just failed an allocation attempt, check
2843 2843                   * that metaslab_should_allocate() agrees. Otherwise,
2844 2844                   * we may end up in an infinite loop retrying the same
2845 2845                   * metaslab.
2846 2846                   */
2847 2847                  ASSERT(!metaslab_should_allocate(msp, asize));
2848 2848                  mutex_exit(&msp->ms_lock);
2849 2849          }
2850 2850          mutex_exit(&msp->ms_lock);
2851 2851          kmem_free(search, sizeof (*search));
2852 2852          return (offset);
2853 2853  }
2854 2854  
2855 2855  static uint64_t
2856 2856  metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
2857 2857      uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2858 2858  {
2859 2859          uint64_t offset;
2860 2860          ASSERT(mg->mg_initialized);
2861 2861  
2862 2862          offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
2863 2863              min_distance, dva, d);
2864 2864  
2865 2865          mutex_enter(&mg->mg_lock);
2866 2866          if (offset == -1ULL) {
2867 2867                  mg->mg_failed_allocations++;
2868 2868                  metaslab_trace_add(zal, mg, NULL, asize, d,
2869 2869                      TRACE_GROUP_FAILURE);
2870 2870                  if (asize == SPA_GANGBLOCKSIZE) {
2871 2871                          /*
2872 2872                           * This metaslab group was unable to allocate
2873 2873                           * the minimum gang block size so it must be out of
2874 2874                           * space. We must notify the allocation throttle
2875 2875                           * to start skipping allocation attempts to this
2876 2876                           * metaslab group until more space becomes available.
2877 2877                           * Note: this failure cannot be caused by the
2878 2878                           * allocation throttle since the allocation throttle
2879 2879                           * is only responsible for skipping devices and
2880 2880                           * not failing block allocations.
2881 2881                           */
2882 2882                          mg->mg_no_free_space = B_TRUE;
2883 2883                  }
2884 2884          }
2885 2885          mg->mg_allocations++;
2886 2886          mutex_exit(&mg->mg_lock);
2887 2887          return (offset);
2888 2888  }
2889 2889  
2890 2890  /*
2891 2891   * If we have to write a ditto block (i.e. more than one DVA for a given BP)
2892 2892   * on the same vdev as an existing DVA of this BP, then try to allocate it
2893 2893   * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
2894 2894   * existing DVAs.
2895 2895   */
2896 2896  int ditto_same_vdev_distance_shift = 3;
2897 2897  
2898 2898  /*
2899 2899   * Allocate a block for the specified i/o.
2900 2900   */
2901 2901  static int
2902 2902  metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2903 2903      dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
2904 2904      zio_alloc_list_t *zal)
2905 2905  {
2906 2906          metaslab_group_t *mg, *rotor;
2907 2907          vdev_t *vd;
2908 2908          boolean_t try_hard = B_FALSE;
2909 2909  
2910 2910          ASSERT(!DVA_IS_VALID(&dva[d]));
2911 2911  
2912 2912          /*
2913 2913           * For testing, make some blocks above a certain size be gang blocks.
2914 2914           */
2915 2915          if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
2916 2916                  metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
2917 2917                  return (SET_ERROR(ENOSPC));
2918 2918          }
2919 2919  
2920 2920          /*
2921 2921           * Start at the rotor and loop through all mgs until we find something.
2922 2922           * Note that there's no locking on mc_rotor or mc_aliquot because
2923 2923           * nothing actually breaks if we miss a few updates -- we just won't
2924 2924           * allocate quite as evenly.  It all balances out over time.
2925 2925           *
2926 2926           * If we are doing ditto or log blocks, try to spread them across
2927 2927           * consecutive vdevs.  If we're forced to reuse a vdev before we've
2928 2928           * allocated all of our ditto blocks, then try and spread them out on
2929 2929           * that vdev as much as possible.  If it turns out to not be possible,
2930 2930           * gradually lower our standards until anything becomes acceptable.
2931 2931           * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2932 2932           * gives us hope of containing our fault domains to something we're
2933 2933           * able to reason about.  Otherwise, any two top-level vdev failures
2934 2934           * will guarantee the loss of data.  With consecutive allocation,
2935 2935           * only two adjacent top-level vdev failures will result in data loss.
2936 2936           *
2937 2937           * If we are doing gang blocks (hintdva is non-NULL), try to keep
2938 2938           * ourselves on the same vdev as our gang block header.  That
2939 2939           * way, we can hope for locality in vdev_cache, plus it makes our
2940 2940           * fault domains something tractable.
2941 2941           */
2942 2942          if (hintdva) {
2943 2943                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2944 2944  
2945 2945                  /*
2946 2946                   * It's possible the vdev we're using as the hint no
2947 2947                   * longer exists (i.e. removed). Consult the rotor when
2948 2948                   * all else fails.
2949 2949                   */
2950 2950                  if (vd != NULL) {
2951 2951                          mg = vd->vdev_mg;
2952 2952  
2953 2953                          if (flags & METASLAB_HINTBP_AVOID &&
2954 2954                              mg->mg_next != NULL)
2955 2955                                  mg = mg->mg_next;
2956 2956                  } else {
2957 2957                          mg = mc->mc_rotor;
2958 2958                  }
2959 2959          } else if (d != 0) {
2960 2960                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
2961 2961                  mg = vd->vdev_mg->mg_next;
2962 2962          } else {
2963 2963                  mg = mc->mc_rotor;
2964 2964          }
2965 2965  
2966 2966          /*
2967 2967           * If the hint put us into the wrong metaslab class, or into a
2968 2968           * metaslab group that has been passivated, just follow the rotor.
2969 2969           */
2970 2970          if (mg->mg_class != mc || mg->mg_activation_count <= 0)
2971 2971                  mg = mc->mc_rotor;
2972 2972  
2973 2973          rotor = mg;
2974 2974  top:
2975 2975          do {
2976 2976                  boolean_t allocatable;
2977 2977  
2978 2978                  ASSERT(mg->mg_activation_count == 1);
2979 2979                  vd = mg->mg_vd;
2980 2980  
2981 2981                  /*
2982 2982                   * Don't allocate from faulted devices.
2983 2983                   */
2984 2984                  if (try_hard) {
2985 2985                          spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2986 2986                          allocatable = vdev_allocatable(vd);
2987 2987                          spa_config_exit(spa, SCL_ZIO, FTAG);
2988 2988                  } else {
2989 2989                          allocatable = vdev_allocatable(vd);
2990 2990                  }
2991 2991  
2992 2992                  /*
2993 2993                   * Determine if the selected metaslab group is eligible
2994 2994                   * for allocations. If we're ganging then don't allow
2995 2995                   * this metaslab group to skip allocations since that would
2996 2996                   * inadvertently return ENOSPC and suspend the pool
2997 2997                   * even though space is still available.
2998 2998                   */
2999 2999                  if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3000 3000                          allocatable = metaslab_group_allocatable(mg, rotor,
3001 3001                              psize);
3002 3002                  }
3003 3003  
3004 3004                  if (!allocatable) {
3005 3005                          metaslab_trace_add(zal, mg, NULL, psize, d,
3006 3006                              TRACE_NOT_ALLOCATABLE);
3007 3007                          goto next;
3008 3008                  }
3009 3009  
3010 3010                  ASSERT(mg->mg_initialized);
3011 3011  
3012 3012                  /*
3013 3013                   * Avoid writing single-copy data to a failing,
3014 3014                   * non-redundant vdev, unless we've already tried all
3015 3015                   * other vdevs.
3016 3016                   */
3017 3017                  if ((vd->vdev_stat.vs_write_errors > 0 ||
3018 3018                      vd->vdev_state < VDEV_STATE_HEALTHY) &&
3019 3019                      d == 0 && !try_hard && vd->vdev_children == 0) {
3020 3020                          metaslab_trace_add(zal, mg, NULL, psize, d,
3021 3021                              TRACE_VDEV_ERROR);
3022 3022                          goto next;
3023 3023                  }
3024 3024  
3025 3025                  ASSERT(mg->mg_class == mc);
3026 3026  
3027 3027                  /*
3028 3028                   * If we don't need to try hard, then require that the
3029 3029                   * block be 1/8th of the device away from any other DVAs
3030 3030                   * in this BP.  If we are trying hard, allow any offset
3031 3031                   * to be used (distance=0).
3032 3032                   */
3033 3033                  uint64_t distance = 0;
3034 3034                  if (!try_hard) {
3035 3035                          distance = vd->vdev_asize >>
3036 3036                              ditto_same_vdev_distance_shift;
3037 3037                          if (distance <= (1ULL << vd->vdev_ms_shift))
3038 3038                                  distance = 0;
3039 3039                  }
3040 3040  
3041 3041                  uint64_t asize = vdev_psize_to_asize(vd, psize);
3042 3042                  ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3043 3043  
3044 3044                  uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3045 3045                      distance, dva, d);
3046 3046  
3047 3047                  if (offset != -1ULL) {
3048 3048                          /*
3049 3049                           * If we've just selected this metaslab group,
3050 3050                           * figure out whether the corresponding vdev is
3051 3051                           * over- or under-used relative to the pool,
3052 3052                           * and set an allocation bias to even it out.
3053 3053                           */
3054 3054                          if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3055 3055                                  vdev_stat_t *vs = &vd->vdev_stat;
3056 3056                                  int64_t vu, cu;
3057 3057  
3058 3058                                  vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
3059 3059                                  cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
3060 3060  
3061 3061                                  /*
3062 3062                                   * Calculate how much more or less we should
3063 3063                                   * try to allocate from this device during
3064 3064                                   * this iteration around the rotor.
3065 3065                                   * For example, if a device is 80% full
3066 3066                                   * and the pool is 20% full then we should
3067 3067                                   * reduce allocations by 60% on this device.
3068 3068                                   *
3069 3069                                   * mg_bias = (20 - 80) * 512K / 100 = -307K
3070 3070                                   *
3071 3071                                   * This reduces allocations by 307K for this
3072 3072                                   * iteration.
3073 3073                                   */
3074 3074                                  mg->mg_bias = ((cu - vu) *
3075 3075                                      (int64_t)mg->mg_aliquot) / 100;
3076 3076                          } else if (!metaslab_bias_enabled) {
3077 3077                                  mg->mg_bias = 0;
3078 3078                          }
3079 3079  
3080 3080                          if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3081 3081                              mg->mg_aliquot + mg->mg_bias) {
3082 3082                                  mc->mc_rotor = mg->mg_next;
3083 3083                                  mc->mc_aliquot = 0;
3084 3084                          }
3085 3085  
3086 3086                          DVA_SET_VDEV(&dva[d], vd->vdev_id);
3087 3087                          DVA_SET_OFFSET(&dva[d], offset);
3088 3088                          DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
3089 3089                          DVA_SET_ASIZE(&dva[d], asize);
3090 3090  
3091 3091                          return (0);
3092 3092                  }
3093 3093  next:
3094 3094                  mc->mc_rotor = mg->mg_next;
3095 3095                  mc->mc_aliquot = 0;
3096 3096          } while ((mg = mg->mg_next) != rotor);
3097 3097  
3098 3098          /*
3099 3099           * If we haven't tried hard, do so now.
3100 3100           */
3101 3101          if (!try_hard) {
3102 3102                  try_hard = B_TRUE;
3103 3103                  goto top;
3104 3104          }
3105 3105  
3106 3106          bzero(&dva[d], sizeof (dva_t));
3107 3107  
3108 3108          metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3109 3109          return (SET_ERROR(ENOSPC));
3110 3110  }
3111 3111  
3112 3112  /*
3113 3113   * Free the block represented by DVA in the context of the specified
3114 3114   * transaction group.
3115 3115   */
3116 3116  static void
3117 3117  metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
3118 3118  {
3119 3119          uint64_t vdev = DVA_GET_VDEV(dva);
3120 3120          uint64_t offset = DVA_GET_OFFSET(dva);
3121 3121          uint64_t size = DVA_GET_ASIZE(dva);
3122 3122          vdev_t *vd;
3123 3123          metaslab_t *msp;
3124 3124  
3125 3125          ASSERT(DVA_IS_VALID(dva));
3126 3126  
3127 3127          if (txg > spa_freeze_txg(spa))
3128 3128                  return;
3129 3129  
3130 3130          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3131 3131              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3132 3132                  cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
3133 3133                      (u_longlong_t)vdev, (u_longlong_t)offset);
3134 3134                  ASSERT(0);
3135 3135                  return;
3136 3136          }
3137 3137  
3138 3138          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3139 3139  
3140 3140          if (DVA_GET_GANG(dva))
3141 3141                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3142 3142  
3143 3143          mutex_enter(&msp->ms_lock);
3144 3144  
3145 3145          if (now) {
3146 3146                  range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3147 3147                      offset, size);
3148 3148  
3149 3149                  VERIFY(!msp->ms_condensing);
3150 3150                  VERIFY3U(offset, >=, msp->ms_start);
3151 3151                  VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3152 3152                  VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
3153 3153                      msp->ms_size);
3154 3154                  VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3155 3155                  VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3156 3156                  range_tree_add(msp->ms_tree, offset, size);
3157 3157                  msp->ms_max_size = metaslab_block_maxsize(msp);
3158 3158          } else {
3159 3159                  VERIFY3U(txg, ==, spa->spa_syncing_txg);
3160 3160                  if (range_tree_space(msp->ms_freeingtree) == 0)
3161 3161                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
3162 3162                  range_tree_add(msp->ms_freeingtree, offset, size);
3163 3163          }
3164 3164  
3165 3165          mutex_exit(&msp->ms_lock);
3166 3166  }
3167 3167  
3168 3168  /*
3169 3169   * Intent log support: upon opening the pool after a crash, notify the SPA
3170 3170   * of blocks that the intent log has allocated for immediate write, but
3171 3171   * which are still considered free by the SPA because the last transaction
3172 3172   * group didn't commit yet.
3173 3173   */
3174 3174  static int
3175 3175  metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3176 3176  {
3177 3177          uint64_t vdev = DVA_GET_VDEV(dva);
3178 3178          uint64_t offset = DVA_GET_OFFSET(dva);
3179 3179          uint64_t size = DVA_GET_ASIZE(dva);
3180 3180          vdev_t *vd;
3181 3181          metaslab_t *msp;
3182 3182          int error = 0;
3183 3183  
3184 3184          ASSERT(DVA_IS_VALID(dva));
3185 3185  
3186 3186          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3187 3187              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
3188 3188                  return (SET_ERROR(ENXIO));
3189 3189  
3190 3190          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3191 3191  
3192 3192          if (DVA_GET_GANG(dva))
3193 3193                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3194 3194  
3195 3195          mutex_enter(&msp->ms_lock);
3196 3196  
3197 3197          if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3198 3198                  error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3199 3199  
3200 3200          if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
3201 3201                  error = SET_ERROR(ENOENT);
3202 3202  
3203 3203          if (error || txg == 0) {        /* txg == 0 indicates dry run */
3204 3204                  mutex_exit(&msp->ms_lock);
3205 3205                  return (error);
3206 3206          }
3207 3207  
3208 3208          VERIFY(!msp->ms_condensing);
3209 3209          VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3210 3210          VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3211 3211          VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
3212 3212          range_tree_remove(msp->ms_tree, offset, size);
3213 3213  
3214 3214          if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
3215 3215                  if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3216 3216                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
3217 3217                  range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3218 3218          }
3219 3219  
3220 3220          mutex_exit(&msp->ms_lock);
3221 3221  
3222 3222          return (0);
3223 3223  }
3224 3224  
3225 3225  /*
3226 3226   * Reserve some allocation slots. The reservation system must be called
3227 3227   * before we call into the allocator. If there aren't any available slots
3228 3228   * then the I/O will be throttled until an I/O completes and its slots are
3229 3229   * freed up. The function returns true if it was successful in placing
3230 3230   * the reservation.
3231 3231   */
3232 3232  boolean_t
3233 3233  metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3234 3234      int flags)
3235 3235  {
3236 3236          uint64_t available_slots = 0;
3237 3237          boolean_t slot_reserved = B_FALSE;
3238 3238  
3239 3239          ASSERT(mc->mc_alloc_throttle_enabled);
3240 3240          mutex_enter(&mc->mc_lock);
3241 3241  
3242 3242          uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3243 3243          if (reserved_slots < mc->mc_alloc_max_slots)
3244 3244                  available_slots = mc->mc_alloc_max_slots - reserved_slots;
3245 3245  
3246 3246          if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3247 3247                  /*
3248 3248                   * We reserve the slots individually so that we can unreserve
3249 3249                   * them individually when an I/O completes.
3250 3250                   */
3251 3251                  for (int d = 0; d < slots; d++) {
3252 3252                          reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3253 3253                  }
3254 3254                  zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3255 3255                  slot_reserved = B_TRUE;
3256 3256          }
3257 3257  
3258 3258          mutex_exit(&mc->mc_lock);
3259 3259          return (slot_reserved);
3260 3260  }
3261 3261  
3262 3262  void
3263 3263  metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3264 3264  {
3265 3265          ASSERT(mc->mc_alloc_throttle_enabled);
3266 3266          mutex_enter(&mc->mc_lock);
3267 3267          for (int d = 0; d < slots; d++) {
3268 3268                  (void) refcount_remove(&mc->mc_alloc_slots, zio);
3269 3269          }
3270 3270          mutex_exit(&mc->mc_lock);
3271 3271  }
3272 3272  
3273 3273  int
3274 3274  metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3275 3275      int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3276 3276      zio_alloc_list_t *zal, zio_t *zio)
3277 3277  {
3278 3278          dva_t *dva = bp->blk_dva;
3279 3279          dva_t *hintdva = hintbp->blk_dva;
3280 3280          int error = 0;
3281 3281  
3282 3282          ASSERT(bp->blk_birth == 0);
3283 3283          ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3284 3284  
3285 3285          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3286 3286  
3287 3287          if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
3288 3288                  spa_config_exit(spa, SCL_ALLOC, FTAG);
3289 3289                  return (SET_ERROR(ENOSPC));
3290 3290          }
3291 3291  
3292 3292          ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3293 3293          ASSERT(BP_GET_NDVAS(bp) == 0);
3294 3294          ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3295 3295          ASSERT3P(zal, !=, NULL);
3296 3296  
3297 3297          for (int d = 0; d < ndvas; d++) {
3298 3298                  error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
3299 3299                      txg, flags, zal);
3300 3300                  if (error != 0) {
3301 3301                          for (d--; d >= 0; d--) {
3302 3302                                  metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
3303 3303                                  metaslab_group_alloc_decrement(spa,
3304 3304                                      DVA_GET_VDEV(&dva[d]), zio, flags);
3305 3305                                  bzero(&dva[d], sizeof (dva_t));
3306 3306                          }
3307 3307                          spa_config_exit(spa, SCL_ALLOC, FTAG);
3308 3308                          return (error);
3309 3309                  } else {
3310 3310                          /*
3311 3311                           * Update the metaslab group's queue depth
3312 3312                           * based on the newly allocated dva.
3313 3313                           */
3314 3314                          metaslab_group_alloc_increment(spa,
3315 3315                              DVA_GET_VDEV(&dva[d]), zio, flags);
3316 3316                  }
3317 3317  
3318 3318          }
3319 3319          ASSERT(error == 0);
3320 3320          ASSERT(BP_GET_NDVAS(bp) == ndvas);
3321 3321  
3322 3322          spa_config_exit(spa, SCL_ALLOC, FTAG);
3323 3323  
3324 3324          BP_SET_BIRTH(bp, txg, txg);
3325 3325  
3326 3326          return (0);
3327 3327  }
3328 3328  
3329 3329  void
3330 3330  metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3331 3331  {
3332 3332          const dva_t *dva = bp->blk_dva;
3333 3333          int ndvas = BP_GET_NDVAS(bp);
3334 3334  
3335 3335          ASSERT(!BP_IS_HOLE(bp));
3336 3336          ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3337 3337  
3338 3338          spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3339 3339  
3340 3340          for (int d = 0; d < ndvas; d++)
3341 3341                  metaslab_free_dva(spa, &dva[d], txg, now);
3342 3342  
3343 3343          spa_config_exit(spa, SCL_FREE, FTAG);
3344 3344  }
3345 3345  
3346 3346  int
3347 3347  metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3348 3348  {
3349 3349          const dva_t *dva = bp->blk_dva;
3350 3350          int ndvas = BP_GET_NDVAS(bp);
3351 3351          int error = 0;
3352 3352  
3353 3353          ASSERT(!BP_IS_HOLE(bp));
3354 3354  
3355 3355          if (txg != 0) {
3356 3356                  /*
3357 3357                   * First do a dry run to make sure all DVAs are claimable,
3358 3358                   * so we don't have to unwind from partial failures below.
3359 3359                   */
3360 3360                  if ((error = metaslab_claim(spa, bp, 0)) != 0)
3361 3361                          return (error);
3362 3362          }
3363 3363  
3364 3364          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3365 3365  
3366 3366          for (int d = 0; d < ndvas; d++)
3367 3367                  if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
3368 3368                          break;
3369 3369  
3370 3370          spa_config_exit(spa, SCL_ALLOC, FTAG);
3371 3371  
3372 3372          ASSERT(error == 0 || txg == 0);
3373 3373  
3374 3374          return (error);
3375 3375  }
3376 3376  
3377 3377  void
3378 3378  metaslab_check_free(spa_t *spa, const blkptr_t *bp)
3379 3379  {
3380 3380          if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3381 3381                  return;
3382 3382  
3383 3383          spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3384 3384          for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
3385 3385                  uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
3386 3386                  vdev_t *vd = vdev_lookup_top(spa, vdev);
3387 3387                  uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
3388 3388                  uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
3389 3389                  metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3390 3390  
3391 3391                  if (msp->ms_loaded)
3392 3392                          range_tree_verify(msp->ms_tree, offset, size);
3393 3393  
3394 3394                  range_tree_verify(msp->ms_freeingtree, offset, size);
3395 3395                  range_tree_verify(msp->ms_freedtree, offset, size);
3396 3396                  for (int j = 0; j < TXG_DEFER_SIZE; j++)
3397 3397                          range_tree_verify(msp->ms_defertree[j], offset, size);
3398 3398          }
3399 3399          spa_config_exit(spa, SCL_VDEV, FTAG);
3400 3400  }

↓ open down ↓

1738 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX