dlpx-os-diff Wdiff usr/src/uts/common/fs/zfs/metaslab.c

Print this page

3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit
4080 zpool clear fails to clear pool
4081 need zfs_mg_noalloc_threshold
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/metaslab.c
          +++ new/usr/src/uts/common/fs/zfs/metaslab.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/dmu.h>
  29   29  #include <sys/dmu_tx.h>
  30   30  #include <sys/space_map.h>
  31   31  #include <sys/metaslab_impl.h>
  32   32  #include <sys/vdev_impl.h>
  33   33  #include <sys/zio.h>
  34   34  
  35   35  /*
  36   36   * Allow allocations to switch to gang blocks quickly. We do this to
  37   37   * avoid having to load lots of space_maps in a given txg. There are,
  38   38   * however, some cases where we want to avoid "fast" ganging and instead
  39   39   * we want to do an exhaustive search of all metaslabs on this device.
  40   40   * Currently we don't allow any gang, zil, or dump device related allocations
  41   41   * to "fast" gang.
  42   42   */
  43   43  #define CAN_FASTGANG(flags) \
  44   44          (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
  45   45          METASLAB_GANG_AVOID)))
  46   46  
  47   47  uint64_t metaslab_aliquot = 512ULL << 10;
  48   48  uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  49   49  
  50   50  /*

↓ open down ↓

50 lines elided

↑ open up ↑

  51   51   * The in-core space map representation is more compact than its on-disk form.
  52   52   * The zfs_condense_pct determines how much more compact the in-core
  53   53   * space_map representation must be before we compact it on-disk.
  54   54   * Values should be greater than or equal to 100.
  55   55   */
  56   56  int zfs_condense_pct = 200;
  57   57  
  58   58  /*
  59   59   * This value defines the number of allowed allocation failures per vdev.
  60   60   * If a device reaches this threshold in a given txg then we consider skipping
  61      - * allocations on that device.
       61 + * allocations on that device. The value of zfs_mg_alloc_failures is computed
       62 + * in zio_init() unless it has been overridden in /etc/system.
  62   63   */
  63      -int zfs_mg_alloc_failures;
       64 +int zfs_mg_alloc_failures = 0;
  64   65  
  65   66  /*
       67 + * The zfs_mg_noalloc_threshold defines which metaslab groups should
       68 + * be eligible for allocation. The value is defined as a percentage of
       69 + * a free space. Metaslab groups that have more free space than
       70 + * zfs_mg_noalloc_threshold are always eligible for allocations. Once
       71 + * a metaslab group's free space is less than or equal to the
       72 + * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
       73 + * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
       74 + * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
       75 + * groups are allowed to accept allocations. Gang blocks are always
       76 + * eligible to allocate on any metaslab group. The default value of 0 means
       77 + * no metaslab group will be excluded based on this criterion.
       78 + */
       79 +int zfs_mg_noalloc_threshold = 0;
       80 +
       81 +/*
  66   82   * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  67   83   */
  68   84  static int metaslab_debug = 0;
  69   85  
  70   86  /*
  71   87   * Minimum size which forces the dynamic allocator to change
  72   88   * it's allocation strategy.  Once the space map cannot satisfy
  73   89   * an allocation of this size then it switches to using more
  74   90   * aggressive strategy (i.e search by size rather than offset).
  75   91   */

  76   92  uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
  77   93  
  78   94  /*
  79   95   * The minimum free space, in percent, which must be available
  80   96   * in a space map to continue allocations in a first-fit fashion.
  81   97   * Once the space_map's free space drops below this level we dynamically
  82   98   * switch to using best-fit allocations.
  83   99   */
  84  100  int metaslab_df_free_pct = 4;
  85  101  
  86  102  /*
  87  103   * A metaslab is considered "free" if it contains a contiguous
  88  104   * segment which is greater than metaslab_min_alloc_size.
  89  105   */
  90  106  uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
  91  107  
  92  108  /*
  93  109   * Max number of space_maps to prefetch.
  94  110   */
  95  111  int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
  96  112  
  97  113  /*
  98  114   * Percentage bonus multiplier for metaslabs that are in the bonus area.
  99  115   */
 100  116  int metaslab_smo_bonus_pct = 150;
 101  117  
 102  118  /*
 103  119   * Should we be willing to write data to degraded vdevs?
 104  120   */
 105  121  boolean_t zfs_write_to_degraded = B_FALSE;
 106  122  
 107  123  /*
 108  124   * ==========================================================================
 109  125   * Metaslab classes
 110  126   * ==========================================================================
 111  127   */
 112  128  metaslab_class_t *
 113  129  metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
 114  130  {
 115  131          metaslab_class_t *mc;
 116  132  
 117  133          mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 118  134  
 119  135          mc->mc_spa = spa;
 120  136          mc->mc_rotor = NULL;
 121  137          mc->mc_ops = ops;
 122  138  
 123  139          return (mc);
 124  140  }
 125  141  
 126  142  void
 127  143  metaslab_class_destroy(metaslab_class_t *mc)
 128  144  {
 129  145          ASSERT(mc->mc_rotor == NULL);
 130  146          ASSERT(mc->mc_alloc == 0);
 131  147          ASSERT(mc->mc_deferred == 0);
 132  148          ASSERT(mc->mc_space == 0);
 133  149          ASSERT(mc->mc_dspace == 0);
 134  150  
 135  151          kmem_free(mc, sizeof (metaslab_class_t));
 136  152  }
 137  153  
 138  154  int
 139  155  metaslab_class_validate(metaslab_class_t *mc)
 140  156  {
 141  157          metaslab_group_t *mg;
 142  158          vdev_t *vd;
 143  159  
 144  160          /*
 145  161           * Must hold one of the spa_config locks.
 146  162           */
 147  163          ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 148  164              spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 149  165  
 150  166          if ((mg = mc->mc_rotor) == NULL)
 151  167                  return (0);
 152  168  
 153  169          do {
 154  170                  vd = mg->mg_vd;
 155  171                  ASSERT(vd->vdev_mg != NULL);
 156  172                  ASSERT3P(vd->vdev_top, ==, vd);
 157  173                  ASSERT3P(mg->mg_class, ==, mc);
 158  174                  ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 159  175          } while ((mg = mg->mg_next) != mc->mc_rotor);
 160  176  
 161  177          return (0);
 162  178  }
 163  179  
 164  180  void
 165  181  metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 166  182      int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 167  183  {
 168  184          atomic_add_64(&mc->mc_alloc, alloc_delta);
 169  185          atomic_add_64(&mc->mc_deferred, defer_delta);
 170  186          atomic_add_64(&mc->mc_space, space_delta);
 171  187          atomic_add_64(&mc->mc_dspace, dspace_delta);
 172  188  }
 173  189  
 174  190  uint64_t
 175  191  metaslab_class_get_alloc(metaslab_class_t *mc)
 176  192  {
 177  193          return (mc->mc_alloc);
 178  194  }
 179  195  
 180  196  uint64_t
 181  197  metaslab_class_get_deferred(metaslab_class_t *mc)
 182  198  {
 183  199          return (mc->mc_deferred);
 184  200  }
 185  201  
 186  202  uint64_t
 187  203  metaslab_class_get_space(metaslab_class_t *mc)
 188  204  {
 189  205          return (mc->mc_space);
 190  206  }
 191  207  
 192  208  uint64_t
 193  209  metaslab_class_get_dspace(metaslab_class_t *mc)
 194  210  {
 195  211          return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 196  212  }
 197  213  
 198  214  /*
 199  215   * ==========================================================================
 200  216   * Metaslab groups
 201  217   * ==========================================================================
 202  218   */
 203  219  static int
 204  220  metaslab_compare(const void *x1, const void *x2)
 205  221  {
 206  222          const metaslab_t *m1 = x1;
 207  223          const metaslab_t *m2 = x2;
 208  224  
 209  225          if (m1->ms_weight < m2->ms_weight)
 210  226                  return (1);
 211  227          if (m1->ms_weight > m2->ms_weight)
 212  228                  return (-1);
 213  229  
 214  230          /*
 215  231           * If the weights are identical, use the offset to force uniqueness.
 216  232           */

↓ open down ↓

141 lines elided

↑ open up ↑

 217  233          if (m1->ms_map->sm_start < m2->ms_map->sm_start)
 218  234                  return (-1);
 219  235          if (m1->ms_map->sm_start > m2->ms_map->sm_start)
 220  236                  return (1);
 221  237  
 222  238          ASSERT3P(m1, ==, m2);
 223  239  
 224  240          return (0);
 225  241  }
 226  242  
      243 +/*
      244 + * Update the allocatable flag and the metaslab group's capacity.
      245 + * The allocatable flag is set to true if the capacity is below
      246 + * the zfs_mg_noalloc_threshold. If a metaslab group transitions
      247 + * from allocatable to non-allocatable or vice versa then the metaslab
      248 + * group's class is updated to reflect the transition.
      249 + */
      250 +static void
      251 +metaslab_group_alloc_update(metaslab_group_t *mg)
      252 +{
      253 +        vdev_t *vd = mg->mg_vd;
      254 +        metaslab_class_t *mc = mg->mg_class;
      255 +        vdev_stat_t *vs = &vd->vdev_stat;
      256 +        boolean_t was_allocatable;
      257 +
      258 +        ASSERT(vd == vd->vdev_top);
      259 +
      260 +        mutex_enter(&mg->mg_lock);
      261 +        was_allocatable = mg->mg_allocatable;
      262 +
      263 +        mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
      264 +            (vs->vs_space + 1);
      265 +
      266 +        mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
      267 +
      268 +        /*
      269 +         * The mc_alloc_groups maintains a count of the number of
      270 +         * groups in this metaslab class that are still above the
      271 +         * zfs_mg_noalloc_threshold. This is used by the allocating
      272 +         * threads to determine if they should avoid allocations to
      273 +         * a given group. The allocator will avoid allocations to a group
      274 +         * if that group has reached or is below the zfs_mg_noalloc_threshold
      275 +         * and there are still other groups that are above the threshold.
      276 +         * When a group transitions from allocatable to non-allocatable or
      277 +         * vice versa we update the metaslab class to reflect that change.
      278 +         * When the mc_alloc_groups value drops to 0 that means that all
      279 +         * groups have reached the zfs_mg_noalloc_threshold making all groups
      280 +         * eligible for allocations. This effectively means that all devices
      281 +         * are balanced again.
      282 +         */
      283 +        if (was_allocatable && !mg->mg_allocatable)
      284 +                mc->mc_alloc_groups--;
      285 +        else if (!was_allocatable && mg->mg_allocatable)
      286 +                mc->mc_alloc_groups++;
      287 +        mutex_exit(&mg->mg_lock);
      288 +}
      289 +
 227  290  metaslab_group_t *
 228  291  metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 229  292  {
 230  293          metaslab_group_t *mg;
 231  294  
 232  295          mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 233  296          mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 234  297          avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 235  298              sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 236  299          mg->mg_vd = vd;

 237  300          mg->mg_class = mc;
 238  301          mg->mg_activation_count = 0;
 239  302  
 240  303          return (mg);
 241  304  }
 242  305  
 243  306  void
 244  307  metaslab_group_destroy(metaslab_group_t *mg)
 245  308  {
 246  309          ASSERT(mg->mg_prev == NULL);
 247  310          ASSERT(mg->mg_next == NULL);
 248  311          /*
 249  312           * We may have gone below zero with the activation count
 250  313           * either because we never activated in the first place or
 251  314           * because we're done, and possibly removing the vdev.
 252  315           */
 253  316          ASSERT(mg->mg_activation_count <= 0);
 254  317  
 255  318          avl_destroy(&mg->mg_metaslab_tree);
 256  319          mutex_destroy(&mg->mg_lock);
 257  320          kmem_free(mg, sizeof (metaslab_group_t));
 258  321  }
 259  322  
 260  323  void
 261  324  metaslab_group_activate(metaslab_group_t *mg)
 262  325  {
 263  326          metaslab_class_t *mc = mg->mg_class;
 264  327          metaslab_group_t *mgprev, *mgnext;
 265  328  
 266  329          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));

↓ open down ↓

30 lines elided

↑ open up ↑

 267  330  
 268  331          ASSERT(mc->mc_rotor != mg);
 269  332          ASSERT(mg->mg_prev == NULL);
 270  333          ASSERT(mg->mg_next == NULL);
 271  334          ASSERT(mg->mg_activation_count <= 0);
 272  335  
 273  336          if (++mg->mg_activation_count <= 0)
 274  337                  return;
 275  338  
 276  339          mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
      340 +        metaslab_group_alloc_update(mg);
 277  341  
 278  342          if ((mgprev = mc->mc_rotor) == NULL) {
 279  343                  mg->mg_prev = mg;
 280  344                  mg->mg_next = mg;
 281  345          } else {
 282  346                  mgnext = mgprev->mg_next;
 283  347                  mg->mg_prev = mgprev;
 284  348                  mg->mg_next = mgnext;
 285  349                  mgprev->mg_next = mg;
 286  350                  mgnext->mg_prev = mg;

 287  351          }
 288  352          mc->mc_rotor = mg;
 289  353  }
 290  354  
 291  355  void
 292  356  metaslab_group_passivate(metaslab_group_t *mg)
 293  357  {
 294  358          metaslab_class_t *mc = mg->mg_class;
 295  359          metaslab_group_t *mgprev, *mgnext;
 296  360  
 297  361          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 298  362  
 299  363          if (--mg->mg_activation_count != 0) {
 300  364                  ASSERT(mc->mc_rotor != mg);
 301  365                  ASSERT(mg->mg_prev == NULL);
 302  366                  ASSERT(mg->mg_next == NULL);
 303  367                  ASSERT(mg->mg_activation_count < 0);
 304  368                  return;
 305  369          }
 306  370  
 307  371          mgprev = mg->mg_prev;
 308  372          mgnext = mg->mg_next;
 309  373  
 310  374          if (mg == mgnext) {
 311  375                  mc->mc_rotor = NULL;
 312  376          } else {
 313  377                  mc->mc_rotor = mgnext;
 314  378                  mgprev->mg_next = mgnext;
 315  379                  mgnext->mg_prev = mgprev;
 316  380          }
 317  381  
 318  382          mg->mg_prev = NULL;
 319  383          mg->mg_next = NULL;
 320  384  }
 321  385  
 322  386  static void
 323  387  metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 324  388  {
 325  389          mutex_enter(&mg->mg_lock);
 326  390          ASSERT(msp->ms_group == NULL);
 327  391          msp->ms_group = mg;
 328  392          msp->ms_weight = 0;
 329  393          avl_add(&mg->mg_metaslab_tree, msp);
 330  394          mutex_exit(&mg->mg_lock);
 331  395  }
 332  396  
 333  397  static void
 334  398  metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 335  399  {
 336  400          mutex_enter(&mg->mg_lock);
 337  401          ASSERT(msp->ms_group == mg);
 338  402          avl_remove(&mg->mg_metaslab_tree, msp);
 339  403          msp->ms_group = NULL;
 340  404          mutex_exit(&mg->mg_lock);
 341  405  }
 342  406  
 343  407  static void
 344  408  metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 345  409  {
 346  410          /*
 347  411           * Although in principle the weight can be any value, in
 348  412           * practice we do not use values in the range [1, 510].
 349  413           */
 350  414          ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
 351  415          ASSERT(MUTEX_HELD(&msp->ms_lock));

↓ open down ↓

65 lines elided

↑ open up ↑

 352  416  
 353  417          mutex_enter(&mg->mg_lock);
 354  418          ASSERT(msp->ms_group == mg);
 355  419          avl_remove(&mg->mg_metaslab_tree, msp);
 356  420          msp->ms_weight = weight;
 357  421          avl_add(&mg->mg_metaslab_tree, msp);
 358  422          mutex_exit(&mg->mg_lock);
 359  423  }
 360  424  
 361  425  /*
      426 + * Determine if a given metaslab group should skip allocations. A metaslab
      427 + * group should avoid allocations if its used capacity has crossed the
      428 + * zfs_mg_noalloc_threshold and there is at least one metaslab group
      429 + * that can still handle allocations.
      430 + */
      431 +static boolean_t
      432 +metaslab_group_allocatable(metaslab_group_t *mg)
      433 +{
      434 +        vdev_t *vd = mg->mg_vd;
      435 +        spa_t *spa = vd->vdev_spa;
      436 +        metaslab_class_t *mc = mg->mg_class;
      437 +
      438 +        /*
      439 +         * A metaslab group is considered allocatable if its free capacity
      440 +         * is greater than the set value of zfs_mg_noalloc_threshold, it's
      441 +         * associated with a slog, or there are no other metaslab groups
      442 +         * with free capacity greater than zfs_mg_noalloc_threshold.
      443 +         */
      444 +        return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
      445 +            mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
      446 +}
      447 +
      448 +/*
 362  449   * ==========================================================================
 363  450   * Common allocator routines
 364  451   * ==========================================================================
 365  452   */
 366  453  static int
 367  454  metaslab_segsize_compare(const void *x1, const void *x2)
 368  455  {
 369  456          const space_seg_t *s1 = x1;
 370  457          const space_seg_t *s2 = x2;
 371  458          uint64_t ss_size1 = s1->ss_end - s1->ss_start;

 372  459          uint64_t ss_size2 = s2->ss_end - s2->ss_start;
 373  460  
 374  461          if (ss_size1 < ss_size2)
 375  462                  return (-1);
 376  463          if (ss_size1 > ss_size2)
 377  464                  return (1);
 378  465  
 379  466          if (s1->ss_start < s2->ss_start)
 380  467                  return (-1);
 381  468          if (s1->ss_start > s2->ss_start)
 382  469                  return (1);
 383  470  
 384  471          return (0);
 385  472  }
 386  473  
 387  474  /*
 388  475   * This is a helper function that can be used by the allocator to find
 389  476   * a suitable block to allocate. This will search the specified AVL
 390  477   * tree looking for a block that matches the specified criteria.
 391  478   */
 392  479  static uint64_t
 393  480  metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 394  481      uint64_t align)
 395  482  {
 396  483          space_seg_t *ss, ssearch;
 397  484          avl_index_t where;
 398  485  
 399  486          ssearch.ss_start = *cursor;
 400  487          ssearch.ss_end = *cursor + size;
 401  488  
 402  489          ss = avl_find(t, &ssearch, &where);
 403  490          if (ss == NULL)
 404  491                  ss = avl_nearest(t, where, AVL_AFTER);
 405  492  
 406  493          while (ss != NULL) {
 407  494                  uint64_t offset = P2ROUNDUP(ss->ss_start, align);
 408  495  
 409  496                  if (offset + size <= ss->ss_end) {
 410  497                          *cursor = offset + size;
 411  498                          return (offset);
 412  499                  }
 413  500                  ss = AVL_NEXT(t, ss);
 414  501          }
 415  502  
 416  503          /*
 417  504           * If we know we've searched the whole map (*cursor == 0), give up.
 418  505           * Otherwise, reset the cursor to the beginning and try again.
 419  506           */
 420  507          if (*cursor == 0)
 421  508                  return (-1ULL);
 422  509  
 423  510          *cursor = 0;
 424  511          return (metaslab_block_picker(t, cursor, size, align));
 425  512  }
 426  513  
 427  514  static void
 428  515  metaslab_pp_load(space_map_t *sm)
 429  516  {
 430  517          space_seg_t *ss;
 431  518  
 432  519          ASSERT(sm->sm_ppd == NULL);
 433  520          sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
 434  521  
 435  522          sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
 436  523          avl_create(sm->sm_pp_root, metaslab_segsize_compare,
 437  524              sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
 438  525  
 439  526          for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
 440  527                  avl_add(sm->sm_pp_root, ss);
 441  528  }
 442  529  
 443  530  static void
 444  531  metaslab_pp_unload(space_map_t *sm)
 445  532  {
 446  533          void *cookie = NULL;
 447  534  
 448  535          kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
 449  536          sm->sm_ppd = NULL;
 450  537  
 451  538          while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
 452  539                  /* tear down the tree */
 453  540          }
 454  541  
 455  542          avl_destroy(sm->sm_pp_root);
 456  543          kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
 457  544          sm->sm_pp_root = NULL;
 458  545  }
 459  546  
 460  547  /* ARGSUSED */
 461  548  static void
 462  549  metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
 463  550  {
 464  551          /* No need to update cursor */
 465  552  }
 466  553  
 467  554  /* ARGSUSED */
 468  555  static void
 469  556  metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
 470  557  {
 471  558          /* No need to update cursor */
 472  559  }
 473  560  
 474  561  /*
 475  562   * Return the maximum contiguous segment within the metaslab.
 476  563   */
 477  564  uint64_t
 478  565  metaslab_pp_maxsize(space_map_t *sm)
 479  566  {
 480  567          avl_tree_t *t = sm->sm_pp_root;
 481  568          space_seg_t *ss;
 482  569  
 483  570          if (t == NULL || (ss = avl_last(t)) == NULL)
 484  571                  return (0ULL);
 485  572  
 486  573          return (ss->ss_end - ss->ss_start);
 487  574  }
 488  575  
 489  576  /*
 490  577   * ==========================================================================
 491  578   * The first-fit block allocator
 492  579   * ==========================================================================
 493  580   */
 494  581  static uint64_t
 495  582  metaslab_ff_alloc(space_map_t *sm, uint64_t size)
 496  583  {
 497  584          avl_tree_t *t = &sm->sm_root;
 498  585          uint64_t align = size & -size;
 499  586          uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 500  587  
 501  588          return (metaslab_block_picker(t, cursor, size, align));
 502  589  }
 503  590  
 504  591  /* ARGSUSED */
 505  592  boolean_t
 506  593  metaslab_ff_fragmented(space_map_t *sm)
 507  594  {
 508  595          return (B_TRUE);
 509  596  }
 510  597  
 511  598  static space_map_ops_t metaslab_ff_ops = {
 512  599          metaslab_pp_load,
 513  600          metaslab_pp_unload,
 514  601          metaslab_ff_alloc,
 515  602          metaslab_pp_claim,
 516  603          metaslab_pp_free,
 517  604          metaslab_pp_maxsize,
 518  605          metaslab_ff_fragmented
 519  606  };
 520  607  
 521  608  /*
 522  609   * ==========================================================================
 523  610   * Dynamic block allocator -
 524  611   * Uses the first fit allocation scheme until space get low and then
 525  612   * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
 526  613   * and metaslab_df_free_pct to determine when to switch the allocation scheme.
 527  614   * ==========================================================================
 528  615   */
 529  616  static uint64_t
 530  617  metaslab_df_alloc(space_map_t *sm, uint64_t size)
 531  618  {
 532  619          avl_tree_t *t = &sm->sm_root;
 533  620          uint64_t align = size & -size;
 534  621          uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 535  622          uint64_t max_size = metaslab_pp_maxsize(sm);
 536  623          int free_pct = sm->sm_space * 100 / sm->sm_size;
 537  624  
 538  625          ASSERT(MUTEX_HELD(sm->sm_lock));
 539  626          ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 540  627  
 541  628          if (max_size < size)
 542  629                  return (-1ULL);
 543  630  
 544  631          /*
 545  632           * If we're running low on space switch to using the size
 546  633           * sorted AVL tree (best-fit).
 547  634           */
 548  635          if (max_size < metaslab_df_alloc_threshold ||
 549  636              free_pct < metaslab_df_free_pct) {
 550  637                  t = sm->sm_pp_root;
 551  638                  *cursor = 0;
 552  639          }
 553  640  
 554  641          return (metaslab_block_picker(t, cursor, size, 1ULL));
 555  642  }
 556  643  
 557  644  static boolean_t
 558  645  metaslab_df_fragmented(space_map_t *sm)
 559  646  {
 560  647          uint64_t max_size = metaslab_pp_maxsize(sm);
 561  648          int free_pct = sm->sm_space * 100 / sm->sm_size;
 562  649  
 563  650          if (max_size >= metaslab_df_alloc_threshold &&
 564  651              free_pct >= metaslab_df_free_pct)
 565  652                  return (B_FALSE);
 566  653  
 567  654          return (B_TRUE);
 568  655  }
 569  656  
 570  657  static space_map_ops_t metaslab_df_ops = {
 571  658          metaslab_pp_load,
 572  659          metaslab_pp_unload,
 573  660          metaslab_df_alloc,
 574  661          metaslab_pp_claim,
 575  662          metaslab_pp_free,
 576  663          metaslab_pp_maxsize,
 577  664          metaslab_df_fragmented
 578  665  };
 579  666  
 580  667  /*
 581  668   * ==========================================================================
 582  669   * Other experimental allocators
 583  670   * ==========================================================================
 584  671   */
 585  672  static uint64_t
 586  673  metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
 587  674  {
 588  675          avl_tree_t *t = &sm->sm_root;
 589  676          uint64_t *cursor = (uint64_t *)sm->sm_ppd;
 590  677          uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
 591  678          uint64_t max_size = metaslab_pp_maxsize(sm);
 592  679          uint64_t rsize = size;
 593  680          uint64_t offset = 0;
 594  681  
 595  682          ASSERT(MUTEX_HELD(sm->sm_lock));
 596  683          ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 597  684  
 598  685          if (max_size < size)
 599  686                  return (-1ULL);
 600  687  
 601  688          ASSERT3U(*extent_end, >=, *cursor);
 602  689  
 603  690          /*
 604  691           * If we're running low on space switch to using the size
 605  692           * sorted AVL tree (best-fit).
 606  693           */
 607  694          if ((*cursor + size) > *extent_end) {
 608  695  
 609  696                  t = sm->sm_pp_root;
 610  697                  *cursor = *extent_end = 0;
 611  698  
 612  699                  if (max_size > 2 * SPA_MAXBLOCKSIZE)
 613  700                          rsize = MIN(metaslab_min_alloc_size, max_size);
 614  701                  offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
 615  702                  if (offset != -1)
 616  703                          *cursor = offset + size;
 617  704          } else {
 618  705                  offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
 619  706          }
 620  707          ASSERT3U(*cursor, <=, *extent_end);
 621  708          return (offset);
 622  709  }
 623  710  
 624  711  static boolean_t
 625  712  metaslab_cdf_fragmented(space_map_t *sm)
 626  713  {
 627  714          uint64_t max_size = metaslab_pp_maxsize(sm);
 628  715  
 629  716          if (max_size > (metaslab_min_alloc_size * 10))
 630  717                  return (B_FALSE);
 631  718          return (B_TRUE);
 632  719  }
 633  720  
 634  721  static space_map_ops_t metaslab_cdf_ops = {
 635  722          metaslab_pp_load,
 636  723          metaslab_pp_unload,
 637  724          metaslab_cdf_alloc,
 638  725          metaslab_pp_claim,
 639  726          metaslab_pp_free,
 640  727          metaslab_pp_maxsize,
 641  728          metaslab_cdf_fragmented
 642  729  };
 643  730  
 644  731  uint64_t metaslab_ndf_clump_shift = 4;
 645  732  
 646  733  static uint64_t
 647  734  metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
 648  735  {
 649  736          avl_tree_t *t = &sm->sm_root;
 650  737          avl_index_t where;
 651  738          space_seg_t *ss, ssearch;
 652  739          uint64_t hbit = highbit(size);
 653  740          uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
 654  741          uint64_t max_size = metaslab_pp_maxsize(sm);
 655  742  
 656  743          ASSERT(MUTEX_HELD(sm->sm_lock));
 657  744          ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 658  745  
 659  746          if (max_size < size)
 660  747                  return (-1ULL);
 661  748  
 662  749          ssearch.ss_start = *cursor;
 663  750          ssearch.ss_end = *cursor + size;
 664  751  
 665  752          ss = avl_find(t, &ssearch, &where);
 666  753          if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
 667  754                  t = sm->sm_pp_root;
 668  755  
 669  756                  ssearch.ss_start = 0;
 670  757                  ssearch.ss_end = MIN(max_size,
 671  758                      1ULL << (hbit + metaslab_ndf_clump_shift));
 672  759                  ss = avl_find(t, &ssearch, &where);
 673  760                  if (ss == NULL)
 674  761                          ss = avl_nearest(t, where, AVL_AFTER);
 675  762                  ASSERT(ss != NULL);
 676  763          }
 677  764  
 678  765          if (ss != NULL) {
 679  766                  if (ss->ss_start + size <= ss->ss_end) {
 680  767                          *cursor = ss->ss_start + size;
 681  768                          return (ss->ss_start);
 682  769                  }
 683  770          }
 684  771          return (-1ULL);
 685  772  }
 686  773  
 687  774  static boolean_t
 688  775  metaslab_ndf_fragmented(space_map_t *sm)
 689  776  {
 690  777          uint64_t max_size = metaslab_pp_maxsize(sm);
 691  778  
 692  779          if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
 693  780                  return (B_FALSE);
 694  781          return (B_TRUE);
 695  782  }
 696  783  
 697  784  
 698  785  static space_map_ops_t metaslab_ndf_ops = {
 699  786          metaslab_pp_load,
 700  787          metaslab_pp_unload,
 701  788          metaslab_ndf_alloc,
 702  789          metaslab_pp_claim,
 703  790          metaslab_pp_free,
 704  791          metaslab_pp_maxsize,
 705  792          metaslab_ndf_fragmented
 706  793  };
 707  794  
 708  795  space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 709  796  
 710  797  /*
 711  798   * ==========================================================================
 712  799   * Metaslabs
 713  800   * ==========================================================================
 714  801   */
 715  802  metaslab_t *
 716  803  metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 717  804          uint64_t start, uint64_t size, uint64_t txg)
 718  805  {
 719  806          vdev_t *vd = mg->mg_vd;
 720  807          metaslab_t *msp;
 721  808  
 722  809          msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 723  810          mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 724  811  
 725  812          msp->ms_smo_syncing = *smo;
 726  813  
 727  814          /*
 728  815           * We create the main space map here, but we don't create the
 729  816           * allocmaps and freemaps until metaslab_sync_done().  This serves
 730  817           * two purposes: it allows metaslab_sync_done() to detect the
 731  818           * addition of new space; and for debugging, it ensures that we'd
 732  819           * data fault on any attempt to use this metaslab before it's ready.
 733  820           */
 734  821          msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
 735  822          space_map_create(msp->ms_map, start, size,
 736  823              vd->vdev_ashift, &msp->ms_lock);
 737  824  
 738  825          metaslab_group_add(mg, msp);
 739  826  
 740  827          if (metaslab_debug && smo->smo_object != 0) {
 741  828                  mutex_enter(&msp->ms_lock);
 742  829                  VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
 743  830                      SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
 744  831                  mutex_exit(&msp->ms_lock);
 745  832          }
 746  833  
 747  834          /*
 748  835           * If we're opening an existing pool (txg == 0) or creating
 749  836           * a new one (txg == TXG_INITIAL), all space is available now.
 750  837           * If we're adding space to an existing pool, the new space
 751  838           * does not become available until after this txg has synced.
 752  839           */
 753  840          if (txg <= TXG_INITIAL)
 754  841                  metaslab_sync_done(msp, 0);
 755  842  
 756  843          if (txg != 0) {
 757  844                  vdev_dirty(vd, 0, NULL, txg);
 758  845                  vdev_dirty(vd, VDD_METASLAB, msp, txg);
 759  846          }
 760  847  
 761  848          return (msp);
 762  849  }
 763  850  
 764  851  void
 765  852  metaslab_fini(metaslab_t *msp)
 766  853  {
 767  854          metaslab_group_t *mg = msp->ms_group;
 768  855  
 769  856          vdev_space_update(mg->mg_vd,
 770  857              -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
 771  858  
 772  859          metaslab_group_remove(mg, msp);
 773  860  
 774  861          mutex_enter(&msp->ms_lock);
 775  862  
 776  863          space_map_unload(msp->ms_map);
 777  864          space_map_destroy(msp->ms_map);
 778  865          kmem_free(msp->ms_map, sizeof (*msp->ms_map));
 779  866  
 780  867          for (int t = 0; t < TXG_SIZE; t++) {
 781  868                  space_map_destroy(msp->ms_allocmap[t]);
 782  869                  space_map_destroy(msp->ms_freemap[t]);
 783  870                  kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
 784  871                  kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
 785  872          }
 786  873  
 787  874          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 788  875                  space_map_destroy(msp->ms_defermap[t]);
 789  876                  kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
 790  877          }
 791  878  
 792  879          ASSERT0(msp->ms_deferspace);
 793  880  
 794  881          mutex_exit(&msp->ms_lock);
 795  882          mutex_destroy(&msp->ms_lock);
 796  883  
 797  884          kmem_free(msp, sizeof (metaslab_t));
 798  885  }
 799  886  
 800  887  #define METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
 801  888  #define METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
 802  889  #define METASLAB_ACTIVE_MASK            \
 803  890          (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
 804  891  
 805  892  static uint64_t
 806  893  metaslab_weight(metaslab_t *msp)
 807  894  {
 808  895          metaslab_group_t *mg = msp->ms_group;
 809  896          space_map_t *sm = msp->ms_map;
 810  897          space_map_obj_t *smo = &msp->ms_smo;
 811  898          vdev_t *vd = mg->mg_vd;
 812  899          uint64_t weight, space;
 813  900  
 814  901          ASSERT(MUTEX_HELD(&msp->ms_lock));
 815  902  
 816  903          /*
 817  904           * This vdev is in the process of being removed so there is nothing
 818  905           * for us to do here.
 819  906           */
 820  907          if (vd->vdev_removing) {
 821  908                  ASSERT0(smo->smo_alloc);
 822  909                  ASSERT0(vd->vdev_ms_shift);
 823  910                  return (0);
 824  911          }
 825  912  
 826  913          /*
 827  914           * The baseline weight is the metaslab's free space.
 828  915           */
 829  916          space = sm->sm_size - smo->smo_alloc;
 830  917          weight = space;
 831  918  
 832  919          /*
 833  920           * Modern disks have uniform bit density and constant angular velocity.
 834  921           * Therefore, the outer recording zones are faster (higher bandwidth)
 835  922           * than the inner zones by the ratio of outer to inner track diameter,
 836  923           * which is typically around 2:1.  We account for this by assigning
 837  924           * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 838  925           * In effect, this means that we'll select the metaslab with the most
 839  926           * free bandwidth rather than simply the one with the most free space.
 840  927           */
 841  928          weight = 2 * weight -
 842  929              ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
 843  930          ASSERT(weight >= space && weight <= 2 * space);
 844  931  
 845  932          /*
 846  933           * For locality, assign higher weight to metaslabs which have
 847  934           * a lower offset than what we've already activated.
 848  935           */
 849  936          if (sm->sm_start <= mg->mg_bonus_area)
 850  937                  weight *= (metaslab_smo_bonus_pct / 100);
 851  938          ASSERT(weight >= space &&
 852  939              weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
 853  940  
 854  941          if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
 855  942                  /*
 856  943                   * If this metaslab is one we're actively using, adjust its
 857  944                   * weight to make it preferable to any inactive metaslab so
 858  945                   * we'll polish it off.
 859  946                   */
 860  947                  weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 861  948          }
 862  949          return (weight);
 863  950  }
 864  951  
 865  952  static void
 866  953  metaslab_prefetch(metaslab_group_t *mg)
 867  954  {
 868  955          spa_t *spa = mg->mg_vd->vdev_spa;
 869  956          metaslab_t *msp;
 870  957          avl_tree_t *t = &mg->mg_metaslab_tree;
 871  958          int m;
 872  959  
 873  960          mutex_enter(&mg->mg_lock);
 874  961  
 875  962          /*
 876  963           * Prefetch the next potential metaslabs
 877  964           */
 878  965          for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
 879  966                  space_map_t *sm = msp->ms_map;
 880  967                  space_map_obj_t *smo = &msp->ms_smo;
 881  968  
 882  969                  /* If we have reached our prefetch limit then we're done */
 883  970                  if (m >= metaslab_prefetch_limit)
 884  971                          break;
 885  972  
 886  973                  if (!sm->sm_loaded && smo->smo_object != 0) {
 887  974                          mutex_exit(&mg->mg_lock);
 888  975                          dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
 889  976                              0ULL, smo->smo_objsize);
 890  977                          mutex_enter(&mg->mg_lock);
 891  978                  }
 892  979          }
 893  980          mutex_exit(&mg->mg_lock);
 894  981  }
 895  982  
 896  983  static int
 897  984  metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 898  985  {
 899  986          metaslab_group_t *mg = msp->ms_group;
 900  987          space_map_t *sm = msp->ms_map;
 901  988          space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 902  989  
 903  990          ASSERT(MUTEX_HELD(&msp->ms_lock));
 904  991  
 905  992          if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 906  993                  space_map_load_wait(sm);
 907  994                  if (!sm->sm_loaded) {
 908  995                          space_map_obj_t *smo = &msp->ms_smo;
 909  996  
 910  997                          int error = space_map_load(sm, sm_ops, SM_FREE, smo,
 911  998                              spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
 912  999                          if (error)  {
 913 1000                                  metaslab_group_sort(msp->ms_group, msp, 0);
 914 1001                                  return (error);
 915 1002                          }
 916 1003                          for (int t = 0; t < TXG_DEFER_SIZE; t++)
 917 1004                                  space_map_walk(msp->ms_defermap[t],
 918 1005                                      space_map_claim, sm);
 919 1006  
 920 1007                  }
 921 1008  
 922 1009                  /*
 923 1010                   * Track the bonus area as we activate new metaslabs.
 924 1011                   */
 925 1012                  if (sm->sm_start > mg->mg_bonus_area) {
 926 1013                          mutex_enter(&mg->mg_lock);
 927 1014                          mg->mg_bonus_area = sm->sm_start;
 928 1015                          mutex_exit(&mg->mg_lock);
 929 1016                  }
 930 1017  
 931 1018                  metaslab_group_sort(msp->ms_group, msp,
 932 1019                      msp->ms_weight | activation_weight);
 933 1020          }
 934 1021          ASSERT(sm->sm_loaded);
 935 1022          ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 936 1023  
 937 1024          return (0);
 938 1025  }
 939 1026  
 940 1027  static void
 941 1028  metaslab_passivate(metaslab_t *msp, uint64_t size)
 942 1029  {
 943 1030          /*
 944 1031           * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 945 1032           * this metaslab again.  In that case, it had better be empty,
 946 1033           * or we would be leaving space on the table.
 947 1034           */
 948 1035          ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0);
 949 1036          metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
 950 1037          ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 951 1038  }
 952 1039  
 953 1040  /*
 954 1041   * Determine if the in-core space map representation can be condensed on-disk.
 955 1042   * We would like to use the following criteria to make our decision:
 956 1043   *
 957 1044   * 1. The size of the space map object should not dramatically increase as a
 958 1045   * result of writing out our in-core free map.
 959 1046   *
 960 1047   * 2. The minimal on-disk space map representation is zfs_condense_pct/100
 961 1048   * times the size than the in-core representation (i.e. zfs_condense_pct = 110
 962 1049   * and in-core = 1MB, minimal = 1.1.MB).
 963 1050   *
 964 1051   * Checking the first condition is tricky since we don't want to walk
 965 1052   * the entire AVL tree calculating the estimated on-disk size. Instead we
 966 1053   * use the size-ordered AVL tree in the space map and calculate the
 967 1054   * size required for the largest segment in our in-core free map. If the
 968 1055   * size required to represent that segment on disk is larger than the space
 969 1056   * map object then we avoid condensing this map.
 970 1057   *
 971 1058   * To determine the second criterion we use a best-case estimate and assume
 972 1059   * each segment can be represented on-disk as a single 64-bit entry. We refer
 973 1060   * to this best-case estimate as the space map's minimal form.
 974 1061   */
 975 1062  static boolean_t
 976 1063  metaslab_should_condense(metaslab_t *msp)
 977 1064  {
 978 1065          space_map_t *sm = msp->ms_map;
 979 1066          space_map_obj_t *smo = &msp->ms_smo_syncing;
 980 1067          space_seg_t *ss;
 981 1068          uint64_t size, entries, segsz;
 982 1069  
 983 1070          ASSERT(MUTEX_HELD(&msp->ms_lock));
 984 1071          ASSERT(sm->sm_loaded);
 985 1072  
 986 1073          /*
 987 1074           * Use the sm_pp_root AVL tree, which is ordered by size, to obtain
 988 1075           * the largest segment in the in-core free map. If the tree is
 989 1076           * empty then we should condense the map.
 990 1077           */
 991 1078          ss = avl_last(sm->sm_pp_root);
 992 1079          if (ss == NULL)
 993 1080                  return (B_TRUE);
 994 1081  
 995 1082          /*
 996 1083           * Calculate the number of 64-bit entries this segment would
 997 1084           * require when written to disk. If this single segment would be
 998 1085           * larger on-disk than the entire current on-disk structure, then
 999 1086           * clearly condensing will increase the on-disk structure size.
1000 1087           */
1001 1088          size = (ss->ss_end - ss->ss_start) >> sm->sm_shift;
1002 1089          entries = size / (MIN(size, SM_RUN_MAX));
1003 1090          segsz = entries * sizeof (uint64_t);
1004 1091  
1005 1092          return (segsz <= smo->smo_objsize &&
1006 1093              smo->smo_objsize >= (zfs_condense_pct *
1007 1094              sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100);
1008 1095  }
1009 1096  
1010 1097  /*
1011 1098   * Condense the on-disk space map representation to its minimized form.
1012 1099   * The minimized form consists of a small number of allocations followed by
1013 1100   * the in-core free map.
1014 1101   */
1015 1102  static void
1016 1103  metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1017 1104  {
1018 1105          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1019 1106          space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK];
1020 1107          space_map_t condense_map;
1021 1108          space_map_t *sm = msp->ms_map;
1022 1109          objset_t *mos = spa_meta_objset(spa);
1023 1110          space_map_obj_t *smo = &msp->ms_smo_syncing;
1024 1111  
1025 1112          ASSERT(MUTEX_HELD(&msp->ms_lock));
1026 1113          ASSERT3U(spa_sync_pass(spa), ==, 1);
1027 1114          ASSERT(sm->sm_loaded);
1028 1115  
1029 1116          spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1030 1117              "smo size %llu, segments %lu", txg,
1031 1118              (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
1032 1119              smo->smo_objsize, avl_numnodes(&sm->sm_root));
1033 1120  
1034 1121          /*
1035 1122           * Create an map that is a 100% allocated map. We remove segments
1036 1123           * that have been freed in this txg, any deferred frees that exist,
1037 1124           * and any allocation in the future. Removing segments should be
1038 1125           * a relatively inexpensive operation since we expect these maps to
1039 1126           * a small number of nodes.
1040 1127           */
1041 1128          space_map_create(&condense_map, sm->sm_start, sm->sm_size,
1042 1129              sm->sm_shift, sm->sm_lock);
1043 1130          space_map_add(&condense_map, condense_map.sm_start,
1044 1131              condense_map.sm_size);
1045 1132  
1046 1133          /*
1047 1134           * Remove what's been freed in this txg from the condense_map.
1048 1135           * Since we're in sync_pass 1, we know that all the frees from
1049 1136           * this txg are in the freemap.
1050 1137           */
1051 1138          space_map_walk(freemap, space_map_remove, &condense_map);
1052 1139  
1053 1140          for (int t = 0; t < TXG_DEFER_SIZE; t++)
1054 1141                  space_map_walk(msp->ms_defermap[t],
1055 1142                      space_map_remove, &condense_map);
1056 1143  
1057 1144          for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1058 1145                  space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK],
1059 1146                      space_map_remove, &condense_map);
1060 1147  
1061 1148          /*
1062 1149           * We're about to drop the metaslab's lock thus allowing
1063 1150           * other consumers to change it's content. Set the
1064 1151           * space_map's sm_condensing flag to ensure that
1065 1152           * allocations on this metaslab do not occur while we're
1066 1153           * in the middle of committing it to disk. This is only critical
1067 1154           * for the ms_map as all other space_maps use per txg
1068 1155           * views of their content.
1069 1156           */
1070 1157          sm->sm_condensing = B_TRUE;
1071 1158  
1072 1159          mutex_exit(&msp->ms_lock);
1073 1160          space_map_truncate(smo, mos, tx);
1074 1161          mutex_enter(&msp->ms_lock);
1075 1162  
1076 1163          /*
1077 1164           * While we would ideally like to create a space_map representation
1078 1165           * that consists only of allocation records, doing so can be
1079 1166           * prohibitively expensive because the in-core free map can be
1080 1167           * large, and therefore computationally expensive to subtract
1081 1168           * from the condense_map. Instead we sync out two maps, a cheap
1082 1169           * allocation only map followed by the in-core free map. While not
1083 1170           * optimal, this is typically close to optimal, and much cheaper to
1084 1171           * compute.
1085 1172           */
1086 1173          space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx);
1087 1174          space_map_vacate(&condense_map, NULL, NULL);
1088 1175          space_map_destroy(&condense_map);
1089 1176  
1090 1177          space_map_sync(sm, SM_FREE, smo, mos, tx);
1091 1178          sm->sm_condensing = B_FALSE;
1092 1179  
1093 1180          spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, "
1094 1181              "smo size %llu", txg,
1095 1182              (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
1096 1183              smo->smo_objsize);
1097 1184  }
1098 1185  
1099 1186  /*
1100 1187   * Write a metaslab to disk in the context of the specified transaction group.
1101 1188   */
1102 1189  void
1103 1190  metaslab_sync(metaslab_t *msp, uint64_t txg)
1104 1191  {
1105 1192          vdev_t *vd = msp->ms_group->mg_vd;
1106 1193          spa_t *spa = vd->vdev_spa;
1107 1194          objset_t *mos = spa_meta_objset(spa);
1108 1195          space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK];
1109 1196          space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK];
1110 1197          space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1111 1198          space_map_t *sm = msp->ms_map;
1112 1199          space_map_obj_t *smo = &msp->ms_smo_syncing;
1113 1200          dmu_buf_t *db;
1114 1201          dmu_tx_t *tx;
1115 1202  
1116 1203          ASSERT(!vd->vdev_ishole);
1117 1204  
1118 1205          /*
1119 1206           * This metaslab has just been added so there's no work to do now.
1120 1207           */
1121 1208          if (*freemap == NULL) {
1122 1209                  ASSERT3P(allocmap, ==, NULL);
1123 1210                  return;
1124 1211          }
1125 1212  
1126 1213          ASSERT3P(allocmap, !=, NULL);
1127 1214          ASSERT3P(*freemap, !=, NULL);
1128 1215          ASSERT3P(*freed_map, !=, NULL);
1129 1216  
1130 1217          if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0)
1131 1218                  return;
1132 1219  
1133 1220          /*
1134 1221           * The only state that can actually be changing concurrently with
1135 1222           * metaslab_sync() is the metaslab's ms_map.  No other thread can
1136 1223           * be modifying this txg's allocmap, freemap, freed_map, or smo.
1137 1224           * Therefore, we only hold ms_lock to satify space_map ASSERTs.
1138 1225           * We drop it whenever we call into the DMU, because the DMU
1139 1226           * can call down to us (e.g. via zio_free()) at any time.
1140 1227           */
1141 1228  
1142 1229          tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1143 1230  
1144 1231          if (smo->smo_object == 0) {
1145 1232                  ASSERT(smo->smo_objsize == 0);
1146 1233                  ASSERT(smo->smo_alloc == 0);
1147 1234                  smo->smo_object = dmu_object_alloc(mos,
1148 1235                      DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1149 1236                      DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1150 1237                  ASSERT(smo->smo_object != 0);
1151 1238                  dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1152 1239                      (sm->sm_start >> vd->vdev_ms_shift),
1153 1240                      sizeof (uint64_t), &smo->smo_object, tx);
1154 1241          }
1155 1242  
1156 1243          mutex_enter(&msp->ms_lock);
1157 1244  
1158 1245          if (sm->sm_loaded && spa_sync_pass(spa) == 1 &&
1159 1246              metaslab_should_condense(msp)) {
1160 1247                  metaslab_condense(msp, txg, tx);
1161 1248          } else {
1162 1249                  space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
1163 1250                  space_map_sync(*freemap, SM_FREE, smo, mos, tx);
1164 1251          }
1165 1252  
1166 1253          space_map_vacate(allocmap, NULL, NULL);
1167 1254  
1168 1255          /*
1169 1256           * For sync pass 1, we avoid walking the entire space map and
1170 1257           * instead will just swap the pointers for freemap and
1171 1258           * freed_map. We can safely do this since the freed_map is
1172 1259           * guaranteed to be empty on the initial pass.
1173 1260           */
1174 1261          if (spa_sync_pass(spa) == 1) {
1175 1262                  ASSERT0((*freed_map)->sm_space);
1176 1263                  ASSERT0(avl_numnodes(&(*freed_map)->sm_root));
1177 1264                  space_map_swap(freemap, freed_map);
1178 1265          } else {
1179 1266                  space_map_vacate(*freemap, space_map_add, *freed_map);
1180 1267          }
1181 1268  
1182 1269          ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space);
1183 1270          ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space);
1184 1271  
1185 1272          mutex_exit(&msp->ms_lock);
1186 1273  
1187 1274          VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1188 1275          dmu_buf_will_dirty(db, tx);
1189 1276          ASSERT3U(db->db_size, >=, sizeof (*smo));
1190 1277          bcopy(smo, db->db_data, sizeof (*smo));
1191 1278          dmu_buf_rele(db, FTAG);
1192 1279  
1193 1280          dmu_tx_commit(tx);
1194 1281  }
1195 1282  
1196 1283  /*
1197 1284   * Called after a transaction group has completely synced to mark
1198 1285   * all of the metaslab's free space as usable.
1199 1286   */
1200 1287  void
1201 1288  metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1202 1289  {
1203 1290          space_map_obj_t *smo = &msp->ms_smo;
1204 1291          space_map_obj_t *smosync = &msp->ms_smo_syncing;
1205 1292          space_map_t *sm = msp->ms_map;
1206 1293          space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1207 1294          space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1208 1295          metaslab_group_t *mg = msp->ms_group;
1209 1296          vdev_t *vd = mg->mg_vd;
1210 1297          int64_t alloc_delta, defer_delta;
1211 1298  
1212 1299          ASSERT(!vd->vdev_ishole);
1213 1300  
1214 1301          mutex_enter(&msp->ms_lock);
1215 1302  
1216 1303          /*
1217 1304           * If this metaslab is just becoming available, initialize its
1218 1305           * allocmaps, freemaps, and defermap and add its capacity to the vdev.
1219 1306           */
1220 1307          if (*freed_map == NULL) {
1221 1308                  ASSERT(*defer_map == NULL);
1222 1309                  for (int t = 0; t < TXG_SIZE; t++) {
1223 1310                          msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t),
1224 1311                              KM_SLEEP);
1225 1312                          space_map_create(msp->ms_allocmap[t], sm->sm_start,
1226 1313                              sm->sm_size, sm->sm_shift, sm->sm_lock);
1227 1314                          msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t),
1228 1315                              KM_SLEEP);
1229 1316                          space_map_create(msp->ms_freemap[t], sm->sm_start,
1230 1317                              sm->sm_size, sm->sm_shift, sm->sm_lock);
1231 1318                  }
1232 1319  
1233 1320                  for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1234 1321                          msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t),
1235 1322                              KM_SLEEP);
1236 1323                          space_map_create(msp->ms_defermap[t], sm->sm_start,
1237 1324                              sm->sm_size, sm->sm_shift, sm->sm_lock);
1238 1325                  }
1239 1326  
1240 1327                  freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1241 1328                  defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1242 1329  
1243 1330                  vdev_space_update(vd, 0, 0, sm->sm_size);
1244 1331          }
1245 1332  
1246 1333          alloc_delta = smosync->smo_alloc - smo->smo_alloc;
1247 1334          defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space;
1248 1335  
1249 1336          vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1250 1337  
1251 1338          ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0);
1252 1339          ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0);
1253 1340  
1254 1341          /*
1255 1342           * If there's a space_map_load() in progress, wait for it to complete
1256 1343           * so that we have a consistent view of the in-core space map.
1257 1344           */
1258 1345          space_map_load_wait(sm);
1259 1346  
1260 1347          /*
1261 1348           * Move the frees from the defer_map to this map (if it's loaded).
1262 1349           * Swap the freed_map and the defer_map -- this is safe to do
1263 1350           * because we've just emptied out the defer_map.
1264 1351           */
1265 1352          space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
1266 1353          ASSERT0((*defer_map)->sm_space);
1267 1354          ASSERT0(avl_numnodes(&(*defer_map)->sm_root));
1268 1355          space_map_swap(freed_map, defer_map);
1269 1356  
1270 1357          *smo = *smosync;
1271 1358  
1272 1359          msp->ms_deferspace += defer_delta;
1273 1360          ASSERT3S(msp->ms_deferspace, >=, 0);
1274 1361          ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
1275 1362          if (msp->ms_deferspace != 0) {
1276 1363                  /*
1277 1364                   * Keep syncing this metaslab until all deferred frees
1278 1365                   * are back in circulation.
1279 1366                   */
1280 1367                  vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1281 1368          }
1282 1369  
1283 1370          /*
1284 1371           * If the map is loaded but no longer active, evict it as soon as all
1285 1372           * future allocations have synced.  (If we unloaded it now and then
1286 1373           * loaded a moment later, the map wouldn't reflect those allocations.)
1287 1374           */
1288 1375          if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1289 1376                  int evictable = 1;
1290 1377  
1291 1378                  for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1292 1379                          if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
1293 1380                                  evictable = 0;
1294 1381  
1295 1382                  if (evictable && !metaslab_debug)
1296 1383                          space_map_unload(sm);
1297 1384          }
1298 1385  
1299 1386          metaslab_group_sort(mg, msp, metaslab_weight(msp));

↓ open down ↓

928 lines elided

↑ open up ↑

1300 1387  
1301 1388          mutex_exit(&msp->ms_lock);
1302 1389  }
1303 1390  
1304 1391  void
1305 1392  metaslab_sync_reassess(metaslab_group_t *mg)
1306 1393  {
1307 1394          vdev_t *vd = mg->mg_vd;
1308 1395          int64_t failures = mg->mg_alloc_failures;
1309 1396  
     1397 +        metaslab_group_alloc_update(mg);
     1398 +
1310 1399          /*
1311 1400           * Re-evaluate all metaslabs which have lower offsets than the
1312 1401           * bonus area.
1313 1402           */
1314 1403          for (int m = 0; m < vd->vdev_ms_count; m++) {
1315 1404                  metaslab_t *msp = vd->vdev_ms[m];
1316 1405  
1317 1406                  if (msp->ms_map->sm_start > mg->mg_bonus_area)
1318 1407                          break;
1319 1408

1320 1409                  mutex_enter(&msp->ms_lock);
1321 1410                  metaslab_group_sort(mg, msp, metaslab_weight(msp));
1322 1411                  mutex_exit(&msp->ms_lock);
1323 1412          }
1324 1413  
1325 1414          atomic_add_64(&mg->mg_alloc_failures, -failures);
1326 1415  
1327 1416          /*
1328 1417           * Prefetch the next potential metaslabs
1329 1418           */
1330 1419          metaslab_prefetch(mg);
1331 1420  }
1332 1421  
1333 1422  static uint64_t
1334 1423  metaslab_distance(metaslab_t *msp, dva_t *dva)
1335 1424  {
1336 1425          uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1337 1426          uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1338 1427          uint64_t start = msp->ms_map->sm_start >> ms_shift;
1339 1428  
1340 1429          if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1341 1430                  return (1ULL << 63);
1342 1431  
1343 1432          if (offset < start)
1344 1433                  return ((start - offset) << ms_shift);
1345 1434          if (offset > start)
1346 1435                  return ((offset - start) << ms_shift);
1347 1436          return (0);
1348 1437  }
1349 1438  
1350 1439  static uint64_t
1351 1440  metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1352 1441      uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1353 1442  {
1354 1443          spa_t *spa = mg->mg_vd->vdev_spa;
1355 1444          metaslab_t *msp = NULL;
1356 1445          uint64_t offset = -1ULL;
1357 1446          avl_tree_t *t = &mg->mg_metaslab_tree;
1358 1447          uint64_t activation_weight;
1359 1448          uint64_t target_distance;
1360 1449          int i;
1361 1450  
1362 1451          activation_weight = METASLAB_WEIGHT_PRIMARY;
1363 1452          for (i = 0; i < d; i++) {
1364 1453                  if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1365 1454                          activation_weight = METASLAB_WEIGHT_SECONDARY;
1366 1455                          break;
1367 1456                  }
1368 1457          }
1369 1458  
1370 1459          for (;;) {
1371 1460                  boolean_t was_active;
1372 1461  
1373 1462                  mutex_enter(&mg->mg_lock);
1374 1463                  for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1375 1464                          if (msp->ms_weight < asize) {
1376 1465                                  spa_dbgmsg(spa, "%s: failed to meet weight "
1377 1466                                      "requirement: vdev %llu, txg %llu, mg %p, "
1378 1467                                      "msp %p, psize %llu, asize %llu, "
1379 1468                                      "failures %llu, weight %llu",
1380 1469                                      spa_name(spa), mg->mg_vd->vdev_id, txg,
1381 1470                                      mg, msp, psize, asize,
1382 1471                                      mg->mg_alloc_failures, msp->ms_weight);
1383 1472                                  mutex_exit(&mg->mg_lock);
1384 1473                                  return (-1ULL);
1385 1474                          }
1386 1475  
1387 1476                          /*
1388 1477                           * If the selected metaslab is condensing, skip it.
1389 1478                           */
1390 1479                          if (msp->ms_map->sm_condensing)
1391 1480                                  continue;
1392 1481  
1393 1482                          was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1394 1483                          if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1395 1484                                  break;
1396 1485  
1397 1486                          target_distance = min_distance +
1398 1487                              (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1399 1488  
1400 1489                          for (i = 0; i < d; i++)

↓ open down ↓

81 lines elided

↑ open up ↑

1401 1490                                  if (metaslab_distance(msp, &dva[i]) <
1402 1491                                      target_distance)
1403 1492                                          break;
1404 1493                          if (i == d)
1405 1494                                  break;
1406 1495                  }
1407 1496                  mutex_exit(&mg->mg_lock);
1408 1497                  if (msp == NULL)
1409 1498                          return (-1ULL);
1410 1499  
     1500 +                mutex_enter(&msp->ms_lock);
     1501 +
1411 1502                  /*
1412 1503                   * If we've already reached the allowable number of failed
1413 1504                   * allocation attempts on this metaslab group then we
1414 1505                   * consider skipping it. We skip it only if we're allowed
1415 1506                   * to "fast" gang, the physical size is larger than
1416 1507                   * a gang block, and we're attempting to allocate from
1417 1508                   * the primary metaslab.
1418 1509                   */
1419 1510                  if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1420 1511                      CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1421 1512                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
1422 1513                          spa_dbgmsg(spa, "%s: skipping metaslab group: "
1423 1514                              "vdev %llu, txg %llu, mg %p, psize %llu, "
1424 1515                              "asize %llu, failures %llu", spa_name(spa),
1425 1516                              mg->mg_vd->vdev_id, txg, mg, psize, asize,
1426 1517                              mg->mg_alloc_failures);
     1518 +                        mutex_exit(&msp->ms_lock);
1427 1519                          return (-1ULL);
1428 1520                  }
1429 1521  
1430      -                mutex_enter(&msp->ms_lock);
1431      -
1432 1522                  /*
1433 1523                   * Ensure that the metaslab we have selected is still
1434 1524                   * capable of handling our request. It's possible that
1435 1525                   * another thread may have changed the weight while we
1436 1526                   * were blocked on the metaslab lock.
1437 1527                   */
1438 1528                  if (msp->ms_weight < asize || (was_active &&
1439 1529                      !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1440 1530                      activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1441 1531                          mutex_exit(&msp->ms_lock);

1442 1532                          continue;
1443 1533                  }
1444 1534  
1445 1535                  if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1446 1536                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
1447 1537                          metaslab_passivate(msp,
1448 1538                              msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1449 1539                          mutex_exit(&msp->ms_lock);
1450 1540                          continue;
1451 1541                  }
1452 1542  
1453 1543                  if (metaslab_activate(msp, activation_weight) != 0) {
1454 1544                          mutex_exit(&msp->ms_lock);
1455 1545                          continue;
1456 1546                  }
1457 1547  
1458 1548                  /*
1459 1549                   * If this metaslab is currently condensing then pick again as
1460 1550                   * we can't manipulate this metaslab until it's committed
1461 1551                   * to disk.
1462 1552                   */
1463 1553                  if (msp->ms_map->sm_condensing) {
1464 1554                          mutex_exit(&msp->ms_lock);
1465 1555                          continue;
1466 1556                  }
1467 1557  
1468 1558                  if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL)
1469 1559                          break;
1470 1560  
1471 1561                  atomic_inc_64(&mg->mg_alloc_failures);
1472 1562  
1473 1563                  metaslab_passivate(msp, space_map_maxsize(msp->ms_map));
1474 1564  
1475 1565                  mutex_exit(&msp->ms_lock);
1476 1566          }
1477 1567  
1478 1568          if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
1479 1569                  vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1480 1570  
1481 1571          space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize);
1482 1572  
1483 1573          mutex_exit(&msp->ms_lock);
1484 1574  
1485 1575          return (offset);
1486 1576  }
1487 1577  
1488 1578  /*
1489 1579   * Allocate a block for the specified i/o.
1490 1580   */
1491 1581  static int
1492 1582  metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1493 1583      dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1494 1584  {
1495 1585          metaslab_group_t *mg, *rotor;
1496 1586          vdev_t *vd;
1497 1587          int dshift = 3;
1498 1588          int all_zero;
1499 1589          int zio_lock = B_FALSE;
1500 1590          boolean_t allocatable;
1501 1591          uint64_t offset = -1ULL;
1502 1592          uint64_t asize;
1503 1593          uint64_t distance;
1504 1594  
1505 1595          ASSERT(!DVA_IS_VALID(&dva[d]));
1506 1596  
1507 1597          /*
1508 1598           * For testing, make some blocks above a certain size be gang blocks.
1509 1599           */
1510 1600          if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1511 1601                  return (SET_ERROR(ENOSPC));
1512 1602  
1513 1603          /*
1514 1604           * Start at the rotor and loop through all mgs until we find something.
1515 1605           * Note that there's no locking on mc_rotor or mc_aliquot because
1516 1606           * nothing actually breaks if we miss a few updates -- we just won't
1517 1607           * allocate quite as evenly.  It all balances out over time.
1518 1608           *
1519 1609           * If we are doing ditto or log blocks, try to spread them across
1520 1610           * consecutive vdevs.  If we're forced to reuse a vdev before we've
1521 1611           * allocated all of our ditto blocks, then try and spread them out on
1522 1612           * that vdev as much as possible.  If it turns out to not be possible,
1523 1613           * gradually lower our standards until anything becomes acceptable.
1524 1614           * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1525 1615           * gives us hope of containing our fault domains to something we're
1526 1616           * able to reason about.  Otherwise, any two top-level vdev failures
1527 1617           * will guarantee the loss of data.  With consecutive allocation,
1528 1618           * only two adjacent top-level vdev failures will result in data loss.
1529 1619           *
1530 1620           * If we are doing gang blocks (hintdva is non-NULL), try to keep
1531 1621           * ourselves on the same vdev as our gang block header.  That
1532 1622           * way, we can hope for locality in vdev_cache, plus it makes our
1533 1623           * fault domains something tractable.
1534 1624           */
1535 1625          if (hintdva) {
1536 1626                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1537 1627  
1538 1628                  /*
1539 1629                   * It's possible the vdev we're using as the hint no
1540 1630                   * longer exists (i.e. removed). Consult the rotor when
1541 1631                   * all else fails.
1542 1632                   */
1543 1633                  if (vd != NULL) {
1544 1634                          mg = vd->vdev_mg;
1545 1635  
1546 1636                          if (flags & METASLAB_HINTBP_AVOID &&
1547 1637                              mg->mg_next != NULL)
1548 1638                                  mg = mg->mg_next;
1549 1639                  } else {
1550 1640                          mg = mc->mc_rotor;
1551 1641                  }
1552 1642          } else if (d != 0) {
1553 1643                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1554 1644                  mg = vd->vdev_mg->mg_next;
1555 1645          } else {
1556 1646                  mg = mc->mc_rotor;
1557 1647          }
1558 1648  
1559 1649          /*
1560 1650           * If the hint put us into the wrong metaslab class, or into a
1561 1651           * metaslab group that has been passivated, just follow the rotor.
1562 1652           */
1563 1653          if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1564 1654                  mg = mc->mc_rotor;
1565 1655  
1566 1656          rotor = mg;
1567 1657  top:
1568 1658          all_zero = B_TRUE;
1569 1659          do {
1570 1660                  ASSERT(mg->mg_activation_count == 1);
1571 1661  
1572 1662                  vd = mg->mg_vd;
1573 1663

↓ open down ↓

132 lines elided

↑ open up ↑

1574 1664                  /*
1575 1665                   * Don't allocate from faulted devices.
1576 1666                   */
1577 1667                  if (zio_lock) {
1578 1668                          spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1579 1669                          allocatable = vdev_allocatable(vd);
1580 1670                          spa_config_exit(spa, SCL_ZIO, FTAG);
1581 1671                  } else {
1582 1672                          allocatable = vdev_allocatable(vd);
1583 1673                  }
     1674 +
     1675 +                /*
     1676 +                 * Determine if the selected metaslab group is eligible
     1677 +                 * for allocations. If we're ganging or have requested
     1678 +                 * an allocation for the smallest gang block size
     1679 +                 * then we don't want to avoid allocating to the this
     1680 +                 * metaslab group. If we're in this condition we should
     1681 +                 * try to allocate from any device possible so that we
     1682 +                 * don't inadvertently return ENOSPC and suspend the pool
     1683 +                 * even though space is still available.
     1684 +                 */
     1685 +                if (allocatable && CAN_FASTGANG(flags) &&
     1686 +                    psize > SPA_GANGBLOCKSIZE)
     1687 +                        allocatable = metaslab_group_allocatable(mg);
     1688 +
1584 1689                  if (!allocatable)
1585 1690                          goto next;
1586 1691  
1587 1692                  /*
1588 1693                   * Avoid writing single-copy data to a failing vdev
1589 1694                   * unless the user instructs us that it is okay.
1590 1695                   */
1591 1696                  if ((vd->vdev_stat.vs_write_errors > 0 ||
1592 1697                      vd->vdev_state < VDEV_STATE_HEALTHY) &&
1593 1698                      d == 0 && dshift == 3 &&

1594 1699                      !(zfs_write_to_degraded && vd->vdev_state ==
1595 1700                      VDEV_STATE_DEGRADED)) {
1596 1701                          all_zero = B_FALSE;
1597 1702                          goto next;
1598 1703                  }
1599 1704  
1600 1705                  ASSERT(mg->mg_class == mc);
1601 1706  
1602 1707                  distance = vd->vdev_asize >> dshift;
1603 1708                  if (distance <= (1ULL << vd->vdev_ms_shift))
1604 1709                          distance = 0;
1605 1710                  else
1606 1711                          all_zero = B_FALSE;
1607 1712  
1608 1713                  asize = vdev_psize_to_asize(vd, psize);
1609 1714                  ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1610 1715  
1611 1716                  offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1612 1717                      dva, d, flags);
1613 1718                  if (offset != -1ULL) {
1614 1719                          /*
1615 1720                           * If we've just selected this metaslab group,
1616 1721                           * figure out whether the corresponding vdev is
1617 1722                           * over- or under-used relative to the pool,
1618 1723                           * and set an allocation bias to even it out.
1619 1724                           */
1620 1725                          if (mc->mc_aliquot == 0) {
1621 1726                                  vdev_stat_t *vs = &vd->vdev_stat;
1622 1727                                  int64_t vu, cu;
1623 1728  
1624 1729                                  vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1625 1730                                  cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1626 1731  
1627 1732                                  /*
1628 1733                                   * Calculate how much more or less we should
1629 1734                                   * try to allocate from this device during
1630 1735                                   * this iteration around the rotor.
1631 1736                                   * For example, if a device is 80% full
1632 1737                                   * and the pool is 20% full then we should
1633 1738                                   * reduce allocations by 60% on this device.
1634 1739                                   *
1635 1740                                   * mg_bias = (20 - 80) * 512K / 100 = -307K
1636 1741                                   *
1637 1742                                   * This reduces allocations by 307K for this
1638 1743                                   * iteration.
1639 1744                                   */
1640 1745                                  mg->mg_bias = ((cu - vu) *
1641 1746                                      (int64_t)mg->mg_aliquot) / 100;
1642 1747                          }
1643 1748  
1644 1749                          if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1645 1750                              mg->mg_aliquot + mg->mg_bias) {
1646 1751                                  mc->mc_rotor = mg->mg_next;
1647 1752                                  mc->mc_aliquot = 0;
1648 1753                          }
1649 1754  
1650 1755                          DVA_SET_VDEV(&dva[d], vd->vdev_id);
1651 1756                          DVA_SET_OFFSET(&dva[d], offset);
1652 1757                          DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1653 1758                          DVA_SET_ASIZE(&dva[d], asize);
1654 1759  
1655 1760                          return (0);
1656 1761                  }
1657 1762  next:
1658 1763                  mc->mc_rotor = mg->mg_next;
1659 1764                  mc->mc_aliquot = 0;
1660 1765          } while ((mg = mg->mg_next) != rotor);
1661 1766  
1662 1767          if (!all_zero) {
1663 1768                  dshift++;
1664 1769                  ASSERT(dshift < 64);
1665 1770                  goto top;
1666 1771          }
1667 1772  
1668 1773          if (!allocatable && !zio_lock) {
1669 1774                  dshift = 3;
1670 1775                  zio_lock = B_TRUE;
1671 1776                  goto top;
1672 1777          }
1673 1778  
1674 1779          bzero(&dva[d], sizeof (dva_t));
1675 1780  
1676 1781          return (SET_ERROR(ENOSPC));
1677 1782  }
1678 1783  
1679 1784  /*
1680 1785   * Free the block represented by DVA in the context of the specified
1681 1786   * transaction group.
1682 1787   */
1683 1788  static void
1684 1789  metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1685 1790  {
1686 1791          uint64_t vdev = DVA_GET_VDEV(dva);
1687 1792          uint64_t offset = DVA_GET_OFFSET(dva);
1688 1793          uint64_t size = DVA_GET_ASIZE(dva);
1689 1794          vdev_t *vd;
1690 1795          metaslab_t *msp;
1691 1796  
1692 1797          ASSERT(DVA_IS_VALID(dva));
1693 1798  
1694 1799          if (txg > spa_freeze_txg(spa))
1695 1800                  return;
1696 1801  
1697 1802          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1698 1803              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1699 1804                  cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1700 1805                      (u_longlong_t)vdev, (u_longlong_t)offset);
1701 1806                  ASSERT(0);
1702 1807                  return;
1703 1808          }
1704 1809  
1705 1810          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1706 1811  
1707 1812          if (DVA_GET_GANG(dva))
1708 1813                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1709 1814  
1710 1815          mutex_enter(&msp->ms_lock);
1711 1816  
1712 1817          if (now) {
1713 1818                  space_map_remove(msp->ms_allocmap[txg & TXG_MASK],
1714 1819                      offset, size);
1715 1820                  space_map_free(msp->ms_map, offset, size);
1716 1821          } else {
1717 1822                  if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0)
1718 1823                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
1719 1824                  space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size);
1720 1825          }
1721 1826  
1722 1827          mutex_exit(&msp->ms_lock);
1723 1828  }
1724 1829  
1725 1830  /*
1726 1831   * Intent log support: upon opening the pool after a crash, notify the SPA
1727 1832   * of blocks that the intent log has allocated for immediate write, but
1728 1833   * which are still considered free by the SPA because the last transaction
1729 1834   * group didn't commit yet.
1730 1835   */
1731 1836  static int
1732 1837  metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
1733 1838  {
1734 1839          uint64_t vdev = DVA_GET_VDEV(dva);
1735 1840          uint64_t offset = DVA_GET_OFFSET(dva);
1736 1841          uint64_t size = DVA_GET_ASIZE(dva);
1737 1842          vdev_t *vd;
1738 1843          metaslab_t *msp;
1739 1844          int error = 0;
1740 1845  
1741 1846          ASSERT(DVA_IS_VALID(dva));
1742 1847  
1743 1848          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1744 1849              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1745 1850                  return (SET_ERROR(ENXIO));
1746 1851  
1747 1852          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1748 1853  
1749 1854          if (DVA_GET_GANG(dva))
1750 1855                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1751 1856  
1752 1857          mutex_enter(&msp->ms_lock);
1753 1858  
1754 1859          if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded)
1755 1860                  error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1756 1861  
1757 1862          if (error == 0 && !space_map_contains(msp->ms_map, offset, size))
1758 1863                  error = SET_ERROR(ENOENT);
1759 1864  
1760 1865          if (error || txg == 0) {        /* txg == 0 indicates dry run */
1761 1866                  mutex_exit(&msp->ms_lock);
1762 1867                  return (error);
1763 1868          }
1764 1869  
1765 1870          space_map_claim(msp->ms_map, offset, size);
1766 1871  
1767 1872          if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
1768 1873                  if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
1769 1874                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
1770 1875                  space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size);
1771 1876          }
1772 1877  
1773 1878          mutex_exit(&msp->ms_lock);
1774 1879  
1775 1880          return (0);
1776 1881  }
1777 1882  
1778 1883  int
1779 1884  metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
1780 1885      int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
1781 1886  {
1782 1887          dva_t *dva = bp->blk_dva;
1783 1888          dva_t *hintdva = hintbp->blk_dva;
1784 1889          int error = 0;
1785 1890  
1786 1891          ASSERT(bp->blk_birth == 0);
1787 1892          ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
1788 1893  
1789 1894          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1790 1895  
1791 1896          if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
1792 1897                  spa_config_exit(spa, SCL_ALLOC, FTAG);
1793 1898                  return (SET_ERROR(ENOSPC));
1794 1899          }
1795 1900  
1796 1901          ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
1797 1902          ASSERT(BP_GET_NDVAS(bp) == 0);
1798 1903          ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
1799 1904  
1800 1905          for (int d = 0; d < ndvas; d++) {
1801 1906                  error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
1802 1907                      txg, flags);
1803 1908                  if (error) {
1804 1909                          for (d--; d >= 0; d--) {
1805 1910                                  metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
1806 1911                                  bzero(&dva[d], sizeof (dva_t));
1807 1912                          }
1808 1913                          spa_config_exit(spa, SCL_ALLOC, FTAG);
1809 1914                          return (error);
1810 1915                  }
1811 1916          }
1812 1917          ASSERT(error == 0);
1813 1918          ASSERT(BP_GET_NDVAS(bp) == ndvas);
1814 1919  
1815 1920          spa_config_exit(spa, SCL_ALLOC, FTAG);
1816 1921  
1817 1922          BP_SET_BIRTH(bp, txg, txg);
1818 1923  
1819 1924          return (0);
1820 1925  }
1821 1926  
1822 1927  void
1823 1928  metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
1824 1929  {
1825 1930          const dva_t *dva = bp->blk_dva;
1826 1931          int ndvas = BP_GET_NDVAS(bp);
1827 1932  
1828 1933          ASSERT(!BP_IS_HOLE(bp));
1829 1934          ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
1830 1935  
1831 1936          spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
1832 1937  
1833 1938          for (int d = 0; d < ndvas; d++)
1834 1939                  metaslab_free_dva(spa, &dva[d], txg, now);
1835 1940  
1836 1941          spa_config_exit(spa, SCL_FREE, FTAG);
1837 1942  }
1838 1943  
1839 1944  int
1840 1945  metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1841 1946  {
1842 1947          const dva_t *dva = bp->blk_dva;
1843 1948          int ndvas = BP_GET_NDVAS(bp);
1844 1949          int error = 0;
1845 1950  
1846 1951          ASSERT(!BP_IS_HOLE(bp));
1847 1952  
1848 1953          if (txg != 0) {
1849 1954                  /*
1850 1955                   * First do a dry run to make sure all DVAs are claimable,
1851 1956                   * so we don't have to unwind from partial failures below.
1852 1957                   */
1853 1958                  if ((error = metaslab_claim(spa, bp, 0)) != 0)
1854 1959                          return (error);
1855 1960          }
1856 1961  
1857 1962          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1858 1963  
1859 1964          for (int d = 0; d < ndvas; d++)
1860 1965                  if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1861 1966                          break;
1862 1967  
1863 1968          spa_config_exit(spa, SCL_ALLOC, FTAG);
1864 1969  
1865 1970          ASSERT(error == 0 || txg == 0);
1866 1971  
1867 1972          return (error);
1868 1973  }
1869 1974  
1870 1975  static void
1871 1976  checkmap(space_map_t *sm, uint64_t off, uint64_t size)
1872 1977  {
1873 1978          space_seg_t *ss;
1874 1979          avl_index_t where;
1875 1980  
1876 1981          mutex_enter(sm->sm_lock);
1877 1982          ss = space_map_find(sm, off, size, &where);
1878 1983          if (ss != NULL)
1879 1984                  panic("freeing free block; ss=%p", (void *)ss);
1880 1985          mutex_exit(sm->sm_lock);
1881 1986  }
1882 1987  
1883 1988  void
1884 1989  metaslab_check_free(spa_t *spa, const blkptr_t *bp)
1885 1990  {
1886 1991          if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
1887 1992                  return;
1888 1993  
1889 1994          spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1890 1995          for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
1891 1996                  uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]);
1892 1997                  vdev_t *vd = vdev_lookup_top(spa, vdid);
1893 1998                  uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]);
1894 1999                  uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
1895 2000                  metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift];
1896 2001  
1897 2002                  if (ms->ms_map->sm_loaded)
1898 2003                          checkmap(ms->ms_map, off, size);
1899 2004  
1900 2005                  for (int j = 0; j < TXG_SIZE; j++)
1901 2006                          checkmap(ms->ms_freemap[j], off, size);
1902 2007                  for (int j = 0; j < TXG_DEFER_SIZE; j++)
1903 2008                          checkmap(ms->ms_defermap[j], off, size);
1904 2009          }
1905 2010          spa_config_exit(spa, SCL_VDEV, FTAG);
1906 2011  }

↓ open down ↓

313 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX