illumos-gate Wdiff usr/src/uts/common/fs/zfs/metaslab.c

Print this page

3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/metaslab.c
          +++ new/usr/src/uts/common/fs/zfs/metaslab.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012 by Delphix. All rights reserved.
  24   24   */
  25   25  
  26   26  #include <sys/zfs_context.h>
  27   27  #include <sys/dmu.h>
  28   28  #include <sys/dmu_tx.h>
  29   29  #include <sys/space_map.h>
  30   30  #include <sys/metaslab_impl.h>
  31   31  #include <sys/vdev_impl.h>
  32   32  #include <sys/zio.h>
  33   33  
  34   34  /*
  35   35   * Allow allocations to switch to gang blocks quickly. We do this to
  36   36   * avoid having to load lots of space_maps in a given txg. There are,
  37   37   * however, some cases where we want to avoid "fast" ganging and instead
  38   38   * we want to do an exhaustive search of all metaslabs on this device.
  39   39   * Currently we don't allow any gang, zil, or dump device related allocations
  40   40   * to "fast" gang.
  41   41   */
  42   42  #define CAN_FASTGANG(flags) \
  43   43          (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
  44   44          METASLAB_GANG_AVOID)))
  45   45  
  46   46  uint64_t metaslab_aliquot = 512ULL << 10;
  47   47  uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;     /* force gang blocks */
  48   48  
  49   49  /*
  50   50   * This value defines the number of allowed allocation failures per vdev.
  51   51   * If a device reaches this threshold in a given txg then we consider skipping
  52   52   * allocations on that device.
  53   53   */
  54   54  int zfs_mg_alloc_failures;
  55   55  
  56   56  /*
  57   57   * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  58   58   */
  59   59  static int metaslab_debug = 0;
  60   60  
  61   61  /*
  62   62   * Minimum size which forces the dynamic allocator to change
  63   63   * it's allocation strategy.  Once the space map cannot satisfy
  64   64   * an allocation of this size then it switches to using more
  65   65   * aggressive strategy (i.e search by size rather than offset).
  66   66   */
  67   67  uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
  68   68  
  69   69  /*
  70   70   * The minimum free space, in percent, which must be available
  71   71   * in a space map to continue allocations in a first-fit fashion.
  72   72   * Once the space_map's free space drops below this level we dynamically
  73   73   * switch to using best-fit allocations.
  74   74   */
  75   75  int metaslab_df_free_pct = 4;
  76   76  
  77   77  /*
  78   78   * A metaslab is considered "free" if it contains a contiguous
  79   79   * segment which is greater than metaslab_min_alloc_size.
  80   80   */
  81   81  uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
  82   82  
  83   83  /*
  84   84   * Max number of space_maps to prefetch.
  85   85   */
  86   86  int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
  87   87  
  88   88  /*
  89   89   * Percentage bonus multiplier for metaslabs that are in the bonus area.
  90   90   */
  91   91  int metaslab_smo_bonus_pct = 150;
  92   92  
  93   93  /*
  94   94   * ==========================================================================
  95   95   * Metaslab classes
  96   96   * ==========================================================================
  97   97   */
  98   98  metaslab_class_t *
  99   99  metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
 100  100  {
 101  101          metaslab_class_t *mc;
 102  102  
 103  103          mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 104  104  
 105  105          mc->mc_spa = spa;
 106  106          mc->mc_rotor = NULL;
 107  107          mc->mc_ops = ops;
 108  108  
 109  109          return (mc);
 110  110  }
 111  111  
 112  112  void
 113  113  metaslab_class_destroy(metaslab_class_t *mc)
 114  114  {
 115  115          ASSERT(mc->mc_rotor == NULL);
 116  116          ASSERT(mc->mc_alloc == 0);
 117  117          ASSERT(mc->mc_deferred == 0);
 118  118          ASSERT(mc->mc_space == 0);
 119  119          ASSERT(mc->mc_dspace == 0);
 120  120  
 121  121          kmem_free(mc, sizeof (metaslab_class_t));
 122  122  }
 123  123  
 124  124  int
 125  125  metaslab_class_validate(metaslab_class_t *mc)
 126  126  {
 127  127          metaslab_group_t *mg;
 128  128          vdev_t *vd;
 129  129  
 130  130          /*
 131  131           * Must hold one of the spa_config locks.
 132  132           */
 133  133          ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 134  134              spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 135  135  
 136  136          if ((mg = mc->mc_rotor) == NULL)
 137  137                  return (0);
 138  138  
 139  139          do {
 140  140                  vd = mg->mg_vd;
 141  141                  ASSERT(vd->vdev_mg != NULL);
 142  142                  ASSERT3P(vd->vdev_top, ==, vd);
 143  143                  ASSERT3P(mg->mg_class, ==, mc);
 144  144                  ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 145  145          } while ((mg = mg->mg_next) != mc->mc_rotor);
 146  146  
 147  147          return (0);
 148  148  }
 149  149  
 150  150  void
 151  151  metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
 152  152      int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 153  153  {
 154  154          atomic_add_64(&mc->mc_alloc, alloc_delta);
 155  155          atomic_add_64(&mc->mc_deferred, defer_delta);
 156  156          atomic_add_64(&mc->mc_space, space_delta);
 157  157          atomic_add_64(&mc->mc_dspace, dspace_delta);
 158  158  }
 159  159  
 160  160  uint64_t
 161  161  metaslab_class_get_alloc(metaslab_class_t *mc)
 162  162  {
 163  163          return (mc->mc_alloc);
 164  164  }
 165  165  
 166  166  uint64_t
 167  167  metaslab_class_get_deferred(metaslab_class_t *mc)
 168  168  {
 169  169          return (mc->mc_deferred);
 170  170  }
 171  171  
 172  172  uint64_t
 173  173  metaslab_class_get_space(metaslab_class_t *mc)
 174  174  {
 175  175          return (mc->mc_space);
 176  176  }
 177  177  
 178  178  uint64_t
 179  179  metaslab_class_get_dspace(metaslab_class_t *mc)
 180  180  {
 181  181          return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 182  182  }
 183  183  
 184  184  /*
 185  185   * ==========================================================================
 186  186   * Metaslab groups
 187  187   * ==========================================================================
 188  188   */
 189  189  static int
 190  190  metaslab_compare(const void *x1, const void *x2)
 191  191  {
 192  192          const metaslab_t *m1 = x1;
 193  193          const metaslab_t *m2 = x2;
 194  194  
 195  195          if (m1->ms_weight < m2->ms_weight)
 196  196                  return (1);
 197  197          if (m1->ms_weight > m2->ms_weight)
 198  198                  return (-1);
 199  199  
 200  200          /*
 201  201           * If the weights are identical, use the offset to force uniqueness.
 202  202           */
 203  203          if (m1->ms_map.sm_start < m2->ms_map.sm_start)
 204  204                  return (-1);
 205  205          if (m1->ms_map.sm_start > m2->ms_map.sm_start)
 206  206                  return (1);
 207  207  
 208  208          ASSERT3P(m1, ==, m2);
 209  209  
 210  210          return (0);
 211  211  }
 212  212  
 213  213  metaslab_group_t *
 214  214  metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 215  215  {
 216  216          metaslab_group_t *mg;
 217  217  
 218  218          mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 219  219          mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 220  220          avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 221  221              sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 222  222          mg->mg_vd = vd;
 223  223          mg->mg_class = mc;
 224  224          mg->mg_activation_count = 0;
 225  225  
 226  226          return (mg);
 227  227  }
 228  228  
 229  229  void
 230  230  metaslab_group_destroy(metaslab_group_t *mg)
 231  231  {
 232  232          ASSERT(mg->mg_prev == NULL);
 233  233          ASSERT(mg->mg_next == NULL);
 234  234          /*
 235  235           * We may have gone below zero with the activation count
 236  236           * either because we never activated in the first place or
 237  237           * because we're done, and possibly removing the vdev.
 238  238           */
 239  239          ASSERT(mg->mg_activation_count <= 0);
 240  240  
 241  241          avl_destroy(&mg->mg_metaslab_tree);
 242  242          mutex_destroy(&mg->mg_lock);
 243  243          kmem_free(mg, sizeof (metaslab_group_t));
 244  244  }
 245  245  
 246  246  void
 247  247  metaslab_group_activate(metaslab_group_t *mg)
 248  248  {
 249  249          metaslab_class_t *mc = mg->mg_class;
 250  250          metaslab_group_t *mgprev, *mgnext;
 251  251  
 252  252          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 253  253  
 254  254          ASSERT(mc->mc_rotor != mg);
 255  255          ASSERT(mg->mg_prev == NULL);
 256  256          ASSERT(mg->mg_next == NULL);
 257  257          ASSERT(mg->mg_activation_count <= 0);
 258  258  
 259  259          if (++mg->mg_activation_count <= 0)
 260  260                  return;
 261  261  
 262  262          mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 263  263  
 264  264          if ((mgprev = mc->mc_rotor) == NULL) {
 265  265                  mg->mg_prev = mg;
 266  266                  mg->mg_next = mg;
 267  267          } else {
 268  268                  mgnext = mgprev->mg_next;
 269  269                  mg->mg_prev = mgprev;
 270  270                  mg->mg_next = mgnext;
 271  271                  mgprev->mg_next = mg;
 272  272                  mgnext->mg_prev = mg;
 273  273          }
 274  274          mc->mc_rotor = mg;
 275  275  }
 276  276  
 277  277  void
 278  278  metaslab_group_passivate(metaslab_group_t *mg)
 279  279  {
 280  280          metaslab_class_t *mc = mg->mg_class;
 281  281          metaslab_group_t *mgprev, *mgnext;
 282  282  
 283  283          ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
 284  284  
 285  285          if (--mg->mg_activation_count != 0) {
 286  286                  ASSERT(mc->mc_rotor != mg);
 287  287                  ASSERT(mg->mg_prev == NULL);
 288  288                  ASSERT(mg->mg_next == NULL);
 289  289                  ASSERT(mg->mg_activation_count < 0);
 290  290                  return;
 291  291          }
 292  292  
 293  293          mgprev = mg->mg_prev;
 294  294          mgnext = mg->mg_next;
 295  295  
 296  296          if (mg == mgnext) {
 297  297                  mc->mc_rotor = NULL;
 298  298          } else {
 299  299                  mc->mc_rotor = mgnext;
 300  300                  mgprev->mg_next = mgnext;
 301  301                  mgnext->mg_prev = mgprev;
 302  302          }
 303  303  
 304  304          mg->mg_prev = NULL;
 305  305          mg->mg_next = NULL;
 306  306  }
 307  307  
 308  308  static void
 309  309  metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 310  310  {
 311  311          mutex_enter(&mg->mg_lock);
 312  312          ASSERT(msp->ms_group == NULL);
 313  313          msp->ms_group = mg;
 314  314          msp->ms_weight = 0;
 315  315          avl_add(&mg->mg_metaslab_tree, msp);
 316  316          mutex_exit(&mg->mg_lock);
 317  317  }
 318  318  
 319  319  static void
 320  320  metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 321  321  {
 322  322          mutex_enter(&mg->mg_lock);
 323  323          ASSERT(msp->ms_group == mg);
 324  324          avl_remove(&mg->mg_metaslab_tree, msp);
 325  325          msp->ms_group = NULL;
 326  326          mutex_exit(&mg->mg_lock);
 327  327  }
 328  328  
 329  329  static void
 330  330  metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 331  331  {
 332  332          /*
 333  333           * Although in principle the weight can be any value, in
 334  334           * practice we do not use values in the range [1, 510].
 335  335           */
 336  336          ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
 337  337          ASSERT(MUTEX_HELD(&msp->ms_lock));
 338  338  
 339  339          mutex_enter(&mg->mg_lock);
 340  340          ASSERT(msp->ms_group == mg);
 341  341          avl_remove(&mg->mg_metaslab_tree, msp);
 342  342          msp->ms_weight = weight;
 343  343          avl_add(&mg->mg_metaslab_tree, msp);
 344  344          mutex_exit(&mg->mg_lock);
 345  345  }
 346  346  
 347  347  /*
 348  348   * ==========================================================================
 349  349   * Common allocator routines
 350  350   * ==========================================================================
 351  351   */
 352  352  static int
 353  353  metaslab_segsize_compare(const void *x1, const void *x2)
 354  354  {
 355  355          const space_seg_t *s1 = x1;
 356  356          const space_seg_t *s2 = x2;
 357  357          uint64_t ss_size1 = s1->ss_end - s1->ss_start;
 358  358          uint64_t ss_size2 = s2->ss_end - s2->ss_start;
 359  359  
 360  360          if (ss_size1 < ss_size2)
 361  361                  return (-1);
 362  362          if (ss_size1 > ss_size2)
 363  363                  return (1);
 364  364  
 365  365          if (s1->ss_start < s2->ss_start)
 366  366                  return (-1);
 367  367          if (s1->ss_start > s2->ss_start)
 368  368                  return (1);
 369  369  
 370  370          return (0);
 371  371  }
 372  372  
 373  373  /*
 374  374   * This is a helper function that can be used by the allocator to find
 375  375   * a suitable block to allocate. This will search the specified AVL
 376  376   * tree looking for a block that matches the specified criteria.
 377  377   */
 378  378  static uint64_t
 379  379  metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 380  380      uint64_t align)
 381  381  {
 382  382          space_seg_t *ss, ssearch;
 383  383          avl_index_t where;
 384  384  
 385  385          ssearch.ss_start = *cursor;
 386  386          ssearch.ss_end = *cursor + size;
 387  387  
 388  388          ss = avl_find(t, &ssearch, &where);
 389  389          if (ss == NULL)
 390  390                  ss = avl_nearest(t, where, AVL_AFTER);
 391  391  
 392  392          while (ss != NULL) {
 393  393                  uint64_t offset = P2ROUNDUP(ss->ss_start, align);
 394  394  
 395  395                  if (offset + size <= ss->ss_end) {
 396  396                          *cursor = offset + size;
 397  397                          return (offset);
 398  398                  }
 399  399                  ss = AVL_NEXT(t, ss);
 400  400          }
 401  401  
 402  402          /*
 403  403           * If we know we've searched the whole map (*cursor == 0), give up.
 404  404           * Otherwise, reset the cursor to the beginning and try again.
 405  405           */
 406  406          if (*cursor == 0)
 407  407                  return (-1ULL);
 408  408  
 409  409          *cursor = 0;
 410  410          return (metaslab_block_picker(t, cursor, size, align));
 411  411  }
 412  412  
 413  413  static void
 414  414  metaslab_pp_load(space_map_t *sm)
 415  415  {
 416  416          space_seg_t *ss;
 417  417  
 418  418          ASSERT(sm->sm_ppd == NULL);
 419  419          sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
 420  420  
 421  421          sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
 422  422          avl_create(sm->sm_pp_root, metaslab_segsize_compare,
 423  423              sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
 424  424  
 425  425          for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
 426  426                  avl_add(sm->sm_pp_root, ss);
 427  427  }
 428  428  
 429  429  static void
 430  430  metaslab_pp_unload(space_map_t *sm)
 431  431  {
 432  432          void *cookie = NULL;
 433  433  
 434  434          kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
 435  435          sm->sm_ppd = NULL;
 436  436  
 437  437          while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
 438  438                  /* tear down the tree */
 439  439          }
 440  440  
 441  441          avl_destroy(sm->sm_pp_root);
 442  442          kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
 443  443          sm->sm_pp_root = NULL;
 444  444  }
 445  445  
 446  446  /* ARGSUSED */
 447  447  static void
 448  448  metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
 449  449  {
 450  450          /* No need to update cursor */
 451  451  }
 452  452  
 453  453  /* ARGSUSED */
 454  454  static void
 455  455  metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
 456  456  {
 457  457          /* No need to update cursor */
 458  458  }
 459  459  
 460  460  /*
 461  461   * Return the maximum contiguous segment within the metaslab.
 462  462   */
 463  463  uint64_t
 464  464  metaslab_pp_maxsize(space_map_t *sm)
 465  465  {
 466  466          avl_tree_t *t = sm->sm_pp_root;
 467  467          space_seg_t *ss;
 468  468  
 469  469          if (t == NULL || (ss = avl_last(t)) == NULL)
 470  470                  return (0ULL);
 471  471  
 472  472          return (ss->ss_end - ss->ss_start);
 473  473  }
 474  474  
 475  475  /*
 476  476   * ==========================================================================
 477  477   * The first-fit block allocator
 478  478   * ==========================================================================
 479  479   */
 480  480  static uint64_t
 481  481  metaslab_ff_alloc(space_map_t *sm, uint64_t size)
 482  482  {
 483  483          avl_tree_t *t = &sm->sm_root;
 484  484          uint64_t align = size & -size;
 485  485          uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 486  486  
 487  487          return (metaslab_block_picker(t, cursor, size, align));
 488  488  }
 489  489  
 490  490  /* ARGSUSED */
 491  491  boolean_t
 492  492  metaslab_ff_fragmented(space_map_t *sm)
 493  493  {
 494  494          return (B_TRUE);
 495  495  }
 496  496  
 497  497  static space_map_ops_t metaslab_ff_ops = {
 498  498          metaslab_pp_load,
 499  499          metaslab_pp_unload,
 500  500          metaslab_ff_alloc,
 501  501          metaslab_pp_claim,
 502  502          metaslab_pp_free,
 503  503          metaslab_pp_maxsize,
 504  504          metaslab_ff_fragmented
 505  505  };
 506  506  
 507  507  /*
 508  508   * ==========================================================================
 509  509   * Dynamic block allocator -
 510  510   * Uses the first fit allocation scheme until space get low and then
 511  511   * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
 512  512   * and metaslab_df_free_pct to determine when to switch the allocation scheme.
 513  513   * ==========================================================================
 514  514   */
 515  515  static uint64_t
 516  516  metaslab_df_alloc(space_map_t *sm, uint64_t size)
 517  517  {
 518  518          avl_tree_t *t = &sm->sm_root;
 519  519          uint64_t align = size & -size;
 520  520          uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 521  521          uint64_t max_size = metaslab_pp_maxsize(sm);
 522  522          int free_pct = sm->sm_space * 100 / sm->sm_size;
 523  523  
 524  524          ASSERT(MUTEX_HELD(sm->sm_lock));
 525  525          ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 526  526  
 527  527          if (max_size < size)
 528  528                  return (-1ULL);
 529  529  
 530  530          /*
 531  531           * If we're running low on space switch to using the size
 532  532           * sorted AVL tree (best-fit).
 533  533           */
 534  534          if (max_size < metaslab_df_alloc_threshold ||
 535  535              free_pct < metaslab_df_free_pct) {
 536  536                  t = sm->sm_pp_root;
 537  537                  *cursor = 0;
 538  538          }
 539  539  
 540  540          return (metaslab_block_picker(t, cursor, size, 1ULL));
 541  541  }
 542  542  
 543  543  static boolean_t
 544  544  metaslab_df_fragmented(space_map_t *sm)
 545  545  {
 546  546          uint64_t max_size = metaslab_pp_maxsize(sm);
 547  547          int free_pct = sm->sm_space * 100 / sm->sm_size;
 548  548  
 549  549          if (max_size >= metaslab_df_alloc_threshold &&
 550  550              free_pct >= metaslab_df_free_pct)
 551  551                  return (B_FALSE);
 552  552  
 553  553          return (B_TRUE);
 554  554  }
 555  555  
 556  556  static space_map_ops_t metaslab_df_ops = {
 557  557          metaslab_pp_load,
 558  558          metaslab_pp_unload,
 559  559          metaslab_df_alloc,
 560  560          metaslab_pp_claim,
 561  561          metaslab_pp_free,
 562  562          metaslab_pp_maxsize,
 563  563          metaslab_df_fragmented
 564  564  };
 565  565  
 566  566  /*
 567  567   * ==========================================================================
 568  568   * Other experimental allocators
 569  569   * ==========================================================================
 570  570   */
 571  571  static uint64_t
 572  572  metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
 573  573  {
 574  574          avl_tree_t *t = &sm->sm_root;
 575  575          uint64_t *cursor = (uint64_t *)sm->sm_ppd;
 576  576          uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
 577  577          uint64_t max_size = metaslab_pp_maxsize(sm);
 578  578          uint64_t rsize = size;
 579  579          uint64_t offset = 0;
 580  580  
 581  581          ASSERT(MUTEX_HELD(sm->sm_lock));
 582  582          ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 583  583  
 584  584          if (max_size < size)
 585  585                  return (-1ULL);
 586  586  
 587  587          ASSERT3U(*extent_end, >=, *cursor);
 588  588  
 589  589          /*
 590  590           * If we're running low on space switch to using the size
 591  591           * sorted AVL tree (best-fit).
 592  592           */
 593  593          if ((*cursor + size) > *extent_end) {
 594  594  
 595  595                  t = sm->sm_pp_root;
 596  596                  *cursor = *extent_end = 0;
 597  597  
 598  598                  if (max_size > 2 * SPA_MAXBLOCKSIZE)
 599  599                          rsize = MIN(metaslab_min_alloc_size, max_size);
 600  600                  offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
 601  601                  if (offset != -1)
 602  602                          *cursor = offset + size;
 603  603          } else {
 604  604                  offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
 605  605          }
 606  606          ASSERT3U(*cursor, <=, *extent_end);
 607  607          return (offset);
 608  608  }
 609  609  
 610  610  static boolean_t
 611  611  metaslab_cdf_fragmented(space_map_t *sm)
 612  612  {
 613  613          uint64_t max_size = metaslab_pp_maxsize(sm);
 614  614  
 615  615          if (max_size > (metaslab_min_alloc_size * 10))
 616  616                  return (B_FALSE);
 617  617          return (B_TRUE);
 618  618  }
 619  619  
 620  620  static space_map_ops_t metaslab_cdf_ops = {
 621  621          metaslab_pp_load,
 622  622          metaslab_pp_unload,
 623  623          metaslab_cdf_alloc,
 624  624          metaslab_pp_claim,
 625  625          metaslab_pp_free,
 626  626          metaslab_pp_maxsize,
 627  627          metaslab_cdf_fragmented
 628  628  };
 629  629  
 630  630  uint64_t metaslab_ndf_clump_shift = 4;
 631  631  
 632  632  static uint64_t
 633  633  metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
 634  634  {
 635  635          avl_tree_t *t = &sm->sm_root;
 636  636          avl_index_t where;
 637  637          space_seg_t *ss, ssearch;
 638  638          uint64_t hbit = highbit(size);
 639  639          uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
 640  640          uint64_t max_size = metaslab_pp_maxsize(sm);
 641  641  
 642  642          ASSERT(MUTEX_HELD(sm->sm_lock));
 643  643          ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
 644  644  
 645  645          if (max_size < size)
 646  646                  return (-1ULL);
 647  647  
 648  648          ssearch.ss_start = *cursor;
 649  649          ssearch.ss_end = *cursor + size;
 650  650  
 651  651          ss = avl_find(t, &ssearch, &where);
 652  652          if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
 653  653                  t = sm->sm_pp_root;
 654  654  
 655  655                  ssearch.ss_start = 0;
 656  656                  ssearch.ss_end = MIN(max_size,
 657  657                      1ULL << (hbit + metaslab_ndf_clump_shift));
 658  658                  ss = avl_find(t, &ssearch, &where);
 659  659                  if (ss == NULL)
 660  660                          ss = avl_nearest(t, where, AVL_AFTER);
 661  661                  ASSERT(ss != NULL);
 662  662          }
 663  663  
 664  664          if (ss != NULL) {
 665  665                  if (ss->ss_start + size <= ss->ss_end) {
 666  666                          *cursor = ss->ss_start + size;
 667  667                          return (ss->ss_start);
 668  668                  }
 669  669          }
 670  670          return (-1ULL);
 671  671  }
 672  672  
 673  673  static boolean_t
 674  674  metaslab_ndf_fragmented(space_map_t *sm)
 675  675  {
 676  676          uint64_t max_size = metaslab_pp_maxsize(sm);
 677  677  
 678  678          if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
 679  679                  return (B_FALSE);
 680  680          return (B_TRUE);
 681  681  }
 682  682  
 683  683  
 684  684  static space_map_ops_t metaslab_ndf_ops = {
 685  685          metaslab_pp_load,
 686  686          metaslab_pp_unload,
 687  687          metaslab_ndf_alloc,
 688  688          metaslab_pp_claim,
 689  689          metaslab_pp_free,
 690  690          metaslab_pp_maxsize,
 691  691          metaslab_ndf_fragmented
 692  692  };
 693  693  
 694  694  space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 695  695  
 696  696  /*
 697  697   * ==========================================================================
 698  698   * Metaslabs
 699  699   * ==========================================================================
 700  700   */
 701  701  metaslab_t *
 702  702  metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 703  703          uint64_t start, uint64_t size, uint64_t txg)
 704  704  {
 705  705          vdev_t *vd = mg->mg_vd;
 706  706          metaslab_t *msp;
 707  707  
 708  708          msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 709  709          mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 710  710  
 711  711          msp->ms_smo_syncing = *smo;
 712  712  
 713  713          /*
 714  714           * We create the main space map here, but we don't create the
 715  715           * allocmaps and freemaps until metaslab_sync_done().  This serves
 716  716           * two purposes: it allows metaslab_sync_done() to detect the
 717  717           * addition of new space; and for debugging, it ensures that we'd
 718  718           * data fault on any attempt to use this metaslab before it's ready.
 719  719           */
 720  720          space_map_create(&msp->ms_map, start, size,
 721  721              vd->vdev_ashift, &msp->ms_lock);
 722  722  
 723  723          metaslab_group_add(mg, msp);
 724  724  
 725  725          if (metaslab_debug && smo->smo_object != 0) {
 726  726                  mutex_enter(&msp->ms_lock);
 727  727                  VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
 728  728                      SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
 729  729                  mutex_exit(&msp->ms_lock);
 730  730          }
 731  731  
 732  732          /*
 733  733           * If we're opening an existing pool (txg == 0) or creating
 734  734           * a new one (txg == TXG_INITIAL), all space is available now.
 735  735           * If we're adding space to an existing pool, the new space
 736  736           * does not become available until after this txg has synced.
 737  737           */
 738  738          if (txg <= TXG_INITIAL)
 739  739                  metaslab_sync_done(msp, 0);
 740  740  
 741  741          if (txg != 0) {
 742  742                  vdev_dirty(vd, 0, NULL, txg);
 743  743                  vdev_dirty(vd, VDD_METASLAB, msp, txg);
 744  744          }
 745  745  
 746  746          return (msp);
 747  747  }
 748  748  
 749  749  void
 750  750  metaslab_fini(metaslab_t *msp)
 751  751  {
 752  752          metaslab_group_t *mg = msp->ms_group;
 753  753  
 754  754          vdev_space_update(mg->mg_vd,
 755  755              -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
 756  756  
 757  757          metaslab_group_remove(mg, msp);
 758  758  
 759  759          mutex_enter(&msp->ms_lock);
 760  760  
 761  761          space_map_unload(&msp->ms_map);

↓ open down ↓

761 lines elided

↑ open up ↑

 762  762          space_map_destroy(&msp->ms_map);
 763  763  
 764  764          for (int t = 0; t < TXG_SIZE; t++) {
 765  765                  space_map_destroy(&msp->ms_allocmap[t]);
 766  766                  space_map_destroy(&msp->ms_freemap[t]);
 767  767          }
 768  768  
 769  769          for (int t = 0; t < TXG_DEFER_SIZE; t++)
 770  770                  space_map_destroy(&msp->ms_defermap[t]);
 771  771  
 772      -        ASSERT3S(msp->ms_deferspace, ==, 0);
      772 +        ASSERT0(msp->ms_deferspace);
 773  773  
 774  774          mutex_exit(&msp->ms_lock);
 775  775          mutex_destroy(&msp->ms_lock);
 776  776  
 777  777          kmem_free(msp, sizeof (metaslab_t));
 778  778  }
 779  779  
 780  780  #define METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
 781  781  #define METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
 782  782  #define METASLAB_ACTIVE_MASK            \

 783  783          (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
 784  784  
 785  785  static uint64_t
 786  786  metaslab_weight(metaslab_t *msp)
 787  787  {
 788  788          metaslab_group_t *mg = msp->ms_group;
 789  789          space_map_t *sm = &msp->ms_map;
 790  790          space_map_obj_t *smo = &msp->ms_smo;
 791  791          vdev_t *vd = mg->mg_vd;
 792  792          uint64_t weight, space;
 793  793  
 794  794          ASSERT(MUTEX_HELD(&msp->ms_lock));
 795  795  
 796  796          /*
 797  797           * The baseline weight is the metaslab's free space.
 798  798           */
 799  799          space = sm->sm_size - smo->smo_alloc;
 800  800          weight = space;
 801  801  
 802  802          /*
 803  803           * Modern disks have uniform bit density and constant angular velocity.
 804  804           * Therefore, the outer recording zones are faster (higher bandwidth)
 805  805           * than the inner zones by the ratio of outer to inner track diameter,
 806  806           * which is typically around 2:1.  We account for this by assigning
 807  807           * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 808  808           * In effect, this means that we'll select the metaslab with the most
 809  809           * free bandwidth rather than simply the one with the most free space.
 810  810           */
 811  811          weight = 2 * weight -
 812  812              ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
 813  813          ASSERT(weight >= space && weight <= 2 * space);
 814  814  
 815  815          /*
 816  816           * For locality, assign higher weight to metaslabs which have
 817  817           * a lower offset than what we've already activated.
 818  818           */
 819  819          if (sm->sm_start <= mg->mg_bonus_area)
 820  820                  weight *= (metaslab_smo_bonus_pct / 100);
 821  821          ASSERT(weight >= space &&
 822  822              weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
 823  823  
 824  824          if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
 825  825                  /*
 826  826                   * If this metaslab is one we're actively using, adjust its
 827  827                   * weight to make it preferable to any inactive metaslab so
 828  828                   * we'll polish it off.
 829  829                   */
 830  830                  weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 831  831          }
 832  832          return (weight);
 833  833  }
 834  834  
 835  835  static void
 836  836  metaslab_prefetch(metaslab_group_t *mg)
 837  837  {
 838  838          spa_t *spa = mg->mg_vd->vdev_spa;
 839  839          metaslab_t *msp;
 840  840          avl_tree_t *t = &mg->mg_metaslab_tree;
 841  841          int m;
 842  842  
 843  843          mutex_enter(&mg->mg_lock);
 844  844  
 845  845          /*
 846  846           * Prefetch the next potential metaslabs
 847  847           */
 848  848          for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
 849  849                  space_map_t *sm = &msp->ms_map;
 850  850                  space_map_obj_t *smo = &msp->ms_smo;
 851  851  
 852  852                  /* If we have reached our prefetch limit then we're done */
 853  853                  if (m >= metaslab_prefetch_limit)
 854  854                          break;
 855  855  
 856  856                  if (!sm->sm_loaded && smo->smo_object != 0) {
 857  857                          mutex_exit(&mg->mg_lock);
 858  858                          dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
 859  859                              0ULL, smo->smo_objsize);
 860  860                          mutex_enter(&mg->mg_lock);
 861  861                  }
 862  862          }
 863  863          mutex_exit(&mg->mg_lock);
 864  864  }
 865  865  
 866  866  static int
 867  867  metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 868  868  {
 869  869          metaslab_group_t *mg = msp->ms_group;
 870  870          space_map_t *sm = &msp->ms_map;
 871  871          space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 872  872  
 873  873          ASSERT(MUTEX_HELD(&msp->ms_lock));
 874  874  
 875  875          if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 876  876                  space_map_load_wait(sm);
 877  877                  if (!sm->sm_loaded) {
 878  878                          int error = space_map_load(sm, sm_ops, SM_FREE,
 879  879                              &msp->ms_smo,
 880  880                              spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
 881  881                          if (error)  {
 882  882                                  metaslab_group_sort(msp->ms_group, msp, 0);
 883  883                                  return (error);
 884  884                          }
 885  885                          for (int t = 0; t < TXG_DEFER_SIZE; t++)
 886  886                                  space_map_walk(&msp->ms_defermap[t],
 887  887                                      space_map_claim, sm);
 888  888  
 889  889                  }
 890  890  
 891  891                  /*
 892  892                   * Track the bonus area as we activate new metaslabs.
 893  893                   */
 894  894                  if (sm->sm_start > mg->mg_bonus_area) {
 895  895                          mutex_enter(&mg->mg_lock);
 896  896                          mg->mg_bonus_area = sm->sm_start;
 897  897                          mutex_exit(&mg->mg_lock);
 898  898                  }
 899  899  
 900  900                  metaslab_group_sort(msp->ms_group, msp,
 901  901                      msp->ms_weight | activation_weight);
 902  902          }
 903  903          ASSERT(sm->sm_loaded);
 904  904          ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 905  905  
 906  906          return (0);
 907  907  }
 908  908  
 909  909  static void
 910  910  metaslab_passivate(metaslab_t *msp, uint64_t size)
 911  911  {
 912  912          /*
 913  913           * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 914  914           * this metaslab again.  In that case, it had better be empty,
 915  915           * or we would be leaving space on the table.
 916  916           */
 917  917          ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
 918  918          metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
 919  919          ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 920  920  }
 921  921  
 922  922  /*
 923  923   * Write a metaslab to disk in the context of the specified transaction group.
 924  924   */
 925  925  void
 926  926  metaslab_sync(metaslab_t *msp, uint64_t txg)
 927  927  {
 928  928          vdev_t *vd = msp->ms_group->mg_vd;
 929  929          spa_t *spa = vd->vdev_spa;
 930  930          objset_t *mos = spa_meta_objset(spa);
 931  931          space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
 932  932          space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
 933  933          space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
 934  934          space_map_t *sm = &msp->ms_map;
 935  935          space_map_obj_t *smo = &msp->ms_smo_syncing;
 936  936          dmu_buf_t *db;
 937  937          dmu_tx_t *tx;
 938  938  
 939  939          ASSERT(!vd->vdev_ishole);
 940  940  
 941  941          if (allocmap->sm_space == 0 && freemap->sm_space == 0)
 942  942                  return;
 943  943  
 944  944          /*
 945  945           * The only state that can actually be changing concurrently with
 946  946           * metaslab_sync() is the metaslab's ms_map.  No other thread can
 947  947           * be modifying this txg's allocmap, freemap, freed_map, or smo.
 948  948           * Therefore, we only hold ms_lock to satify space_map ASSERTs.
 949  949           * We drop it whenever we call into the DMU, because the DMU
 950  950           * can call down to us (e.g. via zio_free()) at any time.
 951  951           */
 952  952  
 953  953          tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 954  954  
 955  955          if (smo->smo_object == 0) {
 956  956                  ASSERT(smo->smo_objsize == 0);
 957  957                  ASSERT(smo->smo_alloc == 0);
 958  958                  smo->smo_object = dmu_object_alloc(mos,
 959  959                      DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
 960  960                      DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
 961  961                  ASSERT(smo->smo_object != 0);
 962  962                  dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 963  963                      (sm->sm_start >> vd->vdev_ms_shift),
 964  964                      sizeof (uint64_t), &smo->smo_object, tx);
 965  965          }
 966  966  
 967  967          mutex_enter(&msp->ms_lock);
 968  968  
 969  969          space_map_walk(freemap, space_map_add, freed_map);
 970  970  
 971  971          if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
 972  972              2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
 973  973                  /*
 974  974                   * The in-core space map representation is twice as compact
 975  975                   * as the on-disk one, so it's time to condense the latter
 976  976                   * by generating a pure allocmap from first principles.
 977  977                   *
 978  978                   * This metaslab is 100% allocated,
 979  979                   * minus the content of the in-core map (sm),
 980  980                   * minus what's been freed this txg (freed_map),
 981  981                   * minus deferred frees (ms_defermap[]),
 982  982                   * minus allocations from txgs in the future
 983  983                   * (because they haven't been committed yet).
 984  984                   */
 985  985                  space_map_vacate(allocmap, NULL, NULL);
 986  986                  space_map_vacate(freemap, NULL, NULL);
 987  987  
 988  988                  space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
 989  989  
 990  990                  space_map_walk(sm, space_map_remove, allocmap);
 991  991                  space_map_walk(freed_map, space_map_remove, allocmap);
 992  992  
 993  993                  for (int t = 0; t < TXG_DEFER_SIZE; t++)
 994  994                          space_map_walk(&msp->ms_defermap[t],
 995  995                              space_map_remove, allocmap);
 996  996  
 997  997                  for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
 998  998                          space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
 999  999                              space_map_remove, allocmap);
1000 1000  
1001 1001                  mutex_exit(&msp->ms_lock);
1002 1002                  space_map_truncate(smo, mos, tx);
1003 1003                  mutex_enter(&msp->ms_lock);
1004 1004          }
1005 1005  
1006 1006          space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
1007 1007          space_map_sync(freemap, SM_FREE, smo, mos, tx);
1008 1008  
1009 1009          mutex_exit(&msp->ms_lock);
1010 1010  
1011 1011          VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1012 1012          dmu_buf_will_dirty(db, tx);
1013 1013          ASSERT3U(db->db_size, >=, sizeof (*smo));
1014 1014          bcopy(smo, db->db_data, sizeof (*smo));
1015 1015          dmu_buf_rele(db, FTAG);
1016 1016  
1017 1017          dmu_tx_commit(tx);
1018 1018  }
1019 1019  
1020 1020  /*
1021 1021   * Called after a transaction group has completely synced to mark
1022 1022   * all of the metaslab's free space as usable.
1023 1023   */
1024 1024  void
1025 1025  metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1026 1026  {
1027 1027          space_map_obj_t *smo = &msp->ms_smo;
1028 1028          space_map_obj_t *smosync = &msp->ms_smo_syncing;
1029 1029          space_map_t *sm = &msp->ms_map;
1030 1030          space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1031 1031          space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1032 1032          metaslab_group_t *mg = msp->ms_group;
1033 1033          vdev_t *vd = mg->mg_vd;
1034 1034          int64_t alloc_delta, defer_delta;
1035 1035  
1036 1036          ASSERT(!vd->vdev_ishole);
1037 1037  
1038 1038          mutex_enter(&msp->ms_lock);
1039 1039  
1040 1040          /*
1041 1041           * If this metaslab is just becoming available, initialize its
1042 1042           * allocmaps and freemaps and add its capacity to the vdev.
1043 1043           */
1044 1044          if (freed_map->sm_size == 0) {
1045 1045                  for (int t = 0; t < TXG_SIZE; t++) {
1046 1046                          space_map_create(&msp->ms_allocmap[t], sm->sm_start,
1047 1047                              sm->sm_size, sm->sm_shift, sm->sm_lock);
1048 1048                          space_map_create(&msp->ms_freemap[t], sm->sm_start,
1049 1049                              sm->sm_size, sm->sm_shift, sm->sm_lock);
1050 1050                  }
1051 1051  
1052 1052                  for (int t = 0; t < TXG_DEFER_SIZE; t++)
1053 1053                          space_map_create(&msp->ms_defermap[t], sm->sm_start,
1054 1054                              sm->sm_size, sm->sm_shift, sm->sm_lock);
1055 1055  
1056 1056                  vdev_space_update(vd, 0, 0, sm->sm_size);
1057 1057          }
1058 1058  
1059 1059          alloc_delta = smosync->smo_alloc - smo->smo_alloc;
1060 1060          defer_delta = freed_map->sm_space - defer_map->sm_space;
1061 1061  
1062 1062          vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1063 1063  
1064 1064          ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
1065 1065          ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
1066 1066  
1067 1067          /*
1068 1068           * If there's a space_map_load() in progress, wait for it to complete
1069 1069           * so that we have a consistent view of the in-core space map.
1070 1070           * Then, add defer_map (oldest deferred frees) to this map and
1071 1071           * transfer freed_map (this txg's frees) to defer_map.
1072 1072           */
1073 1073          space_map_load_wait(sm);
1074 1074          space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
1075 1075          space_map_vacate(freed_map, space_map_add, defer_map);
1076 1076  
1077 1077          *smo = *smosync;
1078 1078  
1079 1079          msp->ms_deferspace += defer_delta;
1080 1080          ASSERT3S(msp->ms_deferspace, >=, 0);
1081 1081          ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
1082 1082          if (msp->ms_deferspace != 0) {
1083 1083                  /*
1084 1084                   * Keep syncing this metaslab until all deferred frees
1085 1085                   * are back in circulation.
1086 1086                   */
1087 1087                  vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1088 1088          }
1089 1089  
1090 1090          /*
1091 1091           * If the map is loaded but no longer active, evict it as soon as all
1092 1092           * future allocations have synced.  (If we unloaded it now and then
1093 1093           * loaded a moment later, the map wouldn't reflect those allocations.)
1094 1094           */
1095 1095          if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1096 1096                  int evictable = 1;
1097 1097  
1098 1098                  for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1099 1099                          if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
1100 1100                                  evictable = 0;
1101 1101  
1102 1102                  if (evictable && !metaslab_debug)
1103 1103                          space_map_unload(sm);
1104 1104          }
1105 1105  
1106 1106          metaslab_group_sort(mg, msp, metaslab_weight(msp));
1107 1107  
1108 1108          mutex_exit(&msp->ms_lock);
1109 1109  }
1110 1110  
1111 1111  void
1112 1112  metaslab_sync_reassess(metaslab_group_t *mg)
1113 1113  {
1114 1114          vdev_t *vd = mg->mg_vd;
1115 1115          int64_t failures = mg->mg_alloc_failures;
1116 1116  
1117 1117          /*
1118 1118           * Re-evaluate all metaslabs which have lower offsets than the
1119 1119           * bonus area.
1120 1120           */
1121 1121          for (int m = 0; m < vd->vdev_ms_count; m++) {
1122 1122                  metaslab_t *msp = vd->vdev_ms[m];
1123 1123  
1124 1124                  if (msp->ms_map.sm_start > mg->mg_bonus_area)
1125 1125                          break;
1126 1126  
1127 1127                  mutex_enter(&msp->ms_lock);
1128 1128                  metaslab_group_sort(mg, msp, metaslab_weight(msp));
1129 1129                  mutex_exit(&msp->ms_lock);
1130 1130          }
1131 1131  
1132 1132          atomic_add_64(&mg->mg_alloc_failures, -failures);
1133 1133  
1134 1134          /*
1135 1135           * Prefetch the next potential metaslabs
1136 1136           */
1137 1137          metaslab_prefetch(mg);
1138 1138  }
1139 1139  
1140 1140  static uint64_t
1141 1141  metaslab_distance(metaslab_t *msp, dva_t *dva)
1142 1142  {
1143 1143          uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1144 1144          uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1145 1145          uint64_t start = msp->ms_map.sm_start >> ms_shift;
1146 1146  
1147 1147          if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1148 1148                  return (1ULL << 63);
1149 1149  
1150 1150          if (offset < start)
1151 1151                  return ((start - offset) << ms_shift);
1152 1152          if (offset > start)
1153 1153                  return ((offset - start) << ms_shift);
1154 1154          return (0);
1155 1155  }
1156 1156  
1157 1157  static uint64_t
1158 1158  metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1159 1159      uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1160 1160  {
1161 1161          spa_t *spa = mg->mg_vd->vdev_spa;
1162 1162          metaslab_t *msp = NULL;
1163 1163          uint64_t offset = -1ULL;
1164 1164          avl_tree_t *t = &mg->mg_metaslab_tree;
1165 1165          uint64_t activation_weight;
1166 1166          uint64_t target_distance;
1167 1167          int i;
1168 1168  
1169 1169          activation_weight = METASLAB_WEIGHT_PRIMARY;
1170 1170          for (i = 0; i < d; i++) {
1171 1171                  if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1172 1172                          activation_weight = METASLAB_WEIGHT_SECONDARY;
1173 1173                          break;
1174 1174                  }
1175 1175          }
1176 1176  
1177 1177          for (;;) {
1178 1178                  boolean_t was_active;
1179 1179  
1180 1180                  mutex_enter(&mg->mg_lock);
1181 1181                  for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1182 1182                          if (msp->ms_weight < asize) {
1183 1183                                  spa_dbgmsg(spa, "%s: failed to meet weight "
1184 1184                                      "requirement: vdev %llu, txg %llu, mg %p, "
1185 1185                                      "msp %p, psize %llu, asize %llu, "
1186 1186                                      "failures %llu, weight %llu",
1187 1187                                      spa_name(spa), mg->mg_vd->vdev_id, txg,
1188 1188                                      mg, msp, psize, asize,
1189 1189                                      mg->mg_alloc_failures, msp->ms_weight);
1190 1190                                  mutex_exit(&mg->mg_lock);
1191 1191                                  return (-1ULL);
1192 1192                          }
1193 1193                          was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1194 1194                          if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1195 1195                                  break;
1196 1196  
1197 1197                          target_distance = min_distance +
1198 1198                              (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1199 1199  
1200 1200                          for (i = 0; i < d; i++)
1201 1201                                  if (metaslab_distance(msp, &dva[i]) <
1202 1202                                      target_distance)
1203 1203                                          break;
1204 1204                          if (i == d)
1205 1205                                  break;
1206 1206                  }
1207 1207                  mutex_exit(&mg->mg_lock);
1208 1208                  if (msp == NULL)
1209 1209                          return (-1ULL);
1210 1210  
1211 1211                  /*
1212 1212                   * If we've already reached the allowable number of failed
1213 1213                   * allocation attempts on this metaslab group then we
1214 1214                   * consider skipping it. We skip it only if we're allowed
1215 1215                   * to "fast" gang, the physical size is larger than
1216 1216                   * a gang block, and we're attempting to allocate from
1217 1217                   * the primary metaslab.
1218 1218                   */
1219 1219                  if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1220 1220                      CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1221 1221                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
1222 1222                          spa_dbgmsg(spa, "%s: skipping metaslab group: "
1223 1223                              "vdev %llu, txg %llu, mg %p, psize %llu, "
1224 1224                              "asize %llu, failures %llu", spa_name(spa),
1225 1225                              mg->mg_vd->vdev_id, txg, mg, psize, asize,
1226 1226                              mg->mg_alloc_failures);
1227 1227                          return (-1ULL);
1228 1228                  }
1229 1229  
1230 1230                  mutex_enter(&msp->ms_lock);
1231 1231  
1232 1232                  /*
1233 1233                   * Ensure that the metaslab we have selected is still
1234 1234                   * capable of handling our request. It's possible that
1235 1235                   * another thread may have changed the weight while we
1236 1236                   * were blocked on the metaslab lock.
1237 1237                   */
1238 1238                  if (msp->ms_weight < asize || (was_active &&
1239 1239                      !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1240 1240                      activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1241 1241                          mutex_exit(&msp->ms_lock);
1242 1242                          continue;
1243 1243                  }
1244 1244  
1245 1245                  if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1246 1246                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
1247 1247                          metaslab_passivate(msp,
1248 1248                              msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1249 1249                          mutex_exit(&msp->ms_lock);
1250 1250                          continue;
1251 1251                  }
1252 1252  
1253 1253                  if (metaslab_activate(msp, activation_weight) != 0) {
1254 1254                          mutex_exit(&msp->ms_lock);
1255 1255                          continue;
1256 1256                  }
1257 1257  
1258 1258                  if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
1259 1259                          break;
1260 1260  
1261 1261                  atomic_inc_64(&mg->mg_alloc_failures);
1262 1262  
1263 1263                  metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
1264 1264  
1265 1265                  mutex_exit(&msp->ms_lock);
1266 1266          }
1267 1267  
1268 1268          if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1269 1269                  vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1270 1270  
1271 1271          space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
1272 1272  
1273 1273          mutex_exit(&msp->ms_lock);
1274 1274  
1275 1275          return (offset);
1276 1276  }
1277 1277  
1278 1278  /*
1279 1279   * Allocate a block for the specified i/o.
1280 1280   */
1281 1281  static int
1282 1282  metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1283 1283      dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1284 1284  {
1285 1285          metaslab_group_t *mg, *rotor;
1286 1286          vdev_t *vd;
1287 1287          int dshift = 3;
1288 1288          int all_zero;
1289 1289          int zio_lock = B_FALSE;
1290 1290          boolean_t allocatable;
1291 1291          uint64_t offset = -1ULL;
1292 1292          uint64_t asize;
1293 1293          uint64_t distance;
1294 1294  
1295 1295          ASSERT(!DVA_IS_VALID(&dva[d]));
1296 1296  
1297 1297          /*
1298 1298           * For testing, make some blocks above a certain size be gang blocks.
1299 1299           */
1300 1300          if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1301 1301                  return (ENOSPC);
1302 1302  
1303 1303          /*
1304 1304           * Start at the rotor and loop through all mgs until we find something.
1305 1305           * Note that there's no locking on mc_rotor or mc_aliquot because
1306 1306           * nothing actually breaks if we miss a few updates -- we just won't
1307 1307           * allocate quite as evenly.  It all balances out over time.
1308 1308           *
1309 1309           * If we are doing ditto or log blocks, try to spread them across
1310 1310           * consecutive vdevs.  If we're forced to reuse a vdev before we've
1311 1311           * allocated all of our ditto blocks, then try and spread them out on
1312 1312           * that vdev as much as possible.  If it turns out to not be possible,
1313 1313           * gradually lower our standards until anything becomes acceptable.
1314 1314           * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1315 1315           * gives us hope of containing our fault domains to something we're
1316 1316           * able to reason about.  Otherwise, any two top-level vdev failures
1317 1317           * will guarantee the loss of data.  With consecutive allocation,
1318 1318           * only two adjacent top-level vdev failures will result in data loss.
1319 1319           *
1320 1320           * If we are doing gang blocks (hintdva is non-NULL), try to keep
1321 1321           * ourselves on the same vdev as our gang block header.  That
1322 1322           * way, we can hope for locality in vdev_cache, plus it makes our
1323 1323           * fault domains something tractable.
1324 1324           */
1325 1325          if (hintdva) {
1326 1326                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1327 1327  
1328 1328                  /*
1329 1329                   * It's possible the vdev we're using as the hint no
1330 1330                   * longer exists (i.e. removed). Consult the rotor when
1331 1331                   * all else fails.
1332 1332                   */
1333 1333                  if (vd != NULL) {
1334 1334                          mg = vd->vdev_mg;
1335 1335  
1336 1336                          if (flags & METASLAB_HINTBP_AVOID &&
1337 1337                              mg->mg_next != NULL)
1338 1338                                  mg = mg->mg_next;
1339 1339                  } else {
1340 1340                          mg = mc->mc_rotor;
1341 1341                  }
1342 1342          } else if (d != 0) {
1343 1343                  vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1344 1344                  mg = vd->vdev_mg->mg_next;
1345 1345          } else {
1346 1346                  mg = mc->mc_rotor;
1347 1347          }
1348 1348  
1349 1349          /*
1350 1350           * If the hint put us into the wrong metaslab class, or into a
1351 1351           * metaslab group that has been passivated, just follow the rotor.
1352 1352           */
1353 1353          if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1354 1354                  mg = mc->mc_rotor;
1355 1355  
1356 1356          rotor = mg;
1357 1357  top:
1358 1358          all_zero = B_TRUE;
1359 1359          do {
1360 1360                  ASSERT(mg->mg_activation_count == 1);
1361 1361  
1362 1362                  vd = mg->mg_vd;
1363 1363  
1364 1364                  /*
1365 1365                   * Don't allocate from faulted devices.
1366 1366                   */
1367 1367                  if (zio_lock) {
1368 1368                          spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1369 1369                          allocatable = vdev_allocatable(vd);
1370 1370                          spa_config_exit(spa, SCL_ZIO, FTAG);
1371 1371                  } else {
1372 1372                          allocatable = vdev_allocatable(vd);
1373 1373                  }
1374 1374                  if (!allocatable)
1375 1375                          goto next;
1376 1376  
1377 1377                  /*
1378 1378                   * Avoid writing single-copy data to a failing vdev
1379 1379                   */
1380 1380                  if ((vd->vdev_stat.vs_write_errors > 0 ||
1381 1381                      vd->vdev_state < VDEV_STATE_HEALTHY) &&
1382 1382                      d == 0 && dshift == 3) {
1383 1383                          all_zero = B_FALSE;
1384 1384                          goto next;
1385 1385                  }
1386 1386  
1387 1387                  ASSERT(mg->mg_class == mc);
1388 1388  
1389 1389                  distance = vd->vdev_asize >> dshift;
1390 1390                  if (distance <= (1ULL << vd->vdev_ms_shift))
1391 1391                          distance = 0;
1392 1392                  else
1393 1393                          all_zero = B_FALSE;
1394 1394  
1395 1395                  asize = vdev_psize_to_asize(vd, psize);
1396 1396                  ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1397 1397  
1398 1398                  offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1399 1399                      dva, d, flags);
1400 1400                  if (offset != -1ULL) {
1401 1401                          /*
1402 1402                           * If we've just selected this metaslab group,
1403 1403                           * figure out whether the corresponding vdev is
1404 1404                           * over- or under-used relative to the pool,
1405 1405                           * and set an allocation bias to even it out.
1406 1406                           */
1407 1407                          if (mc->mc_aliquot == 0) {
1408 1408                                  vdev_stat_t *vs = &vd->vdev_stat;
1409 1409                                  int64_t vu, cu;
1410 1410  
1411 1411                                  vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1412 1412                                  cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1413 1413  
1414 1414                                  /*
1415 1415                                   * Calculate how much more or less we should
1416 1416                                   * try to allocate from this device during
1417 1417                                   * this iteration around the rotor.
1418 1418                                   * For example, if a device is 80% full
1419 1419                                   * and the pool is 20% full then we should
1420 1420                                   * reduce allocations by 60% on this device.
1421 1421                                   *
1422 1422                                   * mg_bias = (20 - 80) * 512K / 100 = -307K
1423 1423                                   *
1424 1424                                   * This reduces allocations by 307K for this
1425 1425                                   * iteration.
1426 1426                                   */
1427 1427                                  mg->mg_bias = ((cu - vu) *
1428 1428                                      (int64_t)mg->mg_aliquot) / 100;
1429 1429                          }
1430 1430  
1431 1431                          if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1432 1432                              mg->mg_aliquot + mg->mg_bias) {
1433 1433                                  mc->mc_rotor = mg->mg_next;
1434 1434                                  mc->mc_aliquot = 0;
1435 1435                          }
1436 1436  
1437 1437                          DVA_SET_VDEV(&dva[d], vd->vdev_id);
1438 1438                          DVA_SET_OFFSET(&dva[d], offset);
1439 1439                          DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1440 1440                          DVA_SET_ASIZE(&dva[d], asize);
1441 1441  
1442 1442                          return (0);
1443 1443                  }
1444 1444  next:
1445 1445                  mc->mc_rotor = mg->mg_next;
1446 1446                  mc->mc_aliquot = 0;
1447 1447          } while ((mg = mg->mg_next) != rotor);
1448 1448  
1449 1449          if (!all_zero) {
1450 1450                  dshift++;
1451 1451                  ASSERT(dshift < 64);
1452 1452                  goto top;
1453 1453          }
1454 1454  
1455 1455          if (!allocatable && !zio_lock) {
1456 1456                  dshift = 3;
1457 1457                  zio_lock = B_TRUE;
1458 1458                  goto top;
1459 1459          }
1460 1460  
1461 1461          bzero(&dva[d], sizeof (dva_t));
1462 1462  
1463 1463          return (ENOSPC);
1464 1464  }
1465 1465  
1466 1466  /*
1467 1467   * Free the block represented by DVA in the context of the specified
1468 1468   * transaction group.
1469 1469   */
1470 1470  static void
1471 1471  metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1472 1472  {
1473 1473          uint64_t vdev = DVA_GET_VDEV(dva);
1474 1474          uint64_t offset = DVA_GET_OFFSET(dva);
1475 1475          uint64_t size = DVA_GET_ASIZE(dva);
1476 1476          vdev_t *vd;
1477 1477          metaslab_t *msp;
1478 1478  
1479 1479          ASSERT(DVA_IS_VALID(dva));
1480 1480  
1481 1481          if (txg > spa_freeze_txg(spa))
1482 1482                  return;
1483 1483  
1484 1484          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1485 1485              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1486 1486                  cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1487 1487                      (u_longlong_t)vdev, (u_longlong_t)offset);
1488 1488                  ASSERT(0);
1489 1489                  return;
1490 1490          }
1491 1491  
1492 1492          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1493 1493  
1494 1494          if (DVA_GET_GANG(dva))
1495 1495                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1496 1496  
1497 1497          mutex_enter(&msp->ms_lock);
1498 1498  
1499 1499          if (now) {
1500 1500                  space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
1501 1501                      offset, size);
1502 1502                  space_map_free(&msp->ms_map, offset, size);
1503 1503          } else {
1504 1504                  if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
1505 1505                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
1506 1506                  space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
1507 1507          }
1508 1508  
1509 1509          mutex_exit(&msp->ms_lock);
1510 1510  }
1511 1511  
1512 1512  /*
1513 1513   * Intent log support: upon opening the pool after a crash, notify the SPA
1514 1514   * of blocks that the intent log has allocated for immediate write, but
1515 1515   * which are still considered free by the SPA because the last transaction
1516 1516   * group didn't commit yet.
1517 1517   */
1518 1518  static int
1519 1519  metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
1520 1520  {
1521 1521          uint64_t vdev = DVA_GET_VDEV(dva);
1522 1522          uint64_t offset = DVA_GET_OFFSET(dva);
1523 1523          uint64_t size = DVA_GET_ASIZE(dva);
1524 1524          vdev_t *vd;
1525 1525          metaslab_t *msp;
1526 1526          int error = 0;
1527 1527  
1528 1528          ASSERT(DVA_IS_VALID(dva));
1529 1529  
1530 1530          if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1531 1531              (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1532 1532                  return (ENXIO);
1533 1533  
1534 1534          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1535 1535  
1536 1536          if (DVA_GET_GANG(dva))
1537 1537                  size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1538 1538  
1539 1539          mutex_enter(&msp->ms_lock);
1540 1540  
1541 1541          if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
1542 1542                  error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1543 1543  
1544 1544          if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
1545 1545                  error = ENOENT;
1546 1546  
1547 1547          if (error || txg == 0) {        /* txg == 0 indicates dry run */
1548 1548                  mutex_exit(&msp->ms_lock);
1549 1549                  return (error);
1550 1550          }
1551 1551  
1552 1552          space_map_claim(&msp->ms_map, offset, size);
1553 1553  
1554 1554          if (spa_writeable(spa)) {       /* don't dirty if we're zdb(1M) */
1555 1555                  if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1556 1556                          vdev_dirty(vd, VDD_METASLAB, msp, txg);
1557 1557                  space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
1558 1558          }
1559 1559  
1560 1560          mutex_exit(&msp->ms_lock);
1561 1561  
1562 1562          return (0);
1563 1563  }
1564 1564  
1565 1565  int
1566 1566  metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
1567 1567      int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
1568 1568  {
1569 1569          dva_t *dva = bp->blk_dva;
1570 1570          dva_t *hintdva = hintbp->blk_dva;
1571 1571          int error = 0;
1572 1572  
1573 1573          ASSERT(bp->blk_birth == 0);
1574 1574          ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
1575 1575  
1576 1576          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1577 1577  
1578 1578          if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
1579 1579                  spa_config_exit(spa, SCL_ALLOC, FTAG);
1580 1580                  return (ENOSPC);
1581 1581          }
1582 1582  
1583 1583          ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
1584 1584          ASSERT(BP_GET_NDVAS(bp) == 0);
1585 1585          ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
1586 1586  
1587 1587          for (int d = 0; d < ndvas; d++) {
1588 1588                  error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
1589 1589                      txg, flags);
1590 1590                  if (error) {
1591 1591                          for (d--; d >= 0; d--) {
1592 1592                                  metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
1593 1593                                  bzero(&dva[d], sizeof (dva_t));
1594 1594                          }
1595 1595                          spa_config_exit(spa, SCL_ALLOC, FTAG);
1596 1596                          return (error);
1597 1597                  }
1598 1598          }
1599 1599          ASSERT(error == 0);
1600 1600          ASSERT(BP_GET_NDVAS(bp) == ndvas);
1601 1601  
1602 1602          spa_config_exit(spa, SCL_ALLOC, FTAG);
1603 1603  
1604 1604          BP_SET_BIRTH(bp, txg, txg);
1605 1605  
1606 1606          return (0);
1607 1607  }
1608 1608  
1609 1609  void
1610 1610  metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
1611 1611  {
1612 1612          const dva_t *dva = bp->blk_dva;
1613 1613          int ndvas = BP_GET_NDVAS(bp);
1614 1614  
1615 1615          ASSERT(!BP_IS_HOLE(bp));
1616 1616          ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
1617 1617  
1618 1618          spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
1619 1619  
1620 1620          for (int d = 0; d < ndvas; d++)
1621 1621                  metaslab_free_dva(spa, &dva[d], txg, now);
1622 1622  
1623 1623          spa_config_exit(spa, SCL_FREE, FTAG);
1624 1624  }
1625 1625  
1626 1626  int
1627 1627  metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1628 1628  {
1629 1629          const dva_t *dva = bp->blk_dva;
1630 1630          int ndvas = BP_GET_NDVAS(bp);
1631 1631          int error = 0;
1632 1632  
1633 1633          ASSERT(!BP_IS_HOLE(bp));
1634 1634  
1635 1635          if (txg != 0) {
1636 1636                  /*
1637 1637                   * First do a dry run to make sure all DVAs are claimable,
1638 1638                   * so we don't have to unwind from partial failures below.
1639 1639                   */
1640 1640                  if ((error = metaslab_claim(spa, bp, 0)) != 0)
1641 1641                          return (error);
1642 1642          }
1643 1643  
1644 1644          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1645 1645  
1646 1646          for (int d = 0; d < ndvas; d++)
1647 1647                  if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1648 1648                          break;
1649 1649  
1650 1650          spa_config_exit(spa, SCL_ALLOC, FTAG);
1651 1651  
1652 1652          ASSERT(error == 0 || txg == 0);
1653 1653  
1654 1654          return (error);
1655 1655  }

↓ open down ↓

873 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX