Print this page
3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit
4080 zpool clear fails to clear pool
4081 need zfs_mg_noalloc_threshold
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/metaslab.c
          +++ new/usr/src/uts/common/fs/zfs/metaslab.c
↓ open down ↓ 50 lines elided ↑ open up ↑
  51   51   * The in-core space map representation is more compact than its on-disk form.
  52   52   * The zfs_condense_pct determines how much more compact the in-core
  53   53   * space_map representation must be before we compact it on-disk.
  54   54   * Values should be greater than or equal to 100.
  55   55   */
  56   56  int zfs_condense_pct = 200;
  57   57  
  58   58  /*
  59   59   * This value defines the number of allowed allocation failures per vdev.
  60   60   * If a device reaches this threshold in a given txg then we consider skipping
  61      - * allocations on that device.
       61 + * allocations on that device. The value of zfs_mg_alloc_failures is computed
       62 + * in zio_init() unless it has been overridden in /etc/system.
  62   63   */
  63      -int zfs_mg_alloc_failures;
       64 +int zfs_mg_alloc_failures = 0;
  64   65  
  65   66  /*
       67 + * The zfs_mg_noalloc_threshold defines which metaslab groups should
       68 + * be eligible for allocation. The value is defined as a percentage of
       69 + * a free space. Metaslab groups that have more free space than
       70 + * zfs_mg_noalloc_threshold are always eligible for allocations. Once
       71 + * a metaslab group's free space is less than or equal to the
       72 + * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
       73 + * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
       74 + * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
       75 + * groups are allowed to accept allocations. Gang blocks are always
       76 + * eligible to allocate on any metaslab group. The default value of 0 means
       77 + * no metaslab group will be excluded based on this criterion.
       78 + */
       79 +int zfs_mg_noalloc_threshold = 0;
       80 +
       81 +/*
  66   82   * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  67   83   */
  68   84  static int metaslab_debug = 0;
  69   85  
  70   86  /*
  71   87   * Minimum size which forces the dynamic allocator to change
  72   88   * it's allocation strategy.  Once the space map cannot satisfy
  73   89   * an allocation of this size then it switches to using more
  74   90   * aggressive strategy (i.e search by size rather than offset).
  75   91   */
↓ open down ↓ 141 lines elided ↑ open up ↑
 217  233          if (m1->ms_map->sm_start < m2->ms_map->sm_start)
 218  234                  return (-1);
 219  235          if (m1->ms_map->sm_start > m2->ms_map->sm_start)
 220  236                  return (1);
 221  237  
 222  238          ASSERT3P(m1, ==, m2);
 223  239  
 224  240          return (0);
 225  241  }
 226  242  
      243 +/*
      244 + * Update the allocatable flag and the metaslab group's capacity.
      245 + * The allocatable flag is set to true if the capacity is below
      246 + * the zfs_mg_noalloc_threshold. If a metaslab group transitions
      247 + * from allocatable to non-allocatable or vice versa then the metaslab
      248 + * group's class is updated to reflect the transition.
      249 + */
      250 +static void
      251 +metaslab_group_alloc_update(metaslab_group_t *mg)
      252 +{
      253 +        vdev_t *vd = mg->mg_vd;
      254 +        metaslab_class_t *mc = mg->mg_class;
      255 +        vdev_stat_t *vs = &vd->vdev_stat;
      256 +        boolean_t was_allocatable;
      257 +
      258 +        ASSERT(vd == vd->vdev_top);
      259 +
      260 +        mutex_enter(&mg->mg_lock);
      261 +        was_allocatable = mg->mg_allocatable;
      262 +
      263 +        mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
      264 +            (vs->vs_space + 1);
      265 +
      266 +        mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
      267 +
      268 +        /*
      269 +         * The mc_alloc_groups maintains a count of the number of
      270 +         * groups in this metaslab class that are still above the
      271 +         * zfs_mg_noalloc_threshold. This is used by the allocating
      272 +         * threads to determine if they should avoid allocations to
      273 +         * a given group. The allocator will avoid allocations to a group
      274 +         * if that group has reached or is below the zfs_mg_noalloc_threshold
      275 +         * and there are still other groups that are above the threshold.
      276 +         * When a group transitions from allocatable to non-allocatable or
      277 +         * vice versa we update the metaslab class to reflect that change.
      278 +         * When the mc_alloc_groups value drops to 0 that means that all
      279 +         * groups have reached the zfs_mg_noalloc_threshold making all groups
      280 +         * eligible for allocations. This effectively means that all devices
      281 +         * are balanced again.
      282 +         */
      283 +        if (was_allocatable && !mg->mg_allocatable)
      284 +                mc->mc_alloc_groups--;
      285 +        else if (!was_allocatable && mg->mg_allocatable)
      286 +                mc->mc_alloc_groups++;
      287 +        mutex_exit(&mg->mg_lock);
      288 +}
      289 +
 227  290  metaslab_group_t *
 228  291  metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 229  292  {
 230  293          metaslab_group_t *mg;
 231  294  
 232  295          mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 233  296          mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 234  297          avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 235  298              sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 236  299          mg->mg_vd = vd;
↓ open down ↓ 30 lines elided ↑ open up ↑
 267  330  
 268  331          ASSERT(mc->mc_rotor != mg);
 269  332          ASSERT(mg->mg_prev == NULL);
 270  333          ASSERT(mg->mg_next == NULL);
 271  334          ASSERT(mg->mg_activation_count <= 0);
 272  335  
 273  336          if (++mg->mg_activation_count <= 0)
 274  337                  return;
 275  338  
 276  339          mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
      340 +        metaslab_group_alloc_update(mg);
 277  341  
 278  342          if ((mgprev = mc->mc_rotor) == NULL) {
 279  343                  mg->mg_prev = mg;
 280  344                  mg->mg_next = mg;
 281  345          } else {
 282  346                  mgnext = mgprev->mg_next;
 283  347                  mg->mg_prev = mgprev;
 284  348                  mg->mg_next = mgnext;
 285  349                  mgprev->mg_next = mg;
 286  350                  mgnext->mg_prev = mg;
↓ open down ↓ 65 lines elided ↑ open up ↑
 352  416  
 353  417          mutex_enter(&mg->mg_lock);
 354  418          ASSERT(msp->ms_group == mg);
 355  419          avl_remove(&mg->mg_metaslab_tree, msp);
 356  420          msp->ms_weight = weight;
 357  421          avl_add(&mg->mg_metaslab_tree, msp);
 358  422          mutex_exit(&mg->mg_lock);
 359  423  }
 360  424  
 361  425  /*
      426 + * Determine if a given metaslab group should skip allocations. A metaslab
      427 + * group should avoid allocations if its used capacity has crossed the
      428 + * zfs_mg_noalloc_threshold and there is at least one metaslab group
      429 + * that can still handle allocations.
      430 + */
      431 +static boolean_t
      432 +metaslab_group_allocatable(metaslab_group_t *mg)
      433 +{
      434 +        vdev_t *vd = mg->mg_vd;
      435 +        spa_t *spa = vd->vdev_spa;
      436 +        metaslab_class_t *mc = mg->mg_class;
      437 +
      438 +        /*
      439 +         * A metaslab group is considered allocatable if its free capacity
      440 +         * is greater than the set value of zfs_mg_noalloc_threshold, it's
      441 +         * associated with a slog, or there are no other metaslab groups
      442 +         * with free capacity greater than zfs_mg_noalloc_threshold.
      443 +         */
      444 +        return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
      445 +            mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
      446 +}
      447 +
      448 +/*
 362  449   * ==========================================================================
 363  450   * Common allocator routines
 364  451   * ==========================================================================
 365  452   */
 366  453  static int
 367  454  metaslab_segsize_compare(const void *x1, const void *x2)
 368  455  {
 369  456          const space_seg_t *s1 = x1;
 370  457          const space_seg_t *s2 = x2;
 371  458          uint64_t ss_size1 = s1->ss_end - s1->ss_start;
↓ open down ↓ 928 lines elided ↑ open up ↑
1300 1387  
1301 1388          mutex_exit(&msp->ms_lock);
1302 1389  }
1303 1390  
1304 1391  void
1305 1392  metaslab_sync_reassess(metaslab_group_t *mg)
1306 1393  {
1307 1394          vdev_t *vd = mg->mg_vd;
1308 1395          int64_t failures = mg->mg_alloc_failures;
1309 1396  
     1397 +        metaslab_group_alloc_update(mg);
     1398 +
1310 1399          /*
1311 1400           * Re-evaluate all metaslabs which have lower offsets than the
1312 1401           * bonus area.
1313 1402           */
1314 1403          for (int m = 0; m < vd->vdev_ms_count; m++) {
1315 1404                  metaslab_t *msp = vd->vdev_ms[m];
1316 1405  
1317 1406                  if (msp->ms_map->sm_start > mg->mg_bonus_area)
1318 1407                          break;
1319 1408  
↓ open down ↓ 81 lines elided ↑ open up ↑
1401 1490                                  if (metaslab_distance(msp, &dva[i]) <
1402 1491                                      target_distance)
1403 1492                                          break;
1404 1493                          if (i == d)
1405 1494                                  break;
1406 1495                  }
1407 1496                  mutex_exit(&mg->mg_lock);
1408 1497                  if (msp == NULL)
1409 1498                          return (-1ULL);
1410 1499  
     1500 +                mutex_enter(&msp->ms_lock);
     1501 +
1411 1502                  /*
1412 1503                   * If we've already reached the allowable number of failed
1413 1504                   * allocation attempts on this metaslab group then we
1414 1505                   * consider skipping it. We skip it only if we're allowed
1415 1506                   * to "fast" gang, the physical size is larger than
1416 1507                   * a gang block, and we're attempting to allocate from
1417 1508                   * the primary metaslab.
1418 1509                   */
1419 1510                  if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1420 1511                      CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1421 1512                      activation_weight == METASLAB_WEIGHT_PRIMARY) {
1422 1513                          spa_dbgmsg(spa, "%s: skipping metaslab group: "
1423 1514                              "vdev %llu, txg %llu, mg %p, psize %llu, "
1424 1515                              "asize %llu, failures %llu", spa_name(spa),
1425 1516                              mg->mg_vd->vdev_id, txg, mg, psize, asize,
1426 1517                              mg->mg_alloc_failures);
     1518 +                        mutex_exit(&msp->ms_lock);
1427 1519                          return (-1ULL);
1428 1520                  }
1429 1521  
1430      -                mutex_enter(&msp->ms_lock);
1431      -
1432 1522                  /*
1433 1523                   * Ensure that the metaslab we have selected is still
1434 1524                   * capable of handling our request. It's possible that
1435 1525                   * another thread may have changed the weight while we
1436 1526                   * were blocked on the metaslab lock.
1437 1527                   */
1438 1528                  if (msp->ms_weight < asize || (was_active &&
1439 1529                      !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1440 1530                      activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1441 1531                          mutex_exit(&msp->ms_lock);
↓ open down ↓ 132 lines elided ↑ open up ↑
1574 1664                  /*
1575 1665                   * Don't allocate from faulted devices.
1576 1666                   */
1577 1667                  if (zio_lock) {
1578 1668                          spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1579 1669                          allocatable = vdev_allocatable(vd);
1580 1670                          spa_config_exit(spa, SCL_ZIO, FTAG);
1581 1671                  } else {
1582 1672                          allocatable = vdev_allocatable(vd);
1583 1673                  }
     1674 +
     1675 +                /*
     1676 +                 * Determine if the selected metaslab group is eligible
     1677 +                 * for allocations. If we're ganging or have requested
     1678 +                 * an allocation for the smallest gang block size
     1679 +                 * then we don't want to avoid allocating to the this
     1680 +                 * metaslab group. If we're in this condition we should
     1681 +                 * try to allocate from any device possible so that we
     1682 +                 * don't inadvertently return ENOSPC and suspend the pool
     1683 +                 * even though space is still available.
     1684 +                 */
     1685 +                if (allocatable && CAN_FASTGANG(flags) &&
     1686 +                    psize > SPA_GANGBLOCKSIZE)
     1687 +                        allocatable = metaslab_group_allocatable(mg);
     1688 +
1584 1689                  if (!allocatable)
1585 1690                          goto next;
1586 1691  
1587 1692                  /*
1588 1693                   * Avoid writing single-copy data to a failing vdev
1589 1694                   * unless the user instructs us that it is okay.
1590 1695                   */
1591 1696                  if ((vd->vdev_stat.vs_write_errors > 0 ||
1592 1697                      vd->vdev_state < VDEV_STATE_HEALTHY) &&
1593 1698                      d == 0 && dshift == 3 &&
↓ open down ↓ 313 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX