Print this page
3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit
4080 zpool clear fails to clear pool
4081 need zfs_mg_noalloc_threshold
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

*** 56,70 **** int zfs_condense_pct = 200; /* * This value defines the number of allowed allocation failures per vdev. * If a device reaches this threshold in a given txg then we consider skipping ! * allocations on that device. */ ! int zfs_mg_alloc_failures; /* * Metaslab debugging: when set, keeps all space maps in core to verify frees. */ static int metaslab_debug = 0; /* --- 56,86 ---- int zfs_condense_pct = 200; /* * This value defines the number of allowed allocation failures per vdev. * If a device reaches this threshold in a given txg then we consider skipping ! * allocations on that device. The value of zfs_mg_alloc_failures is computed ! * in zio_init() unless it has been overridden in /etc/system. */ ! int zfs_mg_alloc_failures = 0; /* + * The zfs_mg_noalloc_threshold defines which metaslab groups should + * be eligible for allocation. The value is defined as a percentage of + * a free space. Metaslab groups that have more free space than + * zfs_mg_noalloc_threshold are always eligible for allocations. Once + * a metaslab group's free space is less than or equal to the + * zfs_mg_noalloc_threshold the allocator will avoid allocating to that + * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. + * Once all groups in the pool reach zfs_mg_noalloc_threshold then all + * groups are allowed to accept allocations. Gang blocks are always + * eligible to allocate on any metaslab group. The default value of 0 means + * no metaslab group will be excluded based on this criterion. + */ + int zfs_mg_noalloc_threshold = 0; + + /* * Metaslab debugging: when set, keeps all space maps in core to verify frees. */ static int metaslab_debug = 0; /*
*** 222,231 **** --- 238,294 ---- ASSERT3P(m1, ==, m2); return (0); } + /* + * Update the allocatable flag and the metaslab group's capacity. + * The allocatable flag is set to true if the capacity is below + * the zfs_mg_noalloc_threshold. If a metaslab group transitions + * from allocatable to non-allocatable or vice versa then the metaslab + * group's class is updated to reflect the transition. + */ + static void + metaslab_group_alloc_update(metaslab_group_t *mg) + { + vdev_t *vd = mg->mg_vd; + metaslab_class_t *mc = mg->mg_class; + vdev_stat_t *vs = &vd->vdev_stat; + boolean_t was_allocatable; + + ASSERT(vd == vd->vdev_top); + + mutex_enter(&mg->mg_lock); + was_allocatable = mg->mg_allocatable; + + mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / + (vs->vs_space + 1); + + mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold); + + /* + * The mc_alloc_groups maintains a count of the number of + * groups in this metaslab class that are still above the + * zfs_mg_noalloc_threshold. This is used by the allocating + * threads to determine if they should avoid allocations to + * a given group. The allocator will avoid allocations to a group + * if that group has reached or is below the zfs_mg_noalloc_threshold + * and there are still other groups that are above the threshold. + * When a group transitions from allocatable to non-allocatable or + * vice versa we update the metaslab class to reflect that change. + * When the mc_alloc_groups value drops to 0 that means that all + * groups have reached the zfs_mg_noalloc_threshold making all groups + * eligible for allocations. This effectively means that all devices + * are balanced again. + */ + if (was_allocatable && !mg->mg_allocatable) + mc->mc_alloc_groups--; + else if (!was_allocatable && mg->mg_allocatable) + mc->mc_alloc_groups++; + mutex_exit(&mg->mg_lock); + } + metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) { metaslab_group_t *mg;
*** 272,281 **** --- 335,345 ---- if (++mg->mg_activation_count <= 0) return; mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else {
*** 357,366 **** --- 421,453 ---- avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); } /* + * Determine if a given metaslab group should skip allocations. A metaslab + * group should avoid allocations if its used capacity has crossed the + * zfs_mg_noalloc_threshold and there is at least one metaslab group + * that can still handle allocations. + */ + static boolean_t + metaslab_group_allocatable(metaslab_group_t *mg) + { + vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + metaslab_class_t *mc = mg->mg_class; + + /* + * A metaslab group is considered allocatable if its free capacity + * is greater than the set value of zfs_mg_noalloc_threshold, it's + * associated with a slog, or there are no other metaslab groups + * with free capacity greater than zfs_mg_noalloc_threshold. + */ + return (mg->mg_free_capacity > zfs_mg_noalloc_threshold || + mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); + } + + /* * ========================================================================== * Common allocator routines * ========================================================================== */ static int
*** 1305,1314 **** --- 1392,1403 ---- metaslab_sync_reassess(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; int64_t failures = mg->mg_alloc_failures; + metaslab_group_alloc_update(mg); + /* * Re-evaluate all metaslabs which have lower offsets than the * bonus area. */ for (int m = 0; m < vd->vdev_ms_count; m++) {
*** 1406,1415 **** --- 1495,1506 ---- } mutex_exit(&mg->mg_lock); if (msp == NULL) return (-1ULL); + mutex_enter(&msp->ms_lock); + /* * If we've already reached the allowable number of failed * allocation attempts on this metaslab group then we * consider skipping it. We skip it only if we're allowed * to "fast" gang, the physical size is larger than
*** 1422,1436 **** spa_dbgmsg(spa, "%s: skipping metaslab group: " "vdev %llu, txg %llu, mg %p, psize %llu, " "asize %llu, failures %llu", spa_name(spa), mg->mg_vd->vdev_id, txg, mg, psize, asize, mg->mg_alloc_failures); return (-1ULL); } - mutex_enter(&msp->ms_lock); - /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. --- 1513,1526 ---- spa_dbgmsg(spa, "%s: skipping metaslab group: " "vdev %llu, txg %llu, mg %p, psize %llu, " "asize %llu, failures %llu", spa_name(spa), mg->mg_vd->vdev_id, txg, mg, psize, asize, mg->mg_alloc_failures); + mutex_exit(&msp->ms_lock); return (-1ULL); } /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock.
*** 1579,1588 **** --- 1669,1693 ---- allocatable = vdev_allocatable(vd); spa_config_exit(spa, SCL_ZIO, FTAG); } else { allocatable = vdev_allocatable(vd); } + + /* + * Determine if the selected metaslab group is eligible + * for allocations. If we're ganging or have requested + * an allocation for the smallest gang block size + * then we don't want to avoid allocating to the this + * metaslab group. If we're in this condition we should + * try to allocate from any device possible so that we + * don't inadvertently return ENOSPC and suspend the pool + * even though space is still available. + */ + if (allocatable && CAN_FASTGANG(flags) && + psize > SPA_GANGBLOCKSIZE) + allocatable = metaslab_group_allocatable(mg); + if (!allocatable) goto next; /* * Avoid writing single-copy data to a failing vdev