Print this page
3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit
4080 zpool clear fails to clear pool
4081 need zfs_mg_noalloc_threshold
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>

@@ -56,15 +56,31 @@
 int zfs_condense_pct = 200;
 
 /*
  * This value defines the number of allowed allocation failures per vdev.
  * If a device reaches this threshold in a given txg then we consider skipping
- * allocations on that device.
+ * allocations on that device. The value of zfs_mg_alloc_failures is computed
+ * in zio_init() unless it has been overridden in /etc/system.
  */
-int zfs_mg_alloc_failures;
+int zfs_mg_alloc_failures = 0;
 
 /*
+ * The zfs_mg_noalloc_threshold defines which metaslab groups should
+ * be eligible for allocation. The value is defined as a percentage of
+ * a free space. Metaslab groups that have more free space than
+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once
+ * a metaslab group's free space is less than or equal to the
+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
+ * groups are allowed to accept allocations. Gang blocks are always
+ * eligible to allocate on any metaslab group. The default value of 0 means
+ * no metaslab group will be excluded based on this criterion.
+ */
+int zfs_mg_noalloc_threshold = 0;
+
+/*
  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  */
 static int metaslab_debug = 0;
 
 /*

@@ -222,10 +238,57 @@
         ASSERT3P(m1, ==, m2);
 
         return (0);
 }
 
+/*
+ * Update the allocatable flag and the metaslab group's capacity.
+ * The allocatable flag is set to true if the capacity is below
+ * the zfs_mg_noalloc_threshold. If a metaslab group transitions
+ * from allocatable to non-allocatable or vice versa then the metaslab
+ * group's class is updated to reflect the transition.
+ */
+static void
+metaslab_group_alloc_update(metaslab_group_t *mg)
+{
+        vdev_t *vd = mg->mg_vd;
+        metaslab_class_t *mc = mg->mg_class;
+        vdev_stat_t *vs = &vd->vdev_stat;
+        boolean_t was_allocatable;
+
+        ASSERT(vd == vd->vdev_top);
+
+        mutex_enter(&mg->mg_lock);
+        was_allocatable = mg->mg_allocatable;
+
+        mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
+            (vs->vs_space + 1);
+
+        mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
+
+        /*
+         * The mc_alloc_groups maintains a count of the number of
+         * groups in this metaslab class that are still above the
+         * zfs_mg_noalloc_threshold. This is used by the allocating
+         * threads to determine if they should avoid allocations to
+         * a given group. The allocator will avoid allocations to a group
+         * if that group has reached or is below the zfs_mg_noalloc_threshold
+         * and there are still other groups that are above the threshold.
+         * When a group transitions from allocatable to non-allocatable or
+         * vice versa we update the metaslab class to reflect that change.
+         * When the mc_alloc_groups value drops to 0 that means that all
+         * groups have reached the zfs_mg_noalloc_threshold making all groups
+         * eligible for allocations. This effectively means that all devices
+         * are balanced again.
+         */
+        if (was_allocatable && !mg->mg_allocatable)
+                mc->mc_alloc_groups--;
+        else if (!was_allocatable && mg->mg_allocatable)
+                mc->mc_alloc_groups++;
+        mutex_exit(&mg->mg_lock);
+}
+
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 {
         metaslab_group_t *mg;
 

@@ -272,10 +335,11 @@
 
         if (++mg->mg_activation_count <= 0)
                 return;
 
         mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+        metaslab_group_alloc_update(mg);
 
         if ((mgprev = mc->mc_rotor) == NULL) {
                 mg->mg_prev = mg;
                 mg->mg_next = mg;
         } else {

@@ -357,10 +421,33 @@
         avl_add(&mg->mg_metaslab_tree, msp);
         mutex_exit(&mg->mg_lock);
 }
 
 /*
+ * Determine if a given metaslab group should skip allocations. A metaslab
+ * group should avoid allocations if its used capacity has crossed the
+ * zfs_mg_noalloc_threshold and there is at least one metaslab group
+ * that can still handle allocations.
+ */
+static boolean_t
+metaslab_group_allocatable(metaslab_group_t *mg)
+{
+        vdev_t *vd = mg->mg_vd;
+        spa_t *spa = vd->vdev_spa;
+        metaslab_class_t *mc = mg->mg_class;
+
+        /*
+         * A metaslab group is considered allocatable if its free capacity
+         * is greater than the set value of zfs_mg_noalloc_threshold, it's
+         * associated with a slog, or there are no other metaslab groups
+         * with free capacity greater than zfs_mg_noalloc_threshold.
+         */
+        return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
+            mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
+}
+
+/*
  * ==========================================================================
  * Common allocator routines
  * ==========================================================================
  */
 static int

@@ -1305,10 +1392,12 @@
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
         vdev_t *vd = mg->mg_vd;
         int64_t failures = mg->mg_alloc_failures;
 
+        metaslab_group_alloc_update(mg);
+
         /*
          * Re-evaluate all metaslabs which have lower offsets than the
          * bonus area.
          */
         for (int m = 0; m < vd->vdev_ms_count; m++) {

@@ -1406,10 +1495,12 @@
                 }
                 mutex_exit(&mg->mg_lock);
                 if (msp == NULL)
                         return (-1ULL);
 
+                mutex_enter(&msp->ms_lock);
+
                 /*
                  * If we've already reached the allowable number of failed
                  * allocation attempts on this metaslab group then we
                  * consider skipping it. We skip it only if we're allowed
                  * to "fast" gang, the physical size is larger than

@@ -1422,15 +1513,14 @@
                         spa_dbgmsg(spa, "%s: skipping metaslab group: "
                             "vdev %llu, txg %llu, mg %p, psize %llu, "
                             "asize %llu, failures %llu", spa_name(spa),
                             mg->mg_vd->vdev_id, txg, mg, psize, asize,
                             mg->mg_alloc_failures);
+                        mutex_exit(&msp->ms_lock);
                         return (-1ULL);
                 }
 
-                mutex_enter(&msp->ms_lock);
-
                 /*
                  * Ensure that the metaslab we have selected is still
                  * capable of handling our request. It's possible that
                  * another thread may have changed the weight while we
                  * were blocked on the metaslab lock.

@@ -1579,10 +1669,25 @@
                         allocatable = vdev_allocatable(vd);
                         spa_config_exit(spa, SCL_ZIO, FTAG);
                 } else {
                         allocatable = vdev_allocatable(vd);
                 }
+
+                /*
+                 * Determine if the selected metaslab group is eligible
+                 * for allocations. If we're ganging or have requested
+                 * an allocation for the smallest gang block size
+                 * then we don't want to avoid allocating to the this
+                 * metaslab group. If we're in this condition we should
+                 * try to allocate from any device possible so that we
+                 * don't inadvertently return ENOSPC and suspend the pool
+                 * even though space is still available.
+                 */
+                if (allocatable && CAN_FASTGANG(flags) &&
+                    psize > SPA_GANGBLOCKSIZE)
+                        allocatable = metaslab_group_allocatable(mg);
+
                 if (!allocatable)
                         goto next;
 
                 /*
                  * Avoid writing single-copy data to a failing vdev