Print this page
OS-1566 filesystem limits for ZFS datasets

@@ -43,10 +43,11 @@
 #include <sys/zfs_znode.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dsl_deadlist.h>
+#include "zfs_prop.h"
 
 static char *dsl_reaper = "the grim reaper";
 
 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;

@@ -346,10 +347,14 @@
                 mt = MT_EXACT;
 
         err = zap_remove_norm(mos, snapobj, name, mt, tx);
         if (err == ENOTSUP && mt == MT_FIRST)
                 err = zap_remove(mos, snapobj, name, tx);
+
+        if (err == 0)
+                dsl_snapcount_adjust(ds->ds_dir, tx, -1, B_TRUE);
+
         return (err);
 }
 
 static int
 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,

@@ -1134,11 +1139,11 @@
 
                 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
                 dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
                     dsl_dataset_destroy_sync, &dsda, tag, 0);
                 dsl_sync_task_create(dstg, dsl_dir_destroy_check,
-                    dsl_dir_destroy_sync, dd, FTAG, 0);
+                    dsl_dir_destroy_sync, dd, tag, 0);
                 err = dsl_sync_task_group_wait(dstg);
                 dsl_sync_task_group_destroy(dstg);
 
                 /*
                  * We could be racing against 'zfs release' or 'zfs destroy -d'

@@ -2010,13 +2015,160 @@
                 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
 
         return (0);
 }
 
+/*
+ * Check if adding additional snapshot(s) would exceed any snapshot limits.
+ * Note that all snapshot limits up to the root dataset (i.e. the pool itself)
+ * or the given ancestor must be satisfied. Note that it is valid for the
+ * count to exceed the limit. This can happen if a snapshot is taken by an
+ * administrative user in the global zone (e.g. a recursive snapshot by root).
+ */
 int
+dsl_snapcount_check(dsl_dir_t *dd, uint64_t cnt, dsl_dir_t *ancestor)
+{
+        uint64_t limit;
+        int err = 0;
+
+        /*
+         * The limit is never enforced for the admin user in global zone.
+         * If we're not in the global zone then we need to run this check in
+         * open context, since thats when we know what zone we're in and
+         * syncing is only performed in the global zone.
+         */
+        if (INGLOBALZONE(curproc))
+                return (0);
+
+        /*
+         * If renaming a dataset with no snapshots, count adjustment is 0.
+         */
+        if (cnt == 0)
+                return (0);
+
+        /*
+         * If an ancestor has been provided, stop checking the limit once we
+         * hit that dir. We need this during rename so that we don't overcount
+         * the check once we recurse up to the common ancestor.
+         */
+        if (ancestor == dd)
+                return (0);
+
+        /*
+         * If we hit an uninitialized node while recursing up the tree, we can
+         * stop since we know the counts are not valid on this node and we
+         * know we won't touch this node's counts.
+         */
+        if (dd->dd_phys->dd_filesystem_count == 0)
+                return (0);
+
+        /*
+         * If there's no value for this property, there's no need to enforce a
+         * snapshot limit.
+         */
+        err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_SNAPSHOT_LIMIT),
+            8, 1, &limit, NULL, B_FALSE);
+        if (err == ENOENT)
+                return (0);
+        else if (err != 0)
+                return (err);
+
+#ifdef _KERNEL
+        extern void __dtrace_probe_zfs__ss__limit(uint64_t, uint64_t, char *);
+        __dtrace_probe_zfs__ss__limit(
+            (uint64_t)dd->dd_phys->dd_snapshot_count, (uint64_t)limit,
+            dd->dd_myname);
+#endif
+
+        if (limit != MAXLIMIT &&
+            (dd->dd_phys->dd_snapshot_count + cnt) > limit)
+                return (EDQUOT);
+
+        if (dd->dd_parent != NULL)
+                err = dsl_snapcount_check(dd->dd_parent, cnt, ancestor);
+
+        return (err);
+}
+
+/*
+ * Adjust the snapshot count for the specified dsl_dir_t and all parents.
+ * When a new snapshot is created, increment the count on all parents, and when
+ * a snapshot is destroyed, decrement the count.
+ */
+void
+dsl_snapcount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
+    boolean_t first)
+{
+        /*
+         * If we hit an uninitialized node while recursing up the tree, we can
+         * stop since we know the counts are not valid on this node and we
+         * know we shouldn't touch this node's counts. An uninitialized count
+         * on the node indicates that either the feature has not yet been
+         * activated or there are no limits on this part of the tree.
+         */
+        if (dd->dd_phys->dd_filesystem_count == 0)
+                return;
+
+        /*
+         * The feature might have previously been active, so there could be
+         * non-0 counts on the nodes, but it might now be inactive.
+         *
+         * On initial entry we need to check if this feature is active, but
+         * we don't want to re-check this on each recursive call. Note: the
+         * feature cannot be active if its not enabled. If the feature is not
+         * active, don't touch the on-disk count fields.
+         */
+        if (first) {
+                dsl_dataset_t *ds = NULL;
+                spa_t *spa;
+                zfeature_info_t *quota_feat =
+                    &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
+
+                VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
+                    dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
+                spa = dsl_dataset_get_spa(ds);
+                dsl_dataset_rele(ds, FTAG);
+                if (!spa_feature_is_active(spa, quota_feat))
+                        return;
+        }
+
+        /*
+         * As with dsl_dataset_set_reservation_check(), wdon't want to run
+         * this check in open context.
+         */
+        if (!dmu_tx_is_syncing(tx))
+                return;
+
+        /* if renaming a dataset with no snapshots, count adjustment is 0 */
+        if (delta == 0)
+                return;
+
+        /*
+         * If we hit an uninitialized node while recursing up the tree, we can
+         * stop since we know the counts are not valid on this node and we
+         * know we shouldn't touch this node's counts.
+         */
+        if (dd->dd_phys->dd_filesystem_count == 0)
+                return;
+
+        /* Increment count for parent */
+        dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+        mutex_enter(&dd->dd_lock);
+
+        dd->dd_phys->dd_snapshot_count += delta;
+
+        /* Roll up this additional count into our ancestors */
+        if (dd->dd_parent != NULL)
+                dsl_snapcount_adjust(dd->dd_parent, tx, delta, B_FALSE);
+
+        mutex_exit(&dd->dd_lock);
+}
+
+int
 dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
-    dmu_tx_t *tx)
+    uint64_t cnt, dmu_tx_t *tx)
 {
         int err;
         uint64_t value;
 
         /*

@@ -2040,10 +2192,14 @@
          * of the dataset's length + 1 for the @-sign + snapshot name's length
          */
         if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
                 return (ENAMETOOLONG);
 
+        err = dsl_snapcount_check(ds->ds_dir, cnt, NULL);
+        if (err)
+                return (err);
+
         err = dsl_dataset_snapshot_reserve_space(ds, tx);
         if (err)
                 return (err);
 
         ds->ds_trysnap_txg = tx->tx_txg;

@@ -2061,10 +2217,12 @@
         objset_t *mos = dp->dp_meta_objset;
         int err;
 
         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
+        dsl_snapcount_adjust(ds->ds_dir, tx, 1, B_TRUE);
+
         /*
          * The origin's ds_creation_txg has to be < TXG_INITIAL
          */
         if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
                 crtxg = 1;

@@ -2716,13 +2874,13 @@
                 pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
                 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
                 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
         }
 
-        /* Check that there is enough space here */
+        /* Check that there is enough space and limit headroom here */
         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
-            pa->used);
+            origin_ds->ds_dir, pa->used, tx);
         if (err)
                 return (err);
 
         /*
          * Compute the amounts of space that will be used by snapshots

@@ -2851,10 +3009,11 @@
                 VERIFY(0 == dsl_dataset_snap_remove(origin_head,
                     ds->ds_snapname, tx));
                 VERIFY(0 == zap_add(dp->dp_meta_objset,
                     hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
                     8, 1, &ds->ds_object, tx));
+                dsl_snapcount_adjust(hds->ds_dir, tx, 1, B_TRUE);
 
                 /* change containing dsl_dir */
                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
                 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
                 ds->ds_phys->ds_dir_obj = dd->dd_object;