illumos Cdiff usr/src/uts/common/fs/zfs/dsl

Print this page

OS-1566 dataset quota for ZFS datasets


*** 19,28 ****
--- 19,29 ----
   * CDDL HEADER END
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   * Copyright (c) 2012 by Delphix. All rights reserved.
+  * Copyright (c) 2012 Joyent, Inc. All rights reserved.
   */
  
  #include <sys/dmu.h>
  #include <sys/dmu_objset.h>
  #include <sys/dmu_tx.h>
*** 36,51 ****
--- 37,113 ----
  #include <sys/zap.h>
  #include <sys/zio.h>
  #include <sys/arc.h>
  #include <sys/sunddi.h>
  #include <sys/zfs_zone.h>
+ #include <sys/zfeature.h>
  #include "zfs_namecheck.h"
  
+ /*
+  * Dataset and Snapshot Quotas
+  * ---------------------------
+  *
+  * These quotas are used to limit the number of datasets and/or snapshots
+  * that can be created at a given level in the tree or below. A common use-case
+  * is with a delegated dataset where the administrator wants to ensure that
+  * a user within the zone is not creating too many datasets or snapshots, even
+  * though they're not exceeding their space quota.
+  *
+  * The count of datasets and snapshots is stored in the dsl_dir_phys_t which
+  * impacts the on-disk format. As such, this capability is controlled by a
+  * feature flag and must be enabled to be used. Once enabled, the feature is
+  * not active until the first quota is set. At that point, future operations to
+  * create/destroy datasets or snapshots will validate and update the counts.
+  *
+  * Because the on-disk counts will be incorrect (garbage) before the feature is
+  * active, the counts are updated when the quota is first set. Starting at the
+  * dataset with the new quota, the code descends into all sub-datasets and
+  * updates the counts to be accurate. In practice this is lightweight since
+  * a quota is typically set when the dataset is created and thus has no
+  * children. Once set, changing the quota value won't require a traversal since
+  * the counts are already valid. The counts in datasets above the one with the
+  * new quota will still be incorrect, unless a quota is eventually set on one
+  * of those datasets. If a dataset with a quota is encountered during the
+  * descent, the counts are known to be valid and there is no need to descend
+  * into that dataset's children. When a new quota value is set on a dataset
+  * with an existing quota, the new value must not be less than the current
+  * count at that level or an error is returned and the quota is not changed.
+  *
+  * Once the feature is active, then whenever a dataset or snapshot is created,
+  * the code recurses up the tree, validating the new count against the quota
+  * at each level. In practice, most levels will not have a quota set. If there
+  * is a quota at any level up the tree, the check must pass or the creation
+  * will fail. Likewise, when a dataset or snapshot is destroyed, the counts
+  * are recursively adjusted all the way up the tree. Renaming a dataset into
+  * different point in the tree will first validate, then update the counts on
+  * each branch up to the common ancestor. A receive will also validate the
+  * counts and then update them.
+  *
+  * Recursive snapshots behave a bit differently. The quota is only validated
+  * against the top-level dataset at which the snapshot is being taken. This
+  * is to prevent a denial-of-service in which a lower level dataset could
+  * max out its quota and thus block snapshots from being taken at a higher
+  * level (in addition, the complexity to address this is not worth the cost).
+  * Because of this, it is possible for the snapshot count to be over the quota
+  * and snapshots taken at a high level could cause a lower level dataset to hit
+  * or exceed its quota. The administrator taking the high-level recursive
+  * snapshot should be aware of this side-effect and behave accordingly.
+  *
+  * The dataset quota is validated by dsl_dir_dscount_check() and updated by
+  * dsl_dir_dscount_adjust(). The snapshot quota is validated by
+  * dsl_snapcount_check() and updated by dsl_snapcount_adjust().
+  * A new quota value is validated in dsl_dir_validate_ds_ss_quota() and the
+  * dataset counts are adjusted, if necessary, by dsl_dir_set_ds_ss_count().
+  */
+ 
  static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
  static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
      uint64_t value, dmu_tx_t *tx);
  
+ extern dsl_syncfunc_t dsl_prop_set_sync;
+ extern char *tmp_dmu_recv_tag;
+ 
  /* ARGSUSED */
  static void
  dsl_dir_evict(dmu_buf_t *db, void *arg)
  {
          dsl_dir_t *dd = arg;
*** 405,414 ****
--- 467,785 ----
  dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
  {
          return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
  }
  
+ /*
+  * Check if there is already a dataset/snapshot quota set for the dataset. If
+  * not, then the counts on this dataset, and those below, may be incorrect due
+  * to the use of a pre-existing pool which did not support the dataset/snapshot
+  * quota feature.
+  *
+  * Recursively descend the dataset tree and update the dataset/snapshot counts
+  * on each dataset below, then update the cumulative count on the current
+  * dataset. If the dataset already has a quota set on it, then we know that
+  * its counts, and the counts on the datasets below it, have been updated to
+  * be correct, so we can skip that dataset.
+  */
+ static void
+ dsl_dir_set_ds_ss_count(const char *nm, dsl_dir_t *dd, dmu_tx_t *tx,
+     uint64_t *dscnt, uint64_t *sscnt)
+ {
+         uint64_t my_ds_cnt = 0;
+         uint64_t my_ss_cnt = 0;
+         objset_t *os = dd->dd_pool->dp_meta_objset;
+         zap_cursor_t *zc;
+         zap_attribute_t *za;
+         char *namebuf;
+         int err;
+         boolean_t quota_set = B_FALSE;
+         uint64_t dsquota, ssquota;
+         dsl_dataset_t *ds;
+ 
+         err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_DATASET_QUOTA),
+             8, 1, &dsquota, NULL, B_FALSE);
+         if (err == 0 && dsquota != 0)
+                 quota_set = B_TRUE;
+ 
+         if (!quota_set) {
+                 err = dsl_prop_get_dd(dd,
+                     zfs_prop_to_name(ZFS_PROP_SNAPSHOT_QUOTA), 8, 1, &ssquota,
+                     NULL, B_FALSE);
+                 if (err == 0 && ssquota != 0)
+                         quota_set = B_TRUE;
+         }
+ 
+         /*
+          * If the dd has a quota, we know its count is already good and we
+          * don't need to recurse down any further.
+          */
+         if (quota_set) {
+                 /* Return dataset count plus 1 for self */
+                 *dscnt = dd->dd_phys->dd_dataset_count + 1;
+                 *sscnt = dd->dd_phys->dd_snapshot_count;
+ 
+                 return;
+         }
+ 
+         zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+         za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+         namebuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ 
+         mutex_enter(&dd->dd_lock);
+ 
+         /* Iterate datasets */
+         for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
+             zap_cursor_retrieve(zc, za) == 0;
+             zap_cursor_advance(zc)) {
+                 dsl_dir_t *chld_dd;
+                 uint64_t chld_ds_cnt = 0;
+                 uint64_t chld_ss_cnt = 0;
+ 
+                 (void) snprintf(namebuf, MAXPATHLEN, "%s/%s", nm, za->za_name);
+ 
+                 if (dsl_dir_open(namebuf, FTAG, &chld_dd, NULL))
+                         continue;
+ 
+                 dsl_dir_set_ds_ss_count(namebuf, chld_dd, tx, &chld_ds_cnt,
+                     &chld_ss_cnt);
+ 
+                 dsl_dir_close(chld_dd, FTAG);
+ 
+                 my_ds_cnt += chld_ds_cnt;
+                 my_ss_cnt += chld_ss_cnt;
+         }
+         zap_cursor_fini(zc);
+ 
+         kmem_free(namebuf, MAXPATHLEN);
+ 
+         /* Iterate snapshots */
+         if (dsl_dataset_hold(nm, FTAG, &ds) == 0) {
+                 for (zap_cursor_init(zc, os, ds->ds_phys->ds_snapnames_zapobj);
+                     zap_cursor_retrieve(zc, za) == 0;
+                     zap_cursor_advance(zc)) {
+                         my_ss_cnt++;
+                 }
+                 zap_cursor_fini(zc);
+                 dsl_dataset_rele(ds, FTAG);
+         }
+ 
+         kmem_free(zc, sizeof (zap_cursor_t));
+         kmem_free(za, sizeof (zap_attribute_t));
+ 
+ #ifdef _KERNEL
+         extern void __dtrace_probe_zfs__ds__fix__count(char *, uint64_t,
+             uint64_t);
+         __dtrace_probe_zfs__ds__fix__count((char *)nm, my_ds_cnt, my_ss_cnt);
+ #endif
+ 
+         /* save updated counts */
+         dmu_buf_will_dirty(dd->dd_dbuf, tx);
+         dd->dd_phys->dd_dataset_count = my_ds_cnt;
+         dd->dd_phys->dd_snapshot_count = my_ss_cnt;
+ 
+         mutex_exit(&dd->dd_lock);
+ 
+         /* Return child dataset count plus 1 for self */
+         *dscnt = my_ds_cnt + 1;
+         *sscnt = my_ss_cnt;
+ }
+ 
+ /*
+  * Return ENOSPC if new quota is less than the existing count, otherwise return
+  * -1 to force the zfs_set_prop_nvlist code down the default path to set the
+  * value in the nvlist.
+  */
+ int
+ dsl_dir_validate_ds_ss_quota(const char *ddname, uint64_t quota,
+     zfs_prop_t ptype)
+ {
+         dsl_dir_t *dd;
+         dsl_dataset_t *ds;
+         int err = -1;
+         uint64_t count;
+         dmu_tx_t *tx;
+         uint64_t my_ds_cnt = 0;
+         uint64_t my_ss_cnt = 0;
+         spa_t *spa;
+         zfeature_info_t *quota_feat =
+             &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA];
+ 
+         if (dsl_dataset_hold(ddname, FTAG, &ds))
+                 return (EACCES);
+ 
+         spa = dsl_dataset_get_spa(ds);
+         if (!spa_feature_is_enabled(spa,
+             &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA])) {
+                 dsl_dataset_rele(ds, FTAG);
+                 return (ENOTSUP);
+         }
+ 
+         /* 0 means no quota */
+         if (quota == 0) {
+                 dsl_dataset_rele(ds, FTAG);
+                 return (-1);
+         }
+ 
+         if (dsl_dir_open(ddname, FTAG, &dd, NULL)) {
+                 dsl_dataset_rele(ds, FTAG);
+                 return (EACCES);
+         }
+ 
+         ASSERT(ds->ds_dir == dd);
+ 
+         tx = dmu_tx_create_dd(dd);
+         if (dmu_tx_assign(tx, TXG_WAIT)) {
+                 dmu_tx_abort(tx);
+                 return (ENOSPC);
+         }
+ 
+         /* set the feature active flag now */
+         if (!spa_feature_is_active(spa, quota_feat))
+                 spa_feature_incr(spa, quota_feat, tx);
+ 
+         /*
+          * Since we are now setting a non-0 quota on the dataset, we need to
+          * ensure the counts are correct. Descend down the tree from this
+          * point and update all of the counts to be accurate.
+          */
+         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+         dsl_dir_set_ds_ss_count(ddname, dd, tx, &my_ds_cnt, &my_ss_cnt);
+         rw_exit(&dd->dd_pool->dp_config_rwlock);
+ 
+         dmu_tx_commit(tx);
+ 
+         if (ptype == ZFS_PROP_DATASET_QUOTA)
+                 count = dd->dd_phys->dd_dataset_count;
+         else
+                 count = dd->dd_phys->dd_snapshot_count;
+ 
+         if (quota < count)
+                 err = ENOSPC;
+ 
+         dsl_dir_close(dd, FTAG);
+         dsl_dataset_rele(ds, FTAG);
+ 
+         return (err);
+ }
+ 
+ /*
+  * Check if adding additional child dataset(s) would exceed any dataset
+  * quotas.  Note that all dataset quotas up to the root dataset (i.e. the pool
+  * itself) or the given ancestor must be satisfied. When receiving we don't
+  * check if the tx is syncing. In this case, the tx is passed as NULL.
+  */
+ int
+ dsl_dir_dscount_check(dsl_dir_t *dd, dmu_tx_t *tx, uint64_t cnt,
+     dsl_dir_t *ancestor)
+ {
+         uint64_t quota;
+         int err = 0;
+ 
+         VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+ 
+         /*
+          * As with dsl_dataset_set_reservation_check(), don't run this check in
+          * open context.
+          */
+         if (tx != NULL && !dmu_tx_is_syncing(tx))
+                 return (0);
+ 
+         /*
+          * If an ancestor has been provided, stop checking the quota once we
+          * hit that dir. We need this during rename so that we don't overcount
+          * the check once we recurse up to the common ancestor.
+          */
+         if (ancestor == dd)
+                 return (0);
+ 
+         /*
+          * If there's no value for this property, there's no need to enforce a
+          * dataset quota.
+          */
+         err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_DATASET_QUOTA),
+             8, 1, &quota, NULL, B_FALSE);
+         if (err == ENOENT)
+                 return (0);
+         else if (err != 0)
+                 return (err);
+ 
+ #ifdef _KERNEL
+         extern void __dtrace_probe_zfs__ds__quota(uint64_t, uint64_t, char *);
+         __dtrace_probe_zfs__ds__quota((uint64_t)dd->dd_phys->dd_dataset_count,
+             (uint64_t)quota, dd->dd_myname);
+ #endif
+ 
+         if (quota > 0 && (dd->dd_phys->dd_dataset_count + cnt) > quota)
+                 return (EDQUOT);
+ 
+         if (dd->dd_parent != NULL)
+                 err = dsl_dir_dscount_check(dd->dd_parent, tx, cnt, ancestor);
+ 
+         return (err);
+ }
+ 
+ /*
+  * Adjust the dataset count for the specified dsl_dir_t and all parent datasets.
+  * When a new dataset is created, increment the count on all parents, and when a
+  * dataset is destroyed, decrement the count.
+  */
+ void
+ dsl_dir_dscount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
+     boolean_t syncing, boolean_t first)
+ {
+         /*
+          * On initial entry we need to check if this feature is active, but
+          * we don't want to re-check this on each recursive call. Note: the
+          * feature cannot be active if its not enabled. If the feature is not
+          * active, don't touch the on-disk count fields.
+          */
+         if (first) {
+                 dsl_dataset_t *ds = NULL;
+                 spa_t *spa;
+                 zfeature_info_t *quota_feat =
+                     &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA];
+ 
+                 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
+                     dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
+                 spa = dsl_dataset_get_spa(ds);
+                 dsl_dataset_rele(ds, FTAG);
+                 if (!spa_feature_is_active(spa, quota_feat))
+                         return;
+         }
+ 
+         VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+         if (syncing)
+                 VERIFY(dmu_tx_is_syncing(tx));
+ 
+         dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ 
+         mutex_enter(&dd->dd_lock);
+ 
+         /*
+          * Counts may be incorrect if dealing with an existing pool and
+          * there has never been a quota set in the dataset hierarchy.
+          * This is not an error.
+          */
+         if (delta < 0 && dd->dd_phys->dd_dataset_count < (delta * -1)) {
+ #ifdef _KERNEL
+                 extern void __dtrace_probe_zfs__dscnt__adj__neg(char *);
+                 __dtrace_probe_zfs__dscnt__adj__neg(dd->dd_myname);
+ #endif
+                 mutex_exit(&dd->dd_lock);
+                 return;
+         }
+ 
+         dd->dd_phys->dd_dataset_count += delta;
+ 
+         if (dd->dd_parent != NULL)
+                 dsl_dir_dscount_adjust(dd->dd_parent, tx, delta, syncing,
+                     B_FALSE);
+ 
+         mutex_exit(&dd->dd_lock);
+ }
+ 
  uint64_t
  dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
      dmu_tx_t *tx)
  {
          objset_t *mos = dp->dp_meta_objset;
*** 486,495 ****
--- 857,880 ----
  
          ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
          ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
  
          /*
+          * Decrement the dataset count for all parent datasets.
+          *
+          * We have to worry about a special case where we are receiving a
+          * dataset that already exists. In this case a temporary clone name
+          * of %X is created (see dmu_recv_begin). In dmu_recv_existing_end we
+          * destroy this temporary clone which leads to here. We don't want to
+          * decrement the dataset counters in this case, since we never
+          * incremented them. To detect this case we check the tag for
+          * "tmp_dmu_recv_tag" to see if we're in that code path.
+          */
+         if (dd->dd_parent != NULL && strcmp(tag, tmp_dmu_recv_tag) != 0)
+                 dsl_dir_dscount_adjust(dd->dd_parent, tx, -1, B_TRUE, B_TRUE);
+ 
+         /*
           * Remove our reservation. The impl() routine avoids setting the
           * actual property, which would require the (already destroyed) ds.
           */
          dsl_dir_set_reservation_sync_impl(dd, 0, tx);
  
*** 1034,1045 ****
          }
          mutex_exit(&dd->dd_lock);
          return (err);
  }
  
- extern dsl_syncfunc_t dsl_prop_set_sync;
- 
  static void
  dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
  {
          dsl_dataset_t *ds = arg1;
          dsl_dir_t *dd = ds->ds_dir;
--- 1419,1428 ----
*** 1276,1286 ****
                  /* no rename into our descendant */
                  if (closest_common_ancestor(dd, ra->newparent) == dd)
                          return (EINVAL);
  
                  if (err = dsl_dir_transfer_possible(dd->dd_parent,
!                     ra->newparent, myspace))
                          return (err);
          }
  
          return (0);
  }
--- 1659,1669 ----
                  /* no rename into our descendant */
                  if (closest_common_ancestor(dd, ra->newparent) == dd)
                          return (EINVAL);
  
                  if (err = dsl_dir_transfer_possible(dd->dd_parent,
!                     ra->newparent, dd, myspace, tx))
                          return (err);
          }
  
          return (0);
  }
*** 1301,1310 ****
--- 1684,1707 ----
          dsl_dir_name(ra->newparent, namebuf);
          spa_history_log_internal_dd(dd, "rename", tx,
              "-> %s/%s", namebuf, ra->mynewname);
  
          if (ra->newparent != dd->dd_parent) {
+                 int cnt;
+ 
+                 mutex_enter(&dd->dd_lock);
+ 
+                 cnt = dd->dd_phys->dd_dataset_count + 1;
+                 dsl_dir_dscount_adjust(dd->dd_parent, tx, -cnt, B_TRUE, B_TRUE);
+                 dsl_dir_dscount_adjust(ra->newparent, tx, cnt, B_TRUE, B_TRUE);
+ 
+                 cnt = dd->dd_phys->dd_snapshot_count;
+                 dsl_snapcount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
+                 dsl_snapcount_adjust(ra->newparent, tx, cnt, B_TRUE);
+ 
+                 mutex_exit(&dd->dd_lock);
+ 
                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
                      -dd->dd_phys->dd_used_bytes,
                      -dd->dd_phys->dd_compressed_bytes,
                      -dd->dd_phys->dd_uncompressed_bytes, tx);
                  dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
*** 1373,1394 ****
          dsl_dir_close(ra.newparent, FTAG);
          return (err);
  }
  
  int
! dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
  {
          dsl_dir_t *ancestor;
          int64_t adelta;
          uint64_t avail;
  
          ancestor = closest_common_ancestor(sdd, tdd);
          adelta = would_change(sdd, -space, ancestor);
          avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
          if (avail < space)
                  return (ENOSPC);
  
          return (0);
  }
  
  timestruc_t
  dsl_dir_snap_cmtime(dsl_dir_t *dd)
--- 1770,1804 ----
          dsl_dir_close(ra.newparent, FTAG);
          return (err);
  }
  
  int
! dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, dsl_dir_t *moving_dd,
!     uint64_t space, dmu_tx_t *tx)
  {
          dsl_dir_t *ancestor;
          int64_t adelta;
          uint64_t avail;
+         int err;
  
          ancestor = closest_common_ancestor(sdd, tdd);
          adelta = would_change(sdd, -space, ancestor);
          avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
          if (avail < space)
                  return (ENOSPC);
  
+         if (sdd != moving_dd) {
+                 err = dsl_dir_dscount_check(tdd, tx,
+                     moving_dd->dd_phys->dd_dataset_count + 1, ancestor);
+                 if (err != 0)
+                         return (err);
+         }
+         err = dsl_snapcount_check(tdd, tx,
+             moving_dd->dd_phys->dd_snapshot_count, ancestor);
+         if (err != 0)
+                 return (err);
+ 
          return (0);
  }
  
  timestruc_t
  dsl_dir_snap_cmtime(dsl_dir_t *dd)