Print this page
OS-1566 filesystem limits for ZFS datasets

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dsl_dataset.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_dataset.c
↓ open down ↓ 37 lines elided ↑ open up ↑
  38   38  #include <sys/zfeature.h>
  39   39  #include <sys/unique.h>
  40   40  #include <sys/zfs_context.h>
  41   41  #include <sys/zfs_ioctl.h>
  42   42  #include <sys/spa.h>
  43   43  #include <sys/zfs_znode.h>
  44   44  #include <sys/zfs_onexit.h>
  45   45  #include <sys/zvol.h>
  46   46  #include <sys/dsl_scan.h>
  47   47  #include <sys/dsl_deadlist.h>
       48 +#include "zfs_prop.h"
  48   49  
  49   50  static char *dsl_reaper = "the grim reaper";
  50   51  
  51   52  static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
  52   53  static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
  53   54  static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
  54   55  
  55   56  #define SWITCH64(x, y) \
  56   57          { \
  57   58                  uint64_t __tmp = (x); \
↓ open down ↓ 283 lines elided ↑ open up ↑
 341  342          dsl_dir_snap_cmtime_update(ds->ds_dir);
 342  343  
 343  344          if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 344  345                  mt = MT_FIRST;
 345  346          else
 346  347                  mt = MT_EXACT;
 347  348  
 348  349          err = zap_remove_norm(mos, snapobj, name, mt, tx);
 349  350          if (err == ENOTSUP && mt == MT_FIRST)
 350  351                  err = zap_remove(mos, snapobj, name, tx);
      352 +
      353 +        if (err == 0)
      354 +                dsl_snapcount_adjust(ds->ds_dir, tx, -1, B_TRUE);
      355 +
 351  356          return (err);
 352  357  }
 353  358  
 354  359  static int
 355  360  dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 356  361      dsl_dataset_t **dsp)
 357  362  {
 358  363          objset_t *mos = dp->dp_meta_objset;
 359  364          dmu_buf_t *dbuf;
 360  365          dsl_dataset_t *ds;
↓ open down ↓ 768 lines elided ↑ open up ↑
1129 1134                          if (err) {
1130 1135                                  dsl_dir_close(dd, FTAG);
1131 1136                                  goto out;
1132 1137                          }
1133 1138                  }
1134 1139  
1135 1140                  dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
1136 1141                  dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
1137 1142                      dsl_dataset_destroy_sync, &dsda, tag, 0);
1138 1143                  dsl_sync_task_create(dstg, dsl_dir_destroy_check,
1139      -                    dsl_dir_destroy_sync, dd, FTAG, 0);
     1144 +                    dsl_dir_destroy_sync, dd, tag, 0);
1140 1145                  err = dsl_sync_task_group_wait(dstg);
1141 1146                  dsl_sync_task_group_destroy(dstg);
1142 1147  
1143 1148                  /*
1144 1149                   * We could be racing against 'zfs release' or 'zfs destroy -d'
1145 1150                   * on the origin snap, in which case we can get EBUSY if we
1146 1151                   * needed to destroy the origin snap but were not ready to
1147 1152                   * do so.
1148 1153                   */
1149 1154                  if (dsda.need_prep) {
↓ open down ↓ 855 lines elided ↑ open up ↑
2005 2010          /*
2006 2011           * Propagate any reserved space for this snapshot to other
2007 2012           * snapshot checks in this sync group.
2008 2013           */
2009 2014          if (asize > 0)
2010 2015                  dsl_dir_willuse_space(ds->ds_dir, asize, tx);
2011 2016  
2012 2017          return (0);
2013 2018  }
2014 2019  
     2020 +/*
     2021 + * Check if adding additional snapshot(s) would exceed any snapshot limits.
     2022 + * Note that all snapshot limits up to the root dataset (i.e. the pool itself)
     2023 + * or the given ancestor must be satisfied. Note that it is valid for the
     2024 + * count to exceed the limit. This can happen if a snapshot is taken by an
     2025 + * administrative user in the global zone (e.g. a recursive snapshot by root).
     2026 + */
2015 2027  int
     2028 +dsl_snapcount_check(dsl_dir_t *dd, uint64_t cnt, dsl_dir_t *ancestor)
     2029 +{
     2030 +        uint64_t limit;
     2031 +        int err = 0;
     2032 +
     2033 +        /*
     2034 +         * The limit is never enforced for the admin user in global zone.
     2035 +         * If we're not in the global zone then we need to run this check in
     2036 +         * open context, since thats when we know what zone we're in and
     2037 +         * syncing is only performed in the global zone.
     2038 +         */
     2039 +        if (INGLOBALZONE(curproc))
     2040 +                return (0);
     2041 +
     2042 +        /*
     2043 +         * If renaming a dataset with no snapshots, count adjustment is 0.
     2044 +         */
     2045 +        if (cnt == 0)
     2046 +                return (0);
     2047 +
     2048 +        /*
     2049 +         * If an ancestor has been provided, stop checking the limit once we
     2050 +         * hit that dir. We need this during rename so that we don't overcount
     2051 +         * the check once we recurse up to the common ancestor.
     2052 +         */
     2053 +        if (ancestor == dd)
     2054 +                return (0);
     2055 +
     2056 +        /*
     2057 +         * If we hit an uninitialized node while recursing up the tree, we can
     2058 +         * stop since we know the counts are not valid on this node and we
     2059 +         * know we won't touch this node's counts.
     2060 +         */
     2061 +        if (dd->dd_phys->dd_filesystem_count == 0)
     2062 +                return (0);
     2063 +
     2064 +        /*
     2065 +         * If there's no value for this property, there's no need to enforce a
     2066 +         * snapshot limit.
     2067 +         */
     2068 +        err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_SNAPSHOT_LIMIT),
     2069 +            8, 1, &limit, NULL, B_FALSE);
     2070 +        if (err == ENOENT)
     2071 +                return (0);
     2072 +        else if (err != 0)
     2073 +                return (err);
     2074 +
     2075 +#ifdef _KERNEL
     2076 +        extern void __dtrace_probe_zfs__ss__limit(uint64_t, uint64_t, char *);
     2077 +        __dtrace_probe_zfs__ss__limit(
     2078 +            (uint64_t)dd->dd_phys->dd_snapshot_count, (uint64_t)limit,
     2079 +            dd->dd_myname);
     2080 +#endif
     2081 +
     2082 +        if (limit != MAXLIMIT &&
     2083 +            (dd->dd_phys->dd_snapshot_count + cnt) > limit)
     2084 +                return (EDQUOT);
     2085 +
     2086 +        if (dd->dd_parent != NULL)
     2087 +                err = dsl_snapcount_check(dd->dd_parent, cnt, ancestor);
     2088 +
     2089 +        return (err);
     2090 +}
     2091 +
     2092 +/*
     2093 + * Adjust the snapshot count for the specified dsl_dir_t and all parents.
     2094 + * When a new snapshot is created, increment the count on all parents, and when
     2095 + * a snapshot is destroyed, decrement the count.
     2096 + */
     2097 +void
     2098 +dsl_snapcount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
     2099 +    boolean_t first)
     2100 +{
     2101 +        /*
     2102 +         * If we hit an uninitialized node while recursing up the tree, we can
     2103 +         * stop since we know the counts are not valid on this node and we
     2104 +         * know we shouldn't touch this node's counts. An uninitialized count
     2105 +         * on the node indicates that either the feature has not yet been
     2106 +         * activated or there are no limits on this part of the tree.
     2107 +         */
     2108 +        if (dd->dd_phys->dd_filesystem_count == 0)
     2109 +                return;
     2110 +
     2111 +        /*
     2112 +         * The feature might have previously been active, so there could be
     2113 +         * non-0 counts on the nodes, but it might now be inactive.
     2114 +         *
     2115 +         * On initial entry we need to check if this feature is active, but
     2116 +         * we don't want to re-check this on each recursive call. Note: the
     2117 +         * feature cannot be active if its not enabled. If the feature is not
     2118 +         * active, don't touch the on-disk count fields.
     2119 +         */
     2120 +        if (first) {
     2121 +                dsl_dataset_t *ds = NULL;
     2122 +                spa_t *spa;
     2123 +                zfeature_info_t *quota_feat =
     2124 +                    &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
     2125 +
     2126 +                VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
     2127 +                    dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
     2128 +                spa = dsl_dataset_get_spa(ds);
     2129 +                dsl_dataset_rele(ds, FTAG);
     2130 +                if (!spa_feature_is_active(spa, quota_feat))
     2131 +                        return;
     2132 +        }
     2133 +
     2134 +        /*
     2135 +         * As with dsl_dataset_set_reservation_check(), wdon't want to run
     2136 +         * this check in open context.
     2137 +         */
     2138 +        if (!dmu_tx_is_syncing(tx))
     2139 +                return;
     2140 +
     2141 +        /* if renaming a dataset with no snapshots, count adjustment is 0 */
     2142 +        if (delta == 0)
     2143 +                return;
     2144 +
     2145 +        /*
     2146 +         * If we hit an uninitialized node while recursing up the tree, we can
     2147 +         * stop since we know the counts are not valid on this node and we
     2148 +         * know we shouldn't touch this node's counts.
     2149 +         */
     2150 +        if (dd->dd_phys->dd_filesystem_count == 0)
     2151 +                return;
     2152 +
     2153 +        /* Increment count for parent */
     2154 +        dmu_buf_will_dirty(dd->dd_dbuf, tx);
     2155 +
     2156 +        mutex_enter(&dd->dd_lock);
     2157 +
     2158 +        dd->dd_phys->dd_snapshot_count += delta;
     2159 +
     2160 +        /* Roll up this additional count into our ancestors */
     2161 +        if (dd->dd_parent != NULL)
     2162 +                dsl_snapcount_adjust(dd->dd_parent, tx, delta, B_FALSE);
     2163 +
     2164 +        mutex_exit(&dd->dd_lock);
     2165 +}
     2166 +
     2167 +int
2016 2168  dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
2017      -    dmu_tx_t *tx)
     2169 +    uint64_t cnt, dmu_tx_t *tx)
2018 2170  {
2019 2171          int err;
2020 2172          uint64_t value;
2021 2173  
2022 2174          /*
2023 2175           * We don't allow multiple snapshots of the same txg.  If there
2024 2176           * is already one, try again.
2025 2177           */
2026 2178          if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
2027 2179                  return (EAGAIN);
↓ open down ↓ 7 lines elided ↑ open up ↑
2035 2187          if (err != ENOENT)
2036 2188                  return (err);
2037 2189  
2038 2190          /*
2039 2191           * Check that the dataset's name is not too long.  Name consists
2040 2192           * of the dataset's length + 1 for the @-sign + snapshot name's length
2041 2193           */
2042 2194          if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
2043 2195                  return (ENAMETOOLONG);
2044 2196  
     2197 +        err = dsl_snapcount_check(ds->ds_dir, cnt, NULL);
     2198 +        if (err)
     2199 +                return (err);
     2200 +
2045 2201          err = dsl_dataset_snapshot_reserve_space(ds, tx);
2046 2202          if (err)
2047 2203                  return (err);
2048 2204  
2049 2205          ds->ds_trysnap_txg = tx->tx_txg;
2050 2206          return (0);
2051 2207  }
2052 2208  
2053 2209  void
2054 2210  dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
↓ open down ↓ 1 lines elided ↑ open up ↑
2056 2212  {
2057 2213          dsl_pool_t *dp = ds->ds_dir->dd_pool;
2058 2214          dmu_buf_t *dbuf;
2059 2215          dsl_dataset_phys_t *dsphys;
2060 2216          uint64_t dsobj, crtxg;
2061 2217          objset_t *mos = dp->dp_meta_objset;
2062 2218          int err;
2063 2219  
2064 2220          ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
2065 2221  
     2222 +        dsl_snapcount_adjust(ds->ds_dir, tx, 1, B_TRUE);
     2223 +
2066 2224          /*
2067 2225           * The origin's ds_creation_txg has to be < TXG_INITIAL
2068 2226           */
2069 2227          if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
2070 2228                  crtxg = 1;
2071 2229          else
2072 2230                  crtxg = tx->tx_txg;
2073 2231  
2074 2232          dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
2075 2233              DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
↓ open down ↓ 635 lines elided ↑ open up ↑
2711 2869          /*
2712 2870           * If we are a clone of a clone then we never reached ORIGIN,
2713 2871           * so we need to subtract out the clone origin's used space.
2714 2872           */
2715 2873          if (pa->origin_origin) {
2716 2874                  pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
2717 2875                  pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2718 2876                  pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2719 2877          }
2720 2878  
2721      -        /* Check that there is enough space here */
     2879 +        /* Check that there is enough space and limit headroom here */
2722 2880          err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2723      -            pa->used);
     2881 +            origin_ds->ds_dir, pa->used, tx);
2724 2882          if (err)
2725 2883                  return (err);
2726 2884  
2727 2885          /*
2728 2886           * Compute the amounts of space that will be used by snapshots
2729 2887           * after the promotion (for both origin and clone).  For each,
2730 2888           * it is the amount of space that will be on all of their
2731 2889           * deadlists (that was not born before their new origin).
2732 2890           */
2733 2891          if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
↓ open down ↓ 112 lines elided ↑ open up ↑
2846 3004                          dmu_objset_evict(ds->ds_objset);
2847 3005                          ds->ds_objset = NULL;
2848 3006                  }
2849 3007                  /* move snap name entry */
2850 3008                  VERIFY(0 == dsl_dataset_get_snapname(ds));
2851 3009                  VERIFY(0 == dsl_dataset_snap_remove(origin_head,
2852 3010                      ds->ds_snapname, tx));
2853 3011                  VERIFY(0 == zap_add(dp->dp_meta_objset,
2854 3012                      hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2855 3013                      8, 1, &ds->ds_object, tx));
     3014 +                dsl_snapcount_adjust(hds->ds_dir, tx, 1, B_TRUE);
2856 3015  
2857 3016                  /* change containing dsl_dir */
2858 3017                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
2859 3018                  ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2860 3019                  ds->ds_phys->ds_dir_obj = dd->dd_object;
2861 3020                  ASSERT3P(ds->ds_dir, ==, odd);
2862 3021                  dsl_dir_close(ds->ds_dir, ds);
2863 3022                  VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
2864 3023                      NULL, ds, &ds->ds_dir));
2865 3024  
↓ open down ↓ 1428 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX