Print this page
OS-1566 filesystem limits for ZFS datasets

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dsl_dir.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_dir.c
↓ open down ↓ 13 lines elided ↑ open up ↑
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012 by Delphix. All rights reserved.
       24 + * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  24   25   */
  25   26  
  26   27  #include <sys/dmu.h>
  27   28  #include <sys/dmu_objset.h>
  28   29  #include <sys/dmu_tx.h>
  29   30  #include <sys/dsl_dataset.h>
  30   31  #include <sys/dsl_dir.h>
  31   32  #include <sys/dsl_prop.h>
  32   33  #include <sys/dsl_synctask.h>
  33   34  #include <sys/dsl_deleg.h>
  34   35  #include <sys/spa.h>
  35   36  #include <sys/metaslab.h>
  36   37  #include <sys/zap.h>
  37   38  #include <sys/zio.h>
  38   39  #include <sys/arc.h>
  39   40  #include <sys/sunddi.h>
  40   41  #include <sys/zfs_zone.h>
       42 +#include <sys/zfeature.h>
  41   43  #include "zfs_namecheck.h"
       44 +#include "zfs_prop.h"
  42   45  
       46 +/*
       47 + * Filesystem and Snapshot Limits
       48 + * ------------------------------
       49 + *
       50 + * These limits are used to restrict the number of filesystems and/or snapshots
       51 + * that can be created at a given level in the tree or below. The standard
       52 + * use-case is with a delegated dataset where the administrator wants to ensure
       53 + * that a user within the zone is not creating too many additional filesystems
       54 + * or snapshots, even though they're not exceeding their space quota.
       55 + *
       56 + * The count of filesystems and snapshots is stored in the dsl_dir_phys_t which
       57 + * impacts the on-disk format. As such, this capability is controlled by a
       58 + * feature flag and must be enabled to be used. Once enabled, the feature is
       59 + * not active until the first limit is set. At that point, future operations to
       60 + * create/destroy filesystems or snapshots will validate and update the counts.
       61 + *
       62 + * Because the on-disk counts will be uninitialized (0) before the feature is
       63 + * active, the counts are updated when a limit is first set on an uninitialized
       64 + * node (The filesystem/snapshot counts on a node includes all of the nested
       65 + * filesystems/snapshots, plus the node itself. Thus, a new leaf node has a
       66 + * filesystem count of 1 and a snapshot count of 0. A filesystem count of 0 on
       67 + * a node indicates uninitialized counts on that node.) When setting a limit on
       68 + * an uninitialized node, the code starts at the filesystem with the new limit
       69 + * and descends into all sub-filesystems and updates the counts to be accurate.
       70 + * In practice this is lightweight since a limit is typically set when the
       71 + * filesystem is created and thus has no children. Once valid, changing the
       72 + * limit value won't require a re-traversal since the counts are already valid.
       73 + * When recursively fixing the counts, if a node with a limit is encountered
       74 + * during the descent, the counts are known to be valid and there is no need to
       75 + * descend into that filesystem's children. The counts on filesystems above the
       76 + * one with the new limit will still be uninitialized (0), unless a limit is
       77 + * eventually set on one of those filesystems. It is possible for the counts
       78 + * to appear initialized, but be invalid, if the feature was previously active
       79 + * but then deactivated. For this reason, the counts are always recursively
       80 + * updated when a limit is set on a dataset, unless there is already a limit.
       81 + * When a new limit value is set on a filesystem with an existing limit, the
       82 + * new limit must be greater than the current count at that level or an error
       83 + * is returned and the limit is not changed.
       84 + *
       85 + * Once the feature is active, then whenever a filesystem or snapshot is
       86 + * created, the code recurses up the tree, validating the new count against the
       87 + * limit at each initialized level. In practice, most levels will not have a
       88 + * limit set. If there is a limit at any initialized level up the tree, the
       89 + * check must pass or the creation will fail. Likewise, when a filesystem or
       90 + * snapshot is destroyed, the counts are recursively adjusted all the way up
       91 + * the initizized nodes in the tree. Renaming a filesystem into different point
       92 + * in the tree will first validate, then update the counts on each branch up to
       93 + * the common ancestor. A receive will also validate the counts and then update
       94 + * them.
       95 + *
       96 + * An exception to the above behavior is that the limits are never enforced
       97 + * for the administrative user in the global zone. This is primarily so that
       98 + * recursive snapshots in the global zone always work. We want to prevent a
       99 + * denial-of-service in which a lower level delegated dataset could max out its
      100 + * limit and thus block recursive snapshots from being taken in the global zone.
      101 + * Because of this, it is possible for the snapshot count to be over the limit
      102 + * and snapshots taken in the global zone could cause a lower level dataset to
      103 + * hit or exceed its limit. The administrator taking the global zone recursive
      104 + * snapshot should be aware of this side-effect and behave accordingly.
      105 + * For consistency, the filesystem limit is also not enforced for the admin
      106 + * user in the global zone.
      107 + *
      108 + * The filesystem limit is validated by dsl_dir_fscount_check() and updated by
      109 + * dsl_dir_fscount_adjust(). The snapshot limit is validated by
      110 + * dsl_snapcount_check() and updated by dsl_snapcount_adjust().
      111 + * A new limit value is validated in dsl_dir_validate_fs_ss_limit() and the
      112 + * filesystem counts are adjusted, if necessary, by dsl_dir_set_fs_ss_count().
      113 + */
      114 +
  43  115  static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
  44  116  static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
  45  117      uint64_t value, dmu_tx_t *tx);
  46  118  
      119 +extern dsl_syncfunc_t dsl_prop_set_sync;
      120 +
  47  121  /* ARGSUSED */
  48  122  static void
  49  123  dsl_dir_evict(dmu_buf_t *db, void *arg)
  50  124  {
  51  125          dsl_dir_t *dd = arg;
  52  126          dsl_pool_t *dp = dd->dd_pool;
  53  127          int t;
  54  128  
  55  129          for (t = 0; t < TXG_SIZE; t++) {
  56  130                  ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
↓ open down ↓ 343 lines elided ↑ open up ↑
 400  474   * be found in *tail.  Return NULL if the path is bogus, or if
 401  475   * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
 402  476   * means that the last component is a snapshot.
 403  477   */
 404  478  int
 405  479  dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
 406  480  {
 407  481          return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
 408  482  }
 409  483  
      484 +/*
      485 + * Check if the counts are already valid for this filesystem and its
      486 + * descendants. The counts on this filesystem, and those below, may be
      487 + * uninitialized due to either the use of a pre-existing pool which did not
      488 + * support the filesystem/snapshot limit feature, or one in which the feature
      489 + * had not yet been enabled. The counts can also be invalid if the feature was
      490 + * previously active but then deactivated.
      491 + *
      492 + * Recursively descend the filesystem tree and update the filesystem/snapshot
      493 + * counts on each filesystem below, then update the cumulative count on the
      494 + * current filesystem. If the filesystem already has a limit set on it,
      495 + * then we know that its counts, and the counts on the filesystems below it,
      496 + * have been updated to be correct, so we can skip this filesystem.
      497 + */
      498 +static void
      499 +dsl_dir_set_fs_ss_count(const char *nm, dsl_dir_t *dd, dmu_tx_t *tx,
      500 +    uint64_t *fscnt, uint64_t *sscnt)
      501 +{
      502 +        uint64_t my_fs_cnt = 0;
      503 +        uint64_t my_ss_cnt = 0;
      504 +        objset_t *os = dd->dd_pool->dp_meta_objset;
      505 +        zap_cursor_t *zc;
      506 +        zap_attribute_t *za;
      507 +        char *namebuf;
      508 +        int err;
      509 +        boolean_t limit_set = B_FALSE;
      510 +        uint64_t fslimit, sslimit;
      511 +        dsl_dataset_t *ds;
      512 +
      513 +        err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_FILESYSTEM_LIMIT),
      514 +            8, 1, &fslimit, NULL, B_FALSE);
      515 +        if (err == 0 && fslimit != MAXLIMIT)
      516 +                limit_set = B_TRUE;
      517 +
      518 +        if (!limit_set) {
      519 +                err = dsl_prop_get_dd(dd,
      520 +                    zfs_prop_to_name(ZFS_PROP_SNAPSHOT_LIMIT), 8, 1, &sslimit,
      521 +                    NULL, B_FALSE);
      522 +                if (err == 0 && sslimit != MAXLIMIT)
      523 +                        limit_set = B_TRUE;
      524 +        }
      525 +
      526 +        /*
      527 +         * If the dd has a limit, we know its count is already good and we
      528 +         * don't need to recurse down any further.
      529 +         *
      530 +         * We can't check for an initialized (non-0) count since the feature
      531 +         * might have been previously active, then deactivated and is now
      532 +         * being activated again.
      533 +         */
      534 +        if (limit_set) {
      535 +                *fscnt = dd->dd_phys->dd_filesystem_count;
      536 +                *sscnt = dd->dd_phys->dd_snapshot_count;
      537 +                return;
      538 +        }
      539 +
      540 +        zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
      541 +        za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
      542 +        namebuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
      543 +
      544 +        mutex_enter(&dd->dd_lock);
      545 +
      546 +        /* Iterate datasets */
      547 +        for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
      548 +            zap_cursor_retrieve(zc, za) == 0;
      549 +            zap_cursor_advance(zc)) {
      550 +                dsl_dir_t *chld_dd;
      551 +                uint64_t chld_fs_cnt = 0;
      552 +                uint64_t chld_ss_cnt = 0;
      553 +
      554 +                (void) snprintf(namebuf, MAXPATHLEN, "%s/%s", nm, za->za_name);
      555 +
      556 +                if (dsl_dir_open(namebuf, FTAG, &chld_dd, NULL))
      557 +                        continue;
      558 +
      559 +                dsl_dir_set_fs_ss_count(namebuf, chld_dd, tx, &chld_fs_cnt,
      560 +                    &chld_ss_cnt);
      561 +
      562 +                dsl_dir_close(chld_dd, FTAG);
      563 +
      564 +                my_fs_cnt += chld_fs_cnt;
      565 +                my_ss_cnt += chld_ss_cnt;
      566 +        }
      567 +        zap_cursor_fini(zc);
      568 +
      569 +        kmem_free(namebuf, MAXPATHLEN);
      570 +
      571 +        /* Iterate snapshots */
      572 +        if (dsl_dataset_hold(nm, FTAG, &ds) == 0) {
      573 +                for (zap_cursor_init(zc, os, ds->ds_phys->ds_snapnames_zapobj);
      574 +                    zap_cursor_retrieve(zc, za) == 0;
      575 +                    zap_cursor_advance(zc)) {
      576 +                        my_ss_cnt++;
      577 +                }
      578 +                zap_cursor_fini(zc);
      579 +                dsl_dataset_rele(ds, FTAG);
      580 +        }
      581 +
      582 +        kmem_free(zc, sizeof (zap_cursor_t));
      583 +        kmem_free(za, sizeof (zap_attribute_t));
      584 +
      585 +        /* Add 1 for self */
      586 +        my_fs_cnt++;
      587 +
      588 +#ifdef _KERNEL
      589 +        extern void __dtrace_probe_zfs__fs__fix__count(char *, uint64_t,
      590 +            uint64_t);
      591 +        __dtrace_probe_zfs__fs__fix__count((char *)nm, my_fs_cnt, my_ss_cnt);
      592 +#endif
      593 +
      594 +        /* save updated counts */
      595 +        dmu_buf_will_dirty(dd->dd_dbuf, tx);
      596 +        dd->dd_phys->dd_filesystem_count = my_fs_cnt;
      597 +        dd->dd_phys->dd_snapshot_count = my_ss_cnt;
      598 +
      599 +        mutex_exit(&dd->dd_lock);
      600 +
      601 +        /* Return child dataset count plus self */
      602 +        *fscnt = my_fs_cnt;
      603 +        *sscnt = my_ss_cnt;
      604 +}
      605 +
      606 +/*
      607 + * Return ENOSPC if new limit is less than the existing count, otherwise return
      608 + * -1 to force the zfs_set_prop_nvlist code down the default path to set the
      609 + * value in the nvlist.
      610 + */
      611 +int
      612 +dsl_dir_validate_fs_ss_limit(const char *ddname, uint64_t limit,
      613 +    zfs_prop_t ptype)
      614 +{
      615 +        dsl_dir_t *dd;
      616 +        dsl_dataset_t *ds;
      617 +        int err = -1;
      618 +        uint64_t count;
      619 +        dmu_tx_t *tx;
      620 +        uint64_t my_fs_cnt = 0;
      621 +        uint64_t my_ss_cnt = 0;
      622 +        uint64_t curr_limit;
      623 +        spa_t *spa;
      624 +        zfeature_info_t *limit_feat =
      625 +            &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
      626 +
      627 +        if (dsl_dataset_hold(ddname, FTAG, &ds))
      628 +                return (EACCES);
      629 +
      630 +        spa = dsl_dataset_get_spa(ds);
      631 +        if (!spa_feature_is_enabled(spa,
      632 +            &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT])) {
      633 +                dsl_dataset_rele(ds, FTAG);
      634 +                return (ENOTSUP);
      635 +        }
      636 +
      637 +        if (dsl_dir_open(ddname, FTAG, &dd, NULL)) {
      638 +                dsl_dataset_rele(ds, FTAG);
      639 +                return (EACCES);
      640 +        }
      641 +
      642 +        ASSERT(ds->ds_dir == dd);
      643 +
      644 +        if (dsl_prop_get_dd(dd, zfs_prop_to_name(ptype), 8, 1, &curr_limit,
      645 +            NULL, B_FALSE) != 0)
      646 +                curr_limit = MAXLIMIT;
      647 +
      648 +        tx = dmu_tx_create_dd(dd);
      649 +        if (dmu_tx_assign(tx, TXG_WAIT)) {
      650 +                dmu_tx_abort(tx);
      651 +                dsl_dir_close(dd, FTAG);
      652 +                dsl_dataset_rele(ds, FTAG);
      653 +                return (ENOSPC);
      654 +        }
      655 +
      656 +        if (limit == MAXLIMIT) {
      657 +                /*
      658 +                 * If we had a limit, since we're now removing that limit,
      659 +                 * decrement the feature-active counter so that the feature
      660 +                 * becomes inactive (only enabled) if we remove the last limit.
      661 +                 */
      662 +                if (curr_limit != MAXLIMIT)
      663 +                        spa_feature_decr(spa, limit_feat, tx);
      664 +
      665 +                dmu_tx_commit(tx);
      666 +                dsl_dir_close(dd, FTAG);
      667 +                dsl_dataset_rele(ds, FTAG);
      668 +                return (-1);
      669 +        }
      670 +
      671 +        /*
      672 +         * Since we are now setting a non-MAXLIMIT on the filesystem, we need
      673 +         * to ensure the counts are correct. Descend down the tree from this
      674 +         * point and update all of the counts to be accurate.
      675 +         */
      676 +        rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
      677 +        dsl_dir_set_fs_ss_count(ddname, dd, tx, &my_fs_cnt, &my_ss_cnt);
      678 +        rw_exit(&dd->dd_pool->dp_config_rwlock);
      679 +
      680 +        if (ptype == ZFS_PROP_FILESYSTEM_LIMIT)
      681 +                count = dd->dd_phys->dd_filesystem_count;
      682 +        else
      683 +                count = dd->dd_phys->dd_snapshot_count;
      684 +
      685 +        if (limit < count) {
      686 +                err = ENOSPC;
      687 +        } else {
      688 +                /*
      689 +                 * If we had no limit, since we're now setting a limit
      690 +                 * increment the feature-active counter so that the feature
      691 +                 * either becomes active for the first time, or the count
      692 +                 * simply increases so that we can decrement it when we remove
      693 +                 * the limit.
      694 +                 */
      695 +                if (curr_limit == MAXLIMIT)
      696 +                        spa_feature_incr(spa, limit_feat, tx);
      697 +        }
      698 +
      699 +        dmu_tx_commit(tx);
      700 +
      701 +        dsl_dir_close(dd, FTAG);
      702 +        dsl_dataset_rele(ds, FTAG);
      703 +
      704 +        return (err);
      705 +}
      706 +
      707 +/*
      708 + * Check if adding additional child filesystem(s) would exceed any filesystem
      709 + * limits. Note that all filesystem limits up to the root (or the highest
      710 + * initialized) filesystem or the given ancestor must be satisfied.
      711 + */
      712 +int
      713 +dsl_dir_fscount_check(dsl_dir_t *dd, uint64_t cnt, dsl_dir_t *ancestor)
      714 +{
      715 +        uint64_t limit;
      716 +        int err = 0;
      717 +
      718 +        VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      719 +
      720 +        /*
      721 +         * The limit is never enforced for the admin user in global zone.
      722 +         * If we're not in the global zone then we need to run this check in
      723 +         * open context, since thats when we know what zone we're in and
      724 +         * syncing is only performed in the global zone.
      725 +         */
      726 +        if (INGLOBALZONE(curproc))
      727 +                return (0);
      728 +
      729 +        /*
      730 +         * If an ancestor has been provided, stop checking the limit once we
      731 +         * hit that dir. We need this during rename so that we don't overcount
      732 +         * the check once we recurse up to the common ancestor.
      733 +         */
      734 +        if (ancestor == dd)
      735 +                return (0);
      736 +
      737 +        /*
      738 +         * If we hit an uninitialized node while recursing up the tree, we can
      739 +         * stop since we know the counts are not valid on this node and we
      740 +         * know we won't touch this node's counts.
      741 +         */
      742 +        if (dd->dd_phys->dd_filesystem_count == 0)
      743 +                return (0);
      744 +
      745 +        /*
      746 +         * If there's no value for this property, there's no need to enforce a
      747 +         * filesystem limit.
      748 +         */
      749 +        err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_FILESYSTEM_LIMIT),
      750 +            8, 1, &limit, NULL, B_FALSE);
      751 +        if (err == ENOENT)
      752 +                return (0);
      753 +        else if (err != 0)
      754 +                return (err);
      755 +
      756 +#ifdef _KERNEL
      757 +        extern void __dtrace_probe_zfs__fs__limit(uint64_t, uint64_t, char *);
      758 +        __dtrace_probe_zfs__fs__limit(
      759 +            (uint64_t)dd->dd_phys->dd_filesystem_count, (uint64_t)limit,
      760 +            dd->dd_myname);
      761 +#endif
      762 +
      763 +        if (limit != MAXLIMIT &&
      764 +            (dd->dd_phys->dd_filesystem_count + cnt) > limit)
      765 +                return (EDQUOT);
      766 +
      767 +        if (dd->dd_parent != NULL)
      768 +                err = dsl_dir_fscount_check(dd->dd_parent, cnt, ancestor);
      769 +
      770 +        return (err);
      771 +}
      772 +
      773 +/*
      774 + * Adjust the filesystem count for the specified dsl_dir_t and all parent
      775 + * filesystems. When a new filesystem is created, increment the count on all
      776 + * parents, and when a filesystem is destroyed, decrement the count.
      777 + */
      778 +void
      779 +dsl_dir_fscount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
      780 +    boolean_t syncing, boolean_t first)
      781 +{
      782 +        VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      783 +        if (syncing)
      784 +                VERIFY(dmu_tx_is_syncing(tx));
      785 +
      786 +        /*
      787 +         * There is a special case where we are receiving a filesystem that
      788 +         * already exists. In this case a temporary clone name of %X is created
      789 +         * (see dmu_recv_begin). In dmu_recv_existing_end we destroy this
      790 +         * temporary clone. We never update the filesystem counts for temporary
      791 +         * clones. To detect this case we check the filesystem name to see if
      792 +         * its a hidden filesystem (%X).
      793 +         */
      794 +        if (dd->dd_myname[0] == '%')
      795 +                return;
      796 +
      797 +        /*
      798 +         * If we hit an uninitialized node while recursing up the tree, we can
      799 +         * stop since we know the counts are not valid on this node and we
      800 +         * know we shouldn't touch this node's counts. An uninitialized count
      801 +         * on the node indicates that either the feature has not yet been
      802 +         * activated or there are no limits on this part of the tree.
      803 +         */
      804 +        if (dd->dd_phys->dd_filesystem_count == 0)
      805 +                return;
      806 +
      807 +        /*
      808 +         * The feature might have previously been active, so there could be
      809 +         * non-0 counts on the nodes, but it might now be inactive.
      810 +         *
      811 +         * On initial entry we need to check if this feature is active, but
      812 +         * we don't want to re-check this on each recursive call. Note: the
      813 +         * feature cannot be active if its not enabled. If the feature is not
      814 +         * active, don't touch the on-disk count fields.
      815 +         */
      816 +        if (first) {
      817 +                dsl_dataset_t *ds = NULL;
      818 +                spa_t *spa;
      819 +                zfeature_info_t *quota_feat =
      820 +                    &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
      821 +
      822 +                VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
      823 +                    dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
      824 +                spa = dsl_dataset_get_spa(ds);
      825 +                dsl_dataset_rele(ds, FTAG);
      826 +                if (!spa_feature_is_active(spa, quota_feat))
      827 +                        return;
      828 +        }
      829 +
      830 +        dmu_buf_will_dirty(dd->dd_dbuf, tx);
      831 +
      832 +        mutex_enter(&dd->dd_lock);
      833 +
      834 +        dd->dd_phys->dd_filesystem_count += delta;
      835 +
      836 +        if (dd->dd_parent != NULL)
      837 +                dsl_dir_fscount_adjust(dd->dd_parent, tx, delta, syncing,
      838 +                    B_FALSE);
      839 +
      840 +        mutex_exit(&dd->dd_lock);
      841 +}
      842 +
 410  843  uint64_t
 411  844  dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 412  845      dmu_tx_t *tx)
 413  846  {
 414  847          objset_t *mos = dp->dp_meta_objset;
 415  848          uint64_t ddobj;
 416  849          dsl_dir_phys_t *ddphys;
 417  850          dmu_buf_t *dbuf;
      851 +        zfeature_info_t *limit_feat =
      852 +            &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
 418  853  
      854 +
 419  855          ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 420  856              DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 421  857          if (pds) {
 422  858                  VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
 423  859                      name, sizeof (uint64_t), 1, &ddobj, tx));
 424  860          } else {
 425  861                  /* it's the root dir */
 426  862                  VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 427  863                      DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 428  864          }
 429  865          VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 430  866          dmu_buf_will_dirty(dbuf, tx);
 431  867          ddphys = dbuf->db_data;
 432  868  
 433  869          ddphys->dd_creation_time = gethrestime_sec();
      870 +        /* Only initialize the count if the limit feature is active */
      871 +        if (spa_feature_is_active(dp->dp_spa, limit_feat))
      872 +                ddphys->dd_filesystem_count = 1;
 434  873          if (pds)
 435  874                  ddphys->dd_parent_obj = pds->dd_object;
 436  875          ddphys->dd_props_zapobj = zap_create(mos,
 437  876              DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 438  877          ddphys->dd_child_dir_zapobj = zap_create(mos,
 439  878              DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 440  879          if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
 441  880                  ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 442  881          dmu_buf_rele(dbuf, FTAG);
 443  882  
↓ open down ↓ 36 lines elided ↑ open up ↑
 480  919  dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 481  920  {
 482  921          dsl_dir_t *dd = arg1;
 483  922          objset_t *mos = dd->dd_pool->dp_meta_objset;
 484  923          uint64_t obj;
 485  924          dd_used_t t;
 486  925  
 487  926          ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
 488  927          ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 489  928  
      929 +        /* Decrement the filesystem count for all parent filesystems. */
      930 +        if (dd->dd_parent != NULL)
      931 +                dsl_dir_fscount_adjust(dd->dd_parent, tx, -1, B_TRUE, B_TRUE);
      932 +
 490  933          /*
 491  934           * Remove our reservation. The impl() routine avoids setting the
 492  935           * actual property, which would require the (already destroyed) ds.
 493  936           */
 494  937          dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 495  938  
 496  939          ASSERT0(dd->dd_phys->dd_used_bytes);
 497  940          ASSERT0(dd->dd_phys->dd_reserved);
 498  941          for (t = 0; t < DD_USED_NUM; t++)
 499  942                  ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
↓ open down ↓ 529 lines elided ↑ open up ↑
1029 1472          towrite = dsl_dir_space_towrite(dd);
1030 1473          if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1031 1474              (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
1032 1475              psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
1033 1476                  err = ENOSPC;
1034 1477          }
1035 1478          mutex_exit(&dd->dd_lock);
1036 1479          return (err);
1037 1480  }
1038 1481  
1039      -extern dsl_syncfunc_t dsl_prop_set_sync;
1040      -
1041 1482  static void
1042 1483  dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1043 1484  {
1044 1485          dsl_dataset_t *ds = arg1;
1045 1486          dsl_dir_t *dd = ds->ds_dir;
1046 1487          dsl_prop_setarg_t *psa = arg2;
1047 1488          uint64_t effective_value = psa->psa_effective_value;
1048 1489  
1049 1490          dsl_prop_set_sync(ds, psa, tx);
1050 1491          DSL_PROP_CHECK_PREDICTION(dd, psa);
↓ open down ↓ 220 lines elided ↑ open up ↑
1271 1712          if (ra->newparent != dd->dd_parent) {
1272 1713                  /* is there enough space? */
1273 1714                  uint64_t myspace =
1274 1715                      MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1275 1716  
1276 1717                  /* no rename into our descendant */
1277 1718                  if (closest_common_ancestor(dd, ra->newparent) == dd)
1278 1719                          return (EINVAL);
1279 1720  
1280 1721                  if (err = dsl_dir_transfer_possible(dd->dd_parent,
1281      -                    ra->newparent, myspace))
     1722 +                    ra->newparent, dd, myspace, tx))
1282 1723                          return (err);
1283 1724          }
1284 1725  
1285 1726          return (0);
1286 1727  }
1287 1728  
1288 1729  static void
1289 1730  dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1290 1731  {
1291 1732          dsl_dir_t *dd = arg1;
↓ open down ↓ 4 lines elided ↑ open up ↑
1296 1737          char namebuf[MAXNAMELEN];
1297 1738  
1298 1739          ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
1299 1740  
1300 1741          /* Log this before we change the name. */
1301 1742          dsl_dir_name(ra->newparent, namebuf);
1302 1743          spa_history_log_internal_dd(dd, "rename", tx,
1303 1744              "-> %s/%s", namebuf, ra->mynewname);
1304 1745  
1305 1746          if (ra->newparent != dd->dd_parent) {
     1747 +                int cnt;
     1748 +
     1749 +                mutex_enter(&dd->dd_lock);
     1750 +
     1751 +                cnt = dd->dd_phys->dd_filesystem_count;
     1752 +                dsl_dir_fscount_adjust(dd->dd_parent, tx, -cnt, B_TRUE, B_TRUE);
     1753 +                dsl_dir_fscount_adjust(ra->newparent, tx, cnt, B_TRUE, B_TRUE);
     1754 +
     1755 +                cnt = dd->dd_phys->dd_snapshot_count;
     1756 +                dsl_snapcount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
     1757 +                dsl_snapcount_adjust(ra->newparent, tx, cnt, B_TRUE);
     1758 +
     1759 +                mutex_exit(&dd->dd_lock);
     1760 +
1306 1761                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1307 1762                      -dd->dd_phys->dd_used_bytes,
1308 1763                      -dd->dd_phys->dd_compressed_bytes,
1309 1764                      -dd->dd_phys->dd_uncompressed_bytes, tx);
1310 1765                  dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
1311 1766                      dd->dd_phys->dd_used_bytes,
1312 1767                      dd->dd_phys->dd_compressed_bytes,
1313 1768                      dd->dd_phys->dd_uncompressed_bytes, tx);
1314 1769  
1315 1770                  if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
↓ open down ↓ 52 lines elided ↑ open up ↑
1368 1823  
1369 1824          err = dsl_sync_task_do(dd->dd_pool,
1370 1825              dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
1371 1826  
1372 1827  out:
1373 1828          dsl_dir_close(ra.newparent, FTAG);
1374 1829          return (err);
1375 1830  }
1376 1831  
1377 1832  int
1378      -dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
     1833 +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, dsl_dir_t *moving_dd,
     1834 +    uint64_t space, dmu_tx_t *tx)
1379 1835  {
1380 1836          dsl_dir_t *ancestor;
1381 1837          int64_t adelta;
1382 1838          uint64_t avail;
     1839 +        int err;
1383 1840  
1384 1841          ancestor = closest_common_ancestor(sdd, tdd);
1385 1842          adelta = would_change(sdd, -space, ancestor);
1386 1843          avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1387 1844          if (avail < space)
1388 1845                  return (ENOSPC);
1389 1846  
     1847 +        if (sdd != moving_dd) {
     1848 +                err = dsl_dir_fscount_check(tdd,
     1849 +                    moving_dd->dd_phys->dd_filesystem_count, ancestor);
     1850 +                if (err != 0)
     1851 +                        return (err);
     1852 +        }
     1853 +        err = dsl_snapcount_check(tdd, moving_dd->dd_phys->dd_snapshot_count,
     1854 +            ancestor);
     1855 +        if (err != 0)
     1856 +                return (err);
     1857 +
1390 1858          return (0);
1391 1859  }
1392 1860  
1393 1861  timestruc_t
1394 1862  dsl_dir_snap_cmtime(dsl_dir_t *dd)
1395 1863  {
1396 1864          timestruc_t t;
1397 1865  
1398 1866          mutex_enter(&dd->dd_lock);
1399 1867          t = dd->dd_snap_cmtime;
↓ open down ↓ 15 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX