Print this page
OS-1566 filesystem limits for ZFS datasets

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dsl_dir.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_dir.c
↓ open down ↓ 13 lines elided ↑ open up ↑
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012 by Delphix. All rights reserved.
       24 + * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  24   25   */
  25   26  
  26   27  #include <sys/dmu.h>
  27   28  #include <sys/dmu_objset.h>
  28   29  #include <sys/dmu_tx.h>
  29   30  #include <sys/dsl_dataset.h>
  30   31  #include <sys/dsl_dir.h>
  31   32  #include <sys/dsl_prop.h>
  32   33  #include <sys/dsl_synctask.h>
  33   34  #include <sys/dsl_deleg.h>
  34   35  #include <sys/spa.h>
  35   36  #include <sys/metaslab.h>
  36   37  #include <sys/zap.h>
  37   38  #include <sys/zio.h>
  38   39  #include <sys/arc.h>
  39   40  #include <sys/sunddi.h>
  40   41  #include <sys/zfs_zone.h>
       42 +#include <sys/zfeature.h>
       43 +#include <sys/policy.h>
       44 +#include <sys/zfs_znode.h>
  41   45  #include "zfs_namecheck.h"
       46 +#include "zfs_prop.h"
  42   47  
       48 +/*
       49 + * Filesystem and Snapshot Limits
       50 + * ------------------------------
       51 + *
       52 + * These limits are used to restrict the number of filesystems and/or snapshots
       53 + * that can be created at a given level in the tree or below. A typical
       54 + * use-case is with a delegated dataset where the administrator wants to ensure
       55 + * that a user within the zone is not creating too many additional filesystems
       56 + * or snapshots, even though they're not exceeding their space quota.
       57 + *
       58 + * The count of filesystems and snapshots is stored in the dsl_dir_phys_t which
       59 + * impacts the on-disk format. As such, this capability is controlled by a
       60 + * feature flag and must be enabled to be used. Once enabled, the feature is
       61 + * not active until the first limit is set. At that point, future operations to
       62 + * create/destroy filesystems or snapshots will validate and update the counts.
       63 + *
       64 + * Because the on-disk counts will be uninitialized (0) before the feature is
       65 + * active, the counts are updated when a limit is first set on an uninitialized
       66 + * node (The filesystem/snapshot counts on a node includes all of the nested
       67 + * filesystems/snapshots, plus the node itself. Thus, a new leaf node has a
       68 + * filesystem count of 1 and a snapshot count of 0. A filesystem count of 0 on
       69 + * a node indicates uninitialized counts on that node.) When setting a limit on
       70 + * an uninitialized node, the code starts at the filesystem with the new limit
       71 + * and descends into all sub-filesystems and updates the counts to be accurate.
       72 + * In practice this is lightweight since a limit is typically set when the
       73 + * filesystem is created and thus has no children. Once valid, changing the
       74 + * limit value won't require a re-traversal since the counts are already valid.
       75 + * When recursively fixing the counts, if a node with a limit is encountered
       76 + * during the descent, the counts are known to be valid and there is no need to
       77 + * descend into that filesystem's children. The counts on filesystems above the
       78 + * one with the new limit will still be uninitialized (0), unless a limit is
       79 + * eventually set on one of those filesystems. The counts are always recursively
       80 + * updated when a limit is set on a dataset, unless there is already a limit.
       81 + * When a new limit value is set on a filesystem with an existing limit, it is
       82 + * possible for the new limit to be less than the current count at that level
       83 + * since a user who can change the limit is also allowed to exceed the limit.
       84 + *
       85 + * Once the feature is active, then whenever a filesystem or snapshot is
       86 + * created, the code recurses up the tree, validating the new count against the
       87 + * limit at each initialized level. In practice, most levels will not have a
       88 + * limit set. If there is a limit at any initialized level up the tree, the
       89 + * check must pass or the creation will fail. Likewise, when a filesystem or
       90 + * snapshot is destroyed, the counts are recursively adjusted all the way up
       91 + * the initizized nodes in the tree. Renaming a filesystem into different point
       92 + * in the tree will first validate, then update the counts on each branch up to
       93 + * the common ancestor. A receive will also validate the counts and then update
       94 + * them.
       95 + *
       96 + * An exception to the above behavior is that the limit is not enforced if the
       97 + * user has permission to modify the limit. This is primarily so that
       98 + * recursive snapshots in the global zone always work. We want to prevent a
       99 + * denial-of-service in which a lower level delegated dataset could max out its
      100 + * limit and thus block recursive snapshots from being taken in the global zone.
      101 + * Because of this, it is possible for the snapshot count to be over the limit
      102 + * and snapshots taken in the global zone could cause a lower level dataset to
      103 + * hit or exceed its limit. The administrator taking the global zone recursive
      104 + * snapshot should be aware of this side-effect and behave accordingly.
      105 + * For consistency, the filesystem limit is also not enforced if the user can
      106 + * modify the limit.
      107 + *
      108 + * The filesystem limit is validated by dsl_dir_fscount_check() and updated by
      109 + * dsl_dir_fscount_adjust(). The snapshot limit is validated by
      110 + * dsl_snapcount_check() and updated by dsl_snapcount_adjust().
      111 + * A new limit value is validated in dsl_dir_validate_fs_ss_limit() and the
      112 + * filesystem counts are adjusted, if necessary, by dsl_dir_set_fs_ss_count().
      113 + *
      114 + * There is a special case when we receive a filesystem that already exists. In
      115 + * this case a temporary clone name of %X is created (see dmu_recv_begin). We
      116 + * never update the filesystem counts for temporary clones.
      117 + */
      118 +
  43  119  static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
  44  120  static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
  45  121      uint64_t value, dmu_tx_t *tx);
  46  122  
      123 +extern dsl_syncfunc_t dsl_prop_set_sync;
      124 +
  47  125  /* ARGSUSED */
  48  126  static void
  49  127  dsl_dir_evict(dmu_buf_t *db, void *arg)
  50  128  {
  51  129          dsl_dir_t *dd = arg;
  52  130          dsl_pool_t *dp = dd->dd_pool;
  53  131          int t;
  54  132  
  55  133          for (t = 0; t < TXG_SIZE; t++) {
  56  134                  ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
↓ open down ↓ 343 lines elided ↑ open up ↑
 400  478   * be found in *tail.  Return NULL if the path is bogus, or if
 401  479   * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
 402  480   * means that the last component is a snapshot.
 403  481   */
 404  482  int
 405  483  dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
 406  484  {
 407  485          return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
 408  486  }
 409  487  
      488 +/*
      489 + * Check if the counts are already valid for this filesystem and its
      490 + * descendants. The counts on this filesystem, and those below, may be
      491 + * uninitialized due to either the use of a pre-existing pool which did not
      492 + * support the filesystem/snapshot limit feature, or one in which the feature
      493 + * had not yet been enabled.
      494 + *
      495 + * Recursively descend the filesystem tree and update the filesystem/snapshot
      496 + * counts on each filesystem below, then update the cumulative count on the
      497 + * current filesystem. If the filesystem already has a limit set on it,
      498 + * then we know that its counts, and the counts on the filesystems below it,
      499 + * have been updated to be correct, so we can skip this filesystem.
      500 + */
      501 +static int
      502 +dsl_dir_set_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx, uint64_t *fscnt,
      503 +    uint64_t *sscnt)
      504 +{
      505 +        uint64_t my_fs_cnt = 0;
      506 +        uint64_t my_ss_cnt = 0;
      507 +        uint64_t curr_ss_cnt;
      508 +        objset_t *os = dd->dd_pool->dp_meta_objset;
      509 +        zap_cursor_t *zc;
      510 +        zap_attribute_t *za;
      511 +        int err;
      512 +        int ret = 0;
      513 +        boolean_t limit_set = B_FALSE;
      514 +        uint64_t fslimit, sslimit;
      515 +        dsl_dataset_t *ds;
      516 +
      517 +        ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      518 +
      519 +        err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_FILESYSTEM_LIMIT),
      520 +            8, 1, &fslimit, NULL, B_FALSE);
      521 +        if (err == 0 && fslimit != UINT64_MAX)
      522 +                limit_set = B_TRUE;
      523 +
      524 +        if (!limit_set) {
      525 +                err = dsl_prop_get_dd(dd,
      526 +                    zfs_prop_to_name(ZFS_PROP_SNAPSHOT_LIMIT), 8, 1, &sslimit,
      527 +                    NULL, B_FALSE);
      528 +                if (err == 0 && sslimit != UINT64_MAX)
      529 +                        limit_set = B_TRUE;
      530 +        }
      531 +
      532 +        /*
      533 +         * If the dd has a limit, we know its count is already good and we
      534 +         * don't need to recurse down any further.
      535 +         */
      536 +        if (limit_set) {
      537 +                *fscnt = dd->dd_phys->dd_filesystem_count;
      538 +                *sscnt = dd->dd_phys->dd_snapshot_count;
      539 +                return (ret);
      540 +        }
      541 +
      542 +        zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
      543 +        za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
      544 +
      545 +        mutex_enter(&dd->dd_lock);
      546 +
      547 +        /* Iterate datasets */
      548 +        for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
      549 +            zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
      550 +                dsl_dir_t *chld_dd;
      551 +                uint64_t chld_fs_cnt = 0;
      552 +                uint64_t chld_ss_cnt = 0;
      553 +
      554 +                if (dsl_dir_open_obj(dd->dd_pool,
      555 +                    ZFS_DIRENT_OBJ(za->za_first_integer), NULL, FTAG,
      556 +                    &chld_dd)) {
      557 +                        ret = 1;
      558 +                        break;
      559 +                }
      560 +
      561 +                if (dsl_dir_set_fs_ss_count(chld_dd, tx, &chld_fs_cnt,
      562 +                    &chld_ss_cnt)) {
      563 +                        ret = 1;
      564 +                        break;
      565 +                }
      566 +
      567 +                dsl_dir_close(chld_dd, FTAG);
      568 +
      569 +                my_fs_cnt += chld_fs_cnt;
      570 +                my_ss_cnt += chld_ss_cnt;
      571 +        }
      572 +        zap_cursor_fini(zc);
      573 +        kmem_free(zc, sizeof (zap_cursor_t));
      574 +        kmem_free(za, sizeof (zap_attribute_t));
      575 +
      576 +        /* Count snapshots */
      577 +        if (dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
      578 +            FTAG, &ds) == 0) {
      579 +                if (zap_count(os, ds->ds_phys->ds_snapnames_zapobj,
      580 +                    &curr_ss_cnt) == 0)
      581 +                        my_ss_cnt += curr_ss_cnt;
      582 +                else
      583 +                        ret = 1;
      584 +                dsl_dataset_rele(ds, FTAG);
      585 +        } else {
      586 +                ret = 1;
      587 +        }
      588 +
      589 +        /* Add 1 for self */
      590 +        my_fs_cnt++;
      591 +
      592 +        /* save updated counts */
      593 +        dmu_buf_will_dirty(dd->dd_dbuf, tx);
      594 +        dd->dd_phys->dd_filesystem_count = my_fs_cnt;
      595 +        dd->dd_phys->dd_snapshot_count = my_ss_cnt;
      596 +
      597 +        mutex_exit(&dd->dd_lock);
      598 +
      599 +        /* Return child dataset count plus self */
      600 +        *fscnt = my_fs_cnt;
      601 +        *sscnt = my_ss_cnt;
      602 +        return (ret);
      603 +}
      604 +
      605 +/* ARGSUSED */
      606 +static int
      607 +fs_ss_limit_feat_check(void *arg1, void *arg2, dmu_tx_t *tx)
      608 +{
      609 +        return (0);
      610 +}
      611 +
      612 +/* ARGSUSED */
      613 +static void
      614 +fs_ss_limit_feat_sync(void *arg1, void *arg2, dmu_tx_t *tx)
      615 +{
      616 +        spa_t *spa = arg1;
      617 +        zfeature_info_t *limit_feat =
      618 +            &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
      619 +
      620 +        spa_feature_incr(spa, limit_feat, tx);
      621 +}
      622 +
      623 +/*
      624 + * Make sure the feature is enabled and activate it if necessary.
      625 + * If setting a limit, ensure the on-disk counts are valid.
      626 + *
      627 + * We do not validate the new limit, since users who can change the limit are
      628 + * also allowed to exceed the limit.
      629 + *
      630 + * Return -1 to force the zfs_set_prop_nvlist code down the default path to set
      631 + * the value in the nvlist.
      632 + */
      633 +int
      634 +dsl_dir_validate_fs_ss_limit(const char *ddname, uint64_t limit,
      635 +    zfs_prop_t ptype)
      636 +{
      637 +        dsl_dir_t *dd;
      638 +        dsl_dataset_t *ds;
      639 +        int err;
      640 +        dmu_tx_t *tx;
      641 +        uint64_t my_fs_cnt = 0;
      642 +        uint64_t my_ss_cnt = 0;
      643 +        uint64_t curr_limit;
      644 +        spa_t *spa;
      645 +        zfeature_info_t *limit_feat =
      646 +            &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
      647 +
      648 +        if ((err = dsl_dataset_hold(ddname, FTAG, &ds)) != 0)
      649 +                return (err);
      650 +
      651 +        spa = dsl_dataset_get_spa(ds);
      652 +        if (!spa_feature_is_enabled(spa,
      653 +            &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT])) {
      654 +                dsl_dataset_rele(ds, FTAG);
      655 +                return (ENOTSUP);
      656 +        }
      657 +
      658 +        dd = ds->ds_dir;
      659 +
      660 +        if ((err = dsl_prop_get_dd(dd, zfs_prop_to_name(ptype), 8, 1,
      661 +            &curr_limit, NULL, B_FALSE)) != 0) {
      662 +                dsl_dataset_rele(ds, FTAG);
      663 +                return (err);
      664 +        }
      665 +
      666 +        if (limit == UINT64_MAX) {
      667 +                /*
      668 +                 * If we had a limit, since we're now removing that limit, this
      669 +                 * is where we could decrement the feature-active counter so
      670 +                 * that the feature becomes inactive (only enabled) if we
      671 +                 * remove the last limit. However, we do not currently support
      672 +                 * deactivating the feature.
      673 +                 */
      674 +                dsl_dataset_rele(ds, FTAG);
      675 +                return (-1);
      676 +        }
      677 +
      678 +        if (!spa_feature_is_active(spa, limit_feat)) {
      679 +                /*
      680 +                 * Since the feature was not active and we're now setting a
      681 +                 * limit, increment the feature-active counter so that the
      682 +                 * feature becomes active for the first time.
      683 +                 *
      684 +                 * We can't update the MOS in open context, so create a sync
      685 +                 * task.
      686 +                 */
      687 +                err = dsl_sync_task_do(dd->dd_pool, fs_ss_limit_feat_check,
      688 +                    fs_ss_limit_feat_sync, spa, (void *)1, 0);
      689 +                if (err != 0)
      690 +                        return (err);
      691 +        }
      692 +
      693 +        tx = dmu_tx_create_dd(dd);
      694 +        if (dmu_tx_assign(tx, TXG_WAIT)) {
      695 +                dmu_tx_abort(tx);
      696 +                dsl_dataset_rele(ds, FTAG);
      697 +                return (ENOSPC);
      698 +        }
      699 +
      700 +        /*
      701 +         * Since we are now setting a non-UINT64_MAX on the filesystem, we need
      702 +         * to ensure the counts are correct. Descend down the tree from this
      703 +         * point and update all of the counts to be accurate.
      704 +         */
      705 +        err = -1;
      706 +        rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
      707 +        if (dsl_dir_set_fs_ss_count(dd, tx, &my_fs_cnt, &my_ss_cnt))
      708 +                err = ENOSPC;
      709 +        rw_exit(&dd->dd_pool->dp_config_rwlock);
      710 +
      711 +        dmu_tx_commit(tx);
      712 +        dsl_dataset_rele(ds, FTAG);
      713 +
      714 +        return (err);
      715 +}
      716 +
      717 +/*
      718 + * Used to determine if the filesystem_limit or snapshot_limit should be
      719 + * enforced. We allow the limit to be exceeded if the user has permission to
      720 + * write the property value. We pass in the creds that we got in the open
      721 + * context since we will always be the GZ root in syncing context.
      722 + *
      723 + * We can never modify these two properties within a non-global zone. In
      724 + * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
      725 + * can't use that function since we are already holding the dp_config_rwlock.
      726 + * In addition, we already have the dd and dealing with snapshots is simplified.
      727 + */
      728 +int
      729 +dsl_secpolicy_write_prop(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
      730 +{
      731 +        int err = 0;
      732 +        uint64_t obj;
      733 +        dsl_dataset_t *ds;
      734 +        uint64_t zoned;
      735 +
      736 +#ifdef _KERNEL
      737 +        if (crgetzoneid(cr) != GLOBAL_ZONEID)
      738 +                return (EPERM);
      739 +
      740 +        if (secpolicy_zfs(cr) == 0)
      741 +                return (0);
      742 +#endif
      743 +
      744 +        if ((obj = dd->dd_phys->dd_head_dataset_obj) == NULL)
      745 +                return (ENOENT);
      746 +
      747 +        ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      748 +
      749 +        if ((err = dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds)) != 0)
      750 +                return (err);
      751 +
      752 +        if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
      753 +                /* Only root can access zoned fs's from the GZ */
      754 +                err = EPERM;
      755 +        } else {
      756 +                err = dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr,
      757 +                    B_FALSE);
      758 +        }
      759 +
      760 +        dsl_dataset_rele(ds, FTAG);
      761 +        return (err);
      762 +}
      763 +
      764 +/*
      765 + * Check if adding additional child filesystem(s) would exceed any filesystem
      766 + * limits. Note that all filesystem limits up to the root (or the highest
      767 + * initialized) filesystem or the given ancestor must be satisfied.
      768 + */
      769 +int
      770 +dsl_dir_fscount_check(dsl_dir_t *dd, uint64_t cnt, dsl_dir_t *ancestor,
      771 +    cred_t *cr)
      772 +{
      773 +        uint64_t limit;
      774 +        int err = 0;
      775 +
      776 +        VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      777 +
      778 +        /* If we're allowed to change the limit, don't enforce the limit. */
      779 +        if (dsl_secpolicy_write_prop(dd, ZFS_PROP_FILESYSTEM_LIMIT, cr) == 0)
      780 +                return (0);
      781 +
      782 +        /*
      783 +         * If an ancestor has been provided, stop checking the limit once we
      784 +         * hit that dir. We need this during rename so that we don't overcount
      785 +         * the check once we recurse up to the common ancestor.
      786 +         */
      787 +        if (ancestor == dd)
      788 +                return (0);
      789 +
      790 +        /*
      791 +         * If we hit an uninitialized node while recursing up the tree, we can
      792 +         * stop since we know the counts are not valid on this node and we
      793 +         * know we won't touch this node's counts.
      794 +         */
      795 +        if (dd->dd_phys->dd_filesystem_count == 0)
      796 +                return (0);
      797 +
      798 +        err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_FILESYSTEM_LIMIT),
      799 +            8, 1, &limit, NULL, B_FALSE);
      800 +        if (err != 0)
      801 +                return (err);
      802 +
      803 +        /* Is there a fs limit which we've hit? */
      804 +        if ((dd->dd_phys->dd_filesystem_count + cnt) > limit)
      805 +                return (EDQUOT);
      806 +
      807 +        if (dd->dd_parent != NULL)
      808 +                err = dsl_dir_fscount_check(dd->dd_parent, cnt, ancestor, cr);
      809 +
      810 +        return (err);
      811 +}
      812 +
      813 +/*
      814 + * Adjust the filesystem count for the specified dsl_dir_t and all parent
      815 + * filesystems. When a new filesystem is created, increment the count on all
      816 + * parents, and when a filesystem is destroyed, decrement the count.
      817 + */
      818 +void
      819 +dsl_dir_fscount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
      820 +    boolean_t first)
      821 +{
      822 +        if (first) {
      823 +                VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      824 +                VERIFY(dmu_tx_is_syncing(tx));
      825 +        }
      826 +
      827 +        /*
      828 +         * When we receive an incremental stream into a filesystem that already
      829 +         * exists, a temporary clone is created.  We don't count this temporary
      830 +         * clone, whose name begins with a '%'.
      831 +         */
      832 +        if (dd->dd_myname[0] == '%')
      833 +                return;
      834 +
      835 +        /*
      836 +         * If we hit an uninitialized node while recursing up the tree, we can
      837 +         * stop since we know the counts are not valid on this node and we
      838 +         * know we shouldn't touch this node's counts. An uninitialized count
      839 +         * on the node indicates that either the feature has not yet been
      840 +         * activated or there are no limits on this part of the tree.
      841 +         */
      842 +        if (dd->dd_phys->dd_filesystem_count == 0)
      843 +                return;
      844 +
      845 +        /*
      846 +         * On initial entry we need to check if this feature is active, but
      847 +         * we don't want to re-check this on each recursive call. Note: the
      848 +         * feature cannot be active if its not enabled. If the feature is not
      849 +         * active, don't touch the on-disk count fields.
      850 +         */
      851 +        if (first) {
      852 +                zfeature_info_t *quota_feat =
      853 +                    &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
      854 +
      855 +                if (!spa_feature_is_active(dd->dd_pool->dp_spa, quota_feat))
      856 +                        return;
      857 +        }
      858 +
      859 +        dmu_buf_will_dirty(dd->dd_dbuf, tx);
      860 +
      861 +        mutex_enter(&dd->dd_lock);
      862 +
      863 +        dd->dd_phys->dd_filesystem_count += delta;
      864 +        VERIFY(dd->dd_phys->dd_filesystem_count >= 1);  /* ourself is 1 */
      865 +
      866 +        /* Roll up this additional count into our ancestors */
      867 +        if (dd->dd_parent != NULL)
      868 +                dsl_dir_fscount_adjust(dd->dd_parent, tx, delta, B_FALSE);
      869 +
      870 +        mutex_exit(&dd->dd_lock);
      871 +}
      872 +
 410  873  uint64_t
 411  874  dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 412  875      dmu_tx_t *tx)
 413  876  {
 414  877          objset_t *mos = dp->dp_meta_objset;
 415  878          uint64_t ddobj;
 416  879          dsl_dir_phys_t *ddphys;
 417  880          dmu_buf_t *dbuf;
      881 +        zfeature_info_t *limit_feat =
      882 +            &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
 418  883  
      884 +
 419  885          ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 420  886              DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 421  887          if (pds) {
 422  888                  VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
 423  889                      name, sizeof (uint64_t), 1, &ddobj, tx));
 424  890          } else {
 425  891                  /* it's the root dir */
 426  892                  VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 427  893                      DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 428  894          }
 429  895          VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 430  896          dmu_buf_will_dirty(dbuf, tx);
 431  897          ddphys = dbuf->db_data;
 432  898  
 433  899          ddphys->dd_creation_time = gethrestime_sec();
      900 +        /* Only initialize the count if the limit feature is active */
      901 +        if (spa_feature_is_active(dp->dp_spa, limit_feat))
      902 +                ddphys->dd_filesystem_count = 1;
 434  903          if (pds)
 435  904                  ddphys->dd_parent_obj = pds->dd_object;
 436  905          ddphys->dd_props_zapobj = zap_create(mos,
 437  906              DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 438  907          ddphys->dd_child_dir_zapobj = zap_create(mos,
 439  908              DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 440  909          if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
 441  910                  ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 442  911          dmu_buf_rele(dbuf, FTAG);
 443  912  
↓ open down ↓ 37 lines elided ↑ open up ↑
 481  950  {
 482  951          dsl_dir_t *dd = arg1;
 483  952          objset_t *mos = dd->dd_pool->dp_meta_objset;
 484  953          uint64_t obj;
 485  954          dd_used_t t;
 486  955  
 487  956          ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
 488  957          ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 489  958  
 490  959          /*
      960 +         * Decrement the filesystem count for all parent filesystems.
      961 +         *
      962 +         * When we receive an incremental stream into a filesystem that already
      963 +         * exists, a temporary clone is created.  We never count this temporary
      964 +         * clone, whose name begins with a '%'.
      965 +         */
      966 +        if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
      967 +                dsl_dir_fscount_adjust(dd->dd_parent, tx, -1, B_TRUE);
      968 +
      969 +        /*
 491  970           * Remove our reservation. The impl() routine avoids setting the
 492  971           * actual property, which would require the (already destroyed) ds.
 493  972           */
 494  973          dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 495  974  
 496  975          ASSERT0(dd->dd_phys->dd_used_bytes);
 497  976          ASSERT0(dd->dd_phys->dd_reserved);
 498  977          for (t = 0; t < DD_USED_NUM; t++)
 499  978                  ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
 500  979  
↓ open down ↓ 528 lines elided ↑ open up ↑
1029 1508          towrite = dsl_dir_space_towrite(dd);
1030 1509          if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1031 1510              (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
1032 1511              psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
1033 1512                  err = ENOSPC;
1034 1513          }
1035 1514          mutex_exit(&dd->dd_lock);
1036 1515          return (err);
1037 1516  }
1038 1517  
1039      -extern dsl_syncfunc_t dsl_prop_set_sync;
1040      -
1041 1518  static void
1042 1519  dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1043 1520  {
1044 1521          dsl_dataset_t *ds = arg1;
1045 1522          dsl_dir_t *dd = ds->ds_dir;
1046 1523          dsl_prop_setarg_t *psa = arg2;
1047 1524          uint64_t effective_value = psa->psa_effective_value;
1048 1525  
1049 1526          dsl_prop_set_sync(ds, psa, tx);
1050 1527          DSL_PROP_CHECK_PREDICTION(dd, psa);
↓ open down ↓ 180 lines elided ↑ open up ↑
1231 1708  
1232 1709          mutex_enter(&dd->dd_lock);
1233 1710          delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
1234 1711          mutex_exit(&dd->dd_lock);
1235 1712          return (would_change(dd->dd_parent, delta, ancestor));
1236 1713  }
1237 1714  
1238 1715  struct renamearg {
1239 1716          dsl_dir_t *newparent;
1240 1717          const char *mynewname;
     1718 +        cred_t *cr;
1241 1719  };
1242 1720  
1243 1721  static int
1244 1722  dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
1245 1723  {
1246 1724          dsl_dir_t *dd = arg1;
1247 1725          struct renamearg *ra = arg2;
1248 1726          dsl_pool_t *dp = dd->dd_pool;
1249 1727          objset_t *mos = dp->dp_meta_objset;
1250 1728          int err;
↓ open down ↓ 20 lines elided ↑ open up ↑
1271 1749          if (ra->newparent != dd->dd_parent) {
1272 1750                  /* is there enough space? */
1273 1751                  uint64_t myspace =
1274 1752                      MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1275 1753  
1276 1754                  /* no rename into our descendant */
1277 1755                  if (closest_common_ancestor(dd, ra->newparent) == dd)
1278 1756                          return (EINVAL);
1279 1757  
1280 1758                  if (err = dsl_dir_transfer_possible(dd->dd_parent,
1281      -                    ra->newparent, myspace))
     1759 +                    ra->newparent, dd, myspace, ra->cr))
1282 1760                          return (err);
     1761 +
     1762 +                if (dd->dd_phys->dd_filesystem_count == 0 &&
     1763 +                    dmu_tx_is_syncing(tx)) {
     1764 +                        uint64_t fs_cnt = 0;
     1765 +                        uint64_t ss_cnt = 0;
     1766 +
     1767 +                        /*
     1768 +                         * Ensure this portion of the tree's counts have been
     1769 +                         * initialized in case the new parent has limits set.
     1770 +                         */
     1771 +                        err = dsl_dir_set_fs_ss_count(dd, tx, &fs_cnt, &ss_cnt);
     1772 +                        if (err)
     1773 +                                return (EIO);
     1774 +                }
1283 1775          }
1284 1776  
1285 1777          return (0);
1286 1778  }
1287 1779  
1288 1780  static void
1289 1781  dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1290 1782  {
1291 1783          dsl_dir_t *dd = arg1;
1292 1784          struct renamearg *ra = arg2;
↓ open down ↓ 3 lines elided ↑ open up ↑
1296 1788          char namebuf[MAXNAMELEN];
1297 1789  
1298 1790          ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
1299 1791  
1300 1792          /* Log this before we change the name. */
1301 1793          dsl_dir_name(ra->newparent, namebuf);
1302 1794          spa_history_log_internal_dd(dd, "rename", tx,
1303 1795              "-> %s/%s", namebuf, ra->mynewname);
1304 1796  
1305 1797          if (ra->newparent != dd->dd_parent) {
     1798 +                int cnt;
     1799 +
     1800 +                mutex_enter(&dd->dd_lock);
     1801 +
     1802 +                cnt = dd->dd_phys->dd_filesystem_count;
     1803 +                dsl_dir_fscount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
     1804 +                dsl_dir_fscount_adjust(ra->newparent, tx, cnt, B_TRUE);
     1805 +
     1806 +                cnt = dd->dd_phys->dd_snapshot_count;
     1807 +                dsl_snapcount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
     1808 +                dsl_snapcount_adjust(ra->newparent, tx, cnt, B_TRUE);
     1809 +
     1810 +                mutex_exit(&dd->dd_lock);
     1811 +
1306 1812                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1307 1813                      -dd->dd_phys->dd_used_bytes,
1308 1814                      -dd->dd_phys->dd_compressed_bytes,
1309 1815                      -dd->dd_phys->dd_uncompressed_bytes, tx);
1310 1816                  dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
1311 1817                      dd->dd_phys->dd_used_bytes,
1312 1818                      dd->dd_phys->dd_compressed_bytes,
1313 1819                      dd->dd_phys->dd_uncompressed_bytes, tx);
1314 1820  
1315 1821                  if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
↓ open down ↓ 43 lines elided ↑ open up ↑
1359 1865                  err = ENXIO;
1360 1866                  goto out;
1361 1867          }
1362 1868  
1363 1869          /* new name should not already exist */
1364 1870          if (ra.mynewname == NULL) {
1365 1871                  err = EEXIST;
1366 1872                  goto out;
1367 1873          }
1368 1874  
     1875 +        ra.cr = CRED();
     1876 +
1369 1877          err = dsl_sync_task_do(dd->dd_pool,
1370 1878              dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
1371 1879  
1372 1880  out:
1373 1881          dsl_dir_close(ra.newparent, FTAG);
1374 1882          return (err);
1375 1883  }
1376 1884  
1377 1885  int
1378      -dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
     1886 +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, dsl_dir_t *moving_dd,
     1887 +    uint64_t space, cred_t *cr)
1379 1888  {
1380 1889          dsl_dir_t *ancestor;
1381 1890          int64_t adelta;
1382 1891          uint64_t avail;
     1892 +        int err;
1383 1893  
1384 1894          ancestor = closest_common_ancestor(sdd, tdd);
1385 1895          adelta = would_change(sdd, -space, ancestor);
1386 1896          avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1387 1897          if (avail < space)
1388 1898                  return (ENOSPC);
1389 1899  
     1900 +        if (sdd != moving_dd) {
     1901 +                err = dsl_dir_fscount_check(tdd,
     1902 +                    moving_dd->dd_phys->dd_filesystem_count, ancestor, cr);
     1903 +                if (err != 0)
     1904 +                        return (err);
     1905 +        }
     1906 +        err = dsl_snapcount_check(tdd, moving_dd->dd_phys->dd_snapshot_count,
     1907 +            ancestor, cr);
     1908 +        if (err != 0)
     1909 +                return (err);
     1910 +
1390 1911          return (0);
1391 1912  }
1392 1913  
1393 1914  timestruc_t
1394 1915  dsl_dir_snap_cmtime(dsl_dir_t *dd)
1395 1916  {
1396 1917          timestruc_t t;
1397 1918  
1398 1919          mutex_enter(&dd->dd_lock);
1399 1920          t = dd->dd_snap_cmtime;
↓ open down ↓ 15 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX