Print this page
    
OS-1566 filesystem limits for ZFS datasets
    
      
        | Split | Close | 
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dsl_dir.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_dir.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | ↓ open down ↓ | 13 lines elided | ↑ open up ↑ | 
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012 by Delphix. All rights reserved.
       24 + * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  24   25   */
  25   26  
  26   27  #include <sys/dmu.h>
  27   28  #include <sys/dmu_objset.h>
  28   29  #include <sys/dmu_tx.h>
  29   30  #include <sys/dsl_dataset.h>
  30   31  #include <sys/dsl_dir.h>
  31   32  #include <sys/dsl_prop.h>
  32   33  #include <sys/dsl_synctask.h>
  33   34  #include <sys/dsl_deleg.h>
  34   35  #include <sys/spa.h>
  35   36  #include <sys/metaslab.h>
  36   37  #include <sys/zap.h>
  37   38  #include <sys/zio.h>
  38   39  #include <sys/arc.h>
  39   40  #include <sys/sunddi.h>
  40   41  #include <sys/zfs_zone.h>
       42 +#include <sys/zfeature.h>
       43 +#include <sys/policy.h>
       44 +#include <sys/zfs_znode.h>
  41   45  #include "zfs_namecheck.h"
       46 +#include "zfs_prop.h"
  42   47  
       48 +/*
       49 + * Filesystem and Snapshot Limits
       50 + * ------------------------------
       51 + *
       52 + * These limits are used to restrict the number of filesystems and/or snapshots
       53 + * that can be created at a given level in the tree or below. A typical
       54 + * use-case is with a delegated dataset where the administrator wants to ensure
       55 + * that a user within the zone is not creating too many additional filesystems
       56 + * or snapshots, even though they're not exceeding their space quota.
       57 + *
       58 + * The count of filesystems and snapshots is stored in the dsl_dir_phys_t which
       59 + * impacts the on-disk format. As such, this capability is controlled by a
       60 + * feature flag and must be enabled to be used. Once enabled, the feature is
       61 + * not active until the first limit is set. At that point, future operations to
       62 + * create/destroy filesystems or snapshots will validate and update the counts.
       63 + *
       64 + * Because the on-disk counts will be uninitialized (0) before the feature is
       65 + * active, the counts are updated when a limit is first set on an uninitialized
       66 + * node (The filesystem/snapshot counts on a node includes all of the nested
       67 + * filesystems/snapshots, plus the node itself. Thus, a new leaf node has a
       68 + * filesystem count of 1 and a snapshot count of 0. A filesystem count of 0 on
       69 + * a node indicates uninitialized counts on that node.) When setting a limit on
       70 + * an uninitialized node, the code starts at the filesystem with the new limit
       71 + * and descends into all sub-filesystems and updates the counts to be accurate.
       72 + * In practice this is lightweight since a limit is typically set when the
       73 + * filesystem is created and thus has no children. Once valid, changing the
       74 + * limit value won't require a re-traversal since the counts are already valid.
       75 + * When recursively fixing the counts, if a node with a limit is encountered
       76 + * during the descent, the counts are known to be valid and there is no need to
       77 + * descend into that filesystem's children. The counts on filesystems above the
       78 + * one with the new limit will still be uninitialized (0), unless a limit is
       79 + * eventually set on one of those filesystems. The counts are always recursively
       80 + * updated when a limit is set on a dataset, unless there is already a limit.
       81 + * When a new limit value is set on a filesystem with an existing limit, it is
       82 + * possible for the new limit to be less than the current count at that level
       83 + * since a user who can change the limit is also allowed to exceed the limit.
       84 + *
       85 + * Once the feature is active, then whenever a filesystem or snapshot is
       86 + * created, the code recurses up the tree, validating the new count against the
       87 + * limit at each initialized level. In practice, most levels will not have a
       88 + * limit set. If there is a limit at any initialized level up the tree, the
       89 + * check must pass or the creation will fail. Likewise, when a filesystem or
       90 + * snapshot is destroyed, the counts are recursively adjusted all the way up
       91 + * the initizized nodes in the tree. Renaming a filesystem into different point
       92 + * in the tree will first validate, then update the counts on each branch up to
       93 + * the common ancestor. A receive will also validate the counts and then update
       94 + * them.
       95 + *
       96 + * An exception to the above behavior is that the limit is not enforced if the
       97 + * user has permission to modify the limit. This is primarily so that
       98 + * recursive snapshots in the global zone always work. We want to prevent a
       99 + * denial-of-service in which a lower level delegated dataset could max out its
      100 + * limit and thus block recursive snapshots from being taken in the global zone.
      101 + * Because of this, it is possible for the snapshot count to be over the limit
      102 + * and snapshots taken in the global zone could cause a lower level dataset to
      103 + * hit or exceed its limit. The administrator taking the global zone recursive
      104 + * snapshot should be aware of this side-effect and behave accordingly.
      105 + * For consistency, the filesystem limit is also not enforced if the user can
      106 + * modify the limit.
      107 + *
      108 + * The filesystem limit is validated by dsl_dir_fscount_check() and updated by
      109 + * dsl_dir_fscount_adjust(). The snapshot limit is validated by
      110 + * dsl_snapcount_check() and updated by dsl_snapcount_adjust().
      111 + * A new limit value is validated in dsl_dir_validate_fs_ss_limit() and the
      112 + * filesystem counts are adjusted, if necessary, by dsl_dir_set_fs_ss_count().
      113 + *
      114 + * There is a special case when we receive a filesystem that already exists. In
      115 + * this case a temporary clone name of %X is created (see dmu_recv_begin). We
      116 + * never update the filesystem counts for temporary clones.
      117 + */
      118 +
  43  119  static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
  44  120  static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
  45  121      uint64_t value, dmu_tx_t *tx);
  46  122  
      123 +extern dsl_syncfunc_t dsl_prop_set_sync;
      124 +
  47  125  /* ARGSUSED */
  48  126  static void
  49  127  dsl_dir_evict(dmu_buf_t *db, void *arg)
  50  128  {
  51  129          dsl_dir_t *dd = arg;
  52  130          dsl_pool_t *dp = dd->dd_pool;
  53  131          int t;
  54  132  
  55  133          for (t = 0; t < TXG_SIZE; t++) {
  56  134                  ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
  57  135                  ASSERT(dd->dd_tempreserved[t] == 0);
  58  136                  ASSERT(dd->dd_space_towrite[t] == 0);
  59  137          }
  60  138  
  61  139          if (dd->dd_parent)
  62  140                  dsl_dir_close(dd->dd_parent, dd);
  63  141  
  64  142          spa_close(dd->dd_pool->dp_spa, dd);
  65  143  
  66  144          /*
  67  145           * The props callback list should have been cleaned up by
  68  146           * objset_evict().
  69  147           */
  70  148          list_destroy(&dd->dd_prop_cbs);
  71  149          mutex_destroy(&dd->dd_lock);
  72  150          kmem_free(dd, sizeof (dsl_dir_t));
  73  151  }
  74  152  
  75  153  int
  76  154  dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
  77  155      const char *tail, void *tag, dsl_dir_t **ddp)
  78  156  {
  79  157          dmu_buf_t *dbuf;
  80  158          dsl_dir_t *dd;
  81  159          int err;
  82  160  
  83  161          ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
  84  162              dsl_pool_sync_context(dp));
  85  163  
  86  164          err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
  87  165          if (err)
  88  166                  return (err);
  89  167          dd = dmu_buf_get_user(dbuf);
  90  168  #ifdef ZFS_DEBUG
  91  169          {
  92  170                  dmu_object_info_t doi;
  93  171                  dmu_object_info_from_db(dbuf, &doi);
  94  172                  ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
  95  173                  ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
  96  174          }
  97  175  #endif
  98  176          if (dd == NULL) {
  99  177                  dsl_dir_t *winner;
 100  178  
 101  179                  dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
 102  180                  dd->dd_object = ddobj;
 103  181                  dd->dd_dbuf = dbuf;
 104  182                  dd->dd_pool = dp;
 105  183                  dd->dd_phys = dbuf->db_data;
 106  184                  mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
 107  185  
 108  186                  list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
 109  187                      offsetof(dsl_prop_cb_record_t, cbr_node));
 110  188  
 111  189                  dsl_dir_snap_cmtime_update(dd);
 112  190  
 113  191                  if (dd->dd_phys->dd_parent_obj) {
 114  192                          err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
 115  193                              NULL, dd, &dd->dd_parent);
 116  194                          if (err)
 117  195                                  goto errout;
 118  196                          if (tail) {
 119  197  #ifdef ZFS_DEBUG
 120  198                                  uint64_t foundobj;
 121  199  
 122  200                                  err = zap_lookup(dp->dp_meta_objset,
 123  201                                      dd->dd_parent->dd_phys->dd_child_dir_zapobj,
 124  202                                      tail, sizeof (foundobj), 1, &foundobj);
 125  203                                  ASSERT(err || foundobj == ddobj);
 126  204  #endif
 127  205                                  (void) strcpy(dd->dd_myname, tail);
 128  206                          } else {
 129  207                                  err = zap_value_search(dp->dp_meta_objset,
 130  208                                      dd->dd_parent->dd_phys->dd_child_dir_zapobj,
 131  209                                      ddobj, 0, dd->dd_myname);
 132  210                          }
 133  211                          if (err)
 134  212                                  goto errout;
 135  213                  } else {
 136  214                          (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
 137  215                  }
 138  216  
 139  217                  if (dsl_dir_is_clone(dd)) {
 140  218                          dmu_buf_t *origin_bonus;
 141  219                          dsl_dataset_phys_t *origin_phys;
 142  220  
 143  221                          /*
 144  222                           * We can't open the origin dataset, because
 145  223                           * that would require opening this dsl_dir.
 146  224                           * Just look at its phys directly instead.
 147  225                           */
 148  226                          err = dmu_bonus_hold(dp->dp_meta_objset,
 149  227                              dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
 150  228                          if (err)
 151  229                                  goto errout;
 152  230                          origin_phys = origin_bonus->db_data;
 153  231                          dd->dd_origin_txg =
 154  232                              origin_phys->ds_creation_txg;
 155  233                          dmu_buf_rele(origin_bonus, FTAG);
 156  234                  }
 157  235  
 158  236                  winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
 159  237                      dsl_dir_evict);
 160  238                  if (winner) {
 161  239                          if (dd->dd_parent)
 162  240                                  dsl_dir_close(dd->dd_parent, dd);
 163  241                          mutex_destroy(&dd->dd_lock);
 164  242                          kmem_free(dd, sizeof (dsl_dir_t));
 165  243                          dd = winner;
 166  244                  } else {
 167  245                          spa_open_ref(dp->dp_spa, dd);
 168  246                  }
 169  247          }
 170  248  
 171  249          /*
 172  250           * The dsl_dir_t has both open-to-close and instantiate-to-evict
 173  251           * holds on the spa.  We need the open-to-close holds because
 174  252           * otherwise the spa_refcnt wouldn't change when we open a
 175  253           * dir which the spa also has open, so we could incorrectly
 176  254           * think it was OK to unload/export/destroy the pool.  We need
 177  255           * the instantiate-to-evict hold because the dsl_dir_t has a
 178  256           * pointer to the dd_pool, which has a pointer to the spa_t.
 179  257           */
 180  258          spa_open_ref(dp->dp_spa, tag);
 181  259          ASSERT3P(dd->dd_pool, ==, dp);
 182  260          ASSERT3U(dd->dd_object, ==, ddobj);
 183  261          ASSERT3P(dd->dd_dbuf, ==, dbuf);
 184  262          *ddp = dd;
 185  263          return (0);
 186  264  
 187  265  errout:
 188  266          if (dd->dd_parent)
 189  267                  dsl_dir_close(dd->dd_parent, dd);
 190  268          mutex_destroy(&dd->dd_lock);
 191  269          kmem_free(dd, sizeof (dsl_dir_t));
 192  270          dmu_buf_rele(dbuf, tag);
 193  271          return (err);
 194  272  }
 195  273  
 196  274  void
 197  275  dsl_dir_close(dsl_dir_t *dd, void *tag)
 198  276  {
 199  277          dprintf_dd(dd, "%s\n", "");
 200  278          spa_close(dd->dd_pool->dp_spa, tag);
 201  279          dmu_buf_rele(dd->dd_dbuf, tag);
 202  280  }
 203  281  
 204  282  /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
 205  283  void
 206  284  dsl_dir_name(dsl_dir_t *dd, char *buf)
 207  285  {
 208  286          if (dd->dd_parent) {
 209  287                  dsl_dir_name(dd->dd_parent, buf);
 210  288                  (void) strcat(buf, "/");
 211  289          } else {
 212  290                  buf[0] = '\0';
 213  291          }
 214  292          if (!MUTEX_HELD(&dd->dd_lock)) {
 215  293                  /*
 216  294                   * recursive mutex so that we can use
 217  295                   * dprintf_dd() with dd_lock held
 218  296                   */
 219  297                  mutex_enter(&dd->dd_lock);
 220  298                  (void) strcat(buf, dd->dd_myname);
 221  299                  mutex_exit(&dd->dd_lock);
 222  300          } else {
 223  301                  (void) strcat(buf, dd->dd_myname);
 224  302          }
 225  303  }
 226  304  
 227  305  /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
 228  306  int
 229  307  dsl_dir_namelen(dsl_dir_t *dd)
 230  308  {
 231  309          int result = 0;
 232  310  
 233  311          if (dd->dd_parent) {
 234  312                  /* parent's name + 1 for the "/" */
 235  313                  result = dsl_dir_namelen(dd->dd_parent) + 1;
 236  314          }
 237  315  
 238  316          if (!MUTEX_HELD(&dd->dd_lock)) {
 239  317                  /* see dsl_dir_name */
 240  318                  mutex_enter(&dd->dd_lock);
 241  319                  result += strlen(dd->dd_myname);
 242  320                  mutex_exit(&dd->dd_lock);
 243  321          } else {
 244  322                  result += strlen(dd->dd_myname);
 245  323          }
 246  324  
 247  325          return (result);
 248  326  }
 249  327  
 250  328  static int
 251  329  getcomponent(const char *path, char *component, const char **nextp)
 252  330  {
 253  331          char *p;
 254  332          if ((path == NULL) || (path[0] == '\0'))
 255  333                  return (ENOENT);
 256  334          /* This would be a good place to reserve some namespace... */
 257  335          p = strpbrk(path, "/@");
 258  336          if (p && (p[1] == '/' || p[1] == '@')) {
 259  337                  /* two separators in a row */
 260  338                  return (EINVAL);
 261  339          }
 262  340          if (p == NULL || p == path) {
 263  341                  /*
 264  342                   * if the first thing is an @ or /, it had better be an
 265  343                   * @ and it had better not have any more ats or slashes,
 266  344                   * and it had better have something after the @.
 267  345                   */
 268  346                  if (p != NULL &&
 269  347                      (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
 270  348                          return (EINVAL);
 271  349                  if (strlen(path) >= MAXNAMELEN)
 272  350                          return (ENAMETOOLONG);
 273  351                  (void) strcpy(component, path);
 274  352                  p = NULL;
 275  353          } else if (p[0] == '/') {
 276  354                  if (p-path >= MAXNAMELEN)
 277  355                          return (ENAMETOOLONG);
 278  356                  (void) strncpy(component, path, p - path);
 279  357                  component[p-path] = '\0';
 280  358                  p++;
 281  359          } else if (p[0] == '@') {
 282  360                  /*
 283  361                   * if the next separator is an @, there better not be
 284  362                   * any more slashes.
 285  363                   */
 286  364                  if (strchr(path, '/'))
 287  365                          return (EINVAL);
 288  366                  if (p-path >= MAXNAMELEN)
 289  367                          return (ENAMETOOLONG);
 290  368                  (void) strncpy(component, path, p - path);
 291  369                  component[p-path] = '\0';
 292  370          } else {
 293  371                  ASSERT(!"invalid p");
 294  372          }
 295  373          *nextp = p;
 296  374          return (0);
 297  375  }
 298  376  
 299  377  /*
 300  378   * same as dsl_open_dir, ignore the first component of name and use the
 301  379   * spa instead
 302  380   */
 303  381  int
 304  382  dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
 305  383      dsl_dir_t **ddp, const char **tailp)
 306  384  {
 307  385          char buf[MAXNAMELEN];
 308  386          const char *next, *nextnext = NULL;
 309  387          int err;
 310  388          dsl_dir_t *dd;
 311  389          dsl_pool_t *dp;
 312  390          uint64_t ddobj;
 313  391          int openedspa = FALSE;
 314  392  
 315  393          dprintf("%s\n", name);
 316  394  
 317  395          err = getcomponent(name, buf, &next);
 318  396          if (err)
 319  397                  return (err);
 320  398          if (spa == NULL) {
 321  399                  err = spa_open(buf, &spa, FTAG);
 322  400                  if (err) {
 323  401                          dprintf("spa_open(%s) failed\n", buf);
 324  402                          return (err);
 325  403                  }
 326  404                  openedspa = TRUE;
 327  405  
 328  406                  /* XXX this assertion belongs in spa_open */
 329  407                  ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
 330  408          }
 331  409  
 332  410          dp = spa_get_dsl(spa);
 333  411  
 334  412          rw_enter(&dp->dp_config_rwlock, RW_READER);
 335  413          err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
 336  414          if (err) {
 337  415                  rw_exit(&dp->dp_config_rwlock);
 338  416                  if (openedspa)
 339  417                          spa_close(spa, FTAG);
 340  418                  return (err);
 341  419          }
 342  420  
 343  421          while (next != NULL) {
 344  422                  dsl_dir_t *child_ds;
 345  423                  err = getcomponent(next, buf, &nextnext);
 346  424                  if (err)
 347  425                          break;
 348  426                  ASSERT(next[0] != '\0');
 349  427                  if (next[0] == '@')
 350  428                          break;
 351  429                  dprintf("looking up %s in obj%lld\n",
 352  430                      buf, dd->dd_phys->dd_child_dir_zapobj);
 353  431  
 354  432                  err = zap_lookup(dp->dp_meta_objset,
 355  433                      dd->dd_phys->dd_child_dir_zapobj,
 356  434                      buf, sizeof (ddobj), 1, &ddobj);
 357  435                  if (err) {
 358  436                          if (err == ENOENT)
 359  437                                  err = 0;
 360  438                          break;
 361  439                  }
 362  440  
 363  441                  err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
 364  442                  if (err)
 365  443                          break;
 366  444                  dsl_dir_close(dd, tag);
 367  445                  dd = child_ds;
 368  446                  next = nextnext;
 369  447          }
 370  448          rw_exit(&dp->dp_config_rwlock);
 371  449  
 372  450          if (err) {
 373  451                  dsl_dir_close(dd, tag);
 374  452                  if (openedspa)
 375  453                          spa_close(spa, FTAG);
 376  454                  return (err);
 377  455          }
 378  456  
 379  457          /*
 380  458           * It's an error if there's more than one component left, or
 381  459           * tailp==NULL and there's any component left.
 382  460           */
 383  461          if (next != NULL &&
 384  462              (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
 385  463                  /* bad path name */
 386  464                  dsl_dir_close(dd, tag);
 387  465                  dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
 388  466                  err = ENOENT;
 389  467          }
 390  468          if (tailp)
 391  469                  *tailp = next;
 392  470          if (openedspa)
 393  471                  spa_close(spa, FTAG);
 394  472          *ddp = dd;
 395  473          return (err);
 396  474  }
 397  475  
 398  476  /*
 399  477   * Return the dsl_dir_t, and possibly the last component which couldn't
  
    | ↓ open down ↓ | 343 lines elided | ↑ open up ↑ | 
 400  478   * be found in *tail.  Return NULL if the path is bogus, or if
 401  479   * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
 402  480   * means that the last component is a snapshot.
 403  481   */
 404  482  int
 405  483  dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
 406  484  {
 407  485          return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
 408  486  }
 409  487  
      488 +/*
      489 + * Check if the counts are already valid for this filesystem and its
      490 + * descendants. The counts on this filesystem, and those below, may be
      491 + * uninitialized due to either the use of a pre-existing pool which did not
      492 + * support the filesystem/snapshot limit feature, or one in which the feature
      493 + * had not yet been enabled.
      494 + *
      495 + * Recursively descend the filesystem tree and update the filesystem/snapshot
      496 + * counts on each filesystem below, then update the cumulative count on the
      497 + * current filesystem. If the filesystem already has a limit set on it,
      498 + * then we know that its counts, and the counts on the filesystems below it,
      499 + * have been updated to be correct, so we can skip this filesystem.
      500 + */
      501 +static int
      502 +dsl_dir_set_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx, uint64_t *fscnt,
      503 +    uint64_t *sscnt)
      504 +{
      505 +        uint64_t my_fs_cnt = 0;
      506 +        uint64_t my_ss_cnt = 0;
      507 +        uint64_t curr_ss_cnt;
      508 +        objset_t *os = dd->dd_pool->dp_meta_objset;
      509 +        zap_cursor_t *zc;
      510 +        zap_attribute_t *za;
      511 +        int err;
      512 +        int ret = 0;
      513 +        boolean_t limit_set = B_FALSE;
      514 +        uint64_t fslimit, sslimit;
      515 +        dsl_dataset_t *ds;
      516 +
      517 +        ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      518 +
      519 +        err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_FILESYSTEM_LIMIT),
      520 +            8, 1, &fslimit, NULL, B_FALSE);
      521 +        if (err == 0 && fslimit != UINT64_MAX)
      522 +                limit_set = B_TRUE;
      523 +
      524 +        if (!limit_set) {
      525 +                err = dsl_prop_get_dd(dd,
      526 +                    zfs_prop_to_name(ZFS_PROP_SNAPSHOT_LIMIT), 8, 1, &sslimit,
      527 +                    NULL, B_FALSE);
      528 +                if (err == 0 && sslimit != UINT64_MAX)
      529 +                        limit_set = B_TRUE;
      530 +        }
      531 +
      532 +        /*
      533 +         * If the dd has a limit, we know its count is already good and we
      534 +         * don't need to recurse down any further.
      535 +         */
      536 +        if (limit_set) {
      537 +                *fscnt = dd->dd_phys->dd_filesystem_count;
      538 +                *sscnt = dd->dd_phys->dd_snapshot_count;
      539 +                return (ret);
      540 +        }
      541 +
      542 +        zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
      543 +        za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
      544 +
      545 +        mutex_enter(&dd->dd_lock);
      546 +
      547 +        /* Iterate datasets */
      548 +        for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
      549 +            zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
      550 +                dsl_dir_t *chld_dd;
      551 +                uint64_t chld_fs_cnt = 0;
      552 +                uint64_t chld_ss_cnt = 0;
      553 +
      554 +                if (dsl_dir_open_obj(dd->dd_pool,
      555 +                    ZFS_DIRENT_OBJ(za->za_first_integer), NULL, FTAG,
      556 +                    &chld_dd)) {
      557 +                        ret = 1;
      558 +                        break;
      559 +                }
      560 +
      561 +                if (dsl_dir_set_fs_ss_count(chld_dd, tx, &chld_fs_cnt,
      562 +                    &chld_ss_cnt)) {
      563 +                        ret = 1;
      564 +                        break;
      565 +                }
      566 +
      567 +                dsl_dir_close(chld_dd, FTAG);
      568 +
      569 +                my_fs_cnt += chld_fs_cnt;
      570 +                my_ss_cnt += chld_ss_cnt;
      571 +        }
      572 +        zap_cursor_fini(zc);
      573 +        kmem_free(zc, sizeof (zap_cursor_t));
      574 +        kmem_free(za, sizeof (zap_attribute_t));
      575 +
      576 +        /* Count snapshots */
      577 +        if (dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
      578 +            FTAG, &ds) == 0) {
      579 +                if (zap_count(os, ds->ds_phys->ds_snapnames_zapobj,
      580 +                    &curr_ss_cnt) == 0)
      581 +                        my_ss_cnt += curr_ss_cnt;
      582 +                else
      583 +                        ret = 1;
      584 +                dsl_dataset_rele(ds, FTAG);
      585 +        } else {
      586 +                ret = 1;
      587 +        }
      588 +
      589 +        /* Add 1 for self */
      590 +        my_fs_cnt++;
      591 +
      592 +        /* save updated counts */
      593 +        dmu_buf_will_dirty(dd->dd_dbuf, tx);
      594 +        dd->dd_phys->dd_filesystem_count = my_fs_cnt;
      595 +        dd->dd_phys->dd_snapshot_count = my_ss_cnt;
      596 +
      597 +        mutex_exit(&dd->dd_lock);
      598 +
      599 +        /* Return child dataset count plus self */
      600 +        *fscnt = my_fs_cnt;
      601 +        *sscnt = my_ss_cnt;
      602 +        return (ret);
      603 +}
      604 +
      605 +/* ARGSUSED */
      606 +static int
      607 +fs_ss_limit_feat_check(void *arg1, void *arg2, dmu_tx_t *tx)
      608 +{
      609 +        return (0);
      610 +}
      611 +
      612 +/* ARGSUSED */
      613 +static void
      614 +fs_ss_limit_feat_sync(void *arg1, void *arg2, dmu_tx_t *tx)
      615 +{
      616 +        spa_t *spa = arg1;
      617 +        zfeature_info_t *limit_feat =
      618 +            &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
      619 +
      620 +        spa_feature_incr(spa, limit_feat, tx);
      621 +}
      622 +
      623 +/*
      624 + * Make sure the feature is enabled and activate it if necessary.
      625 + * If setting a limit, ensure the on-disk counts are valid.
      626 + *
      627 + * We do not validate the new limit, since users who can change the limit are
      628 + * also allowed to exceed the limit.
      629 + *
      630 + * Return -1 to force the zfs_set_prop_nvlist code down the default path to set
      631 + * the value in the nvlist.
      632 + */
      633 +int
      634 +dsl_dir_validate_fs_ss_limit(const char *ddname, uint64_t limit,
      635 +    zfs_prop_t ptype)
      636 +{
      637 +        dsl_dir_t *dd;
      638 +        dsl_dataset_t *ds;
      639 +        int err;
      640 +        dmu_tx_t *tx;
      641 +        uint64_t my_fs_cnt = 0;
      642 +        uint64_t my_ss_cnt = 0;
      643 +        uint64_t curr_limit;
      644 +        spa_t *spa;
      645 +        zfeature_info_t *limit_feat =
      646 +            &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
      647 +
      648 +        if ((err = dsl_dataset_hold(ddname, FTAG, &ds)) != 0)
      649 +                return (err);
      650 +
      651 +        spa = dsl_dataset_get_spa(ds);
      652 +        if (!spa_feature_is_enabled(spa,
      653 +            &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT])) {
      654 +                dsl_dataset_rele(ds, FTAG);
      655 +                return (ENOTSUP);
      656 +        }
      657 +
      658 +        dd = ds->ds_dir;
      659 +
      660 +        if ((err = dsl_prop_get_dd(dd, zfs_prop_to_name(ptype), 8, 1,
      661 +            &curr_limit, NULL, B_FALSE)) != 0) {
      662 +                dsl_dataset_rele(ds, FTAG);
      663 +                return (err);
      664 +        }
      665 +
      666 +        if (limit == UINT64_MAX) {
      667 +                /*
      668 +                 * If we had a limit, since we're now removing that limit, this
      669 +                 * is where we could decrement the feature-active counter so
      670 +                 * that the feature becomes inactive (only enabled) if we
      671 +                 * remove the last limit. However, we do not currently support
      672 +                 * deactivating the feature.
      673 +                 */
      674 +                dsl_dataset_rele(ds, FTAG);
      675 +                return (-1);
      676 +        }
      677 +
      678 +        if (!spa_feature_is_active(spa, limit_feat)) {
      679 +                /*
      680 +                 * Since the feature was not active and we're now setting a
      681 +                 * limit, increment the feature-active counter so that the
      682 +                 * feature becomes active for the first time.
      683 +                 *
      684 +                 * We can't update the MOS in open context, so create a sync
      685 +                 * task.
      686 +                 */
      687 +                err = dsl_sync_task_do(dd->dd_pool, fs_ss_limit_feat_check,
      688 +                    fs_ss_limit_feat_sync, spa, (void *)1, 0);
      689 +                if (err != 0)
      690 +                        return (err);
      691 +        }
      692 +
      693 +        tx = dmu_tx_create_dd(dd);
      694 +        if (dmu_tx_assign(tx, TXG_WAIT)) {
      695 +                dmu_tx_abort(tx);
      696 +                dsl_dataset_rele(ds, FTAG);
      697 +                return (ENOSPC);
      698 +        }
      699 +
      700 +        /*
      701 +         * Since we are now setting a non-UINT64_MAX on the filesystem, we need
      702 +         * to ensure the counts are correct. Descend down the tree from this
      703 +         * point and update all of the counts to be accurate.
      704 +         */
      705 +        err = -1;
      706 +        rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
      707 +        if (dsl_dir_set_fs_ss_count(dd, tx, &my_fs_cnt, &my_ss_cnt))
      708 +                err = ENOSPC;
      709 +        rw_exit(&dd->dd_pool->dp_config_rwlock);
      710 +
      711 +        dmu_tx_commit(tx);
      712 +        dsl_dataset_rele(ds, FTAG);
      713 +
      714 +        return (err);
      715 +}
      716 +
      717 +/*
      718 + * Used to determine if the filesystem_limit or snapshot_limit should be
      719 + * enforced. We allow the limit to be exceeded if the user has permission to
      720 + * write the property value. We pass in the creds that we got in the open
      721 + * context since we will always be the GZ root in syncing context.
      722 + *
      723 + * We can never modify these two properties within a non-global zone. In
      724 + * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
      725 + * can't use that function since we are already holding the dp_config_rwlock.
      726 + * In addition, we already have the dd and dealing with snapshots is simplified.
      727 + */
      728 +int
      729 +dsl_secpolicy_write_prop(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
      730 +{
      731 +        int err = 0;
      732 +        uint64_t obj;
      733 +        dsl_dataset_t *ds;
      734 +        uint64_t zoned;
      735 +
      736 +#ifdef _KERNEL
      737 +        if (crgetzoneid(cr) != GLOBAL_ZONEID)
      738 +                return (EPERM);
      739 +
      740 +        if (secpolicy_zfs(cr) == 0)
      741 +                return (0);
      742 +#endif
      743 +
      744 +        if ((obj = dd->dd_phys->dd_head_dataset_obj) == NULL)
      745 +                return (ENOENT);
      746 +
      747 +        ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      748 +
      749 +        if ((err = dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds)) != 0)
      750 +                return (err);
      751 +
      752 +        if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
      753 +                /* Only root can access zoned fs's from the GZ */
      754 +                err = EPERM;
      755 +        } else {
      756 +                err = dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr,
      757 +                    B_FALSE);
      758 +        }
      759 +
      760 +        dsl_dataset_rele(ds, FTAG);
      761 +        return (err);
      762 +}
      763 +
      764 +/*
      765 + * Check if adding additional child filesystem(s) would exceed any filesystem
      766 + * limits. Note that all filesystem limits up to the root (or the highest
      767 + * initialized) filesystem or the given ancestor must be satisfied.
      768 + */
      769 +int
      770 +dsl_dir_fscount_check(dsl_dir_t *dd, uint64_t cnt, dsl_dir_t *ancestor,
      771 +    cred_t *cr)
      772 +{
      773 +        uint64_t limit;
      774 +        int err = 0;
      775 +
      776 +        VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      777 +
      778 +        /* If we're allowed to change the limit, don't enforce the limit. */
      779 +        if (dsl_secpolicy_write_prop(dd, ZFS_PROP_FILESYSTEM_LIMIT, cr) == 0)
      780 +                return (0);
      781 +
      782 +        /*
      783 +         * If an ancestor has been provided, stop checking the limit once we
      784 +         * hit that dir. We need this during rename so that we don't overcount
      785 +         * the check once we recurse up to the common ancestor.
      786 +         */
      787 +        if (ancestor == dd)
      788 +                return (0);
      789 +
      790 +        /*
      791 +         * If we hit an uninitialized node while recursing up the tree, we can
      792 +         * stop since we know the counts are not valid on this node and we
      793 +         * know we won't touch this node's counts.
      794 +         */
      795 +        if (dd->dd_phys->dd_filesystem_count == 0)
      796 +                return (0);
      797 +
      798 +        err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_FILESYSTEM_LIMIT),
      799 +            8, 1, &limit, NULL, B_FALSE);
      800 +        if (err != 0)
      801 +                return (err);
      802 +
      803 +        /* Is there a fs limit which we've hit? */
      804 +        if ((dd->dd_phys->dd_filesystem_count + cnt) > limit)
      805 +                return (EDQUOT);
      806 +
      807 +        if (dd->dd_parent != NULL)
      808 +                err = dsl_dir_fscount_check(dd->dd_parent, cnt, ancestor, cr);
      809 +
      810 +        return (err);
      811 +}
      812 +
      813 +/*
      814 + * Adjust the filesystem count for the specified dsl_dir_t and all parent
      815 + * filesystems. When a new filesystem is created, increment the count on all
      816 + * parents, and when a filesystem is destroyed, decrement the count.
      817 + */
      818 +void
      819 +dsl_dir_fscount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
      820 +    boolean_t first)
      821 +{
      822 +        if (first) {
      823 +                VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      824 +                VERIFY(dmu_tx_is_syncing(tx));
      825 +        }
      826 +
      827 +        /*
      828 +         * When we receive an incremental stream into a filesystem that already
      829 +         * exists, a temporary clone is created.  We don't count this temporary
      830 +         * clone, whose name begins with a '%'.
      831 +         */
      832 +        if (dd->dd_myname[0] == '%')
      833 +                return;
      834 +
      835 +        /*
      836 +         * If we hit an uninitialized node while recursing up the tree, we can
      837 +         * stop since we know the counts are not valid on this node and we
      838 +         * know we shouldn't touch this node's counts. An uninitialized count
      839 +         * on the node indicates that either the feature has not yet been
      840 +         * activated or there are no limits on this part of the tree.
      841 +         */
      842 +        if (dd->dd_phys->dd_filesystem_count == 0)
      843 +                return;
      844 +
      845 +        /*
      846 +         * On initial entry we need to check if this feature is active, but
      847 +         * we don't want to re-check this on each recursive call. Note: the
      848 +         * feature cannot be active if its not enabled. If the feature is not
      849 +         * active, don't touch the on-disk count fields.
      850 +         */
      851 +        if (first) {
      852 +                zfeature_info_t *quota_feat =
      853 +                    &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
      854 +
      855 +                if (!spa_feature_is_active(dd->dd_pool->dp_spa, quota_feat))
      856 +                        return;
      857 +        }
      858 +
      859 +        dmu_buf_will_dirty(dd->dd_dbuf, tx);
      860 +
      861 +        mutex_enter(&dd->dd_lock);
      862 +
      863 +        dd->dd_phys->dd_filesystem_count += delta;
      864 +        VERIFY(dd->dd_phys->dd_filesystem_count >= 1);  /* ourself is 1 */
      865 +
      866 +        /* Roll up this additional count into our ancestors */
      867 +        if (dd->dd_parent != NULL)
      868 +                dsl_dir_fscount_adjust(dd->dd_parent, tx, delta, B_FALSE);
      869 +
      870 +        mutex_exit(&dd->dd_lock);
      871 +}
      872 +
 410  873  uint64_t
 411  874  dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 412  875      dmu_tx_t *tx)
 413  876  {
 414  877          objset_t *mos = dp->dp_meta_objset;
 415  878          uint64_t ddobj;
 416  879          dsl_dir_phys_t *ddphys;
 417  880          dmu_buf_t *dbuf;
      881 +        zfeature_info_t *limit_feat =
      882 +            &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
 418  883  
      884 +
 419  885          ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 420  886              DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 421  887          if (pds) {
 422  888                  VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
 423  889                      name, sizeof (uint64_t), 1, &ddobj, tx));
 424  890          } else {
 425  891                  /* it's the root dir */
 426  892                  VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 427  893                      DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 428  894          }
 429  895          VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 430  896          dmu_buf_will_dirty(dbuf, tx);
 431  897          ddphys = dbuf->db_data;
 432  898  
 433  899          ddphys->dd_creation_time = gethrestime_sec();
      900 +        /* Only initialize the count if the limit feature is active */
      901 +        if (spa_feature_is_active(dp->dp_spa, limit_feat))
      902 +                ddphys->dd_filesystem_count = 1;
 434  903          if (pds)
 435  904                  ddphys->dd_parent_obj = pds->dd_object;
 436  905          ddphys->dd_props_zapobj = zap_create(mos,
 437  906              DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 438  907          ddphys->dd_child_dir_zapobj = zap_create(mos,
 439  908              DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 440  909          if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
 441  910                  ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 442  911          dmu_buf_rele(dbuf, FTAG);
 443  912  
 444  913          return (ddobj);
 445  914  }
 446  915  
 447  916  /* ARGSUSED */
 448  917  int
 449  918  dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 450  919  {
 451  920          dsl_dir_t *dd = arg1;
 452  921          dsl_pool_t *dp = dd->dd_pool;
 453  922          objset_t *mos = dp->dp_meta_objset;
 454  923          int err;
 455  924          uint64_t count;
 456  925  
 457  926          /*
 458  927           * There should be exactly two holds, both from
 459  928           * dsl_dataset_destroy: one on the dd directory, and one on its
 460  929           * head ds.  If there are more holds, then a concurrent thread is
 461  930           * performing a lookup inside this dir while we're trying to destroy
 462  931           * it.  To minimize this possibility, we perform this check only
 463  932           * in syncing context and fail the operation if we encounter
 464  933           * additional holds.  The dp_config_rwlock ensures that nobody else
 465  934           * opens it after we check.
 466  935           */
 467  936          if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2)
 468  937                  return (EBUSY);
 469  938  
 470  939          err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
 471  940          if (err)
 472  941                  return (err);
 473  942          if (count != 0)
 474  943                  return (EEXIST);
 475  944  
 476  945          return (0);
 477  946  }
 478  947  
 479  948  void
 480  949  dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
  
    | ↓ open down ↓ | 37 lines elided | ↑ open up ↑ | 
 481  950  {
 482  951          dsl_dir_t *dd = arg1;
 483  952          objset_t *mos = dd->dd_pool->dp_meta_objset;
 484  953          uint64_t obj;
 485  954          dd_used_t t;
 486  955  
 487  956          ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
 488  957          ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 489  958  
 490  959          /*
      960 +         * Decrement the filesystem count for all parent filesystems.
      961 +         *
      962 +         * When we receive an incremental stream into a filesystem that already
      963 +         * exists, a temporary clone is created.  We never count this temporary
      964 +         * clone, whose name begins with a '%'.
      965 +         */
      966 +        if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
      967 +                dsl_dir_fscount_adjust(dd->dd_parent, tx, -1, B_TRUE);
      968 +
      969 +        /*
 491  970           * Remove our reservation. The impl() routine avoids setting the
 492  971           * actual property, which would require the (already destroyed) ds.
 493  972           */
 494  973          dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 495  974  
 496  975          ASSERT0(dd->dd_phys->dd_used_bytes);
 497  976          ASSERT0(dd->dd_phys->dd_reserved);
 498  977          for (t = 0; t < DD_USED_NUM; t++)
 499  978                  ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
 500  979  
 501  980          VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
 502  981          VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
 503  982          VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
 504  983          VERIFY(0 == zap_remove(mos,
 505  984              dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
 506  985  
 507  986          obj = dd->dd_object;
 508  987          dsl_dir_close(dd, tag);
 509  988          VERIFY(0 == dmu_object_free(mos, obj, tx));
 510  989  }
 511  990  
 512  991  boolean_t
 513  992  dsl_dir_is_clone(dsl_dir_t *dd)
 514  993  {
 515  994          return (dd->dd_phys->dd_origin_obj &&
 516  995              (dd->dd_pool->dp_origin_snap == NULL ||
 517  996              dd->dd_phys->dd_origin_obj !=
 518  997              dd->dd_pool->dp_origin_snap->ds_object));
 519  998  }
 520  999  
 521 1000  void
 522 1001  dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
 523 1002  {
 524 1003          mutex_enter(&dd->dd_lock);
 525 1004          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
 526 1005              dd->dd_phys->dd_used_bytes);
 527 1006          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
 528 1007          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
 529 1008              dd->dd_phys->dd_reserved);
 530 1009          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
 531 1010              dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
 532 1011              (dd->dd_phys->dd_uncompressed_bytes * 100 /
 533 1012              dd->dd_phys->dd_compressed_bytes));
 534 1013          if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 535 1014                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
 536 1015                      dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
 537 1016                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
 538 1017                      dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
 539 1018                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
 540 1019                      dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
 541 1020                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
 542 1021                      dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
 543 1022                      dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
 544 1023          }
 545 1024          mutex_exit(&dd->dd_lock);
 546 1025  
 547 1026          rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
 548 1027          if (dsl_dir_is_clone(dd)) {
 549 1028                  dsl_dataset_t *ds;
 550 1029                  char buf[MAXNAMELEN];
 551 1030  
 552 1031                  VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
 553 1032                      dd->dd_phys->dd_origin_obj, FTAG, &ds));
 554 1033                  dsl_dataset_name(ds, buf);
 555 1034                  dsl_dataset_rele(ds, FTAG);
 556 1035                  dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
 557 1036          }
 558 1037          rw_exit(&dd->dd_pool->dp_config_rwlock);
 559 1038  }
 560 1039  
 561 1040  void
 562 1041  dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
 563 1042  {
 564 1043          dsl_pool_t *dp = dd->dd_pool;
 565 1044  
 566 1045          ASSERT(dd->dd_phys);
 567 1046  
 568 1047          if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
 569 1048                  /* up the hold count until we can be written out */
 570 1049                  dmu_buf_add_ref(dd->dd_dbuf, dd);
 571 1050          }
 572 1051  }
 573 1052  
 574 1053  static int64_t
 575 1054  parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
 576 1055  {
 577 1056          uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
 578 1057          uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
 579 1058          return (new_accounted - old_accounted);
 580 1059  }
 581 1060  
 582 1061  void
 583 1062  dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 584 1063  {
 585 1064          ASSERT(dmu_tx_is_syncing(tx));
 586 1065  
 587 1066          mutex_enter(&dd->dd_lock);
 588 1067          ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
 589 1068          dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
 590 1069              dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
 591 1070          dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
 592 1071          mutex_exit(&dd->dd_lock);
 593 1072  
 594 1073          /* release the hold from dsl_dir_dirty */
 595 1074          dmu_buf_rele(dd->dd_dbuf, dd);
 596 1075  }
 597 1076  
 598 1077  static uint64_t
 599 1078  dsl_dir_space_towrite(dsl_dir_t *dd)
 600 1079  {
 601 1080          uint64_t space = 0;
 602 1081          int i;
 603 1082  
 604 1083          ASSERT(MUTEX_HELD(&dd->dd_lock));
 605 1084  
 606 1085          for (i = 0; i < TXG_SIZE; i++) {
 607 1086                  space += dd->dd_space_towrite[i&TXG_MASK];
 608 1087                  ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
 609 1088          }
 610 1089          return (space);
 611 1090  }
 612 1091  
 613 1092  /*
 614 1093   * How much space would dd have available if ancestor had delta applied
 615 1094   * to it?  If ondiskonly is set, we're only interested in what's
 616 1095   * on-disk, not estimated pending changes.
 617 1096   */
 618 1097  uint64_t
 619 1098  dsl_dir_space_available(dsl_dir_t *dd,
 620 1099      dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
 621 1100  {
 622 1101          uint64_t parentspace, myspace, quota, used;
 623 1102  
 624 1103          /*
 625 1104           * If there are no restrictions otherwise, assume we have
 626 1105           * unlimited space available.
 627 1106           */
 628 1107          quota = UINT64_MAX;
 629 1108          parentspace = UINT64_MAX;
 630 1109  
 631 1110          if (dd->dd_parent != NULL) {
 632 1111                  parentspace = dsl_dir_space_available(dd->dd_parent,
 633 1112                      ancestor, delta, ondiskonly);
 634 1113          }
 635 1114  
 636 1115          mutex_enter(&dd->dd_lock);
 637 1116          if (dd->dd_phys->dd_quota != 0)
 638 1117                  quota = dd->dd_phys->dd_quota;
 639 1118          used = dd->dd_phys->dd_used_bytes;
 640 1119          if (!ondiskonly)
 641 1120                  used += dsl_dir_space_towrite(dd);
 642 1121  
 643 1122          if (dd->dd_parent == NULL) {
 644 1123                  uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
 645 1124                  quota = MIN(quota, poolsize);
 646 1125          }
 647 1126  
 648 1127          if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
 649 1128                  /*
 650 1129                   * We have some space reserved, in addition to what our
 651 1130                   * parent gave us.
 652 1131                   */
 653 1132                  parentspace += dd->dd_phys->dd_reserved - used;
 654 1133          }
 655 1134  
 656 1135          if (dd == ancestor) {
 657 1136                  ASSERT(delta <= 0);
 658 1137                  ASSERT(used >= -delta);
 659 1138                  used += delta;
 660 1139                  if (parentspace != UINT64_MAX)
 661 1140                          parentspace -= delta;
 662 1141          }
 663 1142  
 664 1143          if (used > quota) {
 665 1144                  /* over quota */
 666 1145                  myspace = 0;
 667 1146          } else {
 668 1147                  /*
 669 1148                   * the lesser of the space provided by our parent and
 670 1149                   * the space left in our quota
 671 1150                   */
 672 1151                  myspace = MIN(parentspace, quota - used);
 673 1152          }
 674 1153  
 675 1154          mutex_exit(&dd->dd_lock);
 676 1155  
 677 1156          return (myspace);
 678 1157  }
 679 1158  
 680 1159  struct tempreserve {
 681 1160          list_node_t tr_node;
 682 1161          dsl_pool_t *tr_dp;
 683 1162          dsl_dir_t *tr_ds;
 684 1163          uint64_t tr_size;
 685 1164  };
 686 1165  
 687 1166  static int
 688 1167  dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 689 1168      boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
 690 1169      dmu_tx_t *tx, boolean_t first)
 691 1170  {
 692 1171          uint64_t txg = tx->tx_txg;
 693 1172          uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
 694 1173          uint64_t deferred = 0;
 695 1174          struct tempreserve *tr;
 696 1175          int retval = EDQUOT;
 697 1176          int txgidx = txg & TXG_MASK;
 698 1177          int i;
 699 1178          uint64_t ref_rsrv = 0;
 700 1179  
 701 1180          ASSERT3U(txg, !=, 0);
 702 1181          ASSERT3S(asize, >, 0);
 703 1182  
 704 1183          mutex_enter(&dd->dd_lock);
 705 1184  
 706 1185          /*
 707 1186           * Check against the dsl_dir's quota.  We don't add in the delta
 708 1187           * when checking for over-quota because they get one free hit.
 709 1188           */
 710 1189          est_inflight = dsl_dir_space_towrite(dd);
 711 1190          for (i = 0; i < TXG_SIZE; i++)
 712 1191                  est_inflight += dd->dd_tempreserved[i];
 713 1192          used_on_disk = dd->dd_phys->dd_used_bytes;
 714 1193  
 715 1194          /*
 716 1195           * On the first iteration, fetch the dataset's used-on-disk and
 717 1196           * refreservation values. Also, if checkrefquota is set, test if
 718 1197           * allocating this space would exceed the dataset's refquota.
 719 1198           */
 720 1199          if (first && tx->tx_objset) {
 721 1200                  int error;
 722 1201                  dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
 723 1202  
 724 1203                  error = dsl_dataset_check_quota(ds, checkrefquota,
 725 1204                      asize, est_inflight, &used_on_disk, &ref_rsrv);
 726 1205                  if (error) {
 727 1206                          mutex_exit(&dd->dd_lock);
 728 1207                          return (error);
 729 1208                  }
 730 1209          }
 731 1210  
 732 1211          /*
 733 1212           * If this transaction will result in a net free of space,
 734 1213           * we want to let it through.
 735 1214           */
 736 1215          if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
 737 1216                  quota = UINT64_MAX;
 738 1217          else
 739 1218                  quota = dd->dd_phys->dd_quota;
 740 1219  
 741 1220          /*
 742 1221           * Adjust the quota against the actual pool size at the root
 743 1222           * minus any outstanding deferred frees.
 744 1223           * To ensure that it's possible to remove files from a full
 745 1224           * pool without inducing transient overcommits, we throttle
 746 1225           * netfree transactions against a quota that is slightly larger,
 747 1226           * but still within the pool's allocation slop.  In cases where
 748 1227           * we're very close to full, this will allow a steady trickle of
 749 1228           * removes to get through.
 750 1229           */
 751 1230          if (dd->dd_parent == NULL) {
 752 1231                  spa_t *spa = dd->dd_pool->dp_spa;
 753 1232                  uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
 754 1233                  deferred = metaslab_class_get_deferred(spa_normal_class(spa));
 755 1234                  if (poolsize - deferred < quota) {
 756 1235                          quota = poolsize - deferred;
 757 1236                          retval = ENOSPC;
 758 1237                  }
 759 1238          }
 760 1239  
 761 1240          /*
 762 1241           * If they are requesting more space, and our current estimate
 763 1242           * is over quota, they get to try again unless the actual
 764 1243           * on-disk is over quota and there are no pending changes (which
 765 1244           * may free up space for us).
 766 1245           */
 767 1246          if (used_on_disk + est_inflight >= quota) {
 768 1247                  if (est_inflight > 0 || used_on_disk < quota ||
 769 1248                      (retval == ENOSPC && used_on_disk < quota + deferred))
 770 1249                          retval = ERESTART;
 771 1250                  dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
 772 1251                      "quota=%lluK tr=%lluK err=%d\n",
 773 1252                      used_on_disk>>10, est_inflight>>10,
 774 1253                      quota>>10, asize>>10, retval);
 775 1254                  mutex_exit(&dd->dd_lock);
 776 1255                  return (retval);
 777 1256          }
 778 1257  
 779 1258          /* We need to up our estimated delta before dropping dd_lock */
 780 1259          dd->dd_tempreserved[txgidx] += asize;
 781 1260  
 782 1261          parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
 783 1262              asize - ref_rsrv);
 784 1263          mutex_exit(&dd->dd_lock);
 785 1264  
 786 1265          tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 787 1266          tr->tr_ds = dd;
 788 1267          tr->tr_size = asize;
 789 1268          list_insert_tail(tr_list, tr);
 790 1269  
 791 1270          /* see if it's OK with our parent */
 792 1271          if (dd->dd_parent && parent_rsrv) {
 793 1272                  boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
 794 1273  
 795 1274                  return (dsl_dir_tempreserve_impl(dd->dd_parent,
 796 1275                      parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
 797 1276          } else {
 798 1277                  return (0);
 799 1278          }
 800 1279  }
 801 1280  
 802 1281  /*
 803 1282   * Reserve space in this dsl_dir, to be used in this tx's txg.
 804 1283   * After the space has been dirtied (and dsl_dir_willuse_space()
 805 1284   * has been called), the reservation should be canceled, using
 806 1285   * dsl_dir_tempreserve_clear().
 807 1286   */
 808 1287  int
 809 1288  dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
 810 1289      uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
 811 1290  {
 812 1291          int err;
 813 1292          list_t *tr_list;
 814 1293  
 815 1294          if (asize == 0) {
 816 1295                  *tr_cookiep = NULL;
 817 1296                  return (0);
 818 1297          }
 819 1298  
 820 1299          tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 821 1300          list_create(tr_list, sizeof (struct tempreserve),
 822 1301              offsetof(struct tempreserve, tr_node));
 823 1302          ASSERT3S(asize, >, 0);
 824 1303          ASSERT3S(fsize, >=, 0);
 825 1304  
 826 1305          err = arc_tempreserve_space(lsize, tx->tx_txg);
 827 1306          if (err == 0) {
 828 1307                  struct tempreserve *tr;
 829 1308  
 830 1309                  tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 831 1310                  tr->tr_size = lsize;
 832 1311                  list_insert_tail(tr_list, tr);
 833 1312  
 834 1313                  err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
 835 1314          } else {
 836 1315                  if (err == EAGAIN) {
 837 1316                          txg_delay(dd->dd_pool, tx->tx_txg,
 838 1317                              zfs_zone_txg_delay());
 839 1318                          err = ERESTART;
 840 1319                  }
 841 1320                  dsl_pool_memory_pressure(dd->dd_pool);
 842 1321          }
 843 1322  
 844 1323          if (err == 0) {
 845 1324                  struct tempreserve *tr;
 846 1325  
 847 1326                  tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 848 1327                  tr->tr_dp = dd->dd_pool;
 849 1328                  tr->tr_size = asize;
 850 1329                  list_insert_tail(tr_list, tr);
 851 1330  
 852 1331                  err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
 853 1332                      FALSE, asize > usize, tr_list, tx, TRUE);
 854 1333          }
 855 1334  
 856 1335          if (err)
 857 1336                  dsl_dir_tempreserve_clear(tr_list, tx);
 858 1337          else
 859 1338                  *tr_cookiep = tr_list;
 860 1339  
 861 1340          return (err);
 862 1341  }
 863 1342  
 864 1343  /*
 865 1344   * Clear a temporary reservation that we previously made with
 866 1345   * dsl_dir_tempreserve_space().
 867 1346   */
 868 1347  void
 869 1348  dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
 870 1349  {
 871 1350          int txgidx = tx->tx_txg & TXG_MASK;
 872 1351          list_t *tr_list = tr_cookie;
 873 1352          struct tempreserve *tr;
 874 1353  
 875 1354          ASSERT3U(tx->tx_txg, !=, 0);
 876 1355  
 877 1356          if (tr_cookie == NULL)
 878 1357                  return;
 879 1358  
 880 1359          while (tr = list_head(tr_list)) {
 881 1360                  if (tr->tr_dp) {
 882 1361                          dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
 883 1362                  } else if (tr->tr_ds) {
 884 1363                          mutex_enter(&tr->tr_ds->dd_lock);
 885 1364                          ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 886 1365                              tr->tr_size);
 887 1366                          tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
 888 1367                          mutex_exit(&tr->tr_ds->dd_lock);
 889 1368                  } else {
 890 1369                          arc_tempreserve_clear(tr->tr_size);
 891 1370                  }
 892 1371                  list_remove(tr_list, tr);
 893 1372                  kmem_free(tr, sizeof (struct tempreserve));
 894 1373          }
 895 1374  
 896 1375          kmem_free(tr_list, sizeof (list_t));
 897 1376  }
 898 1377  
 899 1378  static void
 900 1379  dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 901 1380  {
 902 1381          int64_t parent_space;
 903 1382          uint64_t est_used;
 904 1383  
 905 1384          mutex_enter(&dd->dd_lock);
 906 1385          if (space > 0)
 907 1386                  dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 908 1387  
 909 1388          est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
 910 1389          parent_space = parent_delta(dd, est_used, space);
 911 1390          mutex_exit(&dd->dd_lock);
 912 1391  
 913 1392          /* Make sure that we clean up dd_space_to* */
 914 1393          dsl_dir_dirty(dd, tx);
 915 1394  
 916 1395          /* XXX this is potentially expensive and unnecessary... */
 917 1396          if (parent_space && dd->dd_parent)
 918 1397                  dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
 919 1398  }
 920 1399  
 921 1400  /*
 922 1401   * Call in open context when we think we're going to write/free space,
 923 1402   * eg. when dirtying data.  Be conservative (ie. OK to write less than
 924 1403   * this or free more than this, but don't write more or free less).
 925 1404   */
 926 1405  void
 927 1406  dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 928 1407  {
 929 1408          dsl_pool_willuse_space(dd->dd_pool, space, tx);
 930 1409          dsl_dir_willuse_space_impl(dd, space, tx);
 931 1410  }
 932 1411  
 933 1412  /* call from syncing context when we actually write/free space for this dd */
 934 1413  void
 935 1414  dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 936 1415      int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 937 1416  {
 938 1417          int64_t accounted_delta;
 939 1418          boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 940 1419  
 941 1420          ASSERT(dmu_tx_is_syncing(tx));
 942 1421          ASSERT(type < DD_USED_NUM);
 943 1422  
 944 1423          if (needlock)
 945 1424                  mutex_enter(&dd->dd_lock);
 946 1425          accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
 947 1426          ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
 948 1427          ASSERT(compressed >= 0 ||
 949 1428              dd->dd_phys->dd_compressed_bytes >= -compressed);
 950 1429          ASSERT(uncompressed >= 0 ||
 951 1430              dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
 952 1431          dmu_buf_will_dirty(dd->dd_dbuf, tx);
 953 1432          dd->dd_phys->dd_used_bytes += used;
 954 1433          dd->dd_phys->dd_uncompressed_bytes += uncompressed;
 955 1434          dd->dd_phys->dd_compressed_bytes += compressed;
 956 1435  
 957 1436          if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 958 1437                  ASSERT(used > 0 ||
 959 1438                      dd->dd_phys->dd_used_breakdown[type] >= -used);
 960 1439                  dd->dd_phys->dd_used_breakdown[type] += used;
 961 1440  #ifdef DEBUG
 962 1441                  dd_used_t t;
 963 1442                  uint64_t u = 0;
 964 1443                  for (t = 0; t < DD_USED_NUM; t++)
 965 1444                          u += dd->dd_phys->dd_used_breakdown[t];
 966 1445                  ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
 967 1446  #endif
 968 1447          }
 969 1448          if (needlock)
 970 1449                  mutex_exit(&dd->dd_lock);
 971 1450  
 972 1451          if (dd->dd_parent != NULL) {
 973 1452                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
 974 1453                      accounted_delta, compressed, uncompressed, tx);
 975 1454                  dsl_dir_transfer_space(dd->dd_parent,
 976 1455                      used - accounted_delta,
 977 1456                      DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
 978 1457          }
 979 1458  }
 980 1459  
 981 1460  void
 982 1461  dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
 983 1462      dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
 984 1463  {
 985 1464          boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 986 1465  
 987 1466          ASSERT(dmu_tx_is_syncing(tx));
 988 1467          ASSERT(oldtype < DD_USED_NUM);
 989 1468          ASSERT(newtype < DD_USED_NUM);
 990 1469  
 991 1470          if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
 992 1471                  return;
 993 1472  
 994 1473          if (needlock)
 995 1474                  mutex_enter(&dd->dd_lock);
 996 1475          ASSERT(delta > 0 ?
 997 1476              dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
 998 1477              dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
 999 1478          ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
1000 1479          dmu_buf_will_dirty(dd->dd_dbuf, tx);
1001 1480          dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
1002 1481          dd->dd_phys->dd_used_breakdown[newtype] += delta;
1003 1482          if (needlock)
1004 1483                  mutex_exit(&dd->dd_lock);
1005 1484  }
1006 1485  
1007 1486  static int
1008 1487  dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
1009 1488  {
1010 1489          dsl_dataset_t *ds = arg1;
1011 1490          dsl_dir_t *dd = ds->ds_dir;
1012 1491          dsl_prop_setarg_t *psa = arg2;
1013 1492          int err;
1014 1493          uint64_t towrite;
1015 1494  
1016 1495          if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
1017 1496                  return (err);
1018 1497  
1019 1498          if (psa->psa_effective_value == 0)
1020 1499                  return (0);
1021 1500  
1022 1501          mutex_enter(&dd->dd_lock);
1023 1502          /*
1024 1503           * If we are doing the preliminary check in open context, and
1025 1504           * there are pending changes, then don't fail it, since the
1026 1505           * pending changes could under-estimate the amount of space to be
1027 1506           * freed up.
1028 1507           */
  
    | ↓ open down ↓ | 528 lines elided | ↑ open up ↑ | 
1029 1508          towrite = dsl_dir_space_towrite(dd);
1030 1509          if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1031 1510              (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
1032 1511              psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
1033 1512                  err = ENOSPC;
1034 1513          }
1035 1514          mutex_exit(&dd->dd_lock);
1036 1515          return (err);
1037 1516  }
1038 1517  
1039      -extern dsl_syncfunc_t dsl_prop_set_sync;
1040      -
1041 1518  static void
1042 1519  dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1043 1520  {
1044 1521          dsl_dataset_t *ds = arg1;
1045 1522          dsl_dir_t *dd = ds->ds_dir;
1046 1523          dsl_prop_setarg_t *psa = arg2;
1047 1524          uint64_t effective_value = psa->psa_effective_value;
1048 1525  
1049 1526          dsl_prop_set_sync(ds, psa, tx);
1050 1527          DSL_PROP_CHECK_PREDICTION(dd, psa);
1051 1528  
1052 1529          dmu_buf_will_dirty(dd->dd_dbuf, tx);
1053 1530  
1054 1531          mutex_enter(&dd->dd_lock);
1055 1532          dd->dd_phys->dd_quota = effective_value;
1056 1533          mutex_exit(&dd->dd_lock);
1057 1534  }
1058 1535  
1059 1536  int
1060 1537  dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1061 1538  {
1062 1539          dsl_dir_t *dd;
1063 1540          dsl_dataset_t *ds;
1064 1541          dsl_prop_setarg_t psa;
1065 1542          int err;
1066 1543  
1067 1544          dsl_prop_setarg_init_uint64(&psa, "quota", source, "a);
1068 1545  
1069 1546          err = dsl_dataset_hold(ddname, FTAG, &ds);
1070 1547          if (err)
1071 1548                  return (err);
1072 1549  
1073 1550          err = dsl_dir_open(ddname, FTAG, &dd, NULL);
1074 1551          if (err) {
1075 1552                  dsl_dataset_rele(ds, FTAG);
1076 1553                  return (err);
1077 1554          }
1078 1555  
1079 1556          ASSERT(ds->ds_dir == dd);
1080 1557  
1081 1558          /*
1082 1559           * If someone removes a file, then tries to set the quota, we want to
1083 1560           * make sure the file freeing takes effect.
1084 1561           */
1085 1562          txg_wait_open(dd->dd_pool, 0);
1086 1563  
1087 1564          err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
1088 1565              dsl_dir_set_quota_sync, ds, &psa, 0);
1089 1566  
1090 1567          dsl_dir_close(dd, FTAG);
1091 1568          dsl_dataset_rele(ds, FTAG);
1092 1569          return (err);
1093 1570  }
1094 1571  
1095 1572  int
1096 1573  dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
1097 1574  {
1098 1575          dsl_dataset_t *ds = arg1;
1099 1576          dsl_dir_t *dd = ds->ds_dir;
1100 1577          dsl_prop_setarg_t *psa = arg2;
1101 1578          uint64_t effective_value;
1102 1579          uint64_t used, avail;
1103 1580          int err;
1104 1581  
1105 1582          if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
1106 1583                  return (err);
1107 1584  
1108 1585          effective_value = psa->psa_effective_value;
1109 1586  
1110 1587          /*
1111 1588           * If we are doing the preliminary check in open context, the
1112 1589           * space estimates may be inaccurate.
1113 1590           */
1114 1591          if (!dmu_tx_is_syncing(tx))
1115 1592                  return (0);
1116 1593  
1117 1594          mutex_enter(&dd->dd_lock);
1118 1595          used = dd->dd_phys->dd_used_bytes;
1119 1596          mutex_exit(&dd->dd_lock);
1120 1597  
1121 1598          if (dd->dd_parent) {
1122 1599                  avail = dsl_dir_space_available(dd->dd_parent,
1123 1600                      NULL, 0, FALSE);
1124 1601          } else {
1125 1602                  avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1126 1603          }
1127 1604  
1128 1605          if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
1129 1606                  uint64_t delta = MAX(used, effective_value) -
1130 1607                      MAX(used, dd->dd_phys->dd_reserved);
1131 1608  
1132 1609                  if (delta > avail)
1133 1610                          return (ENOSPC);
1134 1611                  if (dd->dd_phys->dd_quota > 0 &&
1135 1612                      effective_value > dd->dd_phys->dd_quota)
1136 1613                          return (ENOSPC);
1137 1614          }
1138 1615  
1139 1616          return (0);
1140 1617  }
1141 1618  
1142 1619  static void
1143 1620  dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1144 1621  {
1145 1622          uint64_t used;
1146 1623          int64_t delta;
1147 1624  
1148 1625          dmu_buf_will_dirty(dd->dd_dbuf, tx);
1149 1626  
1150 1627          mutex_enter(&dd->dd_lock);
1151 1628          used = dd->dd_phys->dd_used_bytes;
1152 1629          delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
1153 1630          dd->dd_phys->dd_reserved = value;
1154 1631  
1155 1632          if (dd->dd_parent != NULL) {
1156 1633                  /* Roll up this additional usage into our ancestors */
1157 1634                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1158 1635                      delta, 0, 0, tx);
1159 1636          }
1160 1637          mutex_exit(&dd->dd_lock);
1161 1638  }
1162 1639  
1163 1640  
1164 1641  static void
1165 1642  dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1166 1643  {
1167 1644          dsl_dataset_t *ds = arg1;
1168 1645          dsl_dir_t *dd = ds->ds_dir;
1169 1646          dsl_prop_setarg_t *psa = arg2;
1170 1647          uint64_t value = psa->psa_effective_value;
1171 1648  
1172 1649          dsl_prop_set_sync(ds, psa, tx);
1173 1650          DSL_PROP_CHECK_PREDICTION(dd, psa);
1174 1651  
1175 1652          dsl_dir_set_reservation_sync_impl(dd, value, tx);
1176 1653  }
1177 1654  
1178 1655  int
1179 1656  dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1180 1657      uint64_t reservation)
1181 1658  {
1182 1659          dsl_dir_t *dd;
1183 1660          dsl_dataset_t *ds;
1184 1661          dsl_prop_setarg_t psa;
1185 1662          int err;
1186 1663  
1187 1664          dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
1188 1665  
1189 1666          err = dsl_dataset_hold(ddname, FTAG, &ds);
1190 1667          if (err)
1191 1668                  return (err);
1192 1669  
1193 1670          err = dsl_dir_open(ddname, FTAG, &dd, NULL);
1194 1671          if (err) {
1195 1672                  dsl_dataset_rele(ds, FTAG);
1196 1673                  return (err);
1197 1674          }
1198 1675  
1199 1676          ASSERT(ds->ds_dir == dd);
1200 1677  
1201 1678          err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
1202 1679              dsl_dir_set_reservation_sync, ds, &psa, 0);
1203 1680  
1204 1681          dsl_dir_close(dd, FTAG);
1205 1682          dsl_dataset_rele(ds, FTAG);
1206 1683          return (err);
1207 1684  }
1208 1685  
1209 1686  static dsl_dir_t *
1210 1687  closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1211 1688  {
1212 1689          for (; ds1; ds1 = ds1->dd_parent) {
1213 1690                  dsl_dir_t *dd;
1214 1691                  for (dd = ds2; dd; dd = dd->dd_parent) {
1215 1692                          if (ds1 == dd)
1216 1693                                  return (dd);
1217 1694                  }
1218 1695          }
1219 1696          return (NULL);
1220 1697  }
1221 1698  
1222 1699  /*
1223 1700   * If delta is applied to dd, how much of that delta would be applied to
1224 1701   * ancestor?  Syncing context only.
1225 1702   */
1226 1703  static int64_t
1227 1704  would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1228 1705  {
1229 1706          if (dd == ancestor)
1230 1707                  return (delta);
  
    | ↓ open down ↓ | 180 lines elided | ↑ open up ↑ | 
1231 1708  
1232 1709          mutex_enter(&dd->dd_lock);
1233 1710          delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
1234 1711          mutex_exit(&dd->dd_lock);
1235 1712          return (would_change(dd->dd_parent, delta, ancestor));
1236 1713  }
1237 1714  
1238 1715  struct renamearg {
1239 1716          dsl_dir_t *newparent;
1240 1717          const char *mynewname;
     1718 +        cred_t *cr;
1241 1719  };
1242 1720  
1243 1721  static int
1244 1722  dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
1245 1723  {
1246 1724          dsl_dir_t *dd = arg1;
1247 1725          struct renamearg *ra = arg2;
1248 1726          dsl_pool_t *dp = dd->dd_pool;
1249 1727          objset_t *mos = dp->dp_meta_objset;
1250 1728          int err;
1251 1729          uint64_t val;
1252 1730  
1253 1731          /*
1254 1732           * There should only be one reference, from dmu_objset_rename().
1255 1733           * Fleeting holds are also possible (eg, from "zfs list" getting
1256 1734           * stats), but any that are present in open context will likely
1257 1735           * be gone by syncing context, so only fail from syncing
1258 1736           * context.
1259 1737           */
1260 1738          if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1)
1261 1739                  return (EBUSY);
1262 1740  
1263 1741          /* check for existing name */
1264 1742          err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1265 1743              ra->mynewname, 8, 1, &val);
1266 1744          if (err == 0)
1267 1745                  return (EEXIST);
1268 1746          if (err != ENOENT)
1269 1747                  return (err);
1270 1748  
  
    | ↓ open down ↓ | 20 lines elided | ↑ open up ↑ | 
1271 1749          if (ra->newparent != dd->dd_parent) {
1272 1750                  /* is there enough space? */
1273 1751                  uint64_t myspace =
1274 1752                      MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1275 1753  
1276 1754                  /* no rename into our descendant */
1277 1755                  if (closest_common_ancestor(dd, ra->newparent) == dd)
1278 1756                          return (EINVAL);
1279 1757  
1280 1758                  if (err = dsl_dir_transfer_possible(dd->dd_parent,
1281      -                    ra->newparent, myspace))
     1759 +                    ra->newparent, dd, myspace, ra->cr))
1282 1760                          return (err);
     1761 +
     1762 +                if (dd->dd_phys->dd_filesystem_count == 0 &&
     1763 +                    dmu_tx_is_syncing(tx)) {
     1764 +                        uint64_t fs_cnt = 0;
     1765 +                        uint64_t ss_cnt = 0;
     1766 +
     1767 +                        /*
     1768 +                         * Ensure this portion of the tree's counts have been
     1769 +                         * initialized in case the new parent has limits set.
     1770 +                         */
     1771 +                        err = dsl_dir_set_fs_ss_count(dd, tx, &fs_cnt, &ss_cnt);
     1772 +                        if (err)
     1773 +                                return (EIO);
     1774 +                }
1283 1775          }
1284 1776  
1285 1777          return (0);
1286 1778  }
1287 1779  
1288 1780  static void
1289 1781  dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1290 1782  {
1291 1783          dsl_dir_t *dd = arg1;
1292 1784          struct renamearg *ra = arg2;
1293 1785          dsl_pool_t *dp = dd->dd_pool;
1294 1786          objset_t *mos = dp->dp_meta_objset;
1295 1787          int err;
  
    | ↓ open down ↓ | 3 lines elided | ↑ open up ↑ | 
1296 1788          char namebuf[MAXNAMELEN];
1297 1789  
1298 1790          ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
1299 1791  
1300 1792          /* Log this before we change the name. */
1301 1793          dsl_dir_name(ra->newparent, namebuf);
1302 1794          spa_history_log_internal_dd(dd, "rename", tx,
1303 1795              "-> %s/%s", namebuf, ra->mynewname);
1304 1796  
1305 1797          if (ra->newparent != dd->dd_parent) {
     1798 +                int cnt;
     1799 +
     1800 +                mutex_enter(&dd->dd_lock);
     1801 +
     1802 +                cnt = dd->dd_phys->dd_filesystem_count;
     1803 +                dsl_dir_fscount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
     1804 +                dsl_dir_fscount_adjust(ra->newparent, tx, cnt, B_TRUE);
     1805 +
     1806 +                cnt = dd->dd_phys->dd_snapshot_count;
     1807 +                dsl_snapcount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
     1808 +                dsl_snapcount_adjust(ra->newparent, tx, cnt, B_TRUE);
     1809 +
     1810 +                mutex_exit(&dd->dd_lock);
     1811 +
1306 1812                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1307 1813                      -dd->dd_phys->dd_used_bytes,
1308 1814                      -dd->dd_phys->dd_compressed_bytes,
1309 1815                      -dd->dd_phys->dd_uncompressed_bytes, tx);
1310 1816                  dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
1311 1817                      dd->dd_phys->dd_used_bytes,
1312 1818                      dd->dd_phys->dd_compressed_bytes,
1313 1819                      dd->dd_phys->dd_uncompressed_bytes, tx);
1314 1820  
1315 1821                  if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
1316 1822                          uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
1317 1823                              dd->dd_phys->dd_used_bytes;
1318 1824  
1319 1825                          dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1320 1826                              -unused_rsrv, 0, 0, tx);
1321 1827                          dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV,
1322 1828                              unused_rsrv, 0, 0, tx);
1323 1829                  }
1324 1830          }
1325 1831  
1326 1832          dmu_buf_will_dirty(dd->dd_dbuf, tx);
1327 1833  
1328 1834          /* remove from old parent zapobj */
1329 1835          err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
1330 1836              dd->dd_myname, tx);
1331 1837          ASSERT0(err);
1332 1838  
1333 1839          (void) strcpy(dd->dd_myname, ra->mynewname);
1334 1840          dsl_dir_close(dd->dd_parent, dd);
1335 1841          dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
1336 1842          VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
1337 1843              ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
1338 1844  
1339 1845          /* add to new parent zapobj */
1340 1846          err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1341 1847              dd->dd_myname, 8, 1, &dd->dd_object, tx);
1342 1848          ASSERT0(err);
1343 1849  
1344 1850  }
1345 1851  
1346 1852  int
1347 1853  dsl_dir_rename(dsl_dir_t *dd, const char *newname)
1348 1854  {
1349 1855          struct renamearg ra;
1350 1856          int err;
1351 1857  
1352 1858          /* new parent should exist */
1353 1859          err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
1354 1860          if (err)
1355 1861                  return (err);
1356 1862  
1357 1863          /* can't rename to different pool */
1358 1864          if (dd->dd_pool != ra.newparent->dd_pool) {
  
    | ↓ open down ↓ | 43 lines elided | ↑ open up ↑ | 
1359 1865                  err = ENXIO;
1360 1866                  goto out;
1361 1867          }
1362 1868  
1363 1869          /* new name should not already exist */
1364 1870          if (ra.mynewname == NULL) {
1365 1871                  err = EEXIST;
1366 1872                  goto out;
1367 1873          }
1368 1874  
     1875 +        ra.cr = CRED();
     1876 +
1369 1877          err = dsl_sync_task_do(dd->dd_pool,
1370 1878              dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
1371 1879  
1372 1880  out:
1373 1881          dsl_dir_close(ra.newparent, FTAG);
1374 1882          return (err);
1375 1883  }
1376 1884  
1377 1885  int
1378      -dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
     1886 +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, dsl_dir_t *moving_dd,
     1887 +    uint64_t space, cred_t *cr)
1379 1888  {
1380 1889          dsl_dir_t *ancestor;
1381 1890          int64_t adelta;
1382 1891          uint64_t avail;
     1892 +        int err;
1383 1893  
1384 1894          ancestor = closest_common_ancestor(sdd, tdd);
1385 1895          adelta = would_change(sdd, -space, ancestor);
1386 1896          avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1387 1897          if (avail < space)
1388 1898                  return (ENOSPC);
1389 1899  
     1900 +        if (sdd != moving_dd) {
     1901 +                err = dsl_dir_fscount_check(tdd,
     1902 +                    moving_dd->dd_phys->dd_filesystem_count, ancestor, cr);
     1903 +                if (err != 0)
     1904 +                        return (err);
     1905 +        }
     1906 +        err = dsl_snapcount_check(tdd, moving_dd->dd_phys->dd_snapshot_count,
     1907 +            ancestor, cr);
     1908 +        if (err != 0)
     1909 +                return (err);
     1910 +
1390 1911          return (0);
1391 1912  }
1392 1913  
1393 1914  timestruc_t
1394 1915  dsl_dir_snap_cmtime(dsl_dir_t *dd)
1395 1916  {
1396 1917          timestruc_t t;
1397 1918  
1398 1919          mutex_enter(&dd->dd_lock);
1399 1920          t = dd->dd_snap_cmtime;
1400 1921          mutex_exit(&dd->dd_lock);
1401 1922  
1402 1923          return (t);
1403 1924  }
1404 1925  
1405 1926  void
1406 1927  dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1407 1928  {
1408 1929          timestruc_t t;
1409 1930  
1410 1931          gethrestime(&t);
1411 1932          mutex_enter(&dd->dd_lock);
1412 1933          dd->dd_snap_cmtime = t;
1413 1934          mutex_exit(&dd->dd_lock);
1414 1935  }
  
    | ↓ open down ↓ | 15 lines elided | ↑ open up ↑ | 
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX