illumos Wdiff usr/src/uts/common/fs/zfs/dsl_dir.c

Print this page

OS-1566 dataset quota for ZFS datasets

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dsl_dir.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_dir.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012 by Delphix. All rights reserved.
       24 + * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  24   25   */
  25   26  
  26   27  #include <sys/dmu.h>
  27   28  #include <sys/dmu_objset.h>
  28   29  #include <sys/dmu_tx.h>
  29   30  #include <sys/dsl_dataset.h>
  30   31  #include <sys/dsl_dir.h>
  31   32  #include <sys/dsl_prop.h>
  32   33  #include <sys/dsl_synctask.h>
  33   34  #include <sys/dsl_deleg.h>
  34   35  #include <sys/spa.h>
  35   36  #include <sys/metaslab.h>
  36   37  #include <sys/zap.h>
  37   38  #include <sys/zio.h>
  38   39  #include <sys/arc.h>
  39   40  #include <sys/sunddi.h>
  40   41  #include <sys/zfs_zone.h>
       42 +#include <sys/zfeature.h>
  41   43  #include "zfs_namecheck.h"
  42   44  
       45 +/*
       46 + * Dataset and Snapshot Quotas
       47 + * ---------------------------
       48 + *
       49 + * These quotas are used to limit the number of datasets and/or snapshots
       50 + * that can be created at a given level in the tree or below. A common use-case
       51 + * is with a delegated dataset where the administrator wants to ensure that
       52 + * a user within the zone is not creating too many datasets or snapshots, even
       53 + * though they're not exceeding their space quota.
       54 + *
       55 + * The count of datasets and snapshots is stored in the dsl_dir_phys_t which
       56 + * impacts the on-disk format. As such, this capability is controlled by a
       57 + * feature flag and must be enabled to be used. Once enabled, the feature is
       58 + * not active until the first quota is set. At that point, future operations to
       59 + * create/destroy datasets or snapshots will validate and update the counts.
       60 + *
       61 + * Because the on-disk counts will be incorrect (garbage) before the feature is
       62 + * active, the counts are updated when the quota is first set. Starting at the
       63 + * dataset with the new quota, the code descends into all sub-datasets and
       64 + * updates the counts to be accurate. In practice this is lightweight since
       65 + * a quota is typically set when the dataset is created and thus has no
       66 + * children. Once set, changing the quota value won't require a traversal since
       67 + * the counts are already valid. The counts in datasets above the one with the
       68 + * new quota will still be incorrect, unless a quota is eventually set on one
       69 + * of those datasets. If a dataset with a quota is encountered during the
       70 + * descent, the counts are known to be valid and there is no need to descend
       71 + * into that dataset's children. When a new quota value is set on a dataset
       72 + * with an existing quota, the new value must not be less than the current
       73 + * count at that level or an error is returned and the quota is not changed.
       74 + *
       75 + * Once the feature is active, then whenever a dataset or snapshot is created,
       76 + * the code recurses up the tree, validating the new count against the quota
       77 + * at each level. In practice, most levels will not have a quota set. If there
       78 + * is a quota at any level up the tree, the check must pass or the creation
       79 + * will fail. Likewise, when a dataset or snapshot is destroyed, the counts
       80 + * are recursively adjusted all the way up the tree. Renaming a dataset into
       81 + * different point in the tree will first validate, then update the counts on
       82 + * each branch up to the common ancestor. A receive will also validate the
       83 + * counts and then update them.
       84 + *
       85 + * Recursive snapshots behave a bit differently. The quota is only validated
       86 + * against the top-level dataset at which the snapshot is being taken. This
       87 + * is to prevent a denial-of-service in which a lower level dataset could
       88 + * max out its quota and thus block snapshots from being taken at a higher
       89 + * level (in addition, the complexity to address this is not worth the cost).
       90 + * Because of this, it is possible for the snapshot count to be over the quota
       91 + * and snapshots taken at a high level could cause a lower level dataset to hit
       92 + * or exceed its quota. The administrator taking the high-level recursive
       93 + * snapshot should be aware of this side-effect and behave accordingly.
       94 + *
       95 + * The dataset quota is validated by dsl_dir_dscount_check() and updated by
       96 + * dsl_dir_dscount_adjust(). The snapshot quota is validated by
       97 + * dsl_snapcount_check() and updated by dsl_snapcount_adjust().
       98 + * A new quota value is validated in dsl_dir_validate_ds_ss_quota() and the
       99 + * dataset counts are adjusted, if necessary, by dsl_dir_set_ds_ss_count().
      100 + */
      101 +
  43  102  static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
  44  103  static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
  45  104      uint64_t value, dmu_tx_t *tx);
  46  105  
      106 +extern dsl_syncfunc_t dsl_prop_set_sync;
      107 +extern char *tmp_dmu_recv_tag;
      108 +
  47  109  /* ARGSUSED */
  48  110  static void
  49  111  dsl_dir_evict(dmu_buf_t *db, void *arg)
  50  112  {
  51  113          dsl_dir_t *dd = arg;
  52  114          dsl_pool_t *dp = dd->dd_pool;
  53  115          int t;
  54  116  
  55  117          for (t = 0; t < TXG_SIZE; t++) {
  56  118                  ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));

  57  119                  ASSERT(dd->dd_tempreserved[t] == 0);
  58  120                  ASSERT(dd->dd_space_towrite[t] == 0);
  59  121          }
  60  122  
  61  123          if (dd->dd_parent)
  62  124                  dsl_dir_close(dd->dd_parent, dd);
  63  125  
  64  126          spa_close(dd->dd_pool->dp_spa, dd);
  65  127  
  66  128          /*
  67  129           * The props callback list should have been cleaned up by
  68  130           * objset_evict().
  69  131           */
  70  132          list_destroy(&dd->dd_prop_cbs);
  71  133          mutex_destroy(&dd->dd_lock);
  72  134          kmem_free(dd, sizeof (dsl_dir_t));
  73  135  }
  74  136  
  75  137  int
  76  138  dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
  77  139      const char *tail, void *tag, dsl_dir_t **ddp)
  78  140  {
  79  141          dmu_buf_t *dbuf;
  80  142          dsl_dir_t *dd;
  81  143          int err;
  82  144  
  83  145          ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
  84  146              dsl_pool_sync_context(dp));
  85  147  
  86  148          err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
  87  149          if (err)
  88  150                  return (err);
  89  151          dd = dmu_buf_get_user(dbuf);
  90  152  #ifdef ZFS_DEBUG
  91  153          {
  92  154                  dmu_object_info_t doi;
  93  155                  dmu_object_info_from_db(dbuf, &doi);
  94  156                  ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
  95  157                  ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
  96  158          }
  97  159  #endif
  98  160          if (dd == NULL) {
  99  161                  dsl_dir_t *winner;
 100  162  
 101  163                  dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
 102  164                  dd->dd_object = ddobj;
 103  165                  dd->dd_dbuf = dbuf;
 104  166                  dd->dd_pool = dp;
 105  167                  dd->dd_phys = dbuf->db_data;
 106  168                  mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
 107  169  
 108  170                  list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
 109  171                      offsetof(dsl_prop_cb_record_t, cbr_node));
 110  172  
 111  173                  dsl_dir_snap_cmtime_update(dd);
 112  174  
 113  175                  if (dd->dd_phys->dd_parent_obj) {
 114  176                          err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
 115  177                              NULL, dd, &dd->dd_parent);
 116  178                          if (err)
 117  179                                  goto errout;
 118  180                          if (tail) {
 119  181  #ifdef ZFS_DEBUG
 120  182                                  uint64_t foundobj;
 121  183  
 122  184                                  err = zap_lookup(dp->dp_meta_objset,
 123  185                                      dd->dd_parent->dd_phys->dd_child_dir_zapobj,
 124  186                                      tail, sizeof (foundobj), 1, &foundobj);
 125  187                                  ASSERT(err || foundobj == ddobj);
 126  188  #endif
 127  189                                  (void) strcpy(dd->dd_myname, tail);
 128  190                          } else {
 129  191                                  err = zap_value_search(dp->dp_meta_objset,
 130  192                                      dd->dd_parent->dd_phys->dd_child_dir_zapobj,
 131  193                                      ddobj, 0, dd->dd_myname);
 132  194                          }
 133  195                          if (err)
 134  196                                  goto errout;
 135  197                  } else {
 136  198                          (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
 137  199                  }
 138  200  
 139  201                  if (dsl_dir_is_clone(dd)) {
 140  202                          dmu_buf_t *origin_bonus;
 141  203                          dsl_dataset_phys_t *origin_phys;
 142  204  
 143  205                          /*
 144  206                           * We can't open the origin dataset, because
 145  207                           * that would require opening this dsl_dir.
 146  208                           * Just look at its phys directly instead.
 147  209                           */
 148  210                          err = dmu_bonus_hold(dp->dp_meta_objset,
 149  211                              dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
 150  212                          if (err)
 151  213                                  goto errout;
 152  214                          origin_phys = origin_bonus->db_data;
 153  215                          dd->dd_origin_txg =
 154  216                              origin_phys->ds_creation_txg;
 155  217                          dmu_buf_rele(origin_bonus, FTAG);
 156  218                  }
 157  219  
 158  220                  winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
 159  221                      dsl_dir_evict);
 160  222                  if (winner) {
 161  223                          if (dd->dd_parent)
 162  224                                  dsl_dir_close(dd->dd_parent, dd);
 163  225                          mutex_destroy(&dd->dd_lock);
 164  226                          kmem_free(dd, sizeof (dsl_dir_t));
 165  227                          dd = winner;
 166  228                  } else {
 167  229                          spa_open_ref(dp->dp_spa, dd);
 168  230                  }
 169  231          }
 170  232  
 171  233          /*
 172  234           * The dsl_dir_t has both open-to-close and instantiate-to-evict
 173  235           * holds on the spa.  We need the open-to-close holds because
 174  236           * otherwise the spa_refcnt wouldn't change when we open a
 175  237           * dir which the spa also has open, so we could incorrectly
 176  238           * think it was OK to unload/export/destroy the pool.  We need
 177  239           * the instantiate-to-evict hold because the dsl_dir_t has a
 178  240           * pointer to the dd_pool, which has a pointer to the spa_t.
 179  241           */
 180  242          spa_open_ref(dp->dp_spa, tag);
 181  243          ASSERT3P(dd->dd_pool, ==, dp);
 182  244          ASSERT3U(dd->dd_object, ==, ddobj);
 183  245          ASSERT3P(dd->dd_dbuf, ==, dbuf);
 184  246          *ddp = dd;
 185  247          return (0);
 186  248  
 187  249  errout:
 188  250          if (dd->dd_parent)
 189  251                  dsl_dir_close(dd->dd_parent, dd);
 190  252          mutex_destroy(&dd->dd_lock);
 191  253          kmem_free(dd, sizeof (dsl_dir_t));
 192  254          dmu_buf_rele(dbuf, tag);
 193  255          return (err);
 194  256  }
 195  257  
 196  258  void
 197  259  dsl_dir_close(dsl_dir_t *dd, void *tag)
 198  260  {
 199  261          dprintf_dd(dd, "%s\n", "");
 200  262          spa_close(dd->dd_pool->dp_spa, tag);
 201  263          dmu_buf_rele(dd->dd_dbuf, tag);
 202  264  }
 203  265  
 204  266  /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
 205  267  void
 206  268  dsl_dir_name(dsl_dir_t *dd, char *buf)
 207  269  {
 208  270          if (dd->dd_parent) {
 209  271                  dsl_dir_name(dd->dd_parent, buf);
 210  272                  (void) strcat(buf, "/");
 211  273          } else {
 212  274                  buf[0] = '\0';
 213  275          }
 214  276          if (!MUTEX_HELD(&dd->dd_lock)) {
 215  277                  /*
 216  278                   * recursive mutex so that we can use
 217  279                   * dprintf_dd() with dd_lock held
 218  280                   */
 219  281                  mutex_enter(&dd->dd_lock);
 220  282                  (void) strcat(buf, dd->dd_myname);
 221  283                  mutex_exit(&dd->dd_lock);
 222  284          } else {
 223  285                  (void) strcat(buf, dd->dd_myname);
 224  286          }
 225  287  }
 226  288  
 227  289  /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
 228  290  int
 229  291  dsl_dir_namelen(dsl_dir_t *dd)
 230  292  {
 231  293          int result = 0;
 232  294  
 233  295          if (dd->dd_parent) {
 234  296                  /* parent's name + 1 for the "/" */
 235  297                  result = dsl_dir_namelen(dd->dd_parent) + 1;
 236  298          }
 237  299  
 238  300          if (!MUTEX_HELD(&dd->dd_lock)) {
 239  301                  /* see dsl_dir_name */
 240  302                  mutex_enter(&dd->dd_lock);
 241  303                  result += strlen(dd->dd_myname);
 242  304                  mutex_exit(&dd->dd_lock);
 243  305          } else {
 244  306                  result += strlen(dd->dd_myname);
 245  307          }
 246  308  
 247  309          return (result);
 248  310  }
 249  311  
 250  312  static int
 251  313  getcomponent(const char *path, char *component, const char **nextp)
 252  314  {
 253  315          char *p;
 254  316          if ((path == NULL) || (path[0] == '\0'))
 255  317                  return (ENOENT);
 256  318          /* This would be a good place to reserve some namespace... */
 257  319          p = strpbrk(path, "/@");
 258  320          if (p && (p[1] == '/' || p[1] == '@')) {
 259  321                  /* two separators in a row */
 260  322                  return (EINVAL);
 261  323          }
 262  324          if (p == NULL || p == path) {
 263  325                  /*
 264  326                   * if the first thing is an @ or /, it had better be an
 265  327                   * @ and it had better not have any more ats or slashes,
 266  328                   * and it had better have something after the @.
 267  329                   */
 268  330                  if (p != NULL &&
 269  331                      (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
 270  332                          return (EINVAL);
 271  333                  if (strlen(path) >= MAXNAMELEN)
 272  334                          return (ENAMETOOLONG);
 273  335                  (void) strcpy(component, path);
 274  336                  p = NULL;
 275  337          } else if (p[0] == '/') {
 276  338                  if (p-path >= MAXNAMELEN)
 277  339                          return (ENAMETOOLONG);
 278  340                  (void) strncpy(component, path, p - path);
 279  341                  component[p-path] = '\0';
 280  342                  p++;
 281  343          } else if (p[0] == '@') {
 282  344                  /*
 283  345                   * if the next separator is an @, there better not be
 284  346                   * any more slashes.
 285  347                   */
 286  348                  if (strchr(path, '/'))
 287  349                          return (EINVAL);
 288  350                  if (p-path >= MAXNAMELEN)
 289  351                          return (ENAMETOOLONG);
 290  352                  (void) strncpy(component, path, p - path);
 291  353                  component[p-path] = '\0';
 292  354          } else {
 293  355                  ASSERT(!"invalid p");
 294  356          }
 295  357          *nextp = p;
 296  358          return (0);
 297  359  }
 298  360  
 299  361  /*
 300  362   * same as dsl_open_dir, ignore the first component of name and use the
 301  363   * spa instead
 302  364   */
 303  365  int
 304  366  dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
 305  367      dsl_dir_t **ddp, const char **tailp)
 306  368  {
 307  369          char buf[MAXNAMELEN];
 308  370          const char *next, *nextnext = NULL;
 309  371          int err;
 310  372          dsl_dir_t *dd;
 311  373          dsl_pool_t *dp;
 312  374          uint64_t ddobj;
 313  375          int openedspa = FALSE;
 314  376  
 315  377          dprintf("%s\n", name);
 316  378  
 317  379          err = getcomponent(name, buf, &next);
 318  380          if (err)
 319  381                  return (err);
 320  382          if (spa == NULL) {
 321  383                  err = spa_open(buf, &spa, FTAG);
 322  384                  if (err) {
 323  385                          dprintf("spa_open(%s) failed\n", buf);
 324  386                          return (err);
 325  387                  }
 326  388                  openedspa = TRUE;
 327  389  
 328  390                  /* XXX this assertion belongs in spa_open */
 329  391                  ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
 330  392          }
 331  393  
 332  394          dp = spa_get_dsl(spa);
 333  395  
 334  396          rw_enter(&dp->dp_config_rwlock, RW_READER);
 335  397          err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
 336  398          if (err) {
 337  399                  rw_exit(&dp->dp_config_rwlock);
 338  400                  if (openedspa)
 339  401                          spa_close(spa, FTAG);
 340  402                  return (err);
 341  403          }
 342  404  
 343  405          while (next != NULL) {
 344  406                  dsl_dir_t *child_ds;
 345  407                  err = getcomponent(next, buf, &nextnext);
 346  408                  if (err)
 347  409                          break;
 348  410                  ASSERT(next[0] != '\0');
 349  411                  if (next[0] == '@')
 350  412                          break;
 351  413                  dprintf("looking up %s in obj%lld\n",
 352  414                      buf, dd->dd_phys->dd_child_dir_zapobj);
 353  415  
 354  416                  err = zap_lookup(dp->dp_meta_objset,
 355  417                      dd->dd_phys->dd_child_dir_zapobj,
 356  418                      buf, sizeof (ddobj), 1, &ddobj);
 357  419                  if (err) {
 358  420                          if (err == ENOENT)
 359  421                                  err = 0;
 360  422                          break;
 361  423                  }
 362  424  
 363  425                  err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
 364  426                  if (err)
 365  427                          break;
 366  428                  dsl_dir_close(dd, tag);
 367  429                  dd = child_ds;
 368  430                  next = nextnext;
 369  431          }
 370  432          rw_exit(&dp->dp_config_rwlock);
 371  433  
 372  434          if (err) {
 373  435                  dsl_dir_close(dd, tag);
 374  436                  if (openedspa)
 375  437                          spa_close(spa, FTAG);
 376  438                  return (err);
 377  439          }
 378  440  
 379  441          /*
 380  442           * It's an error if there's more than one component left, or
 381  443           * tailp==NULL and there's any component left.
 382  444           */
 383  445          if (next != NULL &&
 384  446              (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
 385  447                  /* bad path name */
 386  448                  dsl_dir_close(dd, tag);
 387  449                  dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
 388  450                  err = ENOENT;
 389  451          }
 390  452          if (tailp)
 391  453                  *tailp = next;
 392  454          if (openedspa)
 393  455                  spa_close(spa, FTAG);
 394  456          *ddp = dd;
 395  457          return (err);
 396  458  }
 397  459  
 398  460  /*
 399  461   * Return the dsl_dir_t, and possibly the last component which couldn't

↓ open down ↓

343 lines elided

↑ open up ↑

 400  462   * be found in *tail.  Return NULL if the path is bogus, or if
 401  463   * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
 402  464   * means that the last component is a snapshot.
 403  465   */
 404  466  int
 405  467  dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
 406  468  {
 407  469          return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
 408  470  }
 409  471  
      472 +/*
      473 + * Check if there is already a dataset/snapshot quota set for the dataset. If
      474 + * not, then the counts on this dataset, and those below, may be incorrect due
      475 + * to the use of a pre-existing pool which did not support the dataset/snapshot
      476 + * quota feature.
      477 + *
      478 + * Recursively descend the dataset tree and update the dataset/snapshot counts
      479 + * on each dataset below, then update the cumulative count on the current
      480 + * dataset. If the dataset already has a quota set on it, then we know that
      481 + * its counts, and the counts on the datasets below it, have been updated to
      482 + * be correct, so we can skip that dataset.
      483 + */
      484 +static void
      485 +dsl_dir_set_ds_ss_count(const char *nm, dsl_dir_t *dd, dmu_tx_t *tx,
      486 +    uint64_t *dscnt, uint64_t *sscnt)
      487 +{
      488 +        uint64_t my_ds_cnt = 0;
      489 +        uint64_t my_ss_cnt = 0;
      490 +        objset_t *os = dd->dd_pool->dp_meta_objset;
      491 +        zap_cursor_t *zc;
      492 +        zap_attribute_t *za;
      493 +        char *namebuf;
      494 +        int err;
      495 +        boolean_t quota_set = B_FALSE;
      496 +        uint64_t dsquota, ssquota;
      497 +        dsl_dataset_t *ds;
      498 +
      499 +        err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_DATASET_QUOTA),
      500 +            8, 1, &dsquota, NULL, B_FALSE);
      501 +        if (err == 0 && dsquota != 0)
      502 +                quota_set = B_TRUE;
      503 +
      504 +        if (!quota_set) {
      505 +                err = dsl_prop_get_dd(dd,
      506 +                    zfs_prop_to_name(ZFS_PROP_SNAPSHOT_QUOTA), 8, 1, &ssquota,
      507 +                    NULL, B_FALSE);
      508 +                if (err == 0 && ssquota != 0)
      509 +                        quota_set = B_TRUE;
      510 +        }
      511 +
      512 +        /*
      513 +         * If the dd has a quota, we know its count is already good and we
      514 +         * don't need to recurse down any further.
      515 +         */
      516 +        if (quota_set) {
      517 +                /* Return dataset count plus 1 for self */
      518 +                *dscnt = dd->dd_phys->dd_dataset_count + 1;
      519 +                *sscnt = dd->dd_phys->dd_snapshot_count;
      520 +
      521 +                return;
      522 +        }
      523 +
      524 +        zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
      525 +        za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
      526 +        namebuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
      527 +
      528 +        mutex_enter(&dd->dd_lock);
      529 +
      530 +        /* Iterate datasets */
      531 +        for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
      532 +            zap_cursor_retrieve(zc, za) == 0;
      533 +            zap_cursor_advance(zc)) {
      534 +                dsl_dir_t *chld_dd;
      535 +                uint64_t chld_ds_cnt = 0;
      536 +                uint64_t chld_ss_cnt = 0;
      537 +
      538 +                (void) snprintf(namebuf, MAXPATHLEN, "%s/%s", nm, za->za_name);
      539 +
      540 +                if (dsl_dir_open(namebuf, FTAG, &chld_dd, NULL))
      541 +                        continue;
      542 +
      543 +                dsl_dir_set_ds_ss_count(namebuf, chld_dd, tx, &chld_ds_cnt,
      544 +                    &chld_ss_cnt);
      545 +
      546 +                dsl_dir_close(chld_dd, FTAG);
      547 +
      548 +                my_ds_cnt += chld_ds_cnt;
      549 +                my_ss_cnt += chld_ss_cnt;
      550 +        }
      551 +        zap_cursor_fini(zc);
      552 +
      553 +        kmem_free(namebuf, MAXPATHLEN);
      554 +
      555 +        /* Iterate snapshots */
      556 +        if (dsl_dataset_hold(nm, FTAG, &ds) == 0) {
      557 +                for (zap_cursor_init(zc, os, ds->ds_phys->ds_snapnames_zapobj);
      558 +                    zap_cursor_retrieve(zc, za) == 0;
      559 +                    zap_cursor_advance(zc)) {
      560 +                        my_ss_cnt++;
      561 +                }
      562 +                zap_cursor_fini(zc);
      563 +                dsl_dataset_rele(ds, FTAG);
      564 +        }
      565 +
      566 +        kmem_free(zc, sizeof (zap_cursor_t));
      567 +        kmem_free(za, sizeof (zap_attribute_t));
      568 +
      569 +#ifdef _KERNEL
      570 +        extern void __dtrace_probe_zfs__ds__fix__count(char *, uint64_t,
      571 +            uint64_t);
      572 +        __dtrace_probe_zfs__ds__fix__count((char *)nm, my_ds_cnt, my_ss_cnt);
      573 +#endif
      574 +
      575 +        /* save updated counts */
      576 +        dmu_buf_will_dirty(dd->dd_dbuf, tx);
      577 +        dd->dd_phys->dd_dataset_count = my_ds_cnt;
      578 +        dd->dd_phys->dd_snapshot_count = my_ss_cnt;
      579 +
      580 +        mutex_exit(&dd->dd_lock);
      581 +
      582 +        /* Return child dataset count plus 1 for self */
      583 +        *dscnt = my_ds_cnt + 1;
      584 +        *sscnt = my_ss_cnt;
      585 +}
      586 +
      587 +/*
      588 + * Return ENOSPC if new quota is less than the existing count, otherwise return
      589 + * -1 to force the zfs_set_prop_nvlist code down the default path to set the
      590 + * value in the nvlist.
      591 + */
      592 +int
      593 +dsl_dir_validate_ds_ss_quota(const char *ddname, uint64_t quota,
      594 +    zfs_prop_t ptype)
      595 +{
      596 +        dsl_dir_t *dd;
      597 +        dsl_dataset_t *ds;
      598 +        int err = -1;
      599 +        uint64_t count;
      600 +        dmu_tx_t *tx;
      601 +        uint64_t my_ds_cnt = 0;
      602 +        uint64_t my_ss_cnt = 0;
      603 +        spa_t *spa;
      604 +        zfeature_info_t *quota_feat =
      605 +            &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA];
      606 +
      607 +        if (dsl_dataset_hold(ddname, FTAG, &ds))
      608 +                return (EACCES);
      609 +
      610 +        spa = dsl_dataset_get_spa(ds);
      611 +        if (!spa_feature_is_enabled(spa,
      612 +            &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA])) {
      613 +                dsl_dataset_rele(ds, FTAG);
      614 +                return (ENOTSUP);
      615 +        }
      616 +
      617 +        /* 0 means no quota */
      618 +        if (quota == 0) {
      619 +                dsl_dataset_rele(ds, FTAG);
      620 +                return (-1);
      621 +        }
      622 +
      623 +        if (dsl_dir_open(ddname, FTAG, &dd, NULL)) {
      624 +                dsl_dataset_rele(ds, FTAG);
      625 +                return (EACCES);
      626 +        }
      627 +
      628 +        ASSERT(ds->ds_dir == dd);
      629 +
      630 +        tx = dmu_tx_create_dd(dd);
      631 +        if (dmu_tx_assign(tx, TXG_WAIT)) {
      632 +                dmu_tx_abort(tx);
      633 +                return (ENOSPC);
      634 +        }
      635 +
      636 +        /* set the feature active flag now */
      637 +        if (!spa_feature_is_active(spa, quota_feat))
      638 +                spa_feature_incr(spa, quota_feat, tx);
      639 +
      640 +        /*
      641 +         * Since we are now setting a non-0 quota on the dataset, we need to
      642 +         * ensure the counts are correct. Descend down the tree from this
      643 +         * point and update all of the counts to be accurate.
      644 +         */
      645 +        rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
      646 +        dsl_dir_set_ds_ss_count(ddname, dd, tx, &my_ds_cnt, &my_ss_cnt);
      647 +        rw_exit(&dd->dd_pool->dp_config_rwlock);
      648 +
      649 +        dmu_tx_commit(tx);
      650 +
      651 +        if (ptype == ZFS_PROP_DATASET_QUOTA)
      652 +                count = dd->dd_phys->dd_dataset_count;
      653 +        else
      654 +                count = dd->dd_phys->dd_snapshot_count;
      655 +
      656 +        if (quota < count)
      657 +                err = ENOSPC;
      658 +
      659 +        dsl_dir_close(dd, FTAG);
      660 +        dsl_dataset_rele(ds, FTAG);
      661 +
      662 +        return (err);
      663 +}
      664 +
      665 +/*
      666 + * Check if adding additional child dataset(s) would exceed any dataset
      667 + * quotas.  Note that all dataset quotas up to the root dataset (i.e. the pool
      668 + * itself) or the given ancestor must be satisfied. When receiving we don't
      669 + * check if the tx is syncing. In this case, the tx is passed as NULL.
      670 + */
      671 +int
      672 +dsl_dir_dscount_check(dsl_dir_t *dd, dmu_tx_t *tx, uint64_t cnt,
      673 +    dsl_dir_t *ancestor)
      674 +{
      675 +        uint64_t quota;
      676 +        int err = 0;
      677 +
      678 +        VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      679 +
      680 +        /*
      681 +         * As with dsl_dataset_set_reservation_check(), don't run this check in
      682 +         * open context.
      683 +         */
      684 +        if (tx != NULL && !dmu_tx_is_syncing(tx))
      685 +                return (0);
      686 +
      687 +        /*
      688 +         * If an ancestor has been provided, stop checking the quota once we
      689 +         * hit that dir. We need this during rename so that we don't overcount
      690 +         * the check once we recurse up to the common ancestor.
      691 +         */
      692 +        if (ancestor == dd)
      693 +                return (0);
      694 +
      695 +        /*
      696 +         * If there's no value for this property, there's no need to enforce a
      697 +         * dataset quota.
      698 +         */
      699 +        err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_DATASET_QUOTA),
      700 +            8, 1, &quota, NULL, B_FALSE);
      701 +        if (err == ENOENT)
      702 +                return (0);
      703 +        else if (err != 0)
      704 +                return (err);
      705 +
      706 +#ifdef _KERNEL
      707 +        extern void __dtrace_probe_zfs__ds__quota(uint64_t, uint64_t, char *);
      708 +        __dtrace_probe_zfs__ds__quota((uint64_t)dd->dd_phys->dd_dataset_count,
      709 +            (uint64_t)quota, dd->dd_myname);
      710 +#endif
      711 +
      712 +        if (quota > 0 && (dd->dd_phys->dd_dataset_count + cnt) > quota)
      713 +                return (EDQUOT);
      714 +
      715 +        if (dd->dd_parent != NULL)
      716 +                err = dsl_dir_dscount_check(dd->dd_parent, tx, cnt, ancestor);
      717 +
      718 +        return (err);
      719 +}
      720 +
      721 +/*
      722 + * Adjust the dataset count for the specified dsl_dir_t and all parent datasets.
      723 + * When a new dataset is created, increment the count on all parents, and when a
      724 + * dataset is destroyed, decrement the count.
      725 + */
      726 +void
      727 +dsl_dir_dscount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
      728 +    boolean_t syncing, boolean_t first)
      729 +{
      730 +        /*
      731 +         * On initial entry we need to check if this feature is active, but
      732 +         * we don't want to re-check this on each recursive call. Note: the
      733 +         * feature cannot be active if its not enabled. If the feature is not
      734 +         * active, don't touch the on-disk count fields.
      735 +         */
      736 +        if (first) {
      737 +                dsl_dataset_t *ds = NULL;
      738 +                spa_t *spa;
      739 +                zfeature_info_t *quota_feat =
      740 +                    &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA];
      741 +
      742 +                VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
      743 +                    dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
      744 +                spa = dsl_dataset_get_spa(ds);
      745 +                dsl_dataset_rele(ds, FTAG);
      746 +                if (!spa_feature_is_active(spa, quota_feat))
      747 +                        return;
      748 +        }
      749 +
      750 +        VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
      751 +        if (syncing)
      752 +                VERIFY(dmu_tx_is_syncing(tx));
      753 +
      754 +        dmu_buf_will_dirty(dd->dd_dbuf, tx);
      755 +
      756 +        mutex_enter(&dd->dd_lock);
      757 +
      758 +        /*
      759 +         * Counts may be incorrect if dealing with an existing pool and
      760 +         * there has never been a quota set in the dataset hierarchy.
      761 +         * This is not an error.
      762 +         */
      763 +        if (delta < 0 && dd->dd_phys->dd_dataset_count < (delta * -1)) {
      764 +#ifdef _KERNEL
      765 +                extern void __dtrace_probe_zfs__dscnt__adj__neg(char *);
      766 +                __dtrace_probe_zfs__dscnt__adj__neg(dd->dd_myname);
      767 +#endif
      768 +                mutex_exit(&dd->dd_lock);
      769 +                return;
      770 +        }
      771 +
      772 +        dd->dd_phys->dd_dataset_count += delta;
      773 +
      774 +        if (dd->dd_parent != NULL)
      775 +                dsl_dir_dscount_adjust(dd->dd_parent, tx, delta, syncing,
      776 +                    B_FALSE);
      777 +
      778 +        mutex_exit(&dd->dd_lock);
      779 +}
      780 +
 410  781  uint64_t
 411  782  dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 412  783      dmu_tx_t *tx)
 413  784  {
 414  785          objset_t *mos = dp->dp_meta_objset;
 415  786          uint64_t ddobj;
 416  787          dsl_dir_phys_t *ddphys;
 417  788          dmu_buf_t *dbuf;
 418  789  
 419  790          ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,

 420  791              DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 421  792          if (pds) {
 422  793                  VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
 423  794                      name, sizeof (uint64_t), 1, &ddobj, tx));
 424  795          } else {
 425  796                  /* it's the root dir */
 426  797                  VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 427  798                      DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 428  799          }
 429  800          VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 430  801          dmu_buf_will_dirty(dbuf, tx);
 431  802          ddphys = dbuf->db_data;
 432  803  
 433  804          ddphys->dd_creation_time = gethrestime_sec();
 434  805          if (pds)
 435  806                  ddphys->dd_parent_obj = pds->dd_object;
 436  807          ddphys->dd_props_zapobj = zap_create(mos,
 437  808              DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 438  809          ddphys->dd_child_dir_zapobj = zap_create(mos,
 439  810              DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 440  811          if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
 441  812                  ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 442  813          dmu_buf_rele(dbuf, FTAG);
 443  814  
 444  815          return (ddobj);
 445  816  }
 446  817  
 447  818  /* ARGSUSED */
 448  819  int
 449  820  dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 450  821  {
 451  822          dsl_dir_t *dd = arg1;
 452  823          dsl_pool_t *dp = dd->dd_pool;
 453  824          objset_t *mos = dp->dp_meta_objset;
 454  825          int err;
 455  826          uint64_t count;
 456  827  
 457  828          /*
 458  829           * There should be exactly two holds, both from
 459  830           * dsl_dataset_destroy: one on the dd directory, and one on its
 460  831           * head ds.  If there are more holds, then a concurrent thread is
 461  832           * performing a lookup inside this dir while we're trying to destroy
 462  833           * it.  To minimize this possibility, we perform this check only
 463  834           * in syncing context and fail the operation if we encounter
 464  835           * additional holds.  The dp_config_rwlock ensures that nobody else
 465  836           * opens it after we check.
 466  837           */
 467  838          if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2)
 468  839                  return (EBUSY);
 469  840  
 470  841          err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
 471  842          if (err)
 472  843                  return (err);
 473  844          if (count != 0)
 474  845                  return (EEXIST);
 475  846  
 476  847          return (0);
 477  848  }
 478  849  
 479  850  void
 480  851  dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)

↓ open down ↓

61 lines elided

↑ open up ↑

 481  852  {
 482  853          dsl_dir_t *dd = arg1;
 483  854          objset_t *mos = dd->dd_pool->dp_meta_objset;
 484  855          uint64_t obj;
 485  856          dd_used_t t;
 486  857  
 487  858          ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
 488  859          ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 489  860  
 490  861          /*
      862 +         * Decrement the dataset count for all parent datasets.
      863 +         *
      864 +         * We have to worry about a special case where we are receiving a
      865 +         * dataset that already exists. In this case a temporary clone name
      866 +         * of %X is created (see dmu_recv_begin). In dmu_recv_existing_end we
      867 +         * destroy this temporary clone which leads to here. We don't want to
      868 +         * decrement the dataset counters in this case, since we never
      869 +         * incremented them. To detect this case we check the tag for
      870 +         * "tmp_dmu_recv_tag" to see if we're in that code path.
      871 +         */
      872 +        if (dd->dd_parent != NULL && strcmp(tag, tmp_dmu_recv_tag) != 0)
      873 +                dsl_dir_dscount_adjust(dd->dd_parent, tx, -1, B_TRUE, B_TRUE);
      874 +
      875 +        /*
 491  876           * Remove our reservation. The impl() routine avoids setting the
 492  877           * actual property, which would require the (already destroyed) ds.
 493  878           */
 494  879          dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 495  880  
 496  881          ASSERT0(dd->dd_phys->dd_used_bytes);
 497  882          ASSERT0(dd->dd_phys->dd_reserved);
 498  883          for (t = 0; t < DD_USED_NUM; t++)
 499  884                  ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
 500  885

 501  886          VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
 502  887          VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
 503  888          VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
 504  889          VERIFY(0 == zap_remove(mos,
 505  890              dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
 506  891  
 507  892          obj = dd->dd_object;
 508  893          dsl_dir_close(dd, tag);
 509  894          VERIFY(0 == dmu_object_free(mos, obj, tx));
 510  895  }
 511  896  
 512  897  boolean_t
 513  898  dsl_dir_is_clone(dsl_dir_t *dd)
 514  899  {
 515  900          return (dd->dd_phys->dd_origin_obj &&
 516  901              (dd->dd_pool->dp_origin_snap == NULL ||
 517  902              dd->dd_phys->dd_origin_obj !=
 518  903              dd->dd_pool->dp_origin_snap->ds_object));
 519  904  }
 520  905  
 521  906  void
 522  907  dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
 523  908  {
 524  909          mutex_enter(&dd->dd_lock);
 525  910          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
 526  911              dd->dd_phys->dd_used_bytes);
 527  912          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
 528  913          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
 529  914              dd->dd_phys->dd_reserved);
 530  915          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
 531  916              dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
 532  917              (dd->dd_phys->dd_uncompressed_bytes * 100 /
 533  918              dd->dd_phys->dd_compressed_bytes));
 534  919          if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 535  920                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
 536  921                      dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
 537  922                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
 538  923                      dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
 539  924                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
 540  925                      dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
 541  926                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
 542  927                      dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
 543  928                      dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
 544  929          }
 545  930          mutex_exit(&dd->dd_lock);
 546  931  
 547  932          rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
 548  933          if (dsl_dir_is_clone(dd)) {
 549  934                  dsl_dataset_t *ds;
 550  935                  char buf[MAXNAMELEN];
 551  936  
 552  937                  VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
 553  938                      dd->dd_phys->dd_origin_obj, FTAG, &ds));
 554  939                  dsl_dataset_name(ds, buf);
 555  940                  dsl_dataset_rele(ds, FTAG);
 556  941                  dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
 557  942          }
 558  943          rw_exit(&dd->dd_pool->dp_config_rwlock);
 559  944  }
 560  945  
 561  946  void
 562  947  dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
 563  948  {
 564  949          dsl_pool_t *dp = dd->dd_pool;
 565  950  
 566  951          ASSERT(dd->dd_phys);
 567  952  
 568  953          if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
 569  954                  /* up the hold count until we can be written out */
 570  955                  dmu_buf_add_ref(dd->dd_dbuf, dd);
 571  956          }
 572  957  }
 573  958  
 574  959  static int64_t
 575  960  parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
 576  961  {
 577  962          uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
 578  963          uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
 579  964          return (new_accounted - old_accounted);
 580  965  }
 581  966  
 582  967  void
 583  968  dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 584  969  {
 585  970          ASSERT(dmu_tx_is_syncing(tx));
 586  971  
 587  972          mutex_enter(&dd->dd_lock);
 588  973          ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
 589  974          dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
 590  975              dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
 591  976          dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
 592  977          mutex_exit(&dd->dd_lock);
 593  978  
 594  979          /* release the hold from dsl_dir_dirty */
 595  980          dmu_buf_rele(dd->dd_dbuf, dd);
 596  981  }
 597  982  
 598  983  static uint64_t
 599  984  dsl_dir_space_towrite(dsl_dir_t *dd)
 600  985  {
 601  986          uint64_t space = 0;
 602  987          int i;
 603  988  
 604  989          ASSERT(MUTEX_HELD(&dd->dd_lock));
 605  990  
 606  991          for (i = 0; i < TXG_SIZE; i++) {
 607  992                  space += dd->dd_space_towrite[i&TXG_MASK];
 608  993                  ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
 609  994          }
 610  995          return (space);
 611  996  }
 612  997  
 613  998  /*
 614  999   * How much space would dd have available if ancestor had delta applied
 615 1000   * to it?  If ondiskonly is set, we're only interested in what's
 616 1001   * on-disk, not estimated pending changes.
 617 1002   */
 618 1003  uint64_t
 619 1004  dsl_dir_space_available(dsl_dir_t *dd,
 620 1005      dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
 621 1006  {
 622 1007          uint64_t parentspace, myspace, quota, used;
 623 1008  
 624 1009          /*
 625 1010           * If there are no restrictions otherwise, assume we have
 626 1011           * unlimited space available.
 627 1012           */
 628 1013          quota = UINT64_MAX;
 629 1014          parentspace = UINT64_MAX;
 630 1015  
 631 1016          if (dd->dd_parent != NULL) {
 632 1017                  parentspace = dsl_dir_space_available(dd->dd_parent,
 633 1018                      ancestor, delta, ondiskonly);
 634 1019          }
 635 1020  
 636 1021          mutex_enter(&dd->dd_lock);
 637 1022          if (dd->dd_phys->dd_quota != 0)
 638 1023                  quota = dd->dd_phys->dd_quota;
 639 1024          used = dd->dd_phys->dd_used_bytes;
 640 1025          if (!ondiskonly)
 641 1026                  used += dsl_dir_space_towrite(dd);
 642 1027  
 643 1028          if (dd->dd_parent == NULL) {
 644 1029                  uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
 645 1030                  quota = MIN(quota, poolsize);
 646 1031          }
 647 1032  
 648 1033          if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
 649 1034                  /*
 650 1035                   * We have some space reserved, in addition to what our
 651 1036                   * parent gave us.
 652 1037                   */
 653 1038                  parentspace += dd->dd_phys->dd_reserved - used;
 654 1039          }
 655 1040  
 656 1041          if (dd == ancestor) {
 657 1042                  ASSERT(delta <= 0);
 658 1043                  ASSERT(used >= -delta);
 659 1044                  used += delta;
 660 1045                  if (parentspace != UINT64_MAX)
 661 1046                          parentspace -= delta;
 662 1047          }
 663 1048  
 664 1049          if (used > quota) {
 665 1050                  /* over quota */
 666 1051                  myspace = 0;
 667 1052          } else {
 668 1053                  /*
 669 1054                   * the lesser of the space provided by our parent and
 670 1055                   * the space left in our quota
 671 1056                   */
 672 1057                  myspace = MIN(parentspace, quota - used);
 673 1058          }
 674 1059  
 675 1060          mutex_exit(&dd->dd_lock);
 676 1061  
 677 1062          return (myspace);
 678 1063  }
 679 1064  
 680 1065  struct tempreserve {
 681 1066          list_node_t tr_node;
 682 1067          dsl_pool_t *tr_dp;
 683 1068          dsl_dir_t *tr_ds;
 684 1069          uint64_t tr_size;
 685 1070  };
 686 1071  
 687 1072  static int
 688 1073  dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 689 1074      boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
 690 1075      dmu_tx_t *tx, boolean_t first)
 691 1076  {
 692 1077          uint64_t txg = tx->tx_txg;
 693 1078          uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
 694 1079          uint64_t deferred = 0;
 695 1080          struct tempreserve *tr;
 696 1081          int retval = EDQUOT;
 697 1082          int txgidx = txg & TXG_MASK;
 698 1083          int i;
 699 1084          uint64_t ref_rsrv = 0;
 700 1085  
 701 1086          ASSERT3U(txg, !=, 0);
 702 1087          ASSERT3S(asize, >, 0);
 703 1088  
 704 1089          mutex_enter(&dd->dd_lock);
 705 1090  
 706 1091          /*
 707 1092           * Check against the dsl_dir's quota.  We don't add in the delta
 708 1093           * when checking for over-quota because they get one free hit.
 709 1094           */
 710 1095          est_inflight = dsl_dir_space_towrite(dd);
 711 1096          for (i = 0; i < TXG_SIZE; i++)
 712 1097                  est_inflight += dd->dd_tempreserved[i];
 713 1098          used_on_disk = dd->dd_phys->dd_used_bytes;
 714 1099  
 715 1100          /*
 716 1101           * On the first iteration, fetch the dataset's used-on-disk and
 717 1102           * refreservation values. Also, if checkrefquota is set, test if
 718 1103           * allocating this space would exceed the dataset's refquota.
 719 1104           */
 720 1105          if (first && tx->tx_objset) {
 721 1106                  int error;
 722 1107                  dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
 723 1108  
 724 1109                  error = dsl_dataset_check_quota(ds, checkrefquota,
 725 1110                      asize, est_inflight, &used_on_disk, &ref_rsrv);
 726 1111                  if (error) {
 727 1112                          mutex_exit(&dd->dd_lock);
 728 1113                          return (error);
 729 1114                  }
 730 1115          }
 731 1116  
 732 1117          /*
 733 1118           * If this transaction will result in a net free of space,
 734 1119           * we want to let it through.
 735 1120           */
 736 1121          if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
 737 1122                  quota = UINT64_MAX;
 738 1123          else
 739 1124                  quota = dd->dd_phys->dd_quota;
 740 1125  
 741 1126          /*
 742 1127           * Adjust the quota against the actual pool size at the root
 743 1128           * minus any outstanding deferred frees.
 744 1129           * To ensure that it's possible to remove files from a full
 745 1130           * pool without inducing transient overcommits, we throttle
 746 1131           * netfree transactions against a quota that is slightly larger,
 747 1132           * but still within the pool's allocation slop.  In cases where
 748 1133           * we're very close to full, this will allow a steady trickle of
 749 1134           * removes to get through.
 750 1135           */
 751 1136          if (dd->dd_parent == NULL) {
 752 1137                  spa_t *spa = dd->dd_pool->dp_spa;
 753 1138                  uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
 754 1139                  deferred = metaslab_class_get_deferred(spa_normal_class(spa));
 755 1140                  if (poolsize - deferred < quota) {
 756 1141                          quota = poolsize - deferred;
 757 1142                          retval = ENOSPC;
 758 1143                  }
 759 1144          }
 760 1145  
 761 1146          /*
 762 1147           * If they are requesting more space, and our current estimate
 763 1148           * is over quota, they get to try again unless the actual
 764 1149           * on-disk is over quota and there are no pending changes (which
 765 1150           * may free up space for us).
 766 1151           */
 767 1152          if (used_on_disk + est_inflight >= quota) {
 768 1153                  if (est_inflight > 0 || used_on_disk < quota ||
 769 1154                      (retval == ENOSPC && used_on_disk < quota + deferred))
 770 1155                          retval = ERESTART;
 771 1156                  dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
 772 1157                      "quota=%lluK tr=%lluK err=%d\n",
 773 1158                      used_on_disk>>10, est_inflight>>10,
 774 1159                      quota>>10, asize>>10, retval);
 775 1160                  mutex_exit(&dd->dd_lock);
 776 1161                  return (retval);
 777 1162          }
 778 1163  
 779 1164          /* We need to up our estimated delta before dropping dd_lock */
 780 1165          dd->dd_tempreserved[txgidx] += asize;
 781 1166  
 782 1167          parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
 783 1168              asize - ref_rsrv);
 784 1169          mutex_exit(&dd->dd_lock);
 785 1170  
 786 1171          tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 787 1172          tr->tr_ds = dd;
 788 1173          tr->tr_size = asize;
 789 1174          list_insert_tail(tr_list, tr);
 790 1175  
 791 1176          /* see if it's OK with our parent */
 792 1177          if (dd->dd_parent && parent_rsrv) {
 793 1178                  boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
 794 1179  
 795 1180                  return (dsl_dir_tempreserve_impl(dd->dd_parent,
 796 1181                      parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
 797 1182          } else {
 798 1183                  return (0);
 799 1184          }
 800 1185  }
 801 1186  
 802 1187  /*
 803 1188   * Reserve space in this dsl_dir, to be used in this tx's txg.
 804 1189   * After the space has been dirtied (and dsl_dir_willuse_space()
 805 1190   * has been called), the reservation should be canceled, using
 806 1191   * dsl_dir_tempreserve_clear().
 807 1192   */
 808 1193  int
 809 1194  dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
 810 1195      uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
 811 1196  {
 812 1197          int err;
 813 1198          list_t *tr_list;
 814 1199  
 815 1200          if (asize == 0) {
 816 1201                  *tr_cookiep = NULL;
 817 1202                  return (0);
 818 1203          }
 819 1204  
 820 1205          tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 821 1206          list_create(tr_list, sizeof (struct tempreserve),
 822 1207              offsetof(struct tempreserve, tr_node));
 823 1208          ASSERT3S(asize, >, 0);
 824 1209          ASSERT3S(fsize, >=, 0);
 825 1210  
 826 1211          err = arc_tempreserve_space(lsize, tx->tx_txg);
 827 1212          if (err == 0) {
 828 1213                  struct tempreserve *tr;
 829 1214  
 830 1215                  tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 831 1216                  tr->tr_size = lsize;
 832 1217                  list_insert_tail(tr_list, tr);
 833 1218  
 834 1219                  err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
 835 1220          } else {
 836 1221                  if (err == EAGAIN) {
 837 1222                          txg_delay(dd->dd_pool, tx->tx_txg,
 838 1223                              zfs_zone_txg_delay());
 839 1224                          err = ERESTART;
 840 1225                  }
 841 1226                  dsl_pool_memory_pressure(dd->dd_pool);
 842 1227          }
 843 1228  
 844 1229          if (err == 0) {
 845 1230                  struct tempreserve *tr;
 846 1231  
 847 1232                  tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 848 1233                  tr->tr_dp = dd->dd_pool;
 849 1234                  tr->tr_size = asize;
 850 1235                  list_insert_tail(tr_list, tr);
 851 1236  
 852 1237                  err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
 853 1238                      FALSE, asize > usize, tr_list, tx, TRUE);
 854 1239          }
 855 1240  
 856 1241          if (err)
 857 1242                  dsl_dir_tempreserve_clear(tr_list, tx);
 858 1243          else
 859 1244                  *tr_cookiep = tr_list;
 860 1245  
 861 1246          return (err);
 862 1247  }
 863 1248  
 864 1249  /*
 865 1250   * Clear a temporary reservation that we previously made with
 866 1251   * dsl_dir_tempreserve_space().
 867 1252   */
 868 1253  void
 869 1254  dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
 870 1255  {
 871 1256          int txgidx = tx->tx_txg & TXG_MASK;
 872 1257          list_t *tr_list = tr_cookie;
 873 1258          struct tempreserve *tr;
 874 1259  
 875 1260          ASSERT3U(tx->tx_txg, !=, 0);
 876 1261  
 877 1262          if (tr_cookie == NULL)
 878 1263                  return;
 879 1264  
 880 1265          while (tr = list_head(tr_list)) {
 881 1266                  if (tr->tr_dp) {
 882 1267                          dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
 883 1268                  } else if (tr->tr_ds) {
 884 1269                          mutex_enter(&tr->tr_ds->dd_lock);
 885 1270                          ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 886 1271                              tr->tr_size);
 887 1272                          tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
 888 1273                          mutex_exit(&tr->tr_ds->dd_lock);
 889 1274                  } else {
 890 1275                          arc_tempreserve_clear(tr->tr_size);
 891 1276                  }
 892 1277                  list_remove(tr_list, tr);
 893 1278                  kmem_free(tr, sizeof (struct tempreserve));
 894 1279          }
 895 1280  
 896 1281          kmem_free(tr_list, sizeof (list_t));
 897 1282  }
 898 1283  
 899 1284  static void
 900 1285  dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 901 1286  {
 902 1287          int64_t parent_space;
 903 1288          uint64_t est_used;
 904 1289  
 905 1290          mutex_enter(&dd->dd_lock);
 906 1291          if (space > 0)
 907 1292                  dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 908 1293  
 909 1294          est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
 910 1295          parent_space = parent_delta(dd, est_used, space);
 911 1296          mutex_exit(&dd->dd_lock);
 912 1297  
 913 1298          /* Make sure that we clean up dd_space_to* */
 914 1299          dsl_dir_dirty(dd, tx);
 915 1300  
 916 1301          /* XXX this is potentially expensive and unnecessary... */
 917 1302          if (parent_space && dd->dd_parent)
 918 1303                  dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
 919 1304  }
 920 1305  
 921 1306  /*
 922 1307   * Call in open context when we think we're going to write/free space,
 923 1308   * eg. when dirtying data.  Be conservative (ie. OK to write less than
 924 1309   * this or free more than this, but don't write more or free less).
 925 1310   */
 926 1311  void
 927 1312  dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 928 1313  {
 929 1314          dsl_pool_willuse_space(dd->dd_pool, space, tx);
 930 1315          dsl_dir_willuse_space_impl(dd, space, tx);
 931 1316  }
 932 1317  
 933 1318  /* call from syncing context when we actually write/free space for this dd */
 934 1319  void
 935 1320  dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 936 1321      int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 937 1322  {
 938 1323          int64_t accounted_delta;
 939 1324          boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 940 1325  
 941 1326          ASSERT(dmu_tx_is_syncing(tx));
 942 1327          ASSERT(type < DD_USED_NUM);
 943 1328  
 944 1329          if (needlock)
 945 1330                  mutex_enter(&dd->dd_lock);
 946 1331          accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
 947 1332          ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
 948 1333          ASSERT(compressed >= 0 ||
 949 1334              dd->dd_phys->dd_compressed_bytes >= -compressed);
 950 1335          ASSERT(uncompressed >= 0 ||
 951 1336              dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
 952 1337          dmu_buf_will_dirty(dd->dd_dbuf, tx);
 953 1338          dd->dd_phys->dd_used_bytes += used;
 954 1339          dd->dd_phys->dd_uncompressed_bytes += uncompressed;
 955 1340          dd->dd_phys->dd_compressed_bytes += compressed;
 956 1341  
 957 1342          if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 958 1343                  ASSERT(used > 0 ||
 959 1344                      dd->dd_phys->dd_used_breakdown[type] >= -used);
 960 1345                  dd->dd_phys->dd_used_breakdown[type] += used;
 961 1346  #ifdef DEBUG
 962 1347                  dd_used_t t;
 963 1348                  uint64_t u = 0;
 964 1349                  for (t = 0; t < DD_USED_NUM; t++)
 965 1350                          u += dd->dd_phys->dd_used_breakdown[t];
 966 1351                  ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
 967 1352  #endif
 968 1353          }
 969 1354          if (needlock)
 970 1355                  mutex_exit(&dd->dd_lock);
 971 1356  
 972 1357          if (dd->dd_parent != NULL) {
 973 1358                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
 974 1359                      accounted_delta, compressed, uncompressed, tx);
 975 1360                  dsl_dir_transfer_space(dd->dd_parent,
 976 1361                      used - accounted_delta,
 977 1362                      DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
 978 1363          }
 979 1364  }
 980 1365  
 981 1366  void
 982 1367  dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
 983 1368      dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
 984 1369  {
 985 1370          boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 986 1371  
 987 1372          ASSERT(dmu_tx_is_syncing(tx));
 988 1373          ASSERT(oldtype < DD_USED_NUM);
 989 1374          ASSERT(newtype < DD_USED_NUM);
 990 1375  
 991 1376          if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
 992 1377                  return;
 993 1378  
 994 1379          if (needlock)
 995 1380                  mutex_enter(&dd->dd_lock);
 996 1381          ASSERT(delta > 0 ?
 997 1382              dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
 998 1383              dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
 999 1384          ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
1000 1385          dmu_buf_will_dirty(dd->dd_dbuf, tx);
1001 1386          dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
1002 1387          dd->dd_phys->dd_used_breakdown[newtype] += delta;
1003 1388          if (needlock)
1004 1389                  mutex_exit(&dd->dd_lock);
1005 1390  }
1006 1391  
1007 1392  static int
1008 1393  dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
1009 1394  {
1010 1395          dsl_dataset_t *ds = arg1;
1011 1396          dsl_dir_t *dd = ds->ds_dir;
1012 1397          dsl_prop_setarg_t *psa = arg2;
1013 1398          int err;
1014 1399          uint64_t towrite;
1015 1400  
1016 1401          if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
1017 1402                  return (err);
1018 1403  
1019 1404          if (psa->psa_effective_value == 0)
1020 1405                  return (0);
1021 1406  
1022 1407          mutex_enter(&dd->dd_lock);
1023 1408          /*
1024 1409           * If we are doing the preliminary check in open context, and
1025 1410           * there are pending changes, then don't fail it, since the
1026 1411           * pending changes could under-estimate the amount of space to be
1027 1412           * freed up.
1028 1413           */

↓ open down ↓

528 lines elided

↑ open up ↑

1029 1414          towrite = dsl_dir_space_towrite(dd);
1030 1415          if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1031 1416              (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
1032 1417              psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
1033 1418                  err = ENOSPC;
1034 1419          }
1035 1420          mutex_exit(&dd->dd_lock);
1036 1421          return (err);
1037 1422  }
1038 1423  
1039      -extern dsl_syncfunc_t dsl_prop_set_sync;
1040      -
1041 1424  static void
1042 1425  dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1043 1426  {
1044 1427          dsl_dataset_t *ds = arg1;
1045 1428          dsl_dir_t *dd = ds->ds_dir;
1046 1429          dsl_prop_setarg_t *psa = arg2;
1047 1430          uint64_t effective_value = psa->psa_effective_value;
1048 1431  
1049 1432          dsl_prop_set_sync(ds, psa, tx);
1050 1433          DSL_PROP_CHECK_PREDICTION(dd, psa);

1051 1434  
1052 1435          dmu_buf_will_dirty(dd->dd_dbuf, tx);
1053 1436  
1054 1437          mutex_enter(&dd->dd_lock);
1055 1438          dd->dd_phys->dd_quota = effective_value;
1056 1439          mutex_exit(&dd->dd_lock);
1057 1440  }
1058 1441  
1059 1442  int
1060 1443  dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1061 1444  {
1062 1445          dsl_dir_t *dd;
1063 1446          dsl_dataset_t *ds;
1064 1447          dsl_prop_setarg_t psa;
1065 1448          int err;
1066 1449  
1067 1450          dsl_prop_setarg_init_uint64(&psa, "quota", source, &quota);
1068 1451  
1069 1452          err = dsl_dataset_hold(ddname, FTAG, &ds);
1070 1453          if (err)
1071 1454                  return (err);
1072 1455  
1073 1456          err = dsl_dir_open(ddname, FTAG, &dd, NULL);
1074 1457          if (err) {
1075 1458                  dsl_dataset_rele(ds, FTAG);
1076 1459                  return (err);
1077 1460          }
1078 1461  
1079 1462          ASSERT(ds->ds_dir == dd);
1080 1463  
1081 1464          /*
1082 1465           * If someone removes a file, then tries to set the quota, we want to
1083 1466           * make sure the file freeing takes effect.
1084 1467           */
1085 1468          txg_wait_open(dd->dd_pool, 0);
1086 1469  
1087 1470          err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
1088 1471              dsl_dir_set_quota_sync, ds, &psa, 0);
1089 1472  
1090 1473          dsl_dir_close(dd, FTAG);
1091 1474          dsl_dataset_rele(ds, FTAG);
1092 1475          return (err);
1093 1476  }
1094 1477  
1095 1478  int
1096 1479  dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
1097 1480  {
1098 1481          dsl_dataset_t *ds = arg1;
1099 1482          dsl_dir_t *dd = ds->ds_dir;
1100 1483          dsl_prop_setarg_t *psa = arg2;
1101 1484          uint64_t effective_value;
1102 1485          uint64_t used, avail;
1103 1486          int err;
1104 1487  
1105 1488          if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
1106 1489                  return (err);
1107 1490  
1108 1491          effective_value = psa->psa_effective_value;
1109 1492  
1110 1493          /*
1111 1494           * If we are doing the preliminary check in open context, the
1112 1495           * space estimates may be inaccurate.
1113 1496           */
1114 1497          if (!dmu_tx_is_syncing(tx))
1115 1498                  return (0);
1116 1499  
1117 1500          mutex_enter(&dd->dd_lock);
1118 1501          used = dd->dd_phys->dd_used_bytes;
1119 1502          mutex_exit(&dd->dd_lock);
1120 1503  
1121 1504          if (dd->dd_parent) {
1122 1505                  avail = dsl_dir_space_available(dd->dd_parent,
1123 1506                      NULL, 0, FALSE);
1124 1507          } else {
1125 1508                  avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1126 1509          }
1127 1510  
1128 1511          if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
1129 1512                  uint64_t delta = MAX(used, effective_value) -
1130 1513                      MAX(used, dd->dd_phys->dd_reserved);
1131 1514  
1132 1515                  if (delta > avail)
1133 1516                          return (ENOSPC);
1134 1517                  if (dd->dd_phys->dd_quota > 0 &&
1135 1518                      effective_value > dd->dd_phys->dd_quota)
1136 1519                          return (ENOSPC);
1137 1520          }
1138 1521  
1139 1522          return (0);
1140 1523  }
1141 1524  
1142 1525  static void
1143 1526  dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1144 1527  {
1145 1528          uint64_t used;
1146 1529          int64_t delta;
1147 1530  
1148 1531          dmu_buf_will_dirty(dd->dd_dbuf, tx);
1149 1532  
1150 1533          mutex_enter(&dd->dd_lock);
1151 1534          used = dd->dd_phys->dd_used_bytes;
1152 1535          delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
1153 1536          dd->dd_phys->dd_reserved = value;
1154 1537  
1155 1538          if (dd->dd_parent != NULL) {
1156 1539                  /* Roll up this additional usage into our ancestors */
1157 1540                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1158 1541                      delta, 0, 0, tx);
1159 1542          }
1160 1543          mutex_exit(&dd->dd_lock);
1161 1544  }
1162 1545  
1163 1546  
1164 1547  static void
1165 1548  dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1166 1549  {
1167 1550          dsl_dataset_t *ds = arg1;
1168 1551          dsl_dir_t *dd = ds->ds_dir;
1169 1552          dsl_prop_setarg_t *psa = arg2;
1170 1553          uint64_t value = psa->psa_effective_value;
1171 1554  
1172 1555          dsl_prop_set_sync(ds, psa, tx);
1173 1556          DSL_PROP_CHECK_PREDICTION(dd, psa);
1174 1557  
1175 1558          dsl_dir_set_reservation_sync_impl(dd, value, tx);
1176 1559  }
1177 1560  
1178 1561  int
1179 1562  dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1180 1563      uint64_t reservation)
1181 1564  {
1182 1565          dsl_dir_t *dd;
1183 1566          dsl_dataset_t *ds;
1184 1567          dsl_prop_setarg_t psa;
1185 1568          int err;
1186 1569  
1187 1570          dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
1188 1571  
1189 1572          err = dsl_dataset_hold(ddname, FTAG, &ds);
1190 1573          if (err)
1191 1574                  return (err);
1192 1575  
1193 1576          err = dsl_dir_open(ddname, FTAG, &dd, NULL);
1194 1577          if (err) {
1195 1578                  dsl_dataset_rele(ds, FTAG);
1196 1579                  return (err);
1197 1580          }
1198 1581  
1199 1582          ASSERT(ds->ds_dir == dd);
1200 1583  
1201 1584          err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
1202 1585              dsl_dir_set_reservation_sync, ds, &psa, 0);
1203 1586  
1204 1587          dsl_dir_close(dd, FTAG);
1205 1588          dsl_dataset_rele(ds, FTAG);
1206 1589          return (err);
1207 1590  }
1208 1591  
1209 1592  static dsl_dir_t *
1210 1593  closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1211 1594  {
1212 1595          for (; ds1; ds1 = ds1->dd_parent) {
1213 1596                  dsl_dir_t *dd;
1214 1597                  for (dd = ds2; dd; dd = dd->dd_parent) {
1215 1598                          if (ds1 == dd)
1216 1599                                  return (dd);
1217 1600                  }
1218 1601          }
1219 1602          return (NULL);
1220 1603  }
1221 1604  
1222 1605  /*
1223 1606   * If delta is applied to dd, how much of that delta would be applied to
1224 1607   * ancestor?  Syncing context only.
1225 1608   */
1226 1609  static int64_t
1227 1610  would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1228 1611  {
1229 1612          if (dd == ancestor)
1230 1613                  return (delta);
1231 1614  
1232 1615          mutex_enter(&dd->dd_lock);
1233 1616          delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
1234 1617          mutex_exit(&dd->dd_lock);
1235 1618          return (would_change(dd->dd_parent, delta, ancestor));
1236 1619  }
1237 1620  
1238 1621  struct renamearg {
1239 1622          dsl_dir_t *newparent;
1240 1623          const char *mynewname;
1241 1624  };
1242 1625  
1243 1626  static int
1244 1627  dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
1245 1628  {
1246 1629          dsl_dir_t *dd = arg1;
1247 1630          struct renamearg *ra = arg2;
1248 1631          dsl_pool_t *dp = dd->dd_pool;
1249 1632          objset_t *mos = dp->dp_meta_objset;
1250 1633          int err;
1251 1634          uint64_t val;
1252 1635  
1253 1636          /*
1254 1637           * There should only be one reference, from dmu_objset_rename().
1255 1638           * Fleeting holds are also possible (eg, from "zfs list" getting
1256 1639           * stats), but any that are present in open context will likely
1257 1640           * be gone by syncing context, so only fail from syncing
1258 1641           * context.
1259 1642           */
1260 1643          if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1)
1261 1644                  return (EBUSY);
1262 1645  
1263 1646          /* check for existing name */
1264 1647          err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1265 1648              ra->mynewname, 8, 1, &val);
1266 1649          if (err == 0)
1267 1650                  return (EEXIST);
1268 1651          if (err != ENOENT)
1269 1652                  return (err);
1270 1653

↓ open down ↓

220 lines elided

↑ open up ↑

1271 1654          if (ra->newparent != dd->dd_parent) {
1272 1655                  /* is there enough space? */
1273 1656                  uint64_t myspace =
1274 1657                      MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1275 1658  
1276 1659                  /* no rename into our descendant */
1277 1660                  if (closest_common_ancestor(dd, ra->newparent) == dd)
1278 1661                          return (EINVAL);
1279 1662  
1280 1663                  if (err = dsl_dir_transfer_possible(dd->dd_parent,
1281      -                    ra->newparent, myspace))
     1664 +                    ra->newparent, dd, myspace, tx))
1282 1665                          return (err);
1283 1666          }
1284 1667  
1285 1668          return (0);
1286 1669  }
1287 1670  
1288 1671  static void
1289 1672  dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1290 1673  {
1291 1674          dsl_dir_t *dd = arg1;

1292 1675          struct renamearg *ra = arg2;
1293 1676          dsl_pool_t *dp = dd->dd_pool;
1294 1677          objset_t *mos = dp->dp_meta_objset;
1295 1678          int err;

↓ open down ↓

4 lines elided

↑ open up ↑

1296 1679          char namebuf[MAXNAMELEN];
1297 1680  
1298 1681          ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
1299 1682  
1300 1683          /* Log this before we change the name. */
1301 1684          dsl_dir_name(ra->newparent, namebuf);
1302 1685          spa_history_log_internal_dd(dd, "rename", tx,
1303 1686              "-> %s/%s", namebuf, ra->mynewname);
1304 1687  
1305 1688          if (ra->newparent != dd->dd_parent) {
     1689 +                int cnt;
     1690 +
     1691 +                mutex_enter(&dd->dd_lock);
     1692 +
     1693 +                cnt = dd->dd_phys->dd_dataset_count + 1;
     1694 +                dsl_dir_dscount_adjust(dd->dd_parent, tx, -cnt, B_TRUE, B_TRUE);
     1695 +                dsl_dir_dscount_adjust(ra->newparent, tx, cnt, B_TRUE, B_TRUE);
     1696 +
     1697 +                cnt = dd->dd_phys->dd_snapshot_count;
     1698 +                dsl_snapcount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
     1699 +                dsl_snapcount_adjust(ra->newparent, tx, cnt, B_TRUE);
     1700 +
     1701 +                mutex_exit(&dd->dd_lock);
     1702 +
1306 1703                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1307 1704                      -dd->dd_phys->dd_used_bytes,
1308 1705                      -dd->dd_phys->dd_compressed_bytes,
1309 1706                      -dd->dd_phys->dd_uncompressed_bytes, tx);
1310 1707                  dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
1311 1708                      dd->dd_phys->dd_used_bytes,
1312 1709                      dd->dd_phys->dd_compressed_bytes,
1313 1710                      dd->dd_phys->dd_uncompressed_bytes, tx);
1314 1711  
1315 1712                  if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {

1316 1713                          uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
1317 1714                              dd->dd_phys->dd_used_bytes;
1318 1715  
1319 1716                          dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1320 1717                              -unused_rsrv, 0, 0, tx);
1321 1718                          dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV,
1322 1719                              unused_rsrv, 0, 0, tx);
1323 1720                  }
1324 1721          }
1325 1722  
1326 1723          dmu_buf_will_dirty(dd->dd_dbuf, tx);
1327 1724  
1328 1725          /* remove from old parent zapobj */
1329 1726          err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
1330 1727              dd->dd_myname, tx);
1331 1728          ASSERT0(err);
1332 1729  
1333 1730          (void) strcpy(dd->dd_myname, ra->mynewname);
1334 1731          dsl_dir_close(dd->dd_parent, dd);
1335 1732          dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
1336 1733          VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
1337 1734              ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
1338 1735  
1339 1736          /* add to new parent zapobj */
1340 1737          err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1341 1738              dd->dd_myname, 8, 1, &dd->dd_object, tx);
1342 1739          ASSERT0(err);
1343 1740  
1344 1741  }
1345 1742  
1346 1743  int
1347 1744  dsl_dir_rename(dsl_dir_t *dd, const char *newname)
1348 1745  {
1349 1746          struct renamearg ra;
1350 1747          int err;
1351 1748  
1352 1749          /* new parent should exist */
1353 1750          err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
1354 1751          if (err)
1355 1752                  return (err);
1356 1753  
1357 1754          /* can't rename to different pool */
1358 1755          if (dd->dd_pool != ra.newparent->dd_pool) {
1359 1756                  err = ENXIO;
1360 1757                  goto out;
1361 1758          }
1362 1759  
1363 1760          /* new name should not already exist */
1364 1761          if (ra.mynewname == NULL) {
1365 1762                  err = EEXIST;
1366 1763                  goto out;
1367 1764          }

↓ open down ↓

52 lines elided

↑ open up ↑

1368 1765  
1369 1766          err = dsl_sync_task_do(dd->dd_pool,
1370 1767              dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
1371 1768  
1372 1769  out:
1373 1770          dsl_dir_close(ra.newparent, FTAG);
1374 1771          return (err);
1375 1772  }
1376 1773  
1377 1774  int
1378      -dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
     1775 +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, dsl_dir_t *moving_dd,
     1776 +    uint64_t space, dmu_tx_t *tx)
1379 1777  {
1380 1778          dsl_dir_t *ancestor;
1381 1779          int64_t adelta;
1382 1780          uint64_t avail;
     1781 +        int err;
1383 1782  
1384 1783          ancestor = closest_common_ancestor(sdd, tdd);
1385 1784          adelta = would_change(sdd, -space, ancestor);
1386 1785          avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1387 1786          if (avail < space)
1388 1787                  return (ENOSPC);
1389 1788  
     1789 +        if (sdd != moving_dd) {
     1790 +                err = dsl_dir_dscount_check(tdd, tx,
     1791 +                    moving_dd->dd_phys->dd_dataset_count + 1, ancestor);
     1792 +                if (err != 0)
     1793 +                        return (err);
     1794 +        }
     1795 +        err = dsl_snapcount_check(tdd, tx,
     1796 +            moving_dd->dd_phys->dd_snapshot_count, ancestor);
     1797 +        if (err != 0)
     1798 +                return (err);
     1799 +
1390 1800          return (0);
1391 1801  }
1392 1802  
1393 1803  timestruc_t
1394 1804  dsl_dir_snap_cmtime(dsl_dir_t *dd)
1395 1805  {
1396 1806          timestruc_t t;
1397 1807  
1398 1808          mutex_enter(&dd->dd_lock);
1399 1809          t = dd->dd_snap_cmtime;

1400 1810          mutex_exit(&dd->dd_lock);
1401 1811  
1402 1812          return (t);
1403 1813  }
1404 1814  
1405 1815  void
1406 1816  dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1407 1817  {
1408 1818          timestruc_t t;
1409 1819  
1410 1820          gethrestime(&t);
1411 1821          mutex_enter(&dd->dd_lock);
1412 1822          dd->dd_snap_cmtime = t;
1413 1823          mutex_exit(&dd->dd_lock);
1414 1824  }

↓ open down ↓

15 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX