Print this page
OS-1566 dataset quota for ZFS datasets


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.

  24  */
  25 
  26 #include <sys/dmu.h>
  27 #include <sys/dmu_objset.h>
  28 #include <sys/dmu_tx.h>
  29 #include <sys/dsl_dataset.h>
  30 #include <sys/dsl_dir.h>
  31 #include <sys/dsl_prop.h>
  32 #include <sys/dsl_synctask.h>
  33 #include <sys/dsl_deleg.h>
  34 #include <sys/spa.h>
  35 #include <sys/metaslab.h>
  36 #include <sys/zap.h>
  37 #include <sys/zio.h>
  38 #include <sys/arc.h>
  39 #include <sys/sunddi.h>
  40 #include <sys/zfs_zone.h>

  41 #include "zfs_namecheck.h"
  42 

























































  43 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
  44 static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
  45     uint64_t value, dmu_tx_t *tx);
  46 



  47 /* ARGSUSED */
  48 static void
  49 dsl_dir_evict(dmu_buf_t *db, void *arg)
  50 {
  51         dsl_dir_t *dd = arg;
  52         dsl_pool_t *dp = dd->dd_pool;
  53         int t;
  54 
  55         for (t = 0; t < TXG_SIZE; t++) {
  56                 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
  57                 ASSERT(dd->dd_tempreserved[t] == 0);
  58                 ASSERT(dd->dd_space_towrite[t] == 0);
  59         }
  60 
  61         if (dd->dd_parent)
  62                 dsl_dir_close(dd->dd_parent, dd);
  63 
  64         spa_close(dd->dd_pool->dp_spa, dd);
  65 
  66         /*


 390         if (tailp)
 391                 *tailp = next;
 392         if (openedspa)
 393                 spa_close(spa, FTAG);
 394         *ddp = dd;
 395         return (err);
 396 }
 397 
 398 /*
 399  * Return the dsl_dir_t, and possibly the last component which couldn't
 400  * be found in *tail.  Return NULL if the path is bogus, or if
 401  * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
 402  * means that the last component is a snapshot.
 403  */
 404 int
 405 dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
 406 {
 407         return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
 408 }
 409 





















































































































































































































































































































 410 uint64_t
 411 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 412     dmu_tx_t *tx)
 413 {
 414         objset_t *mos = dp->dp_meta_objset;
 415         uint64_t ddobj;
 416         dsl_dir_phys_t *ddphys;
 417         dmu_buf_t *dbuf;
 418 
 419         ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 420             DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 421         if (pds) {
 422                 VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
 423                     name, sizeof (uint64_t), 1, &ddobj, tx));
 424         } else {
 425                 /* it's the root dir */
 426                 VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 427                     DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 428         }
 429         VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));


 471         if (err)
 472                 return (err);
 473         if (count != 0)
 474                 return (EEXIST);
 475 
 476         return (0);
 477 }
 478 
 479 void
 480 dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 481 {
 482         dsl_dir_t *dd = arg1;
 483         objset_t *mos = dd->dd_pool->dp_meta_objset;
 484         uint64_t obj;
 485         dd_used_t t;
 486 
 487         ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
 488         ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 489 
 490         /*














 491          * Remove our reservation. The impl() routine avoids setting the
 492          * actual property, which would require the (already destroyed) ds.
 493          */
 494         dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 495 
 496         ASSERT0(dd->dd_phys->dd_used_bytes);
 497         ASSERT0(dd->dd_phys->dd_reserved);
 498         for (t = 0; t < DD_USED_NUM; t++)
 499                 ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
 500 
 501         VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
 502         VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
 503         VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
 504         VERIFY(0 == zap_remove(mos,
 505             dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
 506 
 507         obj = dd->dd_object;
 508         dsl_dir_close(dd, tag);
 509         VERIFY(0 == dmu_object_free(mos, obj, tx));
 510 }


1019         if (psa->psa_effective_value == 0)
1020                 return (0);
1021 
1022         mutex_enter(&dd->dd_lock);
1023         /*
1024          * If we are doing the preliminary check in open context, and
1025          * there are pending changes, then don't fail it, since the
1026          * pending changes could under-estimate the amount of space to be
1027          * freed up.
1028          */
1029         towrite = dsl_dir_space_towrite(dd);
1030         if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1031             (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
1032             psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
1033                 err = ENOSPC;
1034         }
1035         mutex_exit(&dd->dd_lock);
1036         return (err);
1037 }
1038 
1039 extern dsl_syncfunc_t dsl_prop_set_sync;
1040 
1041 static void
1042 dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1043 {
1044         dsl_dataset_t *ds = arg1;
1045         dsl_dir_t *dd = ds->ds_dir;
1046         dsl_prop_setarg_t *psa = arg2;
1047         uint64_t effective_value = psa->psa_effective_value;
1048 
1049         dsl_prop_set_sync(ds, psa, tx);
1050         DSL_PROP_CHECK_PREDICTION(dd, psa);
1051 
1052         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1053 
1054         mutex_enter(&dd->dd_lock);
1055         dd->dd_phys->dd_quota = effective_value;
1056         mutex_exit(&dd->dd_lock);
1057 }
1058 
1059 int
1060 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)


1261                 return (EBUSY);
1262 
1263         /* check for existing name */
1264         err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1265             ra->mynewname, 8, 1, &val);
1266         if (err == 0)
1267                 return (EEXIST);
1268         if (err != ENOENT)
1269                 return (err);
1270 
1271         if (ra->newparent != dd->dd_parent) {
1272                 /* is there enough space? */
1273                 uint64_t myspace =
1274                     MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1275 
1276                 /* no rename into our descendant */
1277                 if (closest_common_ancestor(dd, ra->newparent) == dd)
1278                         return (EINVAL);
1279 
1280                 if (err = dsl_dir_transfer_possible(dd->dd_parent,
1281                     ra->newparent, myspace))
1282                         return (err);
1283         }
1284 
1285         return (0);
1286 }
1287 
1288 static void
1289 dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1290 {
1291         dsl_dir_t *dd = arg1;
1292         struct renamearg *ra = arg2;
1293         dsl_pool_t *dp = dd->dd_pool;
1294         objset_t *mos = dp->dp_meta_objset;
1295         int err;
1296         char namebuf[MAXNAMELEN];
1297 
1298         ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
1299 
1300         /* Log this before we change the name. */
1301         dsl_dir_name(ra->newparent, namebuf);
1302         spa_history_log_internal_dd(dd, "rename", tx,
1303             "-> %s/%s", namebuf, ra->mynewname);
1304 
1305         if (ra->newparent != dd->dd_parent) {














1306                 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1307                     -dd->dd_phys->dd_used_bytes,
1308                     -dd->dd_phys->dd_compressed_bytes,
1309                     -dd->dd_phys->dd_uncompressed_bytes, tx);
1310                 dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
1311                     dd->dd_phys->dd_used_bytes,
1312                     dd->dd_phys->dd_compressed_bytes,
1313                     dd->dd_phys->dd_uncompressed_bytes, tx);
1314 
1315                 if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
1316                         uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
1317                             dd->dd_phys->dd_used_bytes;
1318 
1319                         dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1320                             -unused_rsrv, 0, 0, tx);
1321                         dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV,
1322                             unused_rsrv, 0, 0, tx);
1323                 }
1324         }
1325 


1358         if (dd->dd_pool != ra.newparent->dd_pool) {
1359                 err = ENXIO;
1360                 goto out;
1361         }
1362 
1363         /* new name should not already exist */
1364         if (ra.mynewname == NULL) {
1365                 err = EEXIST;
1366                 goto out;
1367         }
1368 
1369         err = dsl_sync_task_do(dd->dd_pool,
1370             dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
1371 
1372 out:
1373         dsl_dir_close(ra.newparent, FTAG);
1374         return (err);
1375 }
1376 
1377 int
1378 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)

1379 {
1380         dsl_dir_t *ancestor;
1381         int64_t adelta;
1382         uint64_t avail;

1383 
1384         ancestor = closest_common_ancestor(sdd, tdd);
1385         adelta = would_change(sdd, -space, ancestor);
1386         avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1387         if (avail < space)
1388                 return (ENOSPC);
1389 











1390         return (0);
1391 }
1392 
1393 timestruc_t
1394 dsl_dir_snap_cmtime(dsl_dir_t *dd)
1395 {
1396         timestruc_t t;
1397 
1398         mutex_enter(&dd->dd_lock);
1399         t = dd->dd_snap_cmtime;
1400         mutex_exit(&dd->dd_lock);
1401 
1402         return (t);
1403 }
1404 
1405 void
1406 dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1407 {
1408         timestruc_t t;
1409 


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/dmu.h>
  28 #include <sys/dmu_objset.h>
  29 #include <sys/dmu_tx.h>
  30 #include <sys/dsl_dataset.h>
  31 #include <sys/dsl_dir.h>
  32 #include <sys/dsl_prop.h>
  33 #include <sys/dsl_synctask.h>
  34 #include <sys/dsl_deleg.h>
  35 #include <sys/spa.h>
  36 #include <sys/metaslab.h>
  37 #include <sys/zap.h>
  38 #include <sys/zio.h>
  39 #include <sys/arc.h>
  40 #include <sys/sunddi.h>
  41 #include <sys/zfs_zone.h>
  42 #include <sys/zfeature.h>
  43 #include "zfs_namecheck.h"
  44 
  45 /*
  46  * Dataset and Snapshot Quotas
  47  * ---------------------------
  48  *
  49  * These quotas are used to limit the number of datasets and/or snapshots
  50  * that can be created at a given level in the tree or below. A common use-case
  51  * is with a delegated dataset where the administrator wants to ensure that
  52  * a user within the zone is not creating too many datasets or snapshots, even
  53  * though they're not exceeding their space quota.
  54  *
  55  * The count of datasets and snapshots is stored in the dsl_dir_phys_t which
  56  * impacts the on-disk format. As such, this capability is controlled by a
  57  * feature flag and must be enabled to be used. Once enabled, the feature is
  58  * not active until the first quota is set. At that point, future operations to
  59  * create/destroy datasets or snapshots will validate and update the counts.
  60  *
  61  * Because the on-disk counts will be incorrect (garbage) before the feature is
  62  * active, the counts are updated when the quota is first set. Starting at the
  63  * dataset with the new quota, the code descends into all sub-datasets and
  64  * updates the counts to be accurate. In practice this is lightweight since
  65  * a quota is typically set when the dataset is created and thus has no
  66  * children. Once set, changing the quota value won't require a traversal since
  67  * the counts are already valid. The counts in datasets above the one with the
  68  * new quota will still be incorrect, unless a quota is eventually set on one
  69  * of those datasets. If a dataset with a quota is encountered during the
  70  * descent, the counts are known to be valid and there is no need to descend
  71  * into that dataset's children. When a new quota value is set on a dataset
  72  * with an existing quota, the new value must not be less than the current
  73  * count at that level or an error is returned and the quota is not changed.
  74  *
  75  * Once the feature is active, then whenever a dataset or snapshot is created,
  76  * the code recurses up the tree, validating the new count against the quota
  77  * at each level. In practice, most levels will not have a quota set. If there
  78  * is a quota at any level up the tree, the check must pass or the creation
  79  * will fail. Likewise, when a dataset or snapshot is destroyed, the counts
  80  * are recursively adjusted all the way up the tree. Renaming a dataset into
  81  * different point in the tree will first validate, then update the counts on
  82  * each branch up to the common ancestor. A receive will also validate the
  83  * counts and then update them.
  84  *
  85  * Recursive snapshots behave a bit differently. The quota is only validated
  86  * against the top-level dataset at which the snapshot is being taken. This
  87  * is to prevent a denial-of-service in which a lower level dataset could
  88  * max out its quota and thus block snapshots from being taken at a higher
  89  * level (in addition, the complexity to address this is not worth the cost).
  90  * Because of this, it is possible for the snapshot count to be over the quota
  91  * and snapshots taken at a high level could cause a lower level dataset to hit
  92  * or exceed its quota. The administrator taking the high-level recursive
  93  * snapshot should be aware of this side-effect and behave accordingly.
  94  *
  95  * The dataset quota is validated by dsl_dir_dscount_check() and updated by
  96  * dsl_dir_dscount_adjust(). The snapshot quota is validated by
  97  * dsl_snapcount_check() and updated by dsl_snapcount_adjust().
  98  * A new quota value is validated in dsl_dir_validate_ds_ss_quota() and the
  99  * dataset counts are adjusted, if necessary, by dsl_dir_set_ds_ss_count().
 100  */
 101 
 102 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 103 static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
 104     uint64_t value, dmu_tx_t *tx);
 105 
 106 extern dsl_syncfunc_t dsl_prop_set_sync;
 107 extern char *tmp_dmu_recv_tag;
 108 
 109 /* ARGSUSED */
 110 static void
 111 dsl_dir_evict(dmu_buf_t *db, void *arg)
 112 {
 113         dsl_dir_t *dd = arg;
 114         dsl_pool_t *dp = dd->dd_pool;
 115         int t;
 116 
 117         for (t = 0; t < TXG_SIZE; t++) {
 118                 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
 119                 ASSERT(dd->dd_tempreserved[t] == 0);
 120                 ASSERT(dd->dd_space_towrite[t] == 0);
 121         }
 122 
 123         if (dd->dd_parent)
 124                 dsl_dir_close(dd->dd_parent, dd);
 125 
 126         spa_close(dd->dd_pool->dp_spa, dd);
 127 
 128         /*


 452         if (tailp)
 453                 *tailp = next;
 454         if (openedspa)
 455                 spa_close(spa, FTAG);
 456         *ddp = dd;
 457         return (err);
 458 }
 459 
 460 /*
 461  * Return the dsl_dir_t, and possibly the last component which couldn't
 462  * be found in *tail.  Return NULL if the path is bogus, or if
 463  * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
 464  * means that the last component is a snapshot.
 465  */
 466 int
 467 dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
 468 {
 469         return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
 470 }
 471 
 472 /*
 473  * Check if there is already a dataset/snapshot quota set for the dataset. If
 474  * not, then the counts on this dataset, and those below, may be incorrect due
 475  * to the use of a pre-existing pool which did not support the dataset/snapshot
 476  * quota feature.
 477  *
 478  * Recursively descend the dataset tree and update the dataset/snapshot counts
 479  * on each dataset below, then update the cumulative count on the current
 480  * dataset. If the dataset already has a quota set on it, then we know that
 481  * its counts, and the counts on the datasets below it, have been updated to
 482  * be correct, so we can skip that dataset.
 483  */
 484 static void
 485 dsl_dir_set_ds_ss_count(const char *nm, dsl_dir_t *dd, dmu_tx_t *tx,
 486     uint64_t *dscnt, uint64_t *sscnt)
 487 {
 488         uint64_t my_ds_cnt = 0;
 489         uint64_t my_ss_cnt = 0;
 490         objset_t *os = dd->dd_pool->dp_meta_objset;
 491         zap_cursor_t *zc;
 492         zap_attribute_t *za;
 493         char *namebuf;
 494         int err;
 495         boolean_t quota_set = B_FALSE;
 496         uint64_t dsquota, ssquota;
 497         dsl_dataset_t *ds;
 498 
 499         err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_DATASET_QUOTA),
 500             8, 1, &dsquota, NULL, B_FALSE);
 501         if (err == 0 && dsquota != 0)
 502                 quota_set = B_TRUE;
 503 
 504         if (!quota_set) {
 505                 err = dsl_prop_get_dd(dd,
 506                     zfs_prop_to_name(ZFS_PROP_SNAPSHOT_QUOTA), 8, 1, &ssquota,
 507                     NULL, B_FALSE);
 508                 if (err == 0 && ssquota != 0)
 509                         quota_set = B_TRUE;
 510         }
 511 
 512         /*
 513          * If the dd has a quota, we know its count is already good and we
 514          * don't need to recurse down any further.
 515          */
 516         if (quota_set) {
 517                 /* Return dataset count plus 1 for self */
 518                 *dscnt = dd->dd_phys->dd_dataset_count + 1;
 519                 *sscnt = dd->dd_phys->dd_snapshot_count;
 520 
 521                 return;
 522         }
 523 
 524         zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
 525         za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 526         namebuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 527 
 528         mutex_enter(&dd->dd_lock);
 529 
 530         /* Iterate datasets */
 531         for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
 532             zap_cursor_retrieve(zc, za) == 0;
 533             zap_cursor_advance(zc)) {
 534                 dsl_dir_t *chld_dd;
 535                 uint64_t chld_ds_cnt = 0;
 536                 uint64_t chld_ss_cnt = 0;
 537 
 538                 (void) snprintf(namebuf, MAXPATHLEN, "%s/%s", nm, za->za_name);
 539 
 540                 if (dsl_dir_open(namebuf, FTAG, &chld_dd, NULL))
 541                         continue;
 542 
 543                 dsl_dir_set_ds_ss_count(namebuf, chld_dd, tx, &chld_ds_cnt,
 544                     &chld_ss_cnt);
 545 
 546                 dsl_dir_close(chld_dd, FTAG);
 547 
 548                 my_ds_cnt += chld_ds_cnt;
 549                 my_ss_cnt += chld_ss_cnt;
 550         }
 551         zap_cursor_fini(zc);
 552 
 553         kmem_free(namebuf, MAXPATHLEN);
 554 
 555         /* Iterate snapshots */
 556         if (dsl_dataset_hold(nm, FTAG, &ds) == 0) {
 557                 for (zap_cursor_init(zc, os, ds->ds_phys->ds_snapnames_zapobj);
 558                     zap_cursor_retrieve(zc, za) == 0;
 559                     zap_cursor_advance(zc)) {
 560                         my_ss_cnt++;
 561                 }
 562                 zap_cursor_fini(zc);
 563                 dsl_dataset_rele(ds, FTAG);
 564         }
 565 
 566         kmem_free(zc, sizeof (zap_cursor_t));
 567         kmem_free(za, sizeof (zap_attribute_t));
 568 
 569 #ifdef _KERNEL
 570         extern void __dtrace_probe_zfs__ds__fix__count(char *, uint64_t,
 571             uint64_t);
 572         __dtrace_probe_zfs__ds__fix__count((char *)nm, my_ds_cnt, my_ss_cnt);
 573 #endif
 574 
 575         /* save updated counts */
 576         dmu_buf_will_dirty(dd->dd_dbuf, tx);
 577         dd->dd_phys->dd_dataset_count = my_ds_cnt;
 578         dd->dd_phys->dd_snapshot_count = my_ss_cnt;
 579 
 580         mutex_exit(&dd->dd_lock);
 581 
 582         /* Return child dataset count plus 1 for self */
 583         *dscnt = my_ds_cnt + 1;
 584         *sscnt = my_ss_cnt;
 585 }
 586 
 587 /*
 588  * Return ENOSPC if new quota is less than the existing count, otherwise return
 589  * -1 to force the zfs_set_prop_nvlist code down the default path to set the
 590  * value in the nvlist.
 591  */
 592 int
 593 dsl_dir_validate_ds_ss_quota(const char *ddname, uint64_t quota,
 594     zfs_prop_t ptype)
 595 {
 596         dsl_dir_t *dd;
 597         dsl_dataset_t *ds;
 598         int err = -1;
 599         uint64_t count;
 600         dmu_tx_t *tx;
 601         uint64_t my_ds_cnt = 0;
 602         uint64_t my_ss_cnt = 0;
 603         spa_t *spa;
 604         zfeature_info_t *quota_feat =
 605             &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA];
 606 
 607         if (dsl_dataset_hold(ddname, FTAG, &ds))
 608                 return (EACCES);
 609 
 610         spa = dsl_dataset_get_spa(ds);
 611         if (!spa_feature_is_enabled(spa,
 612             &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA])) {
 613                 dsl_dataset_rele(ds, FTAG);
 614                 return (ENOTSUP);
 615         }
 616 
 617         /* 0 means no quota */
 618         if (quota == 0) {
 619                 dsl_dataset_rele(ds, FTAG);
 620                 return (-1);
 621         }
 622 
 623         if (dsl_dir_open(ddname, FTAG, &dd, NULL)) {
 624                 dsl_dataset_rele(ds, FTAG);
 625                 return (EACCES);
 626         }
 627 
 628         ASSERT(ds->ds_dir == dd);
 629 
 630         tx = dmu_tx_create_dd(dd);
 631         if (dmu_tx_assign(tx, TXG_WAIT)) {
 632                 dmu_tx_abort(tx);
 633                 return (ENOSPC);
 634         }
 635 
 636         /* set the feature active flag now */
 637         if (!spa_feature_is_active(spa, quota_feat))
 638                 spa_feature_incr(spa, quota_feat, tx);
 639 
 640         /*
 641          * Since we are now setting a non-0 quota on the dataset, we need to
 642          * ensure the counts are correct. Descend down the tree from this
 643          * point and update all of the counts to be accurate.
 644          */
 645         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
 646         dsl_dir_set_ds_ss_count(ddname, dd, tx, &my_ds_cnt, &my_ss_cnt);
 647         rw_exit(&dd->dd_pool->dp_config_rwlock);
 648 
 649         dmu_tx_commit(tx);
 650 
 651         if (ptype == ZFS_PROP_DATASET_QUOTA)
 652                 count = dd->dd_phys->dd_dataset_count;
 653         else
 654                 count = dd->dd_phys->dd_snapshot_count;
 655 
 656         if (quota < count)
 657                 err = ENOSPC;
 658 
 659         dsl_dir_close(dd, FTAG);
 660         dsl_dataset_rele(ds, FTAG);
 661 
 662         return (err);
 663 }
 664 
 665 /*
 666  * Check if adding additional child dataset(s) would exceed any dataset
 667  * quotas.  Note that all dataset quotas up to the root dataset (i.e. the pool
 668  * itself) or the given ancestor must be satisfied. When receiving we don't
 669  * check if the tx is syncing. In this case, the tx is passed as NULL.
 670  */
 671 int
 672 dsl_dir_dscount_check(dsl_dir_t *dd, dmu_tx_t *tx, uint64_t cnt,
 673     dsl_dir_t *ancestor)
 674 {
 675         uint64_t quota;
 676         int err = 0;
 677 
 678         VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
 679 
 680         /*
 681          * As with dsl_dataset_set_reservation_check(), don't run this check in
 682          * open context.
 683          */
 684         if (tx != NULL && !dmu_tx_is_syncing(tx))
 685                 return (0);
 686 
 687         /*
 688          * If an ancestor has been provided, stop checking the quota once we
 689          * hit that dir. We need this during rename so that we don't overcount
 690          * the check once we recurse up to the common ancestor.
 691          */
 692         if (ancestor == dd)
 693                 return (0);
 694 
 695         /*
 696          * If there's no value for this property, there's no need to enforce a
 697          * dataset quota.
 698          */
 699         err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_DATASET_QUOTA),
 700             8, 1, &quota, NULL, B_FALSE);
 701         if (err == ENOENT)
 702                 return (0);
 703         else if (err != 0)
 704                 return (err);
 705 
 706 #ifdef _KERNEL
 707         extern void __dtrace_probe_zfs__ds__quota(uint64_t, uint64_t, char *);
 708         __dtrace_probe_zfs__ds__quota((uint64_t)dd->dd_phys->dd_dataset_count,
 709             (uint64_t)quota, dd->dd_myname);
 710 #endif
 711 
 712         if (quota > 0 && (dd->dd_phys->dd_dataset_count + cnt) > quota)
 713                 return (EDQUOT);
 714 
 715         if (dd->dd_parent != NULL)
 716                 err = dsl_dir_dscount_check(dd->dd_parent, tx, cnt, ancestor);
 717 
 718         return (err);
 719 }
 720 
 721 /*
 722  * Adjust the dataset count for the specified dsl_dir_t and all parent datasets.
 723  * When a new dataset is created, increment the count on all parents, and when a
 724  * dataset is destroyed, decrement the count.
 725  */
 726 void
 727 dsl_dir_dscount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
 728     boolean_t syncing, boolean_t first)
 729 {
 730         /*
 731          * On initial entry we need to check if this feature is active, but
 732          * we don't want to re-check this on each recursive call. Note: the
 733          * feature cannot be active if its not enabled. If the feature is not
 734          * active, don't touch the on-disk count fields.
 735          */
 736         if (first) {
 737                 dsl_dataset_t *ds = NULL;
 738                 spa_t *spa;
 739                 zfeature_info_t *quota_feat =
 740                     &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA];
 741 
 742                 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
 743                     dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
 744                 spa = dsl_dataset_get_spa(ds);
 745                 dsl_dataset_rele(ds, FTAG);
 746                 if (!spa_feature_is_active(spa, quota_feat))
 747                         return;
 748         }
 749 
 750         VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
 751         if (syncing)
 752                 VERIFY(dmu_tx_is_syncing(tx));
 753 
 754         dmu_buf_will_dirty(dd->dd_dbuf, tx);
 755 
 756         mutex_enter(&dd->dd_lock);
 757 
 758         /*
 759          * Counts may be incorrect if dealing with an existing pool and
 760          * there has never been a quota set in the dataset hierarchy.
 761          * This is not an error.
 762          */
 763         if (delta < 0 && dd->dd_phys->dd_dataset_count < (delta * -1)) {
 764 #ifdef _KERNEL
 765                 extern void __dtrace_probe_zfs__dscnt__adj__neg(char *);
 766                 __dtrace_probe_zfs__dscnt__adj__neg(dd->dd_myname);
 767 #endif
 768                 mutex_exit(&dd->dd_lock);
 769                 return;
 770         }
 771 
 772         dd->dd_phys->dd_dataset_count += delta;
 773 
 774         if (dd->dd_parent != NULL)
 775                 dsl_dir_dscount_adjust(dd->dd_parent, tx, delta, syncing,
 776                     B_FALSE);
 777 
 778         mutex_exit(&dd->dd_lock);
 779 }
 780 
 781 uint64_t
 782 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 783     dmu_tx_t *tx)
 784 {
 785         objset_t *mos = dp->dp_meta_objset;
 786         uint64_t ddobj;
 787         dsl_dir_phys_t *ddphys;
 788         dmu_buf_t *dbuf;
 789 
 790         ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 791             DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 792         if (pds) {
 793                 VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
 794                     name, sizeof (uint64_t), 1, &ddobj, tx));
 795         } else {
 796                 /* it's the root dir */
 797                 VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 798                     DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 799         }
 800         VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));


 842         if (err)
 843                 return (err);
 844         if (count != 0)
 845                 return (EEXIST);
 846 
 847         return (0);
 848 }
 849 
 850 void
 851 dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 852 {
 853         dsl_dir_t *dd = arg1;
 854         objset_t *mos = dd->dd_pool->dp_meta_objset;
 855         uint64_t obj;
 856         dd_used_t t;
 857 
 858         ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
 859         ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 860 
 861         /*
 862          * Decrement the dataset count for all parent datasets.
 863          *
 864          * We have to worry about a special case where we are receiving a
 865          * dataset that already exists. In this case a temporary clone name
 866          * of %X is created (see dmu_recv_begin). In dmu_recv_existing_end we
 867          * destroy this temporary clone which leads to here. We don't want to
 868          * decrement the dataset counters in this case, since we never
 869          * incremented them. To detect this case we check the tag for
 870          * "tmp_dmu_recv_tag" to see if we're in that code path.
 871          */
 872         if (dd->dd_parent != NULL && strcmp(tag, tmp_dmu_recv_tag) != 0)
 873                 dsl_dir_dscount_adjust(dd->dd_parent, tx, -1, B_TRUE, B_TRUE);
 874 
 875         /*
 876          * Remove our reservation. The impl() routine avoids setting the
 877          * actual property, which would require the (already destroyed) ds.
 878          */
 879         dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 880 
 881         ASSERT0(dd->dd_phys->dd_used_bytes);
 882         ASSERT0(dd->dd_phys->dd_reserved);
 883         for (t = 0; t < DD_USED_NUM; t++)
 884                 ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
 885 
 886         VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
 887         VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
 888         VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
 889         VERIFY(0 == zap_remove(mos,
 890             dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
 891 
 892         obj = dd->dd_object;
 893         dsl_dir_close(dd, tag);
 894         VERIFY(0 == dmu_object_free(mos, obj, tx));
 895 }


1404         if (psa->psa_effective_value == 0)
1405                 return (0);
1406 
1407         mutex_enter(&dd->dd_lock);
1408         /*
1409          * If we are doing the preliminary check in open context, and
1410          * there are pending changes, then don't fail it, since the
1411          * pending changes could under-estimate the amount of space to be
1412          * freed up.
1413          */
1414         towrite = dsl_dir_space_towrite(dd);
1415         if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1416             (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
1417             psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
1418                 err = ENOSPC;
1419         }
1420         mutex_exit(&dd->dd_lock);
1421         return (err);
1422 }
1423 


1424 static void
1425 dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1426 {
1427         dsl_dataset_t *ds = arg1;
1428         dsl_dir_t *dd = ds->ds_dir;
1429         dsl_prop_setarg_t *psa = arg2;
1430         uint64_t effective_value = psa->psa_effective_value;
1431 
1432         dsl_prop_set_sync(ds, psa, tx);
1433         DSL_PROP_CHECK_PREDICTION(dd, psa);
1434 
1435         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1436 
1437         mutex_enter(&dd->dd_lock);
1438         dd->dd_phys->dd_quota = effective_value;
1439         mutex_exit(&dd->dd_lock);
1440 }
1441 
1442 int
1443 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)


1644                 return (EBUSY);
1645 
1646         /* check for existing name */
1647         err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1648             ra->mynewname, 8, 1, &val);
1649         if (err == 0)
1650                 return (EEXIST);
1651         if (err != ENOENT)
1652                 return (err);
1653 
1654         if (ra->newparent != dd->dd_parent) {
1655                 /* is there enough space? */
1656                 uint64_t myspace =
1657                     MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1658 
1659                 /* no rename into our descendant */
1660                 if (closest_common_ancestor(dd, ra->newparent) == dd)
1661                         return (EINVAL);
1662 
1663                 if (err = dsl_dir_transfer_possible(dd->dd_parent,
1664                     ra->newparent, dd, myspace, tx))
1665                         return (err);
1666         }
1667 
1668         return (0);
1669 }
1670 
1671 static void
1672 dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1673 {
1674         dsl_dir_t *dd = arg1;
1675         struct renamearg *ra = arg2;
1676         dsl_pool_t *dp = dd->dd_pool;
1677         objset_t *mos = dp->dp_meta_objset;
1678         int err;
1679         char namebuf[MAXNAMELEN];
1680 
1681         ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
1682 
1683         /* Log this before we change the name. */
1684         dsl_dir_name(ra->newparent, namebuf);
1685         spa_history_log_internal_dd(dd, "rename", tx,
1686             "-> %s/%s", namebuf, ra->mynewname);
1687 
1688         if (ra->newparent != dd->dd_parent) {
1689                 int cnt;
1690 
1691                 mutex_enter(&dd->dd_lock);
1692 
1693                 cnt = dd->dd_phys->dd_dataset_count + 1;
1694                 dsl_dir_dscount_adjust(dd->dd_parent, tx, -cnt, B_TRUE, B_TRUE);
1695                 dsl_dir_dscount_adjust(ra->newparent, tx, cnt, B_TRUE, B_TRUE);
1696 
1697                 cnt = dd->dd_phys->dd_snapshot_count;
1698                 dsl_snapcount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
1699                 dsl_snapcount_adjust(ra->newparent, tx, cnt, B_TRUE);
1700 
1701                 mutex_exit(&dd->dd_lock);
1702 
1703                 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1704                     -dd->dd_phys->dd_used_bytes,
1705                     -dd->dd_phys->dd_compressed_bytes,
1706                     -dd->dd_phys->dd_uncompressed_bytes, tx);
1707                 dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
1708                     dd->dd_phys->dd_used_bytes,
1709                     dd->dd_phys->dd_compressed_bytes,
1710                     dd->dd_phys->dd_uncompressed_bytes, tx);
1711 
1712                 if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
1713                         uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
1714                             dd->dd_phys->dd_used_bytes;
1715 
1716                         dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1717                             -unused_rsrv, 0, 0, tx);
1718                         dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV,
1719                             unused_rsrv, 0, 0, tx);
1720                 }
1721         }
1722 


1755         if (dd->dd_pool != ra.newparent->dd_pool) {
1756                 err = ENXIO;
1757                 goto out;
1758         }
1759 
1760         /* new name should not already exist */
1761         if (ra.mynewname == NULL) {
1762                 err = EEXIST;
1763                 goto out;
1764         }
1765 
1766         err = dsl_sync_task_do(dd->dd_pool,
1767             dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
1768 
1769 out:
1770         dsl_dir_close(ra.newparent, FTAG);
1771         return (err);
1772 }
1773 
1774 int
1775 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, dsl_dir_t *moving_dd,
1776     uint64_t space, dmu_tx_t *tx)
1777 {
1778         dsl_dir_t *ancestor;
1779         int64_t adelta;
1780         uint64_t avail;
1781         int err;
1782 
1783         ancestor = closest_common_ancestor(sdd, tdd);
1784         adelta = would_change(sdd, -space, ancestor);
1785         avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1786         if (avail < space)
1787                 return (ENOSPC);
1788 
1789         if (sdd != moving_dd) {
1790                 err = dsl_dir_dscount_check(tdd, tx,
1791                     moving_dd->dd_phys->dd_dataset_count + 1, ancestor);
1792                 if (err != 0)
1793                         return (err);
1794         }
1795         err = dsl_snapcount_check(tdd, tx,
1796             moving_dd->dd_phys->dd_snapshot_count, ancestor);
1797         if (err != 0)
1798                 return (err);
1799 
1800         return (0);
1801 }
1802 
1803 timestruc_t
1804 dsl_dir_snap_cmtime(dsl_dir_t *dd)
1805 {
1806         timestruc_t t;
1807 
1808         mutex_enter(&dd->dd_lock);
1809         t = dd->dd_snap_cmtime;
1810         mutex_exit(&dd->dd_lock);
1811 
1812         return (t);
1813 }
1814 
1815 void
1816 dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1817 {
1818         timestruc_t t;
1819