1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/dmu.h>
  28 #include <sys/dmu_objset.h>
  29 #include <sys/dmu_tx.h>
  30 #include <sys/dsl_dataset.h>
  31 #include <sys/dsl_dir.h>
  32 #include <sys/dsl_prop.h>
  33 #include <sys/dsl_synctask.h>
  34 #include <sys/dsl_deleg.h>
  35 #include <sys/spa.h>
  36 #include <sys/metaslab.h>
  37 #include <sys/zap.h>
  38 #include <sys/zio.h>
  39 #include <sys/arc.h>
  40 #include <sys/sunddi.h>
  41 #include <sys/zfs_zone.h>
  42 #include <sys/zfeature.h>
  43 #include "zfs_namecheck.h"
  44 
  45 /*
  46  * Dataset and Snapshot Quotas
  47  * ---------------------------
  48  *
  49  * These quotas are used to limit the number of datasets and/or snapshots
  50  * that can be created at a given level in the tree or below. A common use-case
  51  * is with a delegated dataset where the administrator wants to ensure that
  52  * a user within the zone is not creating too many datasets or snapshots, even
  53  * though they're not exceeding their space quota.
  54  *
  55  * The count of datasets and snapshots is stored in the dsl_dir_phys_t which
  56  * impacts the on-disk format. As such, this capability is controlled by a
  57  * feature flag and must be enabled to be used. Once enabled, the feature is
  58  * not active until the first quota is set. At that point, future operations to
  59  * create/destroy datasets or snapshots will validate and update the counts.
  60  *
  61  * Because the on-disk counts will be incorrect (garbage) before the feature is
  62  * active, the counts are updated when the quota is first set. Starting at the
  63  * dataset with the new quota, the code descends into all sub-datasets and
  64  * updates the counts to be accurate. In practice this is lightweight since
  65  * a quota is typically set when the dataset is created and thus has no
  66  * children. Once set, changing the quota value won't require a traversal since
  67  * the counts are already valid. The counts in datasets above the one with the
  68  * new quota will still be incorrect, unless a quota is eventually set on one
  69  * of those datasets. If a dataset with a quota is encountered during the
  70  * descent, the counts are known to be valid and there is no need to descend
  71  * into that dataset's children. When a new quota value is set on a dataset
  72  * with an existing quota, the new value must not be less than the current
  73  * count at that level or an error is returned and the quota is not changed.
  74  *
  75  * Once the feature is active, then whenever a dataset or snapshot is created,
  76  * the code recurses up the tree, validating the new count against the quota
  77  * at each level. In practice, most levels will not have a quota set. If there
  78  * is a quota at any level up the tree, the check must pass or the creation
  79  * will fail. Likewise, when a dataset or snapshot is destroyed, the counts
  80  * are recursively adjusted all the way up the tree. Renaming a dataset into
  81  * different point in the tree will first validate, then update the counts on
  82  * each branch up to the common ancestor. A receive will also validate the
  83  * counts and then update them.
  84  *
  85  * Recursive snapshots behave a bit differently. The quota is only validated
  86  * against the top-level dataset at which the snapshot is being taken. This
  87  * is to prevent a denial-of-service in which a lower level dataset could
  88  * max out its quota and thus block snapshots from being taken at a higher
  89  * level (in addition, the complexity to address this is not worth the cost).
  90  * Because of this, it is possible for the snapshot count to be over the quota
  91  * and snapshots taken at a high level could cause a lower level dataset to hit
  92  * or exceed its quota. The administrator taking the high-level recursive
  93  * snapshot should be aware of this side-effect and behave accordingly.
  94  *
  95  * The dataset quota is validated by dsl_dir_dscount_check() and updated by
  96  * dsl_dir_dscount_adjust(). The snapshot quota is validated by
  97  * dsl_snapcount_check() and updated by dsl_snapcount_adjust().
  98  * A new quota value is validated in dsl_dir_validate_ds_ss_quota() and the
  99  * dataset counts are adjusted, if necessary, by dsl_dir_set_ds_ss_count().
 100  */
 101 
 102 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 103 static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
 104     uint64_t value, dmu_tx_t *tx);
 105 
 106 extern dsl_syncfunc_t dsl_prop_set_sync;
 107 extern char *tmp_dmu_recv_tag;
 108 
 109 /* ARGSUSED */
 110 static void
 111 dsl_dir_evict(dmu_buf_t *db, void *arg)
 112 {
 113         dsl_dir_t *dd = arg;
 114         dsl_pool_t *dp = dd->dd_pool;
 115         int t;
 116 
 117         for (t = 0; t < TXG_SIZE; t++) {
 118                 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
 119                 ASSERT(dd->dd_tempreserved[t] == 0);
 120                 ASSERT(dd->dd_space_towrite[t] == 0);
 121         }
 122 
 123         if (dd->dd_parent)
 124                 dsl_dir_close(dd->dd_parent, dd);
 125 
 126         spa_close(dd->dd_pool->dp_spa, dd);
 127 
 128         /*
 129          * The props callback list should have been cleaned up by
 130          * objset_evict().
 131          */
 132         list_destroy(&dd->dd_prop_cbs);
 133         mutex_destroy(&dd->dd_lock);
 134         kmem_free(dd, sizeof (dsl_dir_t));
 135 }
 136 
 137 int
 138 dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 139     const char *tail, void *tag, dsl_dir_t **ddp)
 140 {
 141         dmu_buf_t *dbuf;
 142         dsl_dir_t *dd;
 143         int err;
 144 
 145         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 146             dsl_pool_sync_context(dp));
 147 
 148         err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
 149         if (err)
 150                 return (err);
 151         dd = dmu_buf_get_user(dbuf);
 152 #ifdef ZFS_DEBUG
 153         {
 154                 dmu_object_info_t doi;
 155                 dmu_object_info_from_db(dbuf, &doi);
 156                 ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
 157                 ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
 158         }
 159 #endif
 160         if (dd == NULL) {
 161                 dsl_dir_t *winner;
 162 
 163                 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
 164                 dd->dd_object = ddobj;
 165                 dd->dd_dbuf = dbuf;
 166                 dd->dd_pool = dp;
 167                 dd->dd_phys = dbuf->db_data;
 168                 mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
 169 
 170                 list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
 171                     offsetof(dsl_prop_cb_record_t, cbr_node));
 172 
 173                 dsl_dir_snap_cmtime_update(dd);
 174 
 175                 if (dd->dd_phys->dd_parent_obj) {
 176                         err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
 177                             NULL, dd, &dd->dd_parent);
 178                         if (err)
 179                                 goto errout;
 180                         if (tail) {
 181 #ifdef ZFS_DEBUG
 182                                 uint64_t foundobj;
 183 
 184                                 err = zap_lookup(dp->dp_meta_objset,
 185                                     dd->dd_parent->dd_phys->dd_child_dir_zapobj,
 186                                     tail, sizeof (foundobj), 1, &foundobj);
 187                                 ASSERT(err || foundobj == ddobj);
 188 #endif
 189                                 (void) strcpy(dd->dd_myname, tail);
 190                         } else {
 191                                 err = zap_value_search(dp->dp_meta_objset,
 192                                     dd->dd_parent->dd_phys->dd_child_dir_zapobj,
 193                                     ddobj, 0, dd->dd_myname);
 194                         }
 195                         if (err)
 196                                 goto errout;
 197                 } else {
 198                         (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
 199                 }
 200 
 201                 if (dsl_dir_is_clone(dd)) {
 202                         dmu_buf_t *origin_bonus;
 203                         dsl_dataset_phys_t *origin_phys;
 204 
 205                         /*
 206                          * We can't open the origin dataset, because
 207                          * that would require opening this dsl_dir.
 208                          * Just look at its phys directly instead.
 209                          */
 210                         err = dmu_bonus_hold(dp->dp_meta_objset,
 211                             dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
 212                         if (err)
 213                                 goto errout;
 214                         origin_phys = origin_bonus->db_data;
 215                         dd->dd_origin_txg =
 216                             origin_phys->ds_creation_txg;
 217                         dmu_buf_rele(origin_bonus, FTAG);
 218                 }
 219 
 220                 winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
 221                     dsl_dir_evict);
 222                 if (winner) {
 223                         if (dd->dd_parent)
 224                                 dsl_dir_close(dd->dd_parent, dd);
 225                         mutex_destroy(&dd->dd_lock);
 226                         kmem_free(dd, sizeof (dsl_dir_t));
 227                         dd = winner;
 228                 } else {
 229                         spa_open_ref(dp->dp_spa, dd);
 230                 }
 231         }
 232 
 233         /*
 234          * The dsl_dir_t has both open-to-close and instantiate-to-evict
 235          * holds on the spa.  We need the open-to-close holds because
 236          * otherwise the spa_refcnt wouldn't change when we open a
 237          * dir which the spa also has open, so we could incorrectly
 238          * think it was OK to unload/export/destroy the pool.  We need
 239          * the instantiate-to-evict hold because the dsl_dir_t has a
 240          * pointer to the dd_pool, which has a pointer to the spa_t.
 241          */
 242         spa_open_ref(dp->dp_spa, tag);
 243         ASSERT3P(dd->dd_pool, ==, dp);
 244         ASSERT3U(dd->dd_object, ==, ddobj);
 245         ASSERT3P(dd->dd_dbuf, ==, dbuf);
 246         *ddp = dd;
 247         return (0);
 248 
 249 errout:
 250         if (dd->dd_parent)
 251                 dsl_dir_close(dd->dd_parent, dd);
 252         mutex_destroy(&dd->dd_lock);
 253         kmem_free(dd, sizeof (dsl_dir_t));
 254         dmu_buf_rele(dbuf, tag);
 255         return (err);
 256 }
 257 
 258 void
 259 dsl_dir_close(dsl_dir_t *dd, void *tag)
 260 {
 261         dprintf_dd(dd, "%s\n", "");
 262         spa_close(dd->dd_pool->dp_spa, tag);
 263         dmu_buf_rele(dd->dd_dbuf, tag);
 264 }
 265 
 266 /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
 267 void
 268 dsl_dir_name(dsl_dir_t *dd, char *buf)
 269 {
 270         if (dd->dd_parent) {
 271                 dsl_dir_name(dd->dd_parent, buf);
 272                 (void) strcat(buf, "/");
 273         } else {
 274                 buf[0] = '\0';
 275         }
 276         if (!MUTEX_HELD(&dd->dd_lock)) {
 277                 /*
 278                  * recursive mutex so that we can use
 279                  * dprintf_dd() with dd_lock held
 280                  */
 281                 mutex_enter(&dd->dd_lock);
 282                 (void) strcat(buf, dd->dd_myname);
 283                 mutex_exit(&dd->dd_lock);
 284         } else {
 285                 (void) strcat(buf, dd->dd_myname);
 286         }
 287 }
 288 
 289 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
 290 int
 291 dsl_dir_namelen(dsl_dir_t *dd)
 292 {
 293         int result = 0;
 294 
 295         if (dd->dd_parent) {
 296                 /* parent's name + 1 for the "/" */
 297                 result = dsl_dir_namelen(dd->dd_parent) + 1;
 298         }
 299 
 300         if (!MUTEX_HELD(&dd->dd_lock)) {
 301                 /* see dsl_dir_name */
 302                 mutex_enter(&dd->dd_lock);
 303                 result += strlen(dd->dd_myname);
 304                 mutex_exit(&dd->dd_lock);
 305         } else {
 306                 result += strlen(dd->dd_myname);
 307         }
 308 
 309         return (result);
 310 }
 311 
 312 static int
 313 getcomponent(const char *path, char *component, const char **nextp)
 314 {
 315         char *p;
 316         if ((path == NULL) || (path[0] == '\0'))
 317                 return (ENOENT);
 318         /* This would be a good place to reserve some namespace... */
 319         p = strpbrk(path, "/@");
 320         if (p && (p[1] == '/' || p[1] == '@')) {
 321                 /* two separators in a row */
 322                 return (EINVAL);
 323         }
 324         if (p == NULL || p == path) {
 325                 /*
 326                  * if the first thing is an @ or /, it had better be an
 327                  * @ and it had better not have any more ats or slashes,
 328                  * and it had better have something after the @.
 329                  */
 330                 if (p != NULL &&
 331                     (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
 332                         return (EINVAL);
 333                 if (strlen(path) >= MAXNAMELEN)
 334                         return (ENAMETOOLONG);
 335                 (void) strcpy(component, path);
 336                 p = NULL;
 337         } else if (p[0] == '/') {
 338                 if (p-path >= MAXNAMELEN)
 339                         return (ENAMETOOLONG);
 340                 (void) strncpy(component, path, p - path);
 341                 component[p-path] = '\0';
 342                 p++;
 343         } else if (p[0] == '@') {
 344                 /*
 345                  * if the next separator is an @, there better not be
 346                  * any more slashes.
 347                  */
 348                 if (strchr(path, '/'))
 349                         return (EINVAL);
 350                 if (p-path >= MAXNAMELEN)
 351                         return (ENAMETOOLONG);
 352                 (void) strncpy(component, path, p - path);
 353                 component[p-path] = '\0';
 354         } else {
 355                 ASSERT(!"invalid p");
 356         }
 357         *nextp = p;
 358         return (0);
 359 }
 360 
 361 /*
 362  * same as dsl_open_dir, ignore the first component of name and use the
 363  * spa instead
 364  */
 365 int
 366 dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
 367     dsl_dir_t **ddp, const char **tailp)
 368 {
 369         char buf[MAXNAMELEN];
 370         const char *next, *nextnext = NULL;
 371         int err;
 372         dsl_dir_t *dd;
 373         dsl_pool_t *dp;
 374         uint64_t ddobj;
 375         int openedspa = FALSE;
 376 
 377         dprintf("%s\n", name);
 378 
 379         err = getcomponent(name, buf, &next);
 380         if (err)
 381                 return (err);
 382         if (spa == NULL) {
 383                 err = spa_open(buf, &spa, FTAG);
 384                 if (err) {
 385                         dprintf("spa_open(%s) failed\n", buf);
 386                         return (err);
 387                 }
 388                 openedspa = TRUE;
 389 
 390                 /* XXX this assertion belongs in spa_open */
 391                 ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
 392         }
 393 
 394         dp = spa_get_dsl(spa);
 395 
 396         rw_enter(&dp->dp_config_rwlock, RW_READER);
 397         err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
 398         if (err) {
 399                 rw_exit(&dp->dp_config_rwlock);
 400                 if (openedspa)
 401                         spa_close(spa, FTAG);
 402                 return (err);
 403         }
 404 
 405         while (next != NULL) {
 406                 dsl_dir_t *child_ds;
 407                 err = getcomponent(next, buf, &nextnext);
 408                 if (err)
 409                         break;
 410                 ASSERT(next[0] != '\0');
 411                 if (next[0] == '@')
 412                         break;
 413                 dprintf("looking up %s in obj%lld\n",
 414                     buf, dd->dd_phys->dd_child_dir_zapobj);
 415 
 416                 err = zap_lookup(dp->dp_meta_objset,
 417                     dd->dd_phys->dd_child_dir_zapobj,
 418                     buf, sizeof (ddobj), 1, &ddobj);
 419                 if (err) {
 420                         if (err == ENOENT)
 421                                 err = 0;
 422                         break;
 423                 }
 424 
 425                 err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
 426                 if (err)
 427                         break;
 428                 dsl_dir_close(dd, tag);
 429                 dd = child_ds;
 430                 next = nextnext;
 431         }
 432         rw_exit(&dp->dp_config_rwlock);
 433 
 434         if (err) {
 435                 dsl_dir_close(dd, tag);
 436                 if (openedspa)
 437                         spa_close(spa, FTAG);
 438                 return (err);
 439         }
 440 
 441         /*
 442          * It's an error if there's more than one component left, or
 443          * tailp==NULL and there's any component left.
 444          */
 445         if (next != NULL &&
 446             (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
 447                 /* bad path name */
 448                 dsl_dir_close(dd, tag);
 449                 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
 450                 err = ENOENT;
 451         }
 452         if (tailp)
 453                 *tailp = next;
 454         if (openedspa)
 455                 spa_close(spa, FTAG);
 456         *ddp = dd;
 457         return (err);
 458 }
 459 
 460 /*
 461  * Return the dsl_dir_t, and possibly the last component which couldn't
 462  * be found in *tail.  Return NULL if the path is bogus, or if
 463  * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
 464  * means that the last component is a snapshot.
 465  */
 466 int
 467 dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
 468 {
 469         return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
 470 }
 471 
 472 /*
 473  * Check if there is already a dataset/snapshot quota set for the dataset. If
 474  * not, then the counts on this dataset, and those below, may be incorrect due
 475  * to the use of a pre-existing pool which did not support the dataset/snapshot
 476  * quota feature.
 477  *
 478  * Recursively descend the dataset tree and update the dataset/snapshot counts
 479  * on each dataset below, then update the cumulative count on the current
 480  * dataset. If the dataset already has a quota set on it, then we know that
 481  * its counts, and the counts on the datasets below it, have been updated to
 482  * be correct, so we can skip that dataset.
 483  */
 484 static void
 485 dsl_dir_set_ds_ss_count(const char *nm, dsl_dir_t *dd, dmu_tx_t *tx,
 486     uint64_t *dscnt, uint64_t *sscnt)
 487 {
 488         uint64_t my_ds_cnt = 0;
 489         uint64_t my_ss_cnt = 0;
 490         objset_t *os = dd->dd_pool->dp_meta_objset;
 491         zap_cursor_t *zc;
 492         zap_attribute_t *za;
 493         char *namebuf;
 494         int err;
 495         boolean_t quota_set = B_FALSE;
 496         uint64_t dsquota, ssquota;
 497         dsl_dataset_t *ds;
 498 
 499         err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_DATASET_QUOTA),
 500             8, 1, &dsquota, NULL, B_FALSE);
 501         if (err == 0 && dsquota != 0)
 502                 quota_set = B_TRUE;
 503 
 504         if (!quota_set) {
 505                 err = dsl_prop_get_dd(dd,
 506                     zfs_prop_to_name(ZFS_PROP_SNAPSHOT_QUOTA), 8, 1, &ssquota,
 507                     NULL, B_FALSE);
 508                 if (err == 0 && ssquota != 0)
 509                         quota_set = B_TRUE;
 510         }
 511 
 512         /*
 513          * If the dd has a quota, we know its count is already good and we
 514          * don't need to recurse down any further.
 515          */
 516         if (quota_set) {
 517                 /* Return dataset count plus 1 for self */
 518                 *dscnt = dd->dd_phys->dd_dataset_count + 1;
 519                 *sscnt = dd->dd_phys->dd_snapshot_count;
 520 
 521                 return;
 522         }
 523 
 524         zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
 525         za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 526         namebuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 527 
 528         mutex_enter(&dd->dd_lock);
 529 
 530         /* Iterate datasets */
 531         for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
 532             zap_cursor_retrieve(zc, za) == 0;
 533             zap_cursor_advance(zc)) {
 534                 dsl_dir_t *chld_dd;
 535                 uint64_t chld_ds_cnt = 0;
 536                 uint64_t chld_ss_cnt = 0;
 537 
 538                 (void) snprintf(namebuf, MAXPATHLEN, "%s/%s", nm, za->za_name);
 539 
 540                 if (dsl_dir_open(namebuf, FTAG, &chld_dd, NULL))
 541                         continue;
 542 
 543                 dsl_dir_set_ds_ss_count(namebuf, chld_dd, tx, &chld_ds_cnt,
 544                     &chld_ss_cnt);
 545 
 546                 dsl_dir_close(chld_dd, FTAG);
 547 
 548                 my_ds_cnt += chld_ds_cnt;
 549                 my_ss_cnt += chld_ss_cnt;
 550         }
 551         zap_cursor_fini(zc);
 552 
 553         kmem_free(namebuf, MAXPATHLEN);
 554 
 555         /* Iterate snapshots */
 556         if (dsl_dataset_hold(nm, FTAG, &ds) == 0) {
 557                 for (zap_cursor_init(zc, os, ds->ds_phys->ds_snapnames_zapobj);
 558                     zap_cursor_retrieve(zc, za) == 0;
 559                     zap_cursor_advance(zc)) {
 560                         my_ss_cnt++;
 561                 }
 562                 zap_cursor_fini(zc);
 563                 dsl_dataset_rele(ds, FTAG);
 564         }
 565 
 566         kmem_free(zc, sizeof (zap_cursor_t));
 567         kmem_free(za, sizeof (zap_attribute_t));
 568 
 569 #ifdef _KERNEL
 570         extern void __dtrace_probe_zfs__ds__fix__count(char *, uint64_t,
 571             uint64_t);
 572         __dtrace_probe_zfs__ds__fix__count((char *)nm, my_ds_cnt, my_ss_cnt);
 573 #endif
 574 
 575         /* save updated counts */
 576         dmu_buf_will_dirty(dd->dd_dbuf, tx);
 577         dd->dd_phys->dd_dataset_count = my_ds_cnt;
 578         dd->dd_phys->dd_snapshot_count = my_ss_cnt;
 579 
 580         mutex_exit(&dd->dd_lock);
 581 
 582         /* Return child dataset count plus 1 for self */
 583         *dscnt = my_ds_cnt + 1;
 584         *sscnt = my_ss_cnt;
 585 }
 586 
 587 /*
 588  * Return ENOSPC if new quota is less than the existing count, otherwise return
 589  * -1 to force the zfs_set_prop_nvlist code down the default path to set the
 590  * value in the nvlist.
 591  */
 592 int
 593 dsl_dir_validate_ds_ss_quota(const char *ddname, uint64_t quota,
 594     zfs_prop_t ptype)
 595 {
 596         dsl_dir_t *dd;
 597         dsl_dataset_t *ds;
 598         int err = -1;
 599         uint64_t count;
 600         dmu_tx_t *tx;
 601         uint64_t my_ds_cnt = 0;
 602         uint64_t my_ss_cnt = 0;
 603         spa_t *spa;
 604         zfeature_info_t *quota_feat =
 605             &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA];
 606 
 607         if (dsl_dataset_hold(ddname, FTAG, &ds))
 608                 return (EACCES);
 609 
 610         spa = dsl_dataset_get_spa(ds);
 611         if (!spa_feature_is_enabled(spa,
 612             &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA])) {
 613                 dsl_dataset_rele(ds, FTAG);
 614                 return (ENOTSUP);
 615         }
 616 
 617         /* 0 means no quota */
 618         if (quota == 0) {
 619                 dsl_dataset_rele(ds, FTAG);
 620                 return (-1);
 621         }
 622 
 623         if (dsl_dir_open(ddname, FTAG, &dd, NULL)) {
 624                 dsl_dataset_rele(ds, FTAG);
 625                 return (EACCES);
 626         }
 627 
 628         ASSERT(ds->ds_dir == dd);
 629 
 630         tx = dmu_tx_create_dd(dd);
 631         if (dmu_tx_assign(tx, TXG_WAIT)) {
 632                 dmu_tx_abort(tx);
 633                 return (ENOSPC);
 634         }
 635 
 636         /* set the feature active flag now */
 637         if (!spa_feature_is_active(spa, quota_feat))
 638                 spa_feature_incr(spa, quota_feat, tx);
 639 
 640         /*
 641          * Since we are now setting a non-0 quota on the dataset, we need to
 642          * ensure the counts are correct. Descend down the tree from this
 643          * point and update all of the counts to be accurate.
 644          */
 645         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
 646         dsl_dir_set_ds_ss_count(ddname, dd, tx, &my_ds_cnt, &my_ss_cnt);
 647         rw_exit(&dd->dd_pool->dp_config_rwlock);
 648 
 649         dmu_tx_commit(tx);
 650 
 651         if (ptype == ZFS_PROP_DATASET_QUOTA)
 652                 count = dd->dd_phys->dd_dataset_count;
 653         else
 654                 count = dd->dd_phys->dd_snapshot_count;
 655 
 656         if (quota < count)
 657                 err = ENOSPC;
 658 
 659         dsl_dir_close(dd, FTAG);
 660         dsl_dataset_rele(ds, FTAG);
 661 
 662         return (err);
 663 }
 664 
 665 /*
 666  * Check if adding additional child dataset(s) would exceed any dataset
 667  * quotas.  Note that all dataset quotas up to the root dataset (i.e. the pool
 668  * itself) or the given ancestor must be satisfied. When receiving we don't
 669  * check if the tx is syncing. In this case, the tx is passed as NULL.
 670  */
 671 int
 672 dsl_dir_dscount_check(dsl_dir_t *dd, dmu_tx_t *tx, uint64_t cnt,
 673     dsl_dir_t *ancestor)
 674 {
 675         uint64_t quota;
 676         int err = 0;
 677 
 678         VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
 679 
 680         /*
 681          * As with dsl_dataset_set_reservation_check(), don't run this check in
 682          * open context.
 683          */
 684         if (tx != NULL && !dmu_tx_is_syncing(tx))
 685                 return (0);
 686 
 687         /*
 688          * If an ancestor has been provided, stop checking the quota once we
 689          * hit that dir. We need this during rename so that we don't overcount
 690          * the check once we recurse up to the common ancestor.
 691          */
 692         if (ancestor == dd)
 693                 return (0);
 694 
 695         /*
 696          * If there's no value for this property, there's no need to enforce a
 697          * dataset quota.
 698          */
 699         err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_DATASET_QUOTA),
 700             8, 1, &quota, NULL, B_FALSE);
 701         if (err == ENOENT)
 702                 return (0);
 703         else if (err != 0)
 704                 return (err);
 705 
 706 #ifdef _KERNEL
 707         extern void __dtrace_probe_zfs__ds__quota(uint64_t, uint64_t, char *);
 708         __dtrace_probe_zfs__ds__quota((uint64_t)dd->dd_phys->dd_dataset_count,
 709             (uint64_t)quota, dd->dd_myname);
 710 #endif
 711 
 712         if (quota > 0 && (dd->dd_phys->dd_dataset_count + cnt) > quota)
 713                 return (EDQUOT);
 714 
 715         if (dd->dd_parent != NULL)
 716                 err = dsl_dir_dscount_check(dd->dd_parent, tx, cnt, ancestor);
 717 
 718         return (err);
 719 }
 720 
 721 /*
 722  * Adjust the dataset count for the specified dsl_dir_t and all parent datasets.
 723  * When a new dataset is created, increment the count on all parents, and when a
 724  * dataset is destroyed, decrement the count.
 725  */
 726 void
 727 dsl_dir_dscount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
 728     boolean_t syncing, boolean_t first)
 729 {
 730         /*
 731          * On initial entry we need to check if this feature is active, but
 732          * we don't want to re-check this on each recursive call. Note: the
 733          * feature cannot be active if its not enabled. If the feature is not
 734          * active, don't touch the on-disk count fields.
 735          */
 736         if (first) {
 737                 dsl_dataset_t *ds = NULL;
 738                 spa_t *spa;
 739                 zfeature_info_t *quota_feat =
 740                     &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA];
 741 
 742                 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
 743                     dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
 744                 spa = dsl_dataset_get_spa(ds);
 745                 dsl_dataset_rele(ds, FTAG);
 746                 if (!spa_feature_is_active(spa, quota_feat))
 747                         return;
 748         }
 749 
 750         VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
 751         if (syncing)
 752                 VERIFY(dmu_tx_is_syncing(tx));
 753 
 754         dmu_buf_will_dirty(dd->dd_dbuf, tx);
 755 
 756         mutex_enter(&dd->dd_lock);
 757 
 758         /*
 759          * Counts may be incorrect if dealing with an existing pool and
 760          * there has never been a quota set in the dataset hierarchy.
 761          * This is not an error.
 762          */
 763         if (delta < 0 && dd->dd_phys->dd_dataset_count < (delta * -1)) {
 764 #ifdef _KERNEL
 765                 extern void __dtrace_probe_zfs__dscnt__adj__neg(char *);
 766                 __dtrace_probe_zfs__dscnt__adj__neg(dd->dd_myname);
 767 #endif
 768                 mutex_exit(&dd->dd_lock);
 769                 return;
 770         }
 771 
 772         dd->dd_phys->dd_dataset_count += delta;
 773 
 774         if (dd->dd_parent != NULL)
 775                 dsl_dir_dscount_adjust(dd->dd_parent, tx, delta, syncing,
 776                     B_FALSE);
 777 
 778         mutex_exit(&dd->dd_lock);
 779 }
 780 
 781 uint64_t
 782 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 783     dmu_tx_t *tx)
 784 {
 785         objset_t *mos = dp->dp_meta_objset;
 786         uint64_t ddobj;
 787         dsl_dir_phys_t *ddphys;
 788         dmu_buf_t *dbuf;
 789 
 790         ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 791             DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 792         if (pds) {
 793                 VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
 794                     name, sizeof (uint64_t), 1, &ddobj, tx));
 795         } else {
 796                 /* it's the root dir */
 797                 VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 798                     DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 799         }
 800         VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 801         dmu_buf_will_dirty(dbuf, tx);
 802         ddphys = dbuf->db_data;
 803 
 804         ddphys->dd_creation_time = gethrestime_sec();
 805         if (pds)
 806                 ddphys->dd_parent_obj = pds->dd_object;
 807         ddphys->dd_props_zapobj = zap_create(mos,
 808             DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 809         ddphys->dd_child_dir_zapobj = zap_create(mos,
 810             DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 811         if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
 812                 ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 813         dmu_buf_rele(dbuf, FTAG);
 814 
 815         return (ddobj);
 816 }
 817 
 818 /* ARGSUSED */
 819 int
 820 dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 821 {
 822         dsl_dir_t *dd = arg1;
 823         dsl_pool_t *dp = dd->dd_pool;
 824         objset_t *mos = dp->dp_meta_objset;
 825         int err;
 826         uint64_t count;
 827 
 828         /*
 829          * There should be exactly two holds, both from
 830          * dsl_dataset_destroy: one on the dd directory, and one on its
 831          * head ds.  If there are more holds, then a concurrent thread is
 832          * performing a lookup inside this dir while we're trying to destroy
 833          * it.  To minimize this possibility, we perform this check only
 834          * in syncing context and fail the operation if we encounter
 835          * additional holds.  The dp_config_rwlock ensures that nobody else
 836          * opens it after we check.
 837          */
 838         if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2)
 839                 return (EBUSY);
 840 
 841         err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
 842         if (err)
 843                 return (err);
 844         if (count != 0)
 845                 return (EEXIST);
 846 
 847         return (0);
 848 }
 849 
 850 void
 851 dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 852 {
 853         dsl_dir_t *dd = arg1;
 854         objset_t *mos = dd->dd_pool->dp_meta_objset;
 855         uint64_t obj;
 856         dd_used_t t;
 857 
 858         ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
 859         ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 860 
 861         /*
 862          * Decrement the dataset count for all parent datasets.
 863          *
 864          * We have to worry about a special case where we are receiving a
 865          * dataset that already exists. In this case a temporary clone name
 866          * of %X is created (see dmu_recv_begin). In dmu_recv_existing_end we
 867          * destroy this temporary clone which leads to here. We don't want to
 868          * decrement the dataset counters in this case, since we never
 869          * incremented them. To detect this case we check the tag for
 870          * "tmp_dmu_recv_tag" to see if we're in that code path.
 871          */
 872         if (dd->dd_parent != NULL && strcmp(tag, tmp_dmu_recv_tag) != 0)
 873                 dsl_dir_dscount_adjust(dd->dd_parent, tx, -1, B_TRUE, B_TRUE);
 874 
 875         /*
 876          * Remove our reservation. The impl() routine avoids setting the
 877          * actual property, which would require the (already destroyed) ds.
 878          */
 879         dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 880 
 881         ASSERT0(dd->dd_phys->dd_used_bytes);
 882         ASSERT0(dd->dd_phys->dd_reserved);
 883         for (t = 0; t < DD_USED_NUM; t++)
 884                 ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
 885 
 886         VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
 887         VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
 888         VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
 889         VERIFY(0 == zap_remove(mos,
 890             dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
 891 
 892         obj = dd->dd_object;
 893         dsl_dir_close(dd, tag);
 894         VERIFY(0 == dmu_object_free(mos, obj, tx));
 895 }
 896 
 897 boolean_t
 898 dsl_dir_is_clone(dsl_dir_t *dd)
 899 {
 900         return (dd->dd_phys->dd_origin_obj &&
 901             (dd->dd_pool->dp_origin_snap == NULL ||
 902             dd->dd_phys->dd_origin_obj !=
 903             dd->dd_pool->dp_origin_snap->ds_object));
 904 }
 905 
 906 void
 907 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
 908 {
 909         mutex_enter(&dd->dd_lock);
 910         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
 911             dd->dd_phys->dd_used_bytes);
 912         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
 913         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
 914             dd->dd_phys->dd_reserved);
 915         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
 916             dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
 917             (dd->dd_phys->dd_uncompressed_bytes * 100 /
 918             dd->dd_phys->dd_compressed_bytes));
 919         if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 920                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
 921                     dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
 922                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
 923                     dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
 924                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
 925                     dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
 926                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
 927                     dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
 928                     dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
 929         }
 930         mutex_exit(&dd->dd_lock);
 931 
 932         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
 933         if (dsl_dir_is_clone(dd)) {
 934                 dsl_dataset_t *ds;
 935                 char buf[MAXNAMELEN];
 936 
 937                 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
 938                     dd->dd_phys->dd_origin_obj, FTAG, &ds));
 939                 dsl_dataset_name(ds, buf);
 940                 dsl_dataset_rele(ds, FTAG);
 941                 dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
 942         }
 943         rw_exit(&dd->dd_pool->dp_config_rwlock);
 944 }
 945 
 946 void
 947 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
 948 {
 949         dsl_pool_t *dp = dd->dd_pool;
 950 
 951         ASSERT(dd->dd_phys);
 952 
 953         if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
 954                 /* up the hold count until we can be written out */
 955                 dmu_buf_add_ref(dd->dd_dbuf, dd);
 956         }
 957 }
 958 
 959 static int64_t
 960 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
 961 {
 962         uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
 963         uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
 964         return (new_accounted - old_accounted);
 965 }
 966 
 967 void
 968 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 969 {
 970         ASSERT(dmu_tx_is_syncing(tx));
 971 
 972         mutex_enter(&dd->dd_lock);
 973         ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
 974         dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
 975             dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
 976         dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
 977         mutex_exit(&dd->dd_lock);
 978 
 979         /* release the hold from dsl_dir_dirty */
 980         dmu_buf_rele(dd->dd_dbuf, dd);
 981 }
 982 
 983 static uint64_t
 984 dsl_dir_space_towrite(dsl_dir_t *dd)
 985 {
 986         uint64_t space = 0;
 987         int i;
 988 
 989         ASSERT(MUTEX_HELD(&dd->dd_lock));
 990 
 991         for (i = 0; i < TXG_SIZE; i++) {
 992                 space += dd->dd_space_towrite[i&TXG_MASK];
 993                 ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
 994         }
 995         return (space);
 996 }
 997 
 998 /*
 999  * How much space would dd have available if ancestor had delta applied
1000  * to it?  If ondiskonly is set, we're only interested in what's
1001  * on-disk, not estimated pending changes.
1002  */
1003 uint64_t
1004 dsl_dir_space_available(dsl_dir_t *dd,
1005     dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
1006 {
1007         uint64_t parentspace, myspace, quota, used;
1008 
1009         /*
1010          * If there are no restrictions otherwise, assume we have
1011          * unlimited space available.
1012          */
1013         quota = UINT64_MAX;
1014         parentspace = UINT64_MAX;
1015 
1016         if (dd->dd_parent != NULL) {
1017                 parentspace = dsl_dir_space_available(dd->dd_parent,
1018                     ancestor, delta, ondiskonly);
1019         }
1020 
1021         mutex_enter(&dd->dd_lock);
1022         if (dd->dd_phys->dd_quota != 0)
1023                 quota = dd->dd_phys->dd_quota;
1024         used = dd->dd_phys->dd_used_bytes;
1025         if (!ondiskonly)
1026                 used += dsl_dir_space_towrite(dd);
1027 
1028         if (dd->dd_parent == NULL) {
1029                 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
1030                 quota = MIN(quota, poolsize);
1031         }
1032 
1033         if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
1034                 /*
1035                  * We have some space reserved, in addition to what our
1036                  * parent gave us.
1037                  */
1038                 parentspace += dd->dd_phys->dd_reserved - used;
1039         }
1040 
1041         if (dd == ancestor) {
1042                 ASSERT(delta <= 0);
1043                 ASSERT(used >= -delta);
1044                 used += delta;
1045                 if (parentspace != UINT64_MAX)
1046                         parentspace -= delta;
1047         }
1048 
1049         if (used > quota) {
1050                 /* over quota */
1051                 myspace = 0;
1052         } else {
1053                 /*
1054                  * the lesser of the space provided by our parent and
1055                  * the space left in our quota
1056                  */
1057                 myspace = MIN(parentspace, quota - used);
1058         }
1059 
1060         mutex_exit(&dd->dd_lock);
1061 
1062         return (myspace);
1063 }
1064 
1065 struct tempreserve {
1066         list_node_t tr_node;
1067         dsl_pool_t *tr_dp;
1068         dsl_dir_t *tr_ds;
1069         uint64_t tr_size;
1070 };
1071 
1072 static int
1073 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
1074     boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
1075     dmu_tx_t *tx, boolean_t first)
1076 {
1077         uint64_t txg = tx->tx_txg;
1078         uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
1079         uint64_t deferred = 0;
1080         struct tempreserve *tr;
1081         int retval = EDQUOT;
1082         int txgidx = txg & TXG_MASK;
1083         int i;
1084         uint64_t ref_rsrv = 0;
1085 
1086         ASSERT3U(txg, !=, 0);
1087         ASSERT3S(asize, >, 0);
1088 
1089         mutex_enter(&dd->dd_lock);
1090 
1091         /*
1092          * Check against the dsl_dir's quota.  We don't add in the delta
1093          * when checking for over-quota because they get one free hit.
1094          */
1095         est_inflight = dsl_dir_space_towrite(dd);
1096         for (i = 0; i < TXG_SIZE; i++)
1097                 est_inflight += dd->dd_tempreserved[i];
1098         used_on_disk = dd->dd_phys->dd_used_bytes;
1099 
1100         /*
1101          * On the first iteration, fetch the dataset's used-on-disk and
1102          * refreservation values. Also, if checkrefquota is set, test if
1103          * allocating this space would exceed the dataset's refquota.
1104          */
1105         if (first && tx->tx_objset) {
1106                 int error;
1107                 dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
1108 
1109                 error = dsl_dataset_check_quota(ds, checkrefquota,
1110                     asize, est_inflight, &used_on_disk, &ref_rsrv);
1111                 if (error) {
1112                         mutex_exit(&dd->dd_lock);
1113                         return (error);
1114                 }
1115         }
1116 
1117         /*
1118          * If this transaction will result in a net free of space,
1119          * we want to let it through.
1120          */
1121         if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
1122                 quota = UINT64_MAX;
1123         else
1124                 quota = dd->dd_phys->dd_quota;
1125 
1126         /*
1127          * Adjust the quota against the actual pool size at the root
1128          * minus any outstanding deferred frees.
1129          * To ensure that it's possible to remove files from a full
1130          * pool without inducing transient overcommits, we throttle
1131          * netfree transactions against a quota that is slightly larger,
1132          * but still within the pool's allocation slop.  In cases where
1133          * we're very close to full, this will allow a steady trickle of
1134          * removes to get through.
1135          */
1136         if (dd->dd_parent == NULL) {
1137                 spa_t *spa = dd->dd_pool->dp_spa;
1138                 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
1139                 deferred = metaslab_class_get_deferred(spa_normal_class(spa));
1140                 if (poolsize - deferred < quota) {
1141                         quota = poolsize - deferred;
1142                         retval = ENOSPC;
1143                 }
1144         }
1145 
1146         /*
1147          * If they are requesting more space, and our current estimate
1148          * is over quota, they get to try again unless the actual
1149          * on-disk is over quota and there are no pending changes (which
1150          * may free up space for us).
1151          */
1152         if (used_on_disk + est_inflight >= quota) {
1153                 if (est_inflight > 0 || used_on_disk < quota ||
1154                     (retval == ENOSPC && used_on_disk < quota + deferred))
1155                         retval = ERESTART;
1156                 dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
1157                     "quota=%lluK tr=%lluK err=%d\n",
1158                     used_on_disk>>10, est_inflight>>10,
1159                     quota>>10, asize>>10, retval);
1160                 mutex_exit(&dd->dd_lock);
1161                 return (retval);
1162         }
1163 
1164         /* We need to up our estimated delta before dropping dd_lock */
1165         dd->dd_tempreserved[txgidx] += asize;
1166 
1167         parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
1168             asize - ref_rsrv);
1169         mutex_exit(&dd->dd_lock);
1170 
1171         tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1172         tr->tr_ds = dd;
1173         tr->tr_size = asize;
1174         list_insert_tail(tr_list, tr);
1175 
1176         /* see if it's OK with our parent */
1177         if (dd->dd_parent && parent_rsrv) {
1178                 boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
1179 
1180                 return (dsl_dir_tempreserve_impl(dd->dd_parent,
1181                     parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
1182         } else {
1183                 return (0);
1184         }
1185 }
1186 
1187 /*
1188  * Reserve space in this dsl_dir, to be used in this tx's txg.
1189  * After the space has been dirtied (and dsl_dir_willuse_space()
1190  * has been called), the reservation should be canceled, using
1191  * dsl_dir_tempreserve_clear().
1192  */
1193 int
1194 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
1195     uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
1196 {
1197         int err;
1198         list_t *tr_list;
1199 
1200         if (asize == 0) {
1201                 *tr_cookiep = NULL;
1202                 return (0);
1203         }
1204 
1205         tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
1206         list_create(tr_list, sizeof (struct tempreserve),
1207             offsetof(struct tempreserve, tr_node));
1208         ASSERT3S(asize, >, 0);
1209         ASSERT3S(fsize, >=, 0);
1210 
1211         err = arc_tempreserve_space(lsize, tx->tx_txg);
1212         if (err == 0) {
1213                 struct tempreserve *tr;
1214 
1215                 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1216                 tr->tr_size = lsize;
1217                 list_insert_tail(tr_list, tr);
1218 
1219                 err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
1220         } else {
1221                 if (err == EAGAIN) {
1222                         txg_delay(dd->dd_pool, tx->tx_txg,
1223                             zfs_zone_txg_delay());
1224                         err = ERESTART;
1225                 }
1226                 dsl_pool_memory_pressure(dd->dd_pool);
1227         }
1228 
1229         if (err == 0) {
1230                 struct tempreserve *tr;
1231 
1232                 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1233                 tr->tr_dp = dd->dd_pool;
1234                 tr->tr_size = asize;
1235                 list_insert_tail(tr_list, tr);
1236 
1237                 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
1238                     FALSE, asize > usize, tr_list, tx, TRUE);
1239         }
1240 
1241         if (err)
1242                 dsl_dir_tempreserve_clear(tr_list, tx);
1243         else
1244                 *tr_cookiep = tr_list;
1245 
1246         return (err);
1247 }
1248 
1249 /*
1250  * Clear a temporary reservation that we previously made with
1251  * dsl_dir_tempreserve_space().
1252  */
1253 void
1254 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
1255 {
1256         int txgidx = tx->tx_txg & TXG_MASK;
1257         list_t *tr_list = tr_cookie;
1258         struct tempreserve *tr;
1259 
1260         ASSERT3U(tx->tx_txg, !=, 0);
1261 
1262         if (tr_cookie == NULL)
1263                 return;
1264 
1265         while (tr = list_head(tr_list)) {
1266                 if (tr->tr_dp) {
1267                         dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
1268                 } else if (tr->tr_ds) {
1269                         mutex_enter(&tr->tr_ds->dd_lock);
1270                         ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
1271                             tr->tr_size);
1272                         tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
1273                         mutex_exit(&tr->tr_ds->dd_lock);
1274                 } else {
1275                         arc_tempreserve_clear(tr->tr_size);
1276                 }
1277                 list_remove(tr_list, tr);
1278                 kmem_free(tr, sizeof (struct tempreserve));
1279         }
1280 
1281         kmem_free(tr_list, sizeof (list_t));
1282 }
1283 
1284 static void
1285 dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
1286 {
1287         int64_t parent_space;
1288         uint64_t est_used;
1289 
1290         mutex_enter(&dd->dd_lock);
1291         if (space > 0)
1292                 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
1293 
1294         est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
1295         parent_space = parent_delta(dd, est_used, space);
1296         mutex_exit(&dd->dd_lock);
1297 
1298         /* Make sure that we clean up dd_space_to* */
1299         dsl_dir_dirty(dd, tx);
1300 
1301         /* XXX this is potentially expensive and unnecessary... */
1302         if (parent_space && dd->dd_parent)
1303                 dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
1304 }
1305 
1306 /*
1307  * Call in open context when we think we're going to write/free space,
1308  * eg. when dirtying data.  Be conservative (ie. OK to write less than
1309  * this or free more than this, but don't write more or free less).
1310  */
1311 void
1312 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
1313 {
1314         dsl_pool_willuse_space(dd->dd_pool, space, tx);
1315         dsl_dir_willuse_space_impl(dd, space, tx);
1316 }
1317 
1318 /* call from syncing context when we actually write/free space for this dd */
1319 void
1320 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
1321     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
1322 {
1323         int64_t accounted_delta;
1324         boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
1325 
1326         ASSERT(dmu_tx_is_syncing(tx));
1327         ASSERT(type < DD_USED_NUM);
1328 
1329         if (needlock)
1330                 mutex_enter(&dd->dd_lock);
1331         accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
1332         ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
1333         ASSERT(compressed >= 0 ||
1334             dd->dd_phys->dd_compressed_bytes >= -compressed);
1335         ASSERT(uncompressed >= 0 ||
1336             dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
1337         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1338         dd->dd_phys->dd_used_bytes += used;
1339         dd->dd_phys->dd_uncompressed_bytes += uncompressed;
1340         dd->dd_phys->dd_compressed_bytes += compressed;
1341 
1342         if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1343                 ASSERT(used > 0 ||
1344                     dd->dd_phys->dd_used_breakdown[type] >= -used);
1345                 dd->dd_phys->dd_used_breakdown[type] += used;
1346 #ifdef DEBUG
1347                 dd_used_t t;
1348                 uint64_t u = 0;
1349                 for (t = 0; t < DD_USED_NUM; t++)
1350                         u += dd->dd_phys->dd_used_breakdown[t];
1351                 ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
1352 #endif
1353         }
1354         if (needlock)
1355                 mutex_exit(&dd->dd_lock);
1356 
1357         if (dd->dd_parent != NULL) {
1358                 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1359                     accounted_delta, compressed, uncompressed, tx);
1360                 dsl_dir_transfer_space(dd->dd_parent,
1361                     used - accounted_delta,
1362                     DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
1363         }
1364 }
1365 
1366 void
1367 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
1368     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
1369 {
1370         boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
1371 
1372         ASSERT(dmu_tx_is_syncing(tx));
1373         ASSERT(oldtype < DD_USED_NUM);
1374         ASSERT(newtype < DD_USED_NUM);
1375 
1376         if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
1377                 return;
1378 
1379         if (needlock)
1380                 mutex_enter(&dd->dd_lock);
1381         ASSERT(delta > 0 ?
1382             dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
1383             dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
1384         ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
1385         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1386         dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
1387         dd->dd_phys->dd_used_breakdown[newtype] += delta;
1388         if (needlock)
1389                 mutex_exit(&dd->dd_lock);
1390 }
1391 
1392 static int
1393 dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
1394 {
1395         dsl_dataset_t *ds = arg1;
1396         dsl_dir_t *dd = ds->ds_dir;
1397         dsl_prop_setarg_t *psa = arg2;
1398         int err;
1399         uint64_t towrite;
1400 
1401         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
1402                 return (err);
1403 
1404         if (psa->psa_effective_value == 0)
1405                 return (0);
1406 
1407         mutex_enter(&dd->dd_lock);
1408         /*
1409          * If we are doing the preliminary check in open context, and
1410          * there are pending changes, then don't fail it, since the
1411          * pending changes could under-estimate the amount of space to be
1412          * freed up.
1413          */
1414         towrite = dsl_dir_space_towrite(dd);
1415         if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1416             (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
1417             psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
1418                 err = ENOSPC;
1419         }
1420         mutex_exit(&dd->dd_lock);
1421         return (err);
1422 }
1423 
1424 static void
1425 dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1426 {
1427         dsl_dataset_t *ds = arg1;
1428         dsl_dir_t *dd = ds->ds_dir;
1429         dsl_prop_setarg_t *psa = arg2;
1430         uint64_t effective_value = psa->psa_effective_value;
1431 
1432         dsl_prop_set_sync(ds, psa, tx);
1433         DSL_PROP_CHECK_PREDICTION(dd, psa);
1434 
1435         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1436 
1437         mutex_enter(&dd->dd_lock);
1438         dd->dd_phys->dd_quota = effective_value;
1439         mutex_exit(&dd->dd_lock);
1440 }
1441 
1442 int
1443 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1444 {
1445         dsl_dir_t *dd;
1446         dsl_dataset_t *ds;
1447         dsl_prop_setarg_t psa;
1448         int err;
1449 
1450         dsl_prop_setarg_init_uint64(&psa, "quota", source, &quota);
1451 
1452         err = dsl_dataset_hold(ddname, FTAG, &ds);
1453         if (err)
1454                 return (err);
1455 
1456         err = dsl_dir_open(ddname, FTAG, &dd, NULL);
1457         if (err) {
1458                 dsl_dataset_rele(ds, FTAG);
1459                 return (err);
1460         }
1461 
1462         ASSERT(ds->ds_dir == dd);
1463 
1464         /*
1465          * If someone removes a file, then tries to set the quota, we want to
1466          * make sure the file freeing takes effect.
1467          */
1468         txg_wait_open(dd->dd_pool, 0);
1469 
1470         err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
1471             dsl_dir_set_quota_sync, ds, &psa, 0);
1472 
1473         dsl_dir_close(dd, FTAG);
1474         dsl_dataset_rele(ds, FTAG);
1475         return (err);
1476 }
1477 
1478 int
1479 dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
1480 {
1481         dsl_dataset_t *ds = arg1;
1482         dsl_dir_t *dd = ds->ds_dir;
1483         dsl_prop_setarg_t *psa = arg2;
1484         uint64_t effective_value;
1485         uint64_t used, avail;
1486         int err;
1487 
1488         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
1489                 return (err);
1490 
1491         effective_value = psa->psa_effective_value;
1492 
1493         /*
1494          * If we are doing the preliminary check in open context, the
1495          * space estimates may be inaccurate.
1496          */
1497         if (!dmu_tx_is_syncing(tx))
1498                 return (0);
1499 
1500         mutex_enter(&dd->dd_lock);
1501         used = dd->dd_phys->dd_used_bytes;
1502         mutex_exit(&dd->dd_lock);
1503 
1504         if (dd->dd_parent) {
1505                 avail = dsl_dir_space_available(dd->dd_parent,
1506                     NULL, 0, FALSE);
1507         } else {
1508                 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1509         }
1510 
1511         if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
1512                 uint64_t delta = MAX(used, effective_value) -
1513                     MAX(used, dd->dd_phys->dd_reserved);
1514 
1515                 if (delta > avail)
1516                         return (ENOSPC);
1517                 if (dd->dd_phys->dd_quota > 0 &&
1518                     effective_value > dd->dd_phys->dd_quota)
1519                         return (ENOSPC);
1520         }
1521 
1522         return (0);
1523 }
1524 
1525 static void
1526 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1527 {
1528         uint64_t used;
1529         int64_t delta;
1530 
1531         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1532 
1533         mutex_enter(&dd->dd_lock);
1534         used = dd->dd_phys->dd_used_bytes;
1535         delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
1536         dd->dd_phys->dd_reserved = value;
1537 
1538         if (dd->dd_parent != NULL) {
1539                 /* Roll up this additional usage into our ancestors */
1540                 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1541                     delta, 0, 0, tx);
1542         }
1543         mutex_exit(&dd->dd_lock);
1544 }
1545 
1546 
1547 static void
1548 dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1549 {
1550         dsl_dataset_t *ds = arg1;
1551         dsl_dir_t *dd = ds->ds_dir;
1552         dsl_prop_setarg_t *psa = arg2;
1553         uint64_t value = psa->psa_effective_value;
1554 
1555         dsl_prop_set_sync(ds, psa, tx);
1556         DSL_PROP_CHECK_PREDICTION(dd, psa);
1557 
1558         dsl_dir_set_reservation_sync_impl(dd, value, tx);
1559 }
1560 
1561 int
1562 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1563     uint64_t reservation)
1564 {
1565         dsl_dir_t *dd;
1566         dsl_dataset_t *ds;
1567         dsl_prop_setarg_t psa;
1568         int err;
1569 
1570         dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
1571 
1572         err = dsl_dataset_hold(ddname, FTAG, &ds);
1573         if (err)
1574                 return (err);
1575 
1576         err = dsl_dir_open(ddname, FTAG, &dd, NULL);
1577         if (err) {
1578                 dsl_dataset_rele(ds, FTAG);
1579                 return (err);
1580         }
1581 
1582         ASSERT(ds->ds_dir == dd);
1583 
1584         err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
1585             dsl_dir_set_reservation_sync, ds, &psa, 0);
1586 
1587         dsl_dir_close(dd, FTAG);
1588         dsl_dataset_rele(ds, FTAG);
1589         return (err);
1590 }
1591 
1592 static dsl_dir_t *
1593 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1594 {
1595         for (; ds1; ds1 = ds1->dd_parent) {
1596                 dsl_dir_t *dd;
1597                 for (dd = ds2; dd; dd = dd->dd_parent) {
1598                         if (ds1 == dd)
1599                                 return (dd);
1600                 }
1601         }
1602         return (NULL);
1603 }
1604 
1605 /*
1606  * If delta is applied to dd, how much of that delta would be applied to
1607  * ancestor?  Syncing context only.
1608  */
1609 static int64_t
1610 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1611 {
1612         if (dd == ancestor)
1613                 return (delta);
1614 
1615         mutex_enter(&dd->dd_lock);
1616         delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
1617         mutex_exit(&dd->dd_lock);
1618         return (would_change(dd->dd_parent, delta, ancestor));
1619 }
1620 
1621 struct renamearg {
1622         dsl_dir_t *newparent;
1623         const char *mynewname;
1624 };
1625 
1626 static int
1627 dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
1628 {
1629         dsl_dir_t *dd = arg1;
1630         struct renamearg *ra = arg2;
1631         dsl_pool_t *dp = dd->dd_pool;
1632         objset_t *mos = dp->dp_meta_objset;
1633         int err;
1634         uint64_t val;
1635 
1636         /*
1637          * There should only be one reference, from dmu_objset_rename().
1638          * Fleeting holds are also possible (eg, from "zfs list" getting
1639          * stats), but any that are present in open context will likely
1640          * be gone by syncing context, so only fail from syncing
1641          * context.
1642          */
1643         if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1)
1644                 return (EBUSY);
1645 
1646         /* check for existing name */
1647         err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1648             ra->mynewname, 8, 1, &val);
1649         if (err == 0)
1650                 return (EEXIST);
1651         if (err != ENOENT)
1652                 return (err);
1653 
1654         if (ra->newparent != dd->dd_parent) {
1655                 /* is there enough space? */
1656                 uint64_t myspace =
1657                     MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1658 
1659                 /* no rename into our descendant */
1660                 if (closest_common_ancestor(dd, ra->newparent) == dd)
1661                         return (EINVAL);
1662 
1663                 if (err = dsl_dir_transfer_possible(dd->dd_parent,
1664                     ra->newparent, dd, myspace, tx))
1665                         return (err);
1666         }
1667 
1668         return (0);
1669 }
1670 
1671 static void
1672 dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1673 {
1674         dsl_dir_t *dd = arg1;
1675         struct renamearg *ra = arg2;
1676         dsl_pool_t *dp = dd->dd_pool;
1677         objset_t *mos = dp->dp_meta_objset;
1678         int err;
1679         char namebuf[MAXNAMELEN];
1680 
1681         ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
1682 
1683         /* Log this before we change the name. */
1684         dsl_dir_name(ra->newparent, namebuf);
1685         spa_history_log_internal_dd(dd, "rename", tx,
1686             "-> %s/%s", namebuf, ra->mynewname);
1687 
1688         if (ra->newparent != dd->dd_parent) {
1689                 int cnt;
1690 
1691                 mutex_enter(&dd->dd_lock);
1692 
1693                 cnt = dd->dd_phys->dd_dataset_count + 1;
1694                 dsl_dir_dscount_adjust(dd->dd_parent, tx, -cnt, B_TRUE, B_TRUE);
1695                 dsl_dir_dscount_adjust(ra->newparent, tx, cnt, B_TRUE, B_TRUE);
1696 
1697                 cnt = dd->dd_phys->dd_snapshot_count;
1698                 dsl_snapcount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
1699                 dsl_snapcount_adjust(ra->newparent, tx, cnt, B_TRUE);
1700 
1701                 mutex_exit(&dd->dd_lock);
1702 
1703                 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1704                     -dd->dd_phys->dd_used_bytes,
1705                     -dd->dd_phys->dd_compressed_bytes,
1706                     -dd->dd_phys->dd_uncompressed_bytes, tx);
1707                 dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
1708                     dd->dd_phys->dd_used_bytes,
1709                     dd->dd_phys->dd_compressed_bytes,
1710                     dd->dd_phys->dd_uncompressed_bytes, tx);
1711 
1712                 if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
1713                         uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
1714                             dd->dd_phys->dd_used_bytes;
1715 
1716                         dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1717                             -unused_rsrv, 0, 0, tx);
1718                         dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV,
1719                             unused_rsrv, 0, 0, tx);
1720                 }
1721         }
1722 
1723         dmu_buf_will_dirty(dd->dd_dbuf, tx);
1724 
1725         /* remove from old parent zapobj */
1726         err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
1727             dd->dd_myname, tx);
1728         ASSERT0(err);
1729 
1730         (void) strcpy(dd->dd_myname, ra->mynewname);
1731         dsl_dir_close(dd->dd_parent, dd);
1732         dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
1733         VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
1734             ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
1735 
1736         /* add to new parent zapobj */
1737         err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1738             dd->dd_myname, 8, 1, &dd->dd_object, tx);
1739         ASSERT0(err);
1740 
1741 }
1742 
1743 int
1744 dsl_dir_rename(dsl_dir_t *dd, const char *newname)
1745 {
1746         struct renamearg ra;
1747         int err;
1748 
1749         /* new parent should exist */
1750         err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
1751         if (err)
1752                 return (err);
1753 
1754         /* can't rename to different pool */
1755         if (dd->dd_pool != ra.newparent->dd_pool) {
1756                 err = ENXIO;
1757                 goto out;
1758         }
1759 
1760         /* new name should not already exist */
1761         if (ra.mynewname == NULL) {
1762                 err = EEXIST;
1763                 goto out;
1764         }
1765 
1766         err = dsl_sync_task_do(dd->dd_pool,
1767             dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
1768 
1769 out:
1770         dsl_dir_close(ra.newparent, FTAG);
1771         return (err);
1772 }
1773 
1774 int
1775 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, dsl_dir_t *moving_dd,
1776     uint64_t space, dmu_tx_t *tx)
1777 {
1778         dsl_dir_t *ancestor;
1779         int64_t adelta;
1780         uint64_t avail;
1781         int err;
1782 
1783         ancestor = closest_common_ancestor(sdd, tdd);
1784         adelta = would_change(sdd, -space, ancestor);
1785         avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1786         if (avail < space)
1787                 return (ENOSPC);
1788 
1789         if (sdd != moving_dd) {
1790                 err = dsl_dir_dscount_check(tdd, tx,
1791                     moving_dd->dd_phys->dd_dataset_count + 1, ancestor);
1792                 if (err != 0)
1793                         return (err);
1794         }
1795         err = dsl_snapcount_check(tdd, tx,
1796             moving_dd->dd_phys->dd_snapshot_count, ancestor);
1797         if (err != 0)
1798                 return (err);
1799 
1800         return (0);
1801 }
1802 
1803 timestruc_t
1804 dsl_dir_snap_cmtime(dsl_dir_t *dd)
1805 {
1806         timestruc_t t;
1807 
1808         mutex_enter(&dd->dd_lock);
1809         t = dd->dd_snap_cmtime;
1810         mutex_exit(&dd->dd_lock);
1811 
1812         return (t);
1813 }
1814 
1815 void
1816 dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1817 {
1818         timestruc_t t;
1819 
1820         gethrestime(&t);
1821         mutex_enter(&dd->dd_lock);
1822         dd->dd_snap_cmtime = t;
1823         mutex_exit(&dd->dd_lock);
1824 }