illumos New usr/src/uts/common/fs/zfs/dsl

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/dmu_objset.h>
  28 #include <sys/dsl_dataset.h>
  29 #include <sys/dsl_dir.h>
  30 #include <sys/dsl_prop.h>
  31 #include <sys/dsl_synctask.h>
  32 #include <sys/dmu_traverse.h>
  33 #include <sys/dmu_impl.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/arc.h>
  36 #include <sys/zio.h>
  37 #include <sys/zap.h>
  38 #include <sys/zfeature.h>
  39 #include <sys/unique.h>
  40 #include <sys/zfs_context.h>
  41 #include <sys/zfs_ioctl.h>
  42 #include <sys/spa.h>
  43 #include <sys/zfs_znode.h>
  44 #include <sys/zfs_onexit.h>
  45 #include <sys/zvol.h>
  46 #include <sys/dsl_scan.h>
  47 #include <sys/dsl_deadlist.h>
  48 #include "zfs_prop.h"
  49 
  50 static char *dsl_reaper = "the grim reaper";
  51 
  52 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
  53 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
  54 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
  55 
  56 #define SWITCH64(x, y) \
  57         { \
  58                 uint64_t __tmp = (x); \
  59                 (x) = (y); \
  60                 (y) = __tmp; \
  61         }
  62 
  63 #define DS_REF_MAX      (1ULL << 62)
  64 
  65 #define DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
  66 
  67 #define DSL_DATASET_IS_DESTROYED(ds)    ((ds)->ds_owner == dsl_reaper)
  68 
  69 
  70 /*
  71  * Figure out how much of this delta should be propogated to the dsl_dir
  72  * layer.  If there's a refreservation, that space has already been
  73  * partially accounted for in our ancestors.
  74  */
  75 static int64_t
  76 parent_delta(dsl_dataset_t *ds, int64_t delta)
  77 {
  78         uint64_t old_bytes, new_bytes;
  79 
  80         if (ds->ds_reserved == 0)
  81                 return (delta);
  82 
  83         old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
  84         new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
  85 
  86         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
  87         return (new_bytes - old_bytes);
  88 }
  89 
  90 void
  91 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
  92 {
  93         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
  94         int compressed = BP_GET_PSIZE(bp);
  95         int uncompressed = BP_GET_UCSIZE(bp);
  96         int64_t delta;
  97 
  98         dprintf_bp(bp, "ds=%p", ds);
  99 
 100         ASSERT(dmu_tx_is_syncing(tx));
 101         /* It could have been compressed away to nothing */
 102         if (BP_IS_HOLE(bp))
 103                 return;
 104         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 105         ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
 106         if (ds == NULL) {
 107                 dsl_pool_mos_diduse_space(tx->tx_pool,
 108                     used, compressed, uncompressed);
 109                 return;
 110         }
 111         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 112 
 113         mutex_enter(&ds->ds_dir->dd_lock);
 114         mutex_enter(&ds->ds_lock);
 115         delta = parent_delta(ds, used);
 116         ds->ds_phys->ds_referenced_bytes += used;
 117         ds->ds_phys->ds_compressed_bytes += compressed;
 118         ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 119         ds->ds_phys->ds_unique_bytes += used;
 120         mutex_exit(&ds->ds_lock);
 121         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 122             compressed, uncompressed, tx);
 123         dsl_dir_transfer_space(ds->ds_dir, used - delta,
 124             DD_USED_REFRSRV, DD_USED_HEAD, tx);
 125         mutex_exit(&ds->ds_dir->dd_lock);
 126 }
 127 
 128 int
 129 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 130     boolean_t async)
 131 {
 132         if (BP_IS_HOLE(bp))
 133                 return (0);
 134 
 135         ASSERT(dmu_tx_is_syncing(tx));
 136         ASSERT(bp->blk_birth <= tx->tx_txg);
 137 
 138         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 139         int compressed = BP_GET_PSIZE(bp);
 140         int uncompressed = BP_GET_UCSIZE(bp);
 141 
 142         ASSERT(used > 0);
 143         if (ds == NULL) {
 144                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
 145                 dsl_pool_mos_diduse_space(tx->tx_pool,
 146                     -used, -compressed, -uncompressed);
 147                 return (used);
 148         }
 149         ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 150 
 151         ASSERT(!dsl_dataset_is_snapshot(ds));
 152         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 153 
 154         if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 155                 int64_t delta;
 156 
 157                 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 158                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
 159 
 160                 mutex_enter(&ds->ds_dir->dd_lock);
 161                 mutex_enter(&ds->ds_lock);
 162                 ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
 163                     !DS_UNIQUE_IS_ACCURATE(ds));
 164                 delta = parent_delta(ds, -used);
 165                 ds->ds_phys->ds_unique_bytes -= used;
 166                 mutex_exit(&ds->ds_lock);
 167                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 168                     delta, -compressed, -uncompressed, tx);
 169                 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
 170                     DD_USED_REFRSRV, DD_USED_HEAD, tx);
 171                 mutex_exit(&ds->ds_dir->dd_lock);
 172         } else {
 173                 dprintf_bp(bp, "putting on dead list: %s", "");
 174                 if (async) {
 175                         /*
 176                          * We are here as part of zio's write done callback,
 177                          * which means we're a zio interrupt thread.  We can't
 178                          * call dsl_deadlist_insert() now because it may block
 179                          * waiting for I/O.  Instead, put bp on the deferred
 180                          * queue and let dsl_pool_sync() finish the job.
 181                          */
 182                         bplist_append(&ds->ds_pending_deadlist, bp);
 183                 } else {
 184                         dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 185                 }
 186                 ASSERT3U(ds->ds_prev->ds_object, ==,
 187                     ds->ds_phys->ds_prev_snap_obj);
 188                 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 189                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 190                 if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
 191                     ds->ds_object && bp->blk_birth >
 192                     ds->ds_prev->ds_phys->ds_prev_snap_txg) {
 193                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 194                         mutex_enter(&ds->ds_prev->ds_lock);
 195                         ds->ds_prev->ds_phys->ds_unique_bytes += used;
 196                         mutex_exit(&ds->ds_prev->ds_lock);
 197                 }
 198                 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 199                         dsl_dir_transfer_space(ds->ds_dir, used,
 200                             DD_USED_HEAD, DD_USED_SNAP, tx);
 201                 }
 202         }
 203         mutex_enter(&ds->ds_lock);
 204         ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
 205         ds->ds_phys->ds_referenced_bytes -= used;
 206         ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 207         ds->ds_phys->ds_compressed_bytes -= compressed;
 208         ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 209         ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 210         mutex_exit(&ds->ds_lock);
 211 
 212         return (used);
 213 }
 214 
 215 uint64_t
 216 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 217 {
 218         uint64_t trysnap = 0;
 219 
 220         if (ds == NULL)
 221                 return (0);
 222         /*
 223          * The snapshot creation could fail, but that would cause an
 224          * incorrect FALSE return, which would only result in an
 225          * overestimation of the amount of space that an operation would
 226          * consume, which is OK.
 227          *
 228          * There's also a small window where we could miss a pending
 229          * snapshot, because we could set the sync task in the quiescing
 230          * phase.  So this should only be used as a guess.
 231          */
 232         if (ds->ds_trysnap_txg >
 233             spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 234                 trysnap = ds->ds_trysnap_txg;
 235         return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
 236 }
 237 
 238 boolean_t
 239 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
 240     uint64_t blk_birth)
 241 {
 242         if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
 243                 return (B_FALSE);
 244 
 245         ddt_prefetch(dsl_dataset_get_spa(ds), bp);
 246 
 247         return (B_TRUE);
 248 }
 249 
 250 /* ARGSUSED */
 251 static void
 252 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 253 {
 254         dsl_dataset_t *ds = dsv;
 255 
 256         ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
 257 
 258         unique_remove(ds->ds_fsid_guid);
 259 
 260         if (ds->ds_objset != NULL)
 261                 dmu_objset_evict(ds->ds_objset);
 262 
 263         if (ds->ds_prev) {
 264                 dsl_dataset_drop_ref(ds->ds_prev, ds);
 265                 ds->ds_prev = NULL;
 266         }
 267 
 268         bplist_destroy(&ds->ds_pending_deadlist);
 269         if (db != NULL) {
 270                 dsl_deadlist_close(&ds->ds_deadlist);
 271         } else {
 272                 ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
 273                 ASSERT(!ds->ds_deadlist.dl_oldfmt);
 274         }
 275         if (ds->ds_dir)
 276                 dsl_dir_close(ds->ds_dir, ds);
 277 
 278         ASSERT(!list_link_active(&ds->ds_synced_link));
 279 
 280         mutex_destroy(&ds->ds_lock);
 281         mutex_destroy(&ds->ds_recvlock);
 282         mutex_destroy(&ds->ds_opening_lock);
 283         rw_destroy(&ds->ds_rwlock);
 284         cv_destroy(&ds->ds_exclusive_cv);
 285 
 286         kmem_free(ds, sizeof (dsl_dataset_t));
 287 }
 288 
 289 static int
 290 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 291 {
 292         dsl_dataset_phys_t *headphys;
 293         int err;
 294         dmu_buf_t *headdbuf;
 295         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 296         objset_t *mos = dp->dp_meta_objset;
 297 
 298         if (ds->ds_snapname[0])
 299                 return (0);
 300         if (ds->ds_phys->ds_next_snap_obj == 0)
 301                 return (0);
 302 
 303         err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
 304             FTAG, &headdbuf);
 305         if (err)
 306                 return (err);
 307         headphys = headdbuf->db_data;
 308         err = zap_value_search(dp->dp_meta_objset,
 309             headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
 310         dmu_buf_rele(headdbuf, FTAG);
 311         return (err);
 312 }
 313 
 314 static int
 315 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 316 {
 317         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 318         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 319         matchtype_t mt;
 320         int err;
 321 
 322         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 323                 mt = MT_FIRST;
 324         else
 325                 mt = MT_EXACT;
 326 
 327         err = zap_lookup_norm(mos, snapobj, name, 8, 1,
 328             value, mt, NULL, 0, NULL);
 329         if (err == ENOTSUP && mt == MT_FIRST)
 330                 err = zap_lookup(mos, snapobj, name, 8, 1, value);
 331         return (err);
 332 }
 333 
 334 static int
 335 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
 336 {
 337         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 338         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 339         matchtype_t mt;
 340         int err;
 341 
 342         dsl_dir_snap_cmtime_update(ds->ds_dir);
 343 
 344         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 345                 mt = MT_FIRST;
 346         else
 347                 mt = MT_EXACT;
 348 
 349         err = zap_remove_norm(mos, snapobj, name, mt, tx);
 350         if (err == ENOTSUP && mt == MT_FIRST)
 351                 err = zap_remove(mos, snapobj, name, tx);
 352 
 353         if (err == 0)
 354                 dsl_snapcount_adjust(ds->ds_dir, tx, -1, B_TRUE);
 355 
 356         return (err);
 357 }
 358 
 359 static int
 360 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 361     dsl_dataset_t **dsp)
 362 {
 363         objset_t *mos = dp->dp_meta_objset;
 364         dmu_buf_t *dbuf;
 365         dsl_dataset_t *ds;
 366         int err;
 367         dmu_object_info_t doi;
 368 
 369         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 370             dsl_pool_sync_context(dp));
 371 
 372         err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 373         if (err)
 374                 return (err);
 375 
 376         /* Make sure dsobj has the correct object type. */
 377         dmu_object_info_from_db(dbuf, &doi);
 378         if (doi.doi_type != DMU_OT_DSL_DATASET)
 379                 return (EINVAL);
 380 
 381         ds = dmu_buf_get_user(dbuf);
 382         if (ds == NULL) {
 383                 dsl_dataset_t *winner;
 384 
 385                 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 386                 ds->ds_dbuf = dbuf;
 387                 ds->ds_object = dsobj;
 388                 ds->ds_phys = dbuf->db_data;
 389 
 390                 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 391                 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
 392                 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 393                 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
 394 
 395                 rw_init(&ds->ds_rwlock, 0, 0, 0);
 396                 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
 397 
 398                 bplist_create(&ds->ds_pending_deadlist);
 399                 dsl_deadlist_open(&ds->ds_deadlist,
 400                     mos, ds->ds_phys->ds_deadlist_obj);
 401 
 402                 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
 403                     offsetof(dmu_sendarg_t, dsa_link));
 404 
 405                 if (err == 0) {
 406                         err = dsl_dir_open_obj(dp,
 407                             ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
 408                 }
 409                 if (err) {
 410                         mutex_destroy(&ds->ds_lock);
 411                         mutex_destroy(&ds->ds_recvlock);
 412                         mutex_destroy(&ds->ds_opening_lock);
 413                         rw_destroy(&ds->ds_rwlock);
 414                         cv_destroy(&ds->ds_exclusive_cv);
 415                         bplist_destroy(&ds->ds_pending_deadlist);
 416                         dsl_deadlist_close(&ds->ds_deadlist);
 417                         kmem_free(ds, sizeof (dsl_dataset_t));
 418                         dmu_buf_rele(dbuf, tag);
 419                         return (err);
 420                 }
 421 
 422                 if (!dsl_dataset_is_snapshot(ds)) {
 423                         ds->ds_snapname[0] = '\0';
 424                         if (ds->ds_phys->ds_prev_snap_obj) {
 425                                 err = dsl_dataset_get_ref(dp,
 426                                     ds->ds_phys->ds_prev_snap_obj,
 427                                     ds, &ds->ds_prev);
 428                         }
 429                 } else {
 430                         if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 431                                 err = dsl_dataset_get_snapname(ds);
 432                         if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
 433                                 err = zap_count(
 434                                     ds->ds_dir->dd_pool->dp_meta_objset,
 435                                     ds->ds_phys->ds_userrefs_obj,
 436                                     &ds->ds_userrefs);
 437                         }
 438                 }
 439 
 440                 if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
 441                         /*
 442                          * In sync context, we're called with either no lock
 443                          * or with the write lock.  If we're not syncing,
 444                          * we're always called with the read lock held.
 445                          */
 446                         boolean_t need_lock =
 447                             !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
 448                             dsl_pool_sync_context(dp);
 449 
 450                         if (need_lock)
 451                                 rw_enter(&dp->dp_config_rwlock, RW_READER);
 452 
 453                         err = dsl_prop_get_ds(ds,
 454                             "refreservation", sizeof (uint64_t), 1,
 455                             &ds->ds_reserved, NULL);
 456                         if (err == 0) {
 457                                 err = dsl_prop_get_ds(ds,
 458                                     "refquota", sizeof (uint64_t), 1,
 459                                     &ds->ds_quota, NULL);
 460                         }
 461 
 462                         if (need_lock)
 463                                 rw_exit(&dp->dp_config_rwlock);
 464                 } else {
 465                         ds->ds_reserved = ds->ds_quota = 0;
 466                 }
 467 
 468                 if (err == 0) {
 469                         winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
 470                             dsl_dataset_evict);
 471                 }
 472                 if (err || winner) {
 473                         bplist_destroy(&ds->ds_pending_deadlist);
 474                         dsl_deadlist_close(&ds->ds_deadlist);
 475                         if (ds->ds_prev)
 476                                 dsl_dataset_drop_ref(ds->ds_prev, ds);
 477                         dsl_dir_close(ds->ds_dir, ds);
 478                         mutex_destroy(&ds->ds_lock);
 479                         mutex_destroy(&ds->ds_recvlock);
 480                         mutex_destroy(&ds->ds_opening_lock);
 481                         rw_destroy(&ds->ds_rwlock);
 482                         cv_destroy(&ds->ds_exclusive_cv);
 483                         kmem_free(ds, sizeof (dsl_dataset_t));
 484                         if (err) {
 485                                 dmu_buf_rele(dbuf, tag);
 486                                 return (err);
 487                         }
 488                         ds = winner;
 489                 } else {
 490                         ds->ds_fsid_guid =
 491                             unique_insert(ds->ds_phys->ds_fsid_guid);
 492                 }
 493         }
 494         ASSERT3P(ds->ds_dbuf, ==, dbuf);
 495         ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
 496         ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
 497             spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 498             dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 499         mutex_enter(&ds->ds_lock);
 500         if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
 501                 mutex_exit(&ds->ds_lock);
 502                 dmu_buf_rele(ds->ds_dbuf, tag);
 503                 return (ENOENT);
 504         }
 505         mutex_exit(&ds->ds_lock);
 506         *dsp = ds;
 507         return (0);
 508 }
 509 
 510 static int
 511 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
 512 {
 513         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 514 
 515         /*
 516          * In syncing context we don't want the rwlock lock: there
 517          * may be an existing writer waiting for sync phase to
 518          * finish.  We don't need to worry about such writers, since
 519          * sync phase is single-threaded, so the writer can't be
 520          * doing anything while we are active.
 521          */
 522         if (dsl_pool_sync_context(dp)) {
 523                 ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
 524                 return (0);
 525         }
 526 
 527         /*
 528          * Normal users will hold the ds_rwlock as a READER until they
 529          * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
 530          * drop their READER lock after they set the ds_owner field.
 531          *
 532          * If the dataset is being destroyed, the destroy thread will
 533          * obtain a WRITER lock for exclusive access after it's done its
 534          * open-context work and then change the ds_owner to
 535          * dsl_reaper once destruction is assured.  So threads
 536          * may block here temporarily, until the "destructability" of
 537          * the dataset is determined.
 538          */
 539         ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
 540         mutex_enter(&ds->ds_lock);
 541         while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
 542                 rw_exit(&dp->dp_config_rwlock);
 543                 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
 544                 if (DSL_DATASET_IS_DESTROYED(ds)) {
 545                         mutex_exit(&ds->ds_lock);
 546                         dsl_dataset_drop_ref(ds, tag);
 547                         rw_enter(&dp->dp_config_rwlock, RW_READER);
 548                         return (ENOENT);
 549                 }
 550                 /*
 551                  * The dp_config_rwlock lives above the ds_lock. And
 552                  * we need to check DSL_DATASET_IS_DESTROYED() while
 553                  * holding the ds_lock, so we have to drop and reacquire
 554                  * the ds_lock here.
 555                  */
 556                 mutex_exit(&ds->ds_lock);
 557                 rw_enter(&dp->dp_config_rwlock, RW_READER);
 558                 mutex_enter(&ds->ds_lock);
 559         }
 560         mutex_exit(&ds->ds_lock);
 561         return (0);
 562 }
 563 
 564 int
 565 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 566     dsl_dataset_t **dsp)
 567 {
 568         int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
 569 
 570         if (err)
 571                 return (err);
 572         return (dsl_dataset_hold_ref(*dsp, tag));
 573 }
 574 
 575 int
 576 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
 577     void *tag, dsl_dataset_t **dsp)
 578 {
 579         int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
 580         if (err)
 581                 return (err);
 582         if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 583                 dsl_dataset_rele(*dsp, tag);
 584                 *dsp = NULL;
 585                 return (EBUSY);
 586         }
 587         return (0);
 588 }
 589 
 590 int
 591 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
 592 {
 593         dsl_dir_t *dd;
 594         dsl_pool_t *dp;
 595         const char *snapname;
 596         uint64_t obj;
 597         int err = 0;
 598 
 599         err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
 600         if (err)
 601                 return (err);
 602 
 603         dp = dd->dd_pool;
 604         obj = dd->dd_phys->dd_head_dataset_obj;
 605         rw_enter(&dp->dp_config_rwlock, RW_READER);
 606         if (obj)
 607                 err = dsl_dataset_get_ref(dp, obj, tag, dsp);
 608         else
 609                 err = ENOENT;
 610         if (err)
 611                 goto out;
 612 
 613         err = dsl_dataset_hold_ref(*dsp, tag);
 614 
 615         /* we may be looking for a snapshot */
 616         if (err == 0 && snapname != NULL) {
 617                 dsl_dataset_t *ds = NULL;
 618 
 619                 if (*snapname++ != '@') {
 620                         dsl_dataset_rele(*dsp, tag);
 621                         err = ENOENT;
 622                         goto out;
 623                 }
 624 
 625                 dprintf("looking for snapshot '%s'\n", snapname);
 626                 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
 627                 if (err == 0)
 628                         err = dsl_dataset_get_ref(dp, obj, tag, &ds);
 629                 dsl_dataset_rele(*dsp, tag);
 630 
 631                 ASSERT3U((err == 0), ==, (ds != NULL));
 632 
 633                 if (ds) {
 634                         mutex_enter(&ds->ds_lock);
 635                         if (ds->ds_snapname[0] == 0)
 636                                 (void) strlcpy(ds->ds_snapname, snapname,
 637                                     sizeof (ds->ds_snapname));
 638                         mutex_exit(&ds->ds_lock);
 639                         err = dsl_dataset_hold_ref(ds, tag);
 640                         *dsp = err ? NULL : ds;
 641                 }
 642         }
 643 out:
 644         rw_exit(&dp->dp_config_rwlock);
 645         dsl_dir_close(dd, FTAG);
 646         return (err);
 647 }
 648 
 649 int
 650 dsl_dataset_own(const char *name, boolean_t inconsistentok,
 651     void *tag, dsl_dataset_t **dsp)
 652 {
 653         int err = dsl_dataset_hold(name, tag, dsp);
 654         if (err)
 655                 return (err);
 656         if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 657                 dsl_dataset_rele(*dsp, tag);
 658                 return (EBUSY);
 659         }
 660         return (0);
 661 }
 662 
 663 void
 664 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 665 {
 666         if (ds == NULL) {
 667                 (void) strcpy(name, "mos");
 668         } else {
 669                 dsl_dir_name(ds->ds_dir, name);
 670                 VERIFY(0 == dsl_dataset_get_snapname(ds));
 671                 if (ds->ds_snapname[0]) {
 672                         (void) strcat(name, "@");
 673                         /*
 674                          * We use a "recursive" mutex so that we
 675                          * can call dprintf_ds() with ds_lock held.
 676                          */
 677                         if (!MUTEX_HELD(&ds->ds_lock)) {
 678                                 mutex_enter(&ds->ds_lock);
 679                                 (void) strcat(name, ds->ds_snapname);
 680                                 mutex_exit(&ds->ds_lock);
 681                         } else {
 682                                 (void) strcat(name, ds->ds_snapname);
 683                         }
 684                 }
 685         }
 686 }
 687 
 688 static int
 689 dsl_dataset_namelen(dsl_dataset_t *ds)
 690 {
 691         int result;
 692 
 693         if (ds == NULL) {
 694                 result = 3;     /* "mos" */
 695         } else {
 696                 result = dsl_dir_namelen(ds->ds_dir);
 697                 VERIFY(0 == dsl_dataset_get_snapname(ds));
 698                 if (ds->ds_snapname[0]) {
 699                         ++result;       /* adding one for the @-sign */
 700                         if (!MUTEX_HELD(&ds->ds_lock)) {
 701                                 mutex_enter(&ds->ds_lock);
 702                                 result += strlen(ds->ds_snapname);
 703                                 mutex_exit(&ds->ds_lock);
 704                         } else {
 705                                 result += strlen(ds->ds_snapname);
 706                         }
 707                 }
 708         }
 709 
 710         return (result);
 711 }
 712 
 713 void
 714 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
 715 {
 716         dmu_buf_rele(ds->ds_dbuf, tag);
 717 }
 718 
 719 void
 720 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 721 {
 722         if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
 723                 rw_exit(&ds->ds_rwlock);
 724         }
 725         dsl_dataset_drop_ref(ds, tag);
 726 }
 727 
 728 void
 729 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 730 {
 731         ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
 732             (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
 733 
 734         mutex_enter(&ds->ds_lock);
 735         ds->ds_owner = NULL;
 736         if (RW_WRITE_HELD(&ds->ds_rwlock)) {
 737                 rw_exit(&ds->ds_rwlock);
 738                 cv_broadcast(&ds->ds_exclusive_cv);
 739         }
 740         mutex_exit(&ds->ds_lock);
 741         if (ds->ds_dbuf)
 742                 dsl_dataset_drop_ref(ds, tag);
 743         else
 744                 dsl_dataset_evict(NULL, ds);
 745 }
 746 
 747 boolean_t
 748 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
 749 {
 750         boolean_t gotit = FALSE;
 751 
 752         mutex_enter(&ds->ds_lock);
 753         if (ds->ds_owner == NULL &&
 754             (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
 755                 ds->ds_owner = tag;
 756                 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
 757                         rw_exit(&ds->ds_rwlock);
 758                 gotit = TRUE;
 759         }
 760         mutex_exit(&ds->ds_lock);
 761         return (gotit);
 762 }
 763 
 764 void
 765 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
 766 {
 767         ASSERT3P(owner, ==, ds->ds_owner);
 768         if (!RW_WRITE_HELD(&ds->ds_rwlock))
 769                 rw_enter(&ds->ds_rwlock, RW_WRITER);
 770 }
 771 
 772 uint64_t
 773 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 774     uint64_t flags, dmu_tx_t *tx)
 775 {
 776         dsl_pool_t *dp = dd->dd_pool;
 777         dmu_buf_t *dbuf;
 778         dsl_dataset_phys_t *dsphys;
 779         uint64_t dsobj;
 780         objset_t *mos = dp->dp_meta_objset;
 781 
 782         if (origin == NULL)
 783                 origin = dp->dp_origin_snap;
 784 
 785         ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
 786         ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
 787         ASSERT(dmu_tx_is_syncing(tx));
 788         ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 789 
 790         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 791             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 792         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 793         dmu_buf_will_dirty(dbuf, tx);
 794         dsphys = dbuf->db_data;
 795         bzero(dsphys, sizeof (dsl_dataset_phys_t));
 796         dsphys->ds_dir_obj = dd->dd_object;
 797         dsphys->ds_flags = flags;
 798         dsphys->ds_fsid_guid = unique_create();
 799         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 800             sizeof (dsphys->ds_guid));
 801         dsphys->ds_snapnames_zapobj =
 802             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 803             DMU_OT_NONE, 0, tx);
 804         dsphys->ds_creation_time = gethrestime_sec();
 805         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 806 
 807         if (origin == NULL) {
 808                 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 809         } else {
 810                 dsl_dataset_t *ohds;
 811 
 812                 dsphys->ds_prev_snap_obj = origin->ds_object;
 813                 dsphys->ds_prev_snap_txg =
 814                     origin->ds_phys->ds_creation_txg;
 815                 dsphys->ds_referenced_bytes =
 816                     origin->ds_phys->ds_referenced_bytes;
 817                 dsphys->ds_compressed_bytes =
 818                     origin->ds_phys->ds_compressed_bytes;
 819                 dsphys->ds_uncompressed_bytes =
 820                     origin->ds_phys->ds_uncompressed_bytes;
 821                 dsphys->ds_bp = origin->ds_phys->ds_bp;
 822                 dsphys->ds_flags |= origin->ds_phys->ds_flags;
 823 
 824                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
 825                 origin->ds_phys->ds_num_children++;
 826 
 827                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
 828                     origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
 829                 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 830                     dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 831                 dsl_dataset_rele(ohds, FTAG);
 832 
 833                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 834                         if (origin->ds_phys->ds_next_clones_obj == 0) {
 835                                 origin->ds_phys->ds_next_clones_obj =
 836                                     zap_create(mos,
 837                                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 838                         }
 839                         VERIFY(0 == zap_add_int(mos,
 840                             origin->ds_phys->ds_next_clones_obj,
 841                             dsobj, tx));
 842                 }
 843 
 844                 dmu_buf_will_dirty(dd->dd_dbuf, tx);
 845                 dd->dd_phys->dd_origin_obj = origin->ds_object;
 846                 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 847                         if (origin->ds_dir->dd_phys->dd_clones == 0) {
 848                                 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 849                                 origin->ds_dir->dd_phys->dd_clones =
 850                                     zap_create(mos,
 851                                     DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 852                         }
 853                         VERIFY3U(0, ==, zap_add_int(mos,
 854                             origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
 855                 }
 856         }
 857 
 858         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 859                 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 860 
 861         dmu_buf_rele(dbuf, FTAG);
 862 
 863         dmu_buf_will_dirty(dd->dd_dbuf, tx);
 864         dd->dd_phys->dd_head_dataset_obj = dsobj;
 865 
 866         return (dsobj);
 867 }
 868 
 869 uint64_t
 870 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
 871     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
 872 {
 873         dsl_pool_t *dp = pdd->dd_pool;
 874         uint64_t dsobj, ddobj;
 875         dsl_dir_t *dd;
 876 
 877         ASSERT(lastname[0] != '@');
 878 
 879         ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 880         VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
 881 
 882         dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
 883 
 884         dsl_deleg_set_create_perms(dd, tx, cr);
 885 
 886         dsl_dir_close(dd, FTAG);
 887 
 888         /*
 889          * If we are creating a clone, make sure we zero out any stale
 890          * data from the origin snapshots zil header.
 891          */
 892         if (origin != NULL) {
 893                 dsl_dataset_t *ds;
 894                 objset_t *os;
 895 
 896                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 897                 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
 898                 bzero(&os->os_zil_header, sizeof (os->os_zil_header));
 899                 dsl_dataset_dirty(ds, tx);
 900                 dsl_dataset_rele(ds, FTAG);
 901         }
 902 
 903         return (dsobj);
 904 }
 905 
 906 /*
 907  * The snapshots must all be in the same pool.
 908  */
 909 int
 910 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer,
 911     nvlist_t *errlist)
 912 {
 913         int err;
 914         dsl_sync_task_t *dst;
 915         spa_t *spa;
 916         nvpair_t *pair;
 917         dsl_sync_task_group_t *dstg;
 918 
 919         pair = nvlist_next_nvpair(snaps, NULL);
 920         if (pair == NULL)
 921                 return (0);
 922 
 923         err = spa_open(nvpair_name(pair), &spa, FTAG);
 924         if (err)
 925                 return (err);
 926         dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 927 
 928         for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 929             pair = nvlist_next_nvpair(snaps, pair)) {
 930                 dsl_dataset_t *ds;
 931 
 932                 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
 933                 if (err == 0) {
 934                         struct dsl_ds_destroyarg *dsda;
 935 
 936                         dsl_dataset_make_exclusive(ds, dstg);
 937                         dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
 938                             KM_SLEEP);
 939                         dsda->ds = ds;
 940                         dsda->defer = defer;
 941                         dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
 942                             dsl_dataset_destroy_sync, dsda, dstg, 0);
 943                 } else if (err == ENOENT) {
 944                         err = 0;
 945                 } else {
 946                         fnvlist_add_int32(errlist, nvpair_name(pair), err);
 947                         break;
 948                 }
 949         }
 950 
 951         if (err == 0)
 952                 err = dsl_sync_task_group_wait(dstg);
 953 
 954         for (dst = list_head(&dstg->dstg_tasks); dst;
 955             dst = list_next(&dstg->dstg_tasks, dst)) {
 956                 struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
 957                 dsl_dataset_t *ds = dsda->ds;
 958 
 959                 /*
 960                  * Return the snapshots that triggered the error.
 961                  */
 962                 if (dst->dst_err != 0) {
 963                         char name[ZFS_MAXNAMELEN];
 964                         dsl_dataset_name(ds, name);
 965                         fnvlist_add_int32(errlist, name, dst->dst_err);
 966                 }
 967                 ASSERT3P(dsda->rm_origin, ==, NULL);
 968                 dsl_dataset_disown(ds, dstg);
 969                 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
 970         }
 971 
 972         dsl_sync_task_group_destroy(dstg);
 973         spa_close(spa, FTAG);
 974         return (err);
 975 
 976 }
 977 
 978 static boolean_t
 979 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
 980 {
 981         boolean_t might_destroy = B_FALSE;
 982 
 983         mutex_enter(&ds->ds_lock);
 984         if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
 985             DS_IS_DEFER_DESTROY(ds))
 986                 might_destroy = B_TRUE;
 987         mutex_exit(&ds->ds_lock);
 988 
 989         return (might_destroy);
 990 }
 991 
 992 /*
 993  * If we're removing a clone, and these three conditions are true:
 994  *      1) the clone's origin has no other children
 995  *      2) the clone's origin has no user references
 996  *      3) the clone's origin has been marked for deferred destruction
 997  * Then, prepare to remove the origin as part of this sync task group.
 998  */
 999 static int
1000 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
1001 {
1002         dsl_dataset_t *ds = dsda->ds;
1003         dsl_dataset_t *origin = ds->ds_prev;
1004 
1005         if (dsl_dataset_might_destroy_origin(origin)) {
1006                 char *name;
1007                 int namelen;
1008                 int error;
1009 
1010                 namelen = dsl_dataset_namelen(origin) + 1;
1011                 name = kmem_alloc(namelen, KM_SLEEP);
1012                 dsl_dataset_name(origin, name);
1013 #ifdef _KERNEL
1014                 error = zfs_unmount_snap(name, NULL);
1015                 if (error) {
1016                         kmem_free(name, namelen);
1017                         return (error);
1018                 }
1019 #endif
1020                 error = dsl_dataset_own(name, B_TRUE, tag, &origin);
1021                 kmem_free(name, namelen);
1022                 if (error)
1023                         return (error);
1024                 dsda->rm_origin = origin;
1025                 dsl_dataset_make_exclusive(origin, tag);
1026         }
1027 
1028         return (0);
1029 }
1030 
1031 /*
1032  * ds must be opened as OWNER.  On return (whether successful or not),
1033  * ds will be closed and caller can no longer dereference it.
1034  */
1035 int
1036 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
1037 {
1038         int err;
1039         dsl_sync_task_group_t *dstg;
1040         objset_t *os;
1041         dsl_dir_t *dd;
1042         uint64_t obj;
1043         struct dsl_ds_destroyarg dsda = { 0 };
1044 
1045         dsda.ds = ds;
1046 
1047         if (dsl_dataset_is_snapshot(ds)) {
1048                 /* Destroying a snapshot is simpler */
1049                 dsl_dataset_make_exclusive(ds, tag);
1050 
1051                 dsda.defer = defer;
1052                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1053                     dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
1054                     &dsda, tag, 0);
1055                 ASSERT3P(dsda.rm_origin, ==, NULL);
1056                 goto out;
1057         } else if (defer) {
1058                 err = EINVAL;
1059                 goto out;
1060         }
1061 
1062         dd = ds->ds_dir;
1063 
1064         if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
1065             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1066                 /*
1067                  * Check for errors and mark this ds as inconsistent, in
1068                  * case we crash while freeing the objects.
1069                  */
1070                 err = dsl_sync_task_do(dd->dd_pool,
1071                     dsl_dataset_destroy_begin_check,
1072                     dsl_dataset_destroy_begin_sync, ds, NULL, 0);
1073                 if (err)
1074                         goto out;
1075 
1076                 err = dmu_objset_from_ds(ds, &os);
1077                 if (err)
1078                         goto out;
1079 
1080                 /*
1081                  * Remove all objects while in the open context so that
1082                  * there is less work to do in the syncing context.
1083                  */
1084                 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
1085                     ds->ds_phys->ds_prev_snap_txg)) {
1086                         /*
1087                          * Ignore errors, if there is not enough disk space
1088                          * we will deal with it in dsl_dataset_destroy_sync().
1089                          */
1090                         (void) dmu_free_object(os, obj);
1091                 }
1092                 if (err != ESRCH)
1093                         goto out;
1094 
1095                 /*
1096                  * Sync out all in-flight IO.
1097                  */
1098                 txg_wait_synced(dd->dd_pool, 0);
1099 
1100                 /*
1101                  * If we managed to free all the objects in open
1102                  * context, the user space accounting should be zero.
1103                  */
1104                 if (ds->ds_phys->ds_bp.blk_fill == 0 &&
1105                     dmu_objset_userused_enabled(os)) {
1106                         uint64_t count;
1107 
1108                         ASSERT(zap_count(os, DMU_USERUSED_OBJECT,
1109                             &count) != 0 || count == 0);
1110                         ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT,
1111                             &count) != 0 || count == 0);
1112                 }
1113         }
1114 
1115         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
1116         err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
1117         rw_exit(&dd->dd_pool->dp_config_rwlock);
1118 
1119         if (err)
1120                 goto out;
1121 
1122         /*
1123          * Blow away the dsl_dir + head dataset.
1124          */
1125         dsl_dataset_make_exclusive(ds, tag);
1126         /*
1127          * If we're removing a clone, we might also need to remove its
1128          * origin.
1129          */
1130         do {
1131                 dsda.need_prep = B_FALSE;
1132                 if (dsl_dir_is_clone(dd)) {
1133                         err = dsl_dataset_origin_rm_prep(&dsda, tag);
1134                         if (err) {
1135                                 dsl_dir_close(dd, FTAG);
1136                                 goto out;
1137                         }
1138                 }
1139 
1140                 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
1141                 dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
1142                     dsl_dataset_destroy_sync, &dsda, tag, 0);
1143                 dsl_sync_task_create(dstg, dsl_dir_destroy_check,
1144                     dsl_dir_destroy_sync, dd, tag, 0);
1145                 err = dsl_sync_task_group_wait(dstg);
1146                 dsl_sync_task_group_destroy(dstg);
1147 
1148                 /*
1149                  * We could be racing against 'zfs release' or 'zfs destroy -d'
1150                  * on the origin snap, in which case we can get EBUSY if we
1151                  * needed to destroy the origin snap but were not ready to
1152                  * do so.
1153                  */
1154                 if (dsda.need_prep) {
1155                         ASSERT(err == EBUSY);
1156                         ASSERT(dsl_dir_is_clone(dd));
1157                         ASSERT(dsda.rm_origin == NULL);
1158                 }
1159         } while (dsda.need_prep);
1160 
1161         if (dsda.rm_origin != NULL)
1162                 dsl_dataset_disown(dsda.rm_origin, tag);
1163 
1164         /* if it is successful, dsl_dir_destroy_sync will close the dd */
1165         if (err)
1166                 dsl_dir_close(dd, FTAG);
1167 out:
1168         dsl_dataset_disown(ds, tag);
1169         return (err);
1170 }
1171 
1172 blkptr_t *
1173 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1174 {
1175         return (&ds->ds_phys->ds_bp);
1176 }
1177 
1178 void
1179 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1180 {
1181         ASSERT(dmu_tx_is_syncing(tx));
1182         /* If it's the meta-objset, set dp_meta_rootbp */
1183         if (ds == NULL) {
1184                 tx->tx_pool->dp_meta_rootbp = *bp;
1185         } else {
1186                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1187                 ds->ds_phys->ds_bp = *bp;
1188         }
1189 }
1190 
1191 spa_t *
1192 dsl_dataset_get_spa(dsl_dataset_t *ds)
1193 {
1194         return (ds->ds_dir->dd_pool->dp_spa);
1195 }
1196 
1197 void
1198 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1199 {
1200         dsl_pool_t *dp;
1201 
1202         if (ds == NULL) /* this is the meta-objset */
1203                 return;
1204 
1205         ASSERT(ds->ds_objset != NULL);
1206 
1207         if (ds->ds_phys->ds_next_snap_obj != 0)
1208                 panic("dirtying snapshot!");
1209 
1210         dp = ds->ds_dir->dd_pool;
1211 
1212         if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
1213                 /* up the hold count until we can be written out */
1214                 dmu_buf_add_ref(ds->ds_dbuf, ds);
1215         }
1216 }
1217 
1218 boolean_t
1219 dsl_dataset_is_dirty(dsl_dataset_t *ds)
1220 {
1221         for (int t = 0; t < TXG_SIZE; t++) {
1222                 if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
1223                     ds, t))
1224                         return (B_TRUE);
1225         }
1226         return (B_FALSE);
1227 }
1228 
1229 /*
1230  * The unique space in the head dataset can be calculated by subtracting
1231  * the space used in the most recent snapshot, that is still being used
1232  * in this file system, from the space currently in use.  To figure out
1233  * the space in the most recent snapshot still in use, we need to take
1234  * the total space used in the snapshot and subtract out the space that
1235  * has been freed up since the snapshot was taken.
1236  */
1237 static void
1238 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1239 {
1240         uint64_t mrs_used;
1241         uint64_t dlused, dlcomp, dluncomp;
1242 
1243         ASSERT(!dsl_dataset_is_snapshot(ds));
1244 
1245         if (ds->ds_phys->ds_prev_snap_obj != 0)
1246                 mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
1247         else
1248                 mrs_used = 0;
1249 
1250         dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1251 
1252         ASSERT3U(dlused, <=, mrs_used);
1253         ds->ds_phys->ds_unique_bytes =
1254             ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
1255 
1256         if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1257             SPA_VERSION_UNIQUE_ACCURATE)
1258                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1259 }
1260 
1261 struct killarg {
1262         dsl_dataset_t *ds;
1263         dmu_tx_t *tx;
1264 };
1265 
1266 /* ARGSUSED */
1267 static int
1268 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
1269     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1270 {
1271         struct killarg *ka = arg;
1272         dmu_tx_t *tx = ka->tx;
1273 
1274         if (bp == NULL)
1275                 return (0);
1276 
1277         if (zb->zb_level == ZB_ZIL_LEVEL) {
1278                 ASSERT(zilog != NULL);
1279                 /*
1280                  * It's a block in the intent log.  It has no
1281                  * accounting, so just free it.
1282                  */
1283                 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
1284         } else {
1285                 ASSERT(zilog == NULL);
1286                 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
1287                 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
1288         }
1289 
1290         return (0);
1291 }
1292 
1293 /* ARGSUSED */
1294 static int
1295 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
1296 {
1297         dsl_dataset_t *ds = arg1;
1298         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1299         uint64_t count;
1300         int err;
1301 
1302         /*
1303          * Can't delete a head dataset if there are snapshots of it.
1304          * (Except if the only snapshots are from the branch we cloned
1305          * from.)
1306          */
1307         if (ds->ds_prev != NULL &&
1308             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1309                 return (EBUSY);
1310 
1311         /*
1312          * This is really a dsl_dir thing, but check it here so that
1313          * we'll be less likely to leave this dataset inconsistent &
1314          * nearly destroyed.
1315          */
1316         err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
1317         if (err)
1318                 return (err);
1319         if (count != 0)
1320                 return (EEXIST);
1321 
1322         return (0);
1323 }
1324 
1325 /* ARGSUSED */
1326 static void
1327 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1328 {
1329         dsl_dataset_t *ds = arg1;
1330 
1331         /* Mark it as inconsistent on-disk, in case we crash */
1332         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1333         ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1334 
1335         spa_history_log_internal_ds(ds, "destroy begin", tx, "");
1336 }
1337 
1338 static int
1339 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
1340     dmu_tx_t *tx)
1341 {
1342         dsl_dataset_t *ds = dsda->ds;
1343         dsl_dataset_t *ds_prev = ds->ds_prev;
1344 
1345         if (dsl_dataset_might_destroy_origin(ds_prev)) {
1346                 struct dsl_ds_destroyarg ndsda = {0};
1347 
1348                 /*
1349                  * If we're not prepared to remove the origin, don't remove
1350                  * the clone either.
1351                  */
1352                 if (dsda->rm_origin == NULL) {
1353                         dsda->need_prep = B_TRUE;
1354                         return (EBUSY);
1355                 }
1356 
1357                 ndsda.ds = ds_prev;
1358                 ndsda.is_origin_rm = B_TRUE;
1359                 return (dsl_dataset_destroy_check(&ndsda, tag, tx));
1360         }
1361 
1362         /*
1363          * If we're not going to remove the origin after all,
1364          * undo the open context setup.
1365          */
1366         if (dsda->rm_origin != NULL) {
1367                 dsl_dataset_disown(dsda->rm_origin, tag);
1368                 dsda->rm_origin = NULL;
1369         }
1370 
1371         return (0);
1372 }
1373 
1374 /*
1375  * If you add new checks here, you may need to add
1376  * additional checks to the "temporary" case in
1377  * snapshot_check() in dmu_objset.c.
1378  */
1379 /* ARGSUSED */
1380 int
1381 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
1382 {
1383         struct dsl_ds_destroyarg *dsda = arg1;
1384         dsl_dataset_t *ds = dsda->ds;
1385 
1386         /* we have an owner hold, so noone else can destroy us */
1387         ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
1388 
1389         /*
1390          * Only allow deferred destroy on pools that support it.
1391          * NOTE: deferred destroy is only supported on snapshots.
1392          */
1393         if (dsda->defer) {
1394                 if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
1395                     SPA_VERSION_USERREFS)
1396                         return (ENOTSUP);
1397                 ASSERT(dsl_dataset_is_snapshot(ds));
1398                 return (0);
1399         }
1400 
1401         /*
1402          * Can't delete a head dataset if there are snapshots of it.
1403          * (Except if the only snapshots are from the branch we cloned
1404          * from.)
1405          */
1406         if (ds->ds_prev != NULL &&
1407             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1408                 return (EBUSY);
1409 
1410         /*
1411          * If we made changes this txg, traverse_dsl_dataset won't find
1412          * them.  Try again.
1413          */
1414         if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1415                 return (EAGAIN);
1416 
1417         if (dsl_dataset_is_snapshot(ds)) {
1418                 /*
1419                  * If this snapshot has an elevated user reference count,
1420                  * we can't destroy it yet.
1421                  */
1422                 if (ds->ds_userrefs > 0 && !dsda->releasing)
1423                         return (EBUSY);
1424 
1425                 mutex_enter(&ds->ds_lock);
1426                 /*
1427                  * Can't delete a branch point. However, if we're destroying
1428                  * a clone and removing its origin due to it having a user
1429                  * hold count of 0 and having been marked for deferred destroy,
1430                  * it's OK for the origin to have a single clone.
1431                  */
1432                 if (ds->ds_phys->ds_num_children >
1433                     (dsda->is_origin_rm ? 2 : 1)) {
1434                         mutex_exit(&ds->ds_lock);
1435                         return (EEXIST);
1436                 }
1437                 mutex_exit(&ds->ds_lock);
1438         } else if (dsl_dir_is_clone(ds->ds_dir)) {
1439                 return (dsl_dataset_origin_check(dsda, arg2, tx));
1440         }
1441 
1442         /* XXX we should do some i/o error checking... */
1443         return (0);
1444 }
1445 
1446 struct refsarg {
1447         kmutex_t lock;
1448         boolean_t gone;
1449         kcondvar_t cv;
1450 };
1451 
1452 /* ARGSUSED */
1453 static void
1454 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
1455 {
1456         struct refsarg *arg = argv;
1457 
1458         mutex_enter(&arg->lock);
1459         arg->gone = TRUE;
1460         cv_signal(&arg->cv);
1461         mutex_exit(&arg->lock);
1462 }
1463 
1464 static void
1465 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
1466 {
1467         struct refsarg arg;
1468 
1469         mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
1470         cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
1471         arg.gone = FALSE;
1472         (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
1473             dsl_dataset_refs_gone);
1474         dmu_buf_rele(ds->ds_dbuf, tag);
1475         mutex_enter(&arg.lock);
1476         while (!arg.gone)
1477                 cv_wait(&arg.cv, &arg.lock);
1478         ASSERT(arg.gone);
1479         mutex_exit(&arg.lock);
1480         ds->ds_dbuf = NULL;
1481         ds->ds_phys = NULL;
1482         mutex_destroy(&arg.lock);
1483         cv_destroy(&arg.cv);
1484 }
1485 
1486 static void
1487 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
1488 {
1489         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1490         uint64_t count;
1491         int err;
1492 
1493         ASSERT(ds->ds_phys->ds_num_children >= 2);
1494         err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
1495         /*
1496          * The err should not be ENOENT, but a bug in a previous version
1497          * of the code could cause upgrade_clones_cb() to not set
1498          * ds_next_snap_obj when it should, leading to a missing entry.
1499          * If we knew that the pool was created after
1500          * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1501          * ENOENT.  However, at least we can check that we don't have
1502          * too many entries in the next_clones_obj even after failing to
1503          * remove this one.
1504          */
1505         if (err != ENOENT) {
1506                 VERIFY0(err);
1507         }
1508         ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
1509             &count));
1510         ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
1511 }
1512 
1513 static void
1514 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
1515 {
1516         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1517         zap_cursor_t zc;
1518         zap_attribute_t za;
1519 
1520         /*
1521          * If it is the old version, dd_clones doesn't exist so we can't
1522          * find the clones, but deadlist_remove_key() is a no-op so it
1523          * doesn't matter.
1524          */
1525         if (ds->ds_dir->dd_phys->dd_clones == 0)
1526                 return;
1527 
1528         for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
1529             zap_cursor_retrieve(&zc, &za) == 0;
1530             zap_cursor_advance(&zc)) {
1531                 dsl_dataset_t *clone;
1532 
1533                 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1534                     za.za_first_integer, FTAG, &clone));
1535                 if (clone->ds_dir->dd_origin_txg > mintxg) {
1536                         dsl_deadlist_remove_key(&clone->ds_deadlist,
1537                             mintxg, tx);
1538                         dsl_dataset_remove_clones_key(clone, mintxg, tx);
1539                 }
1540                 dsl_dataset_rele(clone, FTAG);
1541         }
1542         zap_cursor_fini(&zc);
1543 }
1544 
1545 struct process_old_arg {
1546         dsl_dataset_t *ds;
1547         dsl_dataset_t *ds_prev;
1548         boolean_t after_branch_point;
1549         zio_t *pio;
1550         uint64_t used, comp, uncomp;
1551 };
1552 
1553 static int
1554 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1555 {
1556         struct process_old_arg *poa = arg;
1557         dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
1558 
1559         if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
1560                 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
1561                 if (poa->ds_prev && !poa->after_branch_point &&
1562                     bp->blk_birth >
1563                     poa->ds_prev->ds_phys->ds_prev_snap_txg) {
1564                         poa->ds_prev->ds_phys->ds_unique_bytes +=
1565                             bp_get_dsize_sync(dp->dp_spa, bp);
1566                 }
1567         } else {
1568                 poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
1569                 poa->comp += BP_GET_PSIZE(bp);
1570                 poa->uncomp += BP_GET_UCSIZE(bp);
1571                 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
1572         }
1573         return (0);
1574 }
1575 
1576 static void
1577 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
1578     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
1579 {
1580         struct process_old_arg poa = { 0 };
1581         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1582         objset_t *mos = dp->dp_meta_objset;
1583 
1584         ASSERT(ds->ds_deadlist.dl_oldfmt);
1585         ASSERT(ds_next->ds_deadlist.dl_oldfmt);
1586 
1587         poa.ds = ds;
1588         poa.ds_prev = ds_prev;
1589         poa.after_branch_point = after_branch_point;
1590         poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1591         VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
1592             process_old_cb, &poa, tx));
1593         VERIFY0(zio_wait(poa.pio));
1594         ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
1595 
1596         /* change snapused */
1597         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1598             -poa.used, -poa.comp, -poa.uncomp, tx);
1599 
1600         /* swap next's deadlist to our deadlist */
1601         dsl_deadlist_close(&ds->ds_deadlist);
1602         dsl_deadlist_close(&ds_next->ds_deadlist);
1603         SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
1604             ds->ds_phys->ds_deadlist_obj);
1605         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1606         dsl_deadlist_open(&ds_next->ds_deadlist, mos,
1607             ds_next->ds_phys->ds_deadlist_obj);
1608 }
1609 
1610 static int
1611 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
1612 {
1613         int err;
1614         struct killarg ka;
1615 
1616         /*
1617          * Free everything that we point to (that's born after
1618          * the previous snapshot, if we are a clone)
1619          *
1620          * NB: this should be very quick, because we already
1621          * freed all the objects in open context.
1622          */
1623         ka.ds = ds;
1624         ka.tx = tx;
1625         err = traverse_dataset(ds,
1626             ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
1627             kill_blkptr, &ka);
1628         ASSERT0(err);
1629         ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
1630 
1631         return (err);
1632 }
1633 
1634 void
1635 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
1636 {
1637         struct dsl_ds_destroyarg *dsda = arg1;
1638         dsl_dataset_t *ds = dsda->ds;
1639         int err;
1640         int after_branch_point = FALSE;
1641         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1642         objset_t *mos = dp->dp_meta_objset;
1643         dsl_dataset_t *ds_prev = NULL;
1644         boolean_t wont_destroy;
1645         uint64_t obj;
1646 
1647         wont_destroy = (dsda->defer &&
1648             (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
1649 
1650         ASSERT(ds->ds_owner || wont_destroy);
1651         ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
1652         ASSERT(ds->ds_prev == NULL ||
1653             ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
1654         ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
1655 
1656         if (wont_destroy) {
1657                 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
1658                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1659                 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
1660                 spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
1661                 return;
1662         }
1663 
1664         /* We need to log before removing it from the namespace. */
1665         spa_history_log_internal_ds(ds, "destroy", tx, "");
1666 
1667         /* signal any waiters that this dataset is going away */
1668         mutex_enter(&ds->ds_lock);
1669         ds->ds_owner = dsl_reaper;
1670         cv_broadcast(&ds->ds_exclusive_cv);
1671         mutex_exit(&ds->ds_lock);
1672 
1673         /* Remove our reservation */
1674         if (ds->ds_reserved != 0) {
1675                 dsl_prop_setarg_t psa;
1676                 uint64_t value = 0;
1677 
1678                 dsl_prop_setarg_init_uint64(&psa, "refreservation",
1679                     (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
1680                     &value);
1681                 psa.psa_effective_value = 0;    /* predict default value */
1682 
1683                 dsl_dataset_set_reservation_sync(ds, &psa, tx);
1684                 ASSERT0(ds->ds_reserved);
1685         }
1686 
1687         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1688 
1689         dsl_scan_ds_destroyed(ds, tx);
1690 
1691         obj = ds->ds_object;
1692 
1693         if (ds->ds_phys->ds_prev_snap_obj != 0) {
1694                 if (ds->ds_prev) {
1695                         ds_prev = ds->ds_prev;
1696                 } else {
1697                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1698                             ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
1699                 }
1700                 after_branch_point =
1701                     (ds_prev->ds_phys->ds_next_snap_obj != obj);
1702 
1703                 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1704                 if (after_branch_point &&
1705                     ds_prev->ds_phys->ds_next_clones_obj != 0) {
1706                         remove_from_next_clones(ds_prev, obj, tx);
1707                         if (ds->ds_phys->ds_next_snap_obj != 0) {
1708                                 VERIFY(0 == zap_add_int(mos,
1709                                     ds_prev->ds_phys->ds_next_clones_obj,
1710                                     ds->ds_phys->ds_next_snap_obj, tx));
1711                         }
1712                 }
1713                 if (after_branch_point &&
1714                     ds->ds_phys->ds_next_snap_obj == 0) {
1715                         /* This clone is toast. */
1716                         ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1717                         ds_prev->ds_phys->ds_num_children--;
1718 
1719                         /*
1720                          * If the clone's origin has no other clones, no
1721                          * user holds, and has been marked for deferred
1722                          * deletion, then we should have done the necessary
1723                          * destroy setup for it.
1724                          */
1725                         if (ds_prev->ds_phys->ds_num_children == 1 &&
1726                             ds_prev->ds_userrefs == 0 &&
1727                             DS_IS_DEFER_DESTROY(ds_prev)) {
1728                                 ASSERT3P(dsda->rm_origin, !=, NULL);
1729                         } else {
1730                                 ASSERT3P(dsda->rm_origin, ==, NULL);
1731                         }
1732                 } else if (!after_branch_point) {
1733                         ds_prev->ds_phys->ds_next_snap_obj =
1734                             ds->ds_phys->ds_next_snap_obj;
1735                 }
1736         }
1737 
1738         if (dsl_dataset_is_snapshot(ds)) {
1739                 dsl_dataset_t *ds_next;
1740                 uint64_t old_unique;
1741                 uint64_t used = 0, comp = 0, uncomp = 0;
1742 
1743                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1744                     ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
1745                 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1746 
1747                 old_unique = ds_next->ds_phys->ds_unique_bytes;
1748 
1749                 dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1750                 ds_next->ds_phys->ds_prev_snap_obj =
1751                     ds->ds_phys->ds_prev_snap_obj;
1752                 ds_next->ds_phys->ds_prev_snap_txg =
1753                     ds->ds_phys->ds_prev_snap_txg;
1754                 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1755                     ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1756 
1757 
1758                 if (ds_next->ds_deadlist.dl_oldfmt) {
1759                         process_old_deadlist(ds, ds_prev, ds_next,
1760                             after_branch_point, tx);
1761                 } else {
1762                         /* Adjust prev's unique space. */
1763                         if (ds_prev && !after_branch_point) {
1764                                 dsl_deadlist_space_range(&ds_next->ds_deadlist,
1765                                     ds_prev->ds_phys->ds_prev_snap_txg,
1766                                     ds->ds_phys->ds_prev_snap_txg,
1767                                     &used, &comp, &uncomp);
1768                                 ds_prev->ds_phys->ds_unique_bytes += used;
1769                         }
1770 
1771                         /* Adjust snapused. */
1772                         dsl_deadlist_space_range(&ds_next->ds_deadlist,
1773                             ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1774                             &used, &comp, &uncomp);
1775                         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1776                             -used, -comp, -uncomp, tx);
1777 
1778                         /* Move blocks to be freed to pool's free list. */
1779                         dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
1780                             &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
1781                             tx);
1782                         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
1783                             DD_USED_HEAD, used, comp, uncomp, tx);
1784 
1785                         /* Merge our deadlist into next's and free it. */
1786                         dsl_deadlist_merge(&ds_next->ds_deadlist,
1787                             ds->ds_phys->ds_deadlist_obj, tx);
1788                 }
1789                 dsl_deadlist_close(&ds->ds_deadlist);
1790                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1791 
1792                 /* Collapse range in clone heads */
1793                 dsl_dataset_remove_clones_key(ds,
1794                     ds->ds_phys->ds_creation_txg, tx);
1795 
1796                 if (dsl_dataset_is_snapshot(ds_next)) {
1797                         dsl_dataset_t *ds_nextnext;
1798 
1799                         /*
1800                          * Update next's unique to include blocks which
1801                          * were previously shared by only this snapshot
1802                          * and it.  Those blocks will be born after the
1803                          * prev snap and before this snap, and will have
1804                          * died after the next snap and before the one
1805                          * after that (ie. be on the snap after next's
1806                          * deadlist).
1807                          */
1808                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1809                             ds_next->ds_phys->ds_next_snap_obj,
1810                             FTAG, &ds_nextnext));
1811                         dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
1812                             ds->ds_phys->ds_prev_snap_txg,
1813                             ds->ds_phys->ds_creation_txg,
1814                             &used, &comp, &uncomp);
1815                         ds_next->ds_phys->ds_unique_bytes += used;
1816                         dsl_dataset_rele(ds_nextnext, FTAG);
1817                         ASSERT3P(ds_next->ds_prev, ==, NULL);
1818 
1819                         /* Collapse range in this head. */
1820                         dsl_dataset_t *hds;
1821                         VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
1822                             ds->ds_dir->dd_phys->dd_head_dataset_obj,
1823                             FTAG, &hds));
1824                         dsl_deadlist_remove_key(&hds->ds_deadlist,
1825                             ds->ds_phys->ds_creation_txg, tx);
1826                         dsl_dataset_rele(hds, FTAG);
1827 
1828                 } else {
1829                         ASSERT3P(ds_next->ds_prev, ==, ds);
1830                         dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
1831                         ds_next->ds_prev = NULL;
1832                         if (ds_prev) {
1833                                 VERIFY(0 == dsl_dataset_get_ref(dp,
1834                                     ds->ds_phys->ds_prev_snap_obj,
1835                                     ds_next, &ds_next->ds_prev));
1836                         }
1837 
1838                         dsl_dataset_recalc_head_uniq(ds_next);
1839 
1840                         /*
1841                          * Reduce the amount of our unconsmed refreservation
1842                          * being charged to our parent by the amount of
1843                          * new unique data we have gained.
1844                          */
1845                         if (old_unique < ds_next->ds_reserved) {
1846                                 int64_t mrsdelta;
1847                                 uint64_t new_unique =
1848                                     ds_next->ds_phys->ds_unique_bytes;
1849 
1850                                 ASSERT(old_unique <= new_unique);
1851                                 mrsdelta = MIN(new_unique - old_unique,
1852                                     ds_next->ds_reserved - old_unique);
1853                                 dsl_dir_diduse_space(ds->ds_dir,
1854                                     DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1855                         }
1856                 }
1857                 dsl_dataset_rele(ds_next, FTAG);
1858         } else {
1859                 zfeature_info_t *async_destroy =
1860                     &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
1861                 objset_t *os;
1862 
1863                 /*
1864                  * There's no next snapshot, so this is a head dataset.
1865                  * Destroy the deadlist.  Unless it's a clone, the
1866                  * deadlist should be empty.  (If it's a clone, it's
1867                  * safe to ignore the deadlist contents.)
1868                  */
1869                 dsl_deadlist_close(&ds->ds_deadlist);
1870                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1871                 ds->ds_phys->ds_deadlist_obj = 0;
1872 
1873                 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
1874 
1875                 if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
1876                         err = old_synchronous_dataset_destroy(ds, tx);
1877                 } else {
1878                         /*
1879                          * Move the bptree into the pool's list of trees to
1880                          * clean up and update space accounting information.
1881                          */
1882                         uint64_t used, comp, uncomp;
1883 
1884                         zil_destroy_sync(dmu_objset_zil(os), tx);
1885 
1886                         if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
1887                                 spa_feature_incr(dp->dp_spa, async_destroy, tx);
1888                                 dp->dp_bptree_obj = bptree_alloc(mos, tx);
1889                                 VERIFY(zap_add(mos,
1890                                     DMU_POOL_DIRECTORY_OBJECT,
1891                                     DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
1892                                     &dp->dp_bptree_obj, tx) == 0);
1893                         }
1894 
1895                         used = ds->ds_dir->dd_phys->dd_used_bytes;
1896                         comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
1897                         uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
1898 
1899                         ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
1900                             ds->ds_phys->ds_unique_bytes == used);
1901 
1902                         bptree_add(mos, dp->dp_bptree_obj,
1903                             &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
1904                             used, comp, uncomp, tx);
1905                         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
1906                             -used, -comp, -uncomp, tx);
1907                         dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
1908                             used, comp, uncomp, tx);
1909                 }
1910 
1911                 if (ds->ds_prev != NULL) {
1912                         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
1913                                 VERIFY3U(0, ==, zap_remove_int(mos,
1914                                     ds->ds_prev->ds_dir->dd_phys->dd_clones,
1915                                     ds->ds_object, tx));
1916                         }
1917                         dsl_dataset_rele(ds->ds_prev, ds);
1918                         ds->ds_prev = ds_prev = NULL;
1919                 }
1920         }
1921 
1922         /*
1923          * This must be done after the dsl_traverse(), because it will
1924          * re-open the objset.
1925          */
1926         if (ds->ds_objset) {
1927                 dmu_objset_evict(ds->ds_objset);
1928                 ds->ds_objset = NULL;
1929         }
1930 
1931         if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1932                 /* Erase the link in the dir */
1933                 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1934                 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
1935                 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
1936                 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1937                 ASSERT(err == 0);
1938         } else {
1939                 /* remove from snapshot namespace */
1940                 dsl_dataset_t *ds_head;
1941                 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
1942                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1943                     ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
1944                 VERIFY(0 == dsl_dataset_get_snapname(ds));
1945 #ifdef ZFS_DEBUG
1946                 {
1947                         uint64_t val;
1948 
1949                         err = dsl_dataset_snap_lookup(ds_head,
1950                             ds->ds_snapname, &val);
1951                         ASSERT0(err);
1952                         ASSERT3U(val, ==, obj);
1953                 }
1954 #endif
1955                 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
1956                 ASSERT(err == 0);
1957                 dsl_dataset_rele(ds_head, FTAG);
1958         }
1959 
1960         if (ds_prev && ds->ds_prev != ds_prev)
1961                 dsl_dataset_rele(ds_prev, FTAG);
1962 
1963         spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
1964 
1965         if (ds->ds_phys->ds_next_clones_obj != 0) {
1966                 uint64_t count;
1967                 ASSERT(0 == zap_count(mos,
1968                     ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
1969                 VERIFY(0 == dmu_object_free(mos,
1970                     ds->ds_phys->ds_next_clones_obj, tx));
1971         }
1972         if (ds->ds_phys->ds_props_obj != 0)
1973                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
1974         if (ds->ds_phys->ds_userrefs_obj != 0)
1975                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
1976         dsl_dir_close(ds->ds_dir, ds);
1977         ds->ds_dir = NULL;
1978         dsl_dataset_drain_refs(ds, tag);
1979         VERIFY(0 == dmu_object_free(mos, obj, tx));
1980 
1981         if (dsda->rm_origin) {
1982                 /*
1983                  * Remove the origin of the clone we just destroyed.
1984                  */
1985                 struct dsl_ds_destroyarg ndsda = {0};
1986 
1987                 ndsda.ds = dsda->rm_origin;
1988                 dsl_dataset_destroy_sync(&ndsda, tag, tx);
1989         }
1990 }
1991 
1992 static int
1993 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1994 {
1995         uint64_t asize;
1996 
1997         if (!dmu_tx_is_syncing(tx))
1998                 return (0);
1999 
2000         /*
2001          * If there's an fs-only reservation, any blocks that might become
2002          * owned by the snapshot dataset must be accommodated by space
2003          * outside of the reservation.
2004          */
2005         ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
2006         asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2007         if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
2008                 return (ENOSPC);
2009 
2010         /*
2011          * Propagate any reserved space for this snapshot to other
2012          * snapshot checks in this sync group.
2013          */
2014         if (asize > 0)
2015                 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
2016 
2017         return (0);
2018 }
2019 
2020 /*
2021  * Check if adding additional snapshot(s) would exceed any snapshot limits.
2022  * Note that all snapshot limits up to the root dataset (i.e. the pool itself)
2023  * or the given ancestor must be satisfied. Note that it is valid for the
2024  * count to exceed the limit. This can happen if a snapshot is taken by an
2025  * administrative user in the global zone (e.g. a recursive snapshot by root).
2026  */
2027 int
2028 dsl_snapcount_check(dsl_dir_t *dd, uint64_t cnt, dsl_dir_t *ancestor)
2029 {
2030         uint64_t limit;
2031         int err = 0;
2032 
2033         /*
2034          * The limit is never enforced for the admin user in global zone.
2035          * If we're not in the global zone then we need to run this check in
2036          * open context, since thats when we know what zone we're in and
2037          * syncing is only performed in the global zone.
2038          */
2039         if (INGLOBALZONE(curproc))
2040                 return (0);
2041 
2042         /*
2043          * If renaming a dataset with no snapshots, count adjustment is 0.
2044          */
2045         if (cnt == 0)
2046                 return (0);
2047 
2048         /*
2049          * If an ancestor has been provided, stop checking the limit once we
2050          * hit that dir. We need this during rename so that we don't overcount
2051          * the check once we recurse up to the common ancestor.
2052          */
2053         if (ancestor == dd)
2054                 return (0);
2055 
2056         /*
2057          * If we hit an uninitialized node while recursing up the tree, we can
2058          * stop since we know the counts are not valid on this node and we
2059          * know we won't touch this node's counts.
2060          */
2061         if (dd->dd_phys->dd_filesystem_count == 0)
2062                 return (0);
2063 
2064         /*
2065          * If there's no value for this property, there's no need to enforce a
2066          * snapshot limit.
2067          */
2068         err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_SNAPSHOT_LIMIT),
2069             8, 1, &limit, NULL, B_FALSE);
2070         if (err == ENOENT)
2071                 return (0);
2072         else if (err != 0)
2073                 return (err);
2074 
2075 #ifdef _KERNEL
2076         extern void __dtrace_probe_zfs__ss__limit(uint64_t, uint64_t, char *);
2077         __dtrace_probe_zfs__ss__limit(
2078             (uint64_t)dd->dd_phys->dd_snapshot_count, (uint64_t)limit,
2079             dd->dd_myname);
2080 #endif
2081 
2082         if (limit != MAXLIMIT &&
2083             (dd->dd_phys->dd_snapshot_count + cnt) > limit)
2084                 return (EDQUOT);
2085 
2086         if (dd->dd_parent != NULL)
2087                 err = dsl_snapcount_check(dd->dd_parent, cnt, ancestor);
2088 
2089         return (err);
2090 }
2091 
2092 /*
2093  * Adjust the snapshot count for the specified dsl_dir_t and all parents.
2094  * When a new snapshot is created, increment the count on all parents, and when
2095  * a snapshot is destroyed, decrement the count.
2096  */
2097 void
2098 dsl_snapcount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
2099     boolean_t first)
2100 {
2101         /*
2102          * If we hit an uninitialized node while recursing up the tree, we can
2103          * stop since we know the counts are not valid on this node and we
2104          * know we shouldn't touch this node's counts. An uninitialized count
2105          * on the node indicates that either the feature has not yet been
2106          * activated or there are no limits on this part of the tree.
2107          */
2108         if (dd->dd_phys->dd_filesystem_count == 0)
2109                 return;
2110 
2111         /*
2112          * The feature might have previously been active, so there could be
2113          * non-0 counts on the nodes, but it might now be inactive.
2114          *
2115          * On initial entry we need to check if this feature is active, but
2116          * we don't want to re-check this on each recursive call. Note: the
2117          * feature cannot be active if its not enabled. If the feature is not
2118          * active, don't touch the on-disk count fields.
2119          */
2120         if (first) {
2121                 dsl_dataset_t *ds = NULL;
2122                 spa_t *spa;
2123                 zfeature_info_t *quota_feat =
2124                     &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
2125 
2126                 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2127                     dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
2128                 spa = dsl_dataset_get_spa(ds);
2129                 dsl_dataset_rele(ds, FTAG);
2130                 if (!spa_feature_is_active(spa, quota_feat))
2131                         return;
2132         }
2133 
2134         /*
2135          * As with dsl_dataset_set_reservation_check(), wdon't want to run
2136          * this check in open context.
2137          */
2138         if (!dmu_tx_is_syncing(tx))
2139                 return;
2140 
2141         /* if renaming a dataset with no snapshots, count adjustment is 0 */
2142         if (delta == 0)
2143                 return;
2144 
2145         /*
2146          * If we hit an uninitialized node while recursing up the tree, we can
2147          * stop since we know the counts are not valid on this node and we
2148          * know we shouldn't touch this node's counts.
2149          */
2150         if (dd->dd_phys->dd_filesystem_count == 0)
2151                 return;
2152 
2153         /* Increment count for parent */
2154         dmu_buf_will_dirty(dd->dd_dbuf, tx);
2155 
2156         mutex_enter(&dd->dd_lock);
2157 
2158         dd->dd_phys->dd_snapshot_count += delta;
2159 
2160         /* Roll up this additional count into our ancestors */
2161         if (dd->dd_parent != NULL)
2162                 dsl_snapcount_adjust(dd->dd_parent, tx, delta, B_FALSE);
2163 
2164         mutex_exit(&dd->dd_lock);
2165 }
2166 
2167 int
2168 dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
2169     uint64_t cnt, dmu_tx_t *tx)
2170 {
2171         int err;
2172         uint64_t value;
2173 
2174         /*
2175          * We don't allow multiple snapshots of the same txg.  If there
2176          * is already one, try again.
2177          */
2178         if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
2179                 return (EAGAIN);
2180 
2181         /*
2182          * Check for conflicting snapshot name.
2183          */
2184         err = dsl_dataset_snap_lookup(ds, snapname, &value);
2185         if (err == 0)
2186                 return (EEXIST);
2187         if (err != ENOENT)
2188                 return (err);
2189 
2190         /*
2191          * Check that the dataset's name is not too long.  Name consists
2192          * of the dataset's length + 1 for the @-sign + snapshot name's length
2193          */
2194         if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
2195                 return (ENAMETOOLONG);
2196 
2197         err = dsl_snapcount_check(ds->ds_dir, cnt, NULL);
2198         if (err)
2199                 return (err);
2200 
2201         err = dsl_dataset_snapshot_reserve_space(ds, tx);
2202         if (err)
2203                 return (err);
2204 
2205         ds->ds_trysnap_txg = tx->tx_txg;
2206         return (0);
2207 }
2208 
2209 void
2210 dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
2211     dmu_tx_t *tx)
2212 {
2213         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2214         dmu_buf_t *dbuf;
2215         dsl_dataset_phys_t *dsphys;
2216         uint64_t dsobj, crtxg;
2217         objset_t *mos = dp->dp_meta_objset;
2218         int err;
2219 
2220         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
2221 
2222         dsl_snapcount_adjust(ds->ds_dir, tx, 1, B_TRUE);
2223 
2224         /*
2225          * The origin's ds_creation_txg has to be < TXG_INITIAL
2226          */
2227         if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
2228                 crtxg = 1;
2229         else
2230                 crtxg = tx->tx_txg;
2231 
2232         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
2233             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
2234         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
2235         dmu_buf_will_dirty(dbuf, tx);
2236         dsphys = dbuf->db_data;
2237         bzero(dsphys, sizeof (dsl_dataset_phys_t));
2238         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
2239         dsphys->ds_fsid_guid = unique_create();
2240         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
2241             sizeof (dsphys->ds_guid));
2242         dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
2243         dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
2244         dsphys->ds_next_snap_obj = ds->ds_object;
2245         dsphys->ds_num_children = 1;
2246         dsphys->ds_creation_time = gethrestime_sec();
2247         dsphys->ds_creation_txg = crtxg;
2248         dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
2249         dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
2250         dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
2251         dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
2252         dsphys->ds_flags = ds->ds_phys->ds_flags;
2253         dsphys->ds_bp = ds->ds_phys->ds_bp;
2254         dmu_buf_rele(dbuf, FTAG);
2255 
2256         ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
2257         if (ds->ds_prev) {
2258                 uint64_t next_clones_obj =
2259                     ds->ds_prev->ds_phys->ds_next_clones_obj;
2260                 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
2261                     ds->ds_object ||
2262                     ds->ds_prev->ds_phys->ds_num_children > 1);
2263                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
2264                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2265                         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
2266                             ds->ds_prev->ds_phys->ds_creation_txg);
2267                         ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
2268                 } else if (next_clones_obj != 0) {
2269                         remove_from_next_clones(ds->ds_prev,
2270                             dsphys->ds_next_snap_obj, tx);
2271                         VERIFY3U(0, ==, zap_add_int(mos,
2272                             next_clones_obj, dsobj, tx));
2273                 }
2274         }
2275 
2276         /*
2277          * If we have a reference-reservation on this dataset, we will
2278          * need to increase the amount of refreservation being charged
2279          * since our unique space is going to zero.
2280          */
2281         if (ds->ds_reserved) {
2282                 int64_t delta;
2283                 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
2284                 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2285                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
2286                     delta, 0, 0, tx);
2287         }
2288 
2289         dmu_buf_will_dirty(ds->ds_dbuf, tx);
2290         zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
2291             ds->ds_dir->dd_myname, snapname, dsobj,
2292             ds->ds_phys->ds_prev_snap_txg);
2293         ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
2294             UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
2295         dsl_deadlist_close(&ds->ds_deadlist);
2296         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
2297         dsl_deadlist_add_key(&ds->ds_deadlist,
2298             ds->ds_phys->ds_prev_snap_txg, tx);
2299 
2300         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
2301         ds->ds_phys->ds_prev_snap_obj = dsobj;
2302         ds->ds_phys->ds_prev_snap_txg = crtxg;
2303         ds->ds_phys->ds_unique_bytes = 0;
2304         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
2305                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
2306 
2307         err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
2308             snapname, 8, 1, &dsobj, tx);
2309         ASSERT(err == 0);
2310 
2311         if (ds->ds_prev)
2312                 dsl_dataset_drop_ref(ds->ds_prev, ds);
2313         VERIFY(0 == dsl_dataset_get_ref(dp,
2314             ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
2315 
2316         dsl_scan_ds_snapshotted(ds, tx);
2317 
2318         dsl_dir_snap_cmtime_update(ds->ds_dir);
2319 
2320         spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
2321 }
2322 
2323 void
2324 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
2325 {
2326         ASSERT(dmu_tx_is_syncing(tx));
2327         ASSERT(ds->ds_objset != NULL);
2328         ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
2329 
2330         /*
2331          * in case we had to change ds_fsid_guid when we opened it,
2332          * sync it out now.
2333          */
2334         dmu_buf_will_dirty(ds->ds_dbuf, tx);
2335         ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
2336 
2337         dmu_objset_sync(ds->ds_objset, zio, tx);
2338 }
2339 
2340 static void
2341 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
2342 {
2343         uint64_t count = 0;
2344         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
2345         zap_cursor_t zc;
2346         zap_attribute_t za;
2347         nvlist_t *propval;
2348         nvlist_t *val;
2349 
2350         rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2351         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2352         VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2353 
2354         /*
2355          * There may me missing entries in ds_next_clones_obj
2356          * due to a bug in a previous version of the code.
2357          * Only trust it if it has the right number of entries.
2358          */
2359         if (ds->ds_phys->ds_next_clones_obj != 0) {
2360                 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
2361                     &count));
2362         }
2363         if (count != ds->ds_phys->ds_num_children - 1) {
2364                 goto fail;
2365         }
2366         for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
2367             zap_cursor_retrieve(&zc, &za) == 0;
2368             zap_cursor_advance(&zc)) {
2369                 dsl_dataset_t *clone;
2370                 char buf[ZFS_MAXNAMELEN];
2371                 /*
2372                  * Even though we hold the dp_config_rwlock, the dataset
2373                  * may fail to open, returning ENOENT.  If there is a
2374                  * thread concurrently attempting to destroy this
2375                  * dataset, it will have the ds_rwlock held for
2376                  * RW_WRITER.  Our call to dsl_dataset_hold_obj() ->
2377                  * dsl_dataset_hold_ref() will fail its
2378                  * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
2379                  * dp_config_rwlock, and wait for the destroy progress
2380                  * and signal ds_exclusive_cv.  If the destroy was
2381                  * successful, we will see that
2382                  * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
2383                  */
2384                 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
2385                     za.za_first_integer, FTAG, &clone) != 0)
2386                         continue;
2387                 dsl_dir_name(clone->ds_dir, buf);
2388                 VERIFY(nvlist_add_boolean(val, buf) == 0);
2389                 dsl_dataset_rele(clone, FTAG);
2390         }
2391         zap_cursor_fini(&zc);
2392         VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
2393         VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
2394             propval) == 0);
2395 fail:
2396         nvlist_free(val);
2397         nvlist_free(propval);
2398         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2399 }
2400 
2401 void
2402 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
2403 {
2404         uint64_t refd, avail, uobjs, aobjs, ratio;
2405 
2406         ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
2407             (ds->ds_phys->ds_uncompressed_bytes * 100 /
2408             ds->ds_phys->ds_compressed_bytes);
2409 
2410         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
2411 
2412         if (dsl_dataset_is_snapshot(ds)) {
2413                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
2414                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
2415                     ds->ds_phys->ds_unique_bytes);
2416                 get_clones_stat(ds, nv);
2417         } else {
2418                 dsl_dir_stats(ds->ds_dir, nv);
2419         }
2420 
2421         dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
2422         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
2423         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
2424 
2425         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
2426             ds->ds_phys->ds_creation_time);
2427         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
2428             ds->ds_phys->ds_creation_txg);
2429         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
2430             ds->ds_quota);
2431         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
2432             ds->ds_reserved);
2433         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
2434             ds->ds_phys->ds_guid);
2435         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
2436             ds->ds_phys->ds_unique_bytes);
2437         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
2438             ds->ds_object);
2439         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
2440             ds->ds_userrefs);
2441         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
2442             DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
2443 
2444         if (ds->ds_phys->ds_prev_snap_obj != 0) {
2445                 uint64_t written, comp, uncomp;
2446                 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2447                 dsl_dataset_t *prev;
2448 
2449                 rw_enter(&dp->dp_config_rwlock, RW_READER);
2450                 int err = dsl_dataset_hold_obj(dp,
2451                     ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
2452                 rw_exit(&dp->dp_config_rwlock);
2453                 if (err == 0) {
2454                         err = dsl_dataset_space_written(prev, ds, &written,
2455                             &comp, &uncomp);
2456                         dsl_dataset_rele(prev, FTAG);
2457                         if (err == 0) {
2458                                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
2459                                     written);
2460                         }
2461                 }
2462         }
2463 }
2464 
2465 void
2466 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
2467 {
2468         stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
2469         stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
2470         stat->dds_guid = ds->ds_phys->ds_guid;
2471         stat->dds_origin[0] = '\0';
2472         if (dsl_dataset_is_snapshot(ds)) {
2473                 stat->dds_is_snapshot = B_TRUE;
2474                 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
2475         } else {
2476                 stat->dds_is_snapshot = B_FALSE;
2477                 stat->dds_num_clones = 0;
2478 
2479                 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2480                 if (dsl_dir_is_clone(ds->ds_dir)) {
2481                         dsl_dataset_t *ods;
2482 
2483                         VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
2484                             ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
2485                         dsl_dataset_name(ods, stat->dds_origin);
2486                         dsl_dataset_drop_ref(ods, FTAG);
2487                 }
2488                 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2489         }
2490 }
2491 
2492 uint64_t
2493 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2494 {
2495         return (ds->ds_fsid_guid);
2496 }
2497 
2498 void
2499 dsl_dataset_space(dsl_dataset_t *ds,
2500     uint64_t *refdbytesp, uint64_t *availbytesp,
2501     uint64_t *usedobjsp, uint64_t *availobjsp)
2502 {
2503         *refdbytesp = ds->ds_phys->ds_referenced_bytes;
2504         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2505         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
2506                 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
2507         if (ds->ds_quota != 0) {
2508                 /*
2509                  * Adjust available bytes according to refquota
2510                  */
2511                 if (*refdbytesp < ds->ds_quota)
2512                         *availbytesp = MIN(*availbytesp,
2513                             ds->ds_quota - *refdbytesp);
2514                 else
2515                         *availbytesp = 0;
2516         }
2517         *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
2518         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
2519 }
2520 
2521 boolean_t
2522 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2523 {
2524         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2525 
2526         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
2527             dsl_pool_sync_context(dp));
2528         if (ds->ds_prev == NULL)
2529                 return (B_FALSE);
2530         if (ds->ds_phys->ds_bp.blk_birth >
2531             ds->ds_prev->ds_phys->ds_creation_txg) {
2532                 objset_t *os, *os_prev;
2533                 /*
2534                  * It may be that only the ZIL differs, because it was
2535                  * reset in the head.  Don't count that as being
2536                  * modified.
2537                  */
2538                 if (dmu_objset_from_ds(ds, &os) != 0)
2539                         return (B_TRUE);
2540                 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
2541                         return (B_TRUE);
2542                 return (bcmp(&os->os_phys->os_meta_dnode,
2543                     &os_prev->os_phys->os_meta_dnode,
2544                     sizeof (os->os_phys->os_meta_dnode)) != 0);
2545         }
2546         return (B_FALSE);
2547 }
2548 
2549 /* ARGSUSED */
2550 static int
2551 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
2552 {
2553         dsl_dataset_t *ds = arg1;
2554         char *newsnapname = arg2;
2555         dsl_dir_t *dd = ds->ds_dir;
2556         dsl_dataset_t *hds;
2557         uint64_t val;
2558         int err;
2559 
2560         err = dsl_dataset_hold_obj(dd->dd_pool,
2561             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
2562         if (err)
2563                 return (err);
2564 
2565         /* new name better not be in use */
2566         err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
2567         dsl_dataset_rele(hds, FTAG);
2568 
2569         if (err == 0)
2570                 err = EEXIST;
2571         else if (err == ENOENT)
2572                 err = 0;
2573 
2574         /* dataset name + 1 for the "@" + the new snapshot name must fit */
2575         if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
2576                 err = ENAMETOOLONG;
2577 
2578         return (err);
2579 }
2580 
2581 static void
2582 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2583 {
2584         dsl_dataset_t *ds = arg1;
2585         const char *newsnapname = arg2;
2586         dsl_dir_t *dd = ds->ds_dir;
2587         objset_t *mos = dd->dd_pool->dp_meta_objset;
2588         dsl_dataset_t *hds;
2589         int err;
2590 
2591         ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
2592 
2593         VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2594             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
2595 
2596         VERIFY(0 == dsl_dataset_get_snapname(ds));
2597         err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
2598         ASSERT0(err);
2599         mutex_enter(&ds->ds_lock);
2600         (void) strcpy(ds->ds_snapname, newsnapname);
2601         mutex_exit(&ds->ds_lock);
2602         err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
2603             ds->ds_snapname, 8, 1, &ds->ds_object, tx);
2604         ASSERT0(err);
2605 
2606         spa_history_log_internal_ds(ds, "rename", tx,
2607             "-> @%s", newsnapname);
2608         dsl_dataset_rele(hds, FTAG);
2609 }
2610 
2611 struct renamesnaparg {
2612         dsl_sync_task_group_t *dstg;
2613         char failed[MAXPATHLEN];
2614         char *oldsnap;
2615         char *newsnap;
2616 };
2617 
2618 static int
2619 dsl_snapshot_rename_one(const char *name, void *arg)
2620 {
2621         struct renamesnaparg *ra = arg;
2622         dsl_dataset_t *ds = NULL;
2623         char *snapname;
2624         int err;
2625 
2626         snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
2627         (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
2628 
2629         /*
2630          * For recursive snapshot renames the parent won't be changing
2631          * so we just pass name for both the to/from argument.
2632          */
2633         err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
2634         if (err != 0) {
2635                 strfree(snapname);
2636                 return (err == ENOENT ? 0 : err);
2637         }
2638 
2639 #ifdef _KERNEL
2640         /*
2641          * For all filesystems undergoing rename, we'll need to unmount it.
2642          */
2643         (void) zfs_unmount_snap(snapname, NULL);
2644 #endif
2645         err = dsl_dataset_hold(snapname, ra->dstg, &ds);
2646         strfree(snapname);
2647         if (err != 0)
2648                 return (err == ENOENT ? 0 : err);
2649 
2650         dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
2651             dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
2652 
2653         return (0);
2654 }
2655 
2656 static int
2657 dsl_recursive_rename(char *oldname, const char *newname)
2658 {
2659         int err;
2660         struct renamesnaparg *ra;
2661         dsl_sync_task_t *dst;
2662         spa_t *spa;
2663         char *cp, *fsname = spa_strdup(oldname);
2664         int len = strlen(oldname) + 1;
2665 
2666         /* truncate the snapshot name to get the fsname */
2667         cp = strchr(fsname, '@');
2668         *cp = '\0';
2669 
2670         err = spa_open(fsname, &spa, FTAG);
2671         if (err) {
2672                 kmem_free(fsname, len);
2673                 return (err);
2674         }
2675         ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
2676         ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
2677 
2678         ra->oldsnap = strchr(oldname, '@') + 1;
2679         ra->newsnap = strchr(newname, '@') + 1;
2680         *ra->failed = '\0';
2681 
2682         err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
2683             DS_FIND_CHILDREN);
2684         kmem_free(fsname, len);
2685 
2686         if (err == 0) {
2687                 err = dsl_sync_task_group_wait(ra->dstg);
2688         }
2689 
2690         for (dst = list_head(&ra->dstg->dstg_tasks); dst;
2691             dst = list_next(&ra->dstg->dstg_tasks, dst)) {
2692                 dsl_dataset_t *ds = dst->dst_arg1;
2693                 if (dst->dst_err) {
2694                         dsl_dir_name(ds->ds_dir, ra->failed);
2695                         (void) strlcat(ra->failed, "@", sizeof (ra->failed));
2696                         (void) strlcat(ra->failed, ra->newsnap,
2697                             sizeof (ra->failed));
2698                 }
2699                 dsl_dataset_rele(ds, ra->dstg);
2700         }
2701 
2702         if (err)
2703                 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
2704 
2705         dsl_sync_task_group_destroy(ra->dstg);
2706         kmem_free(ra, sizeof (struct renamesnaparg));
2707         spa_close(spa, FTAG);
2708         return (err);
2709 }
2710 
2711 static int
2712 dsl_valid_rename(const char *oldname, void *arg)
2713 {
2714         int delta = *(int *)arg;
2715 
2716         if (strlen(oldname) + delta >= MAXNAMELEN)
2717                 return (ENAMETOOLONG);
2718 
2719         return (0);
2720 }
2721 
2722 #pragma weak dmu_objset_rename = dsl_dataset_rename
2723 int
2724 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
2725 {
2726         dsl_dir_t *dd;
2727         dsl_dataset_t *ds;
2728         const char *tail;
2729         int err;
2730 
2731         err = dsl_dir_open(oldname, FTAG, &dd, &tail);
2732         if (err)
2733                 return (err);
2734 
2735         if (tail == NULL) {
2736                 int delta = strlen(newname) - strlen(oldname);
2737 
2738                 /* if we're growing, validate child name lengths */
2739                 if (delta > 0)
2740                         err = dmu_objset_find(oldname, dsl_valid_rename,
2741                             &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
2742 
2743                 if (err == 0)
2744                         err = dsl_dir_rename(dd, newname);
2745                 dsl_dir_close(dd, FTAG);
2746                 return (err);
2747         }
2748 
2749         if (tail[0] != '@') {
2750                 /* the name ended in a nonexistent component */
2751                 dsl_dir_close(dd, FTAG);
2752                 return (ENOENT);
2753         }
2754 
2755         dsl_dir_close(dd, FTAG);
2756 
2757         /* new name must be snapshot in same filesystem */
2758         tail = strchr(newname, '@');
2759         if (tail == NULL)
2760                 return (EINVAL);
2761         tail++;
2762         if (strncmp(oldname, newname, tail - newname) != 0)
2763                 return (EXDEV);
2764 
2765         if (recursive) {
2766                 err = dsl_recursive_rename(oldname, newname);
2767         } else {
2768                 err = dsl_dataset_hold(oldname, FTAG, &ds);
2769                 if (err)
2770                         return (err);
2771 
2772                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
2773                     dsl_dataset_snapshot_rename_check,
2774                     dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
2775 
2776                 dsl_dataset_rele(ds, FTAG);
2777         }
2778 
2779         return (err);
2780 }
2781 
2782 struct promotenode {
2783         list_node_t link;
2784         dsl_dataset_t *ds;
2785 };
2786 
2787 struct promotearg {
2788         list_t shared_snaps, origin_snaps, clone_snaps;
2789         dsl_dataset_t *origin_origin;
2790         uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2791         char *err_ds;
2792 };
2793 
2794 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2795 static boolean_t snaplist_unstable(list_t *l);
2796 
2797 static int
2798 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
2799 {
2800         dsl_dataset_t *hds = arg1;
2801         struct promotearg *pa = arg2;
2802         struct promotenode *snap = list_head(&pa->shared_snaps);
2803         dsl_dataset_t *origin_ds = snap->ds;
2804         int err;
2805         uint64_t unused;
2806 
2807         /* Check that it is a real clone */
2808         if (!dsl_dir_is_clone(hds->ds_dir))
2809                 return (EINVAL);
2810 
2811         /* Since this is so expensive, don't do the preliminary check */
2812         if (!dmu_tx_is_syncing(tx))
2813                 return (0);
2814 
2815         if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
2816                 return (EXDEV);
2817 
2818         /* compute origin's new unique space */
2819         snap = list_tail(&pa->clone_snaps);
2820         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2821         dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2822             origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2823             &pa->unique, &unused, &unused);
2824 
2825         /*
2826          * Walk the snapshots that we are moving
2827          *
2828          * Compute space to transfer.  Consider the incremental changes
2829          * to used for each snapshot:
2830          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2831          * So each snapshot gave birth to:
2832          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2833          * So a sequence would look like:
2834          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2835          * Which simplifies to:
2836          * uN + kN + kN-1 + ... + k1 + k0
2837          * Note however, if we stop before we reach the ORIGIN we get:
2838          * uN + kN + kN-1 + ... + kM - uM-1
2839          */
2840         pa->used = origin_ds->ds_phys->ds_referenced_bytes;
2841         pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2842         pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2843         for (snap = list_head(&pa->shared_snaps); snap;
2844             snap = list_next(&pa->shared_snaps, snap)) {
2845                 uint64_t val, dlused, dlcomp, dluncomp;
2846                 dsl_dataset_t *ds = snap->ds;
2847 
2848                 /* Check that the snapshot name does not conflict */
2849                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2850                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2851                 if (err == 0) {
2852                         err = EEXIST;
2853                         goto out;
2854                 }
2855                 if (err != ENOENT)
2856                         goto out;
2857 
2858                 /* The very first snapshot does not have a deadlist */
2859                 if (ds->ds_phys->ds_prev_snap_obj == 0)
2860                         continue;
2861 
2862                 dsl_deadlist_space(&ds->ds_deadlist,
2863                     &dlused, &dlcomp, &dluncomp);
2864                 pa->used += dlused;
2865                 pa->comp += dlcomp;
2866                 pa->uncomp += dluncomp;
2867         }
2868 
2869         /*
2870          * If we are a clone of a clone then we never reached ORIGIN,
2871          * so we need to subtract out the clone origin's used space.
2872          */
2873         if (pa->origin_origin) {
2874                 pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
2875                 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2876                 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2877         }
2878 
2879         /* Check that there is enough space and limit headroom here */
2880         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2881             origin_ds->ds_dir, pa->used, tx);
2882         if (err)
2883                 return (err);
2884 
2885         /*
2886          * Compute the amounts of space that will be used by snapshots
2887          * after the promotion (for both origin and clone).  For each,
2888          * it is the amount of space that will be on all of their
2889          * deadlists (that was not born before their new origin).
2890          */
2891         if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2892                 uint64_t space;
2893 
2894                 /*
2895                  * Note, typically this will not be a clone of a clone,
2896                  * so dd_origin_txg will be < TXG_INITIAL, so
2897                  * these snaplist_space() -> dsl_deadlist_space_range()
2898                  * calls will be fast because they do not have to
2899                  * iterate over all bps.
2900                  */
2901                 snap = list_head(&pa->origin_snaps);
2902                 err = snaplist_space(&pa->shared_snaps,
2903                     snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
2904                 if (err)
2905                         return (err);
2906 
2907                 err = snaplist_space(&pa->clone_snaps,
2908                     snap->ds->ds_dir->dd_origin_txg, &space);
2909                 if (err)
2910                         return (err);
2911                 pa->cloneusedsnap += space;
2912         }
2913         if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2914                 err = snaplist_space(&pa->origin_snaps,
2915                     origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
2916                 if (err)
2917                         return (err);
2918         }
2919 
2920         return (0);
2921 out:
2922         pa->err_ds =  snap->ds->ds_snapname;
2923         return (err);
2924 }
2925 
2926 static void
2927 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2928 {
2929         dsl_dataset_t *hds = arg1;
2930         struct promotearg *pa = arg2;
2931         struct promotenode *snap = list_head(&pa->shared_snaps);
2932         dsl_dataset_t *origin_ds = snap->ds;
2933         dsl_dataset_t *origin_head;
2934         dsl_dir_t *dd = hds->ds_dir;
2935         dsl_pool_t *dp = hds->ds_dir->dd_pool;
2936         dsl_dir_t *odd = NULL;
2937         uint64_t oldnext_obj;
2938         int64_t delta;
2939 
2940         ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
2941 
2942         snap = list_head(&pa->origin_snaps);
2943         origin_head = snap->ds;
2944 
2945         /*
2946          * We need to explicitly open odd, since origin_ds's dd will be
2947          * changing.
2948          */
2949         VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
2950             NULL, FTAG, &odd));
2951 
2952         /* change origin's next snap */
2953         dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2954         oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2955         snap = list_tail(&pa->clone_snaps);
2956         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2957         origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2958 
2959         /* change the origin's next clone */
2960         if (origin_ds->ds_phys->ds_next_clones_obj) {
2961                 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
2962                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2963                     origin_ds->ds_phys->ds_next_clones_obj,
2964                     oldnext_obj, tx));
2965         }
2966 
2967         /* change origin */
2968         dmu_buf_will_dirty(dd->dd_dbuf, tx);
2969         ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2970         dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2971         dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2972         dmu_buf_will_dirty(odd->dd_dbuf, tx);
2973         odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2974         origin_head->ds_dir->dd_origin_txg =
2975             origin_ds->ds_phys->ds_creation_txg;
2976 
2977         /* change dd_clone entries */
2978         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2979                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2980                     odd->dd_phys->dd_clones, hds->ds_object, tx));
2981                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2982                     pa->origin_origin->ds_dir->dd_phys->dd_clones,
2983                     hds->ds_object, tx));
2984 
2985                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2986                     pa->origin_origin->ds_dir->dd_phys->dd_clones,
2987                     origin_head->ds_object, tx));
2988                 if (dd->dd_phys->dd_clones == 0) {
2989                         dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
2990                             DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
2991                 }
2992                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2993                     dd->dd_phys->dd_clones, origin_head->ds_object, tx));
2994 
2995         }
2996 
2997         /* move snapshots to this dir */
2998         for (snap = list_head(&pa->shared_snaps); snap;
2999             snap = list_next(&pa->shared_snaps, snap)) {
3000                 dsl_dataset_t *ds = snap->ds;
3001 
3002                 /* unregister props as dsl_dir is changing */
3003                 if (ds->ds_objset) {
3004                         dmu_objset_evict(ds->ds_objset);
3005                         ds->ds_objset = NULL;
3006                 }
3007                 /* move snap name entry */
3008                 VERIFY(0 == dsl_dataset_get_snapname(ds));
3009                 VERIFY(0 == dsl_dataset_snap_remove(origin_head,
3010                     ds->ds_snapname, tx));
3011                 VERIFY(0 == zap_add(dp->dp_meta_objset,
3012                     hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
3013                     8, 1, &ds->ds_object, tx));
3014                 dsl_snapcount_adjust(hds->ds_dir, tx, 1, B_TRUE);
3015 
3016                 /* change containing dsl_dir */
3017                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3018                 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
3019                 ds->ds_phys->ds_dir_obj = dd->dd_object;
3020                 ASSERT3P(ds->ds_dir, ==, odd);
3021                 dsl_dir_close(ds->ds_dir, ds);
3022                 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
3023                     NULL, ds, &ds->ds_dir));
3024 
3025                 /* move any clone references */
3026                 if (ds->ds_phys->ds_next_clones_obj &&
3027                     spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
3028                         zap_cursor_t zc;
3029                         zap_attribute_t za;
3030 
3031                         for (zap_cursor_init(&zc, dp->dp_meta_objset,
3032                             ds->ds_phys->ds_next_clones_obj);
3033                             zap_cursor_retrieve(&zc, &za) == 0;
3034                             zap_cursor_advance(&zc)) {
3035                                 dsl_dataset_t *cnds;
3036                                 uint64_t o;
3037 
3038                                 if (za.za_first_integer == oldnext_obj) {
3039                                         /*
3040                                          * We've already moved the
3041                                          * origin's reference.
3042                                          */
3043                                         continue;
3044                                 }
3045 
3046                                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
3047                                     za.za_first_integer, FTAG, &cnds));
3048                                 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
3049 
3050                                 VERIFY3U(zap_remove_int(dp->dp_meta_objset,
3051                                     odd->dd_phys->dd_clones, o, tx), ==, 0);
3052                                 VERIFY3U(zap_add_int(dp->dp_meta_objset,
3053                                     dd->dd_phys->dd_clones, o, tx), ==, 0);
3054                                 dsl_dataset_rele(cnds, FTAG);
3055                         }
3056                         zap_cursor_fini(&zc);
3057                 }
3058 
3059                 ASSERT0(dsl_prop_numcb(ds));
3060         }
3061 
3062         /*
3063          * Change space accounting.
3064          * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
3065          * both be valid, or both be 0 (resulting in delta == 0).  This
3066          * is true for each of {clone,origin} independently.
3067          */
3068 
3069         delta = pa->cloneusedsnap -
3070             dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
3071         ASSERT3S(delta, >=, 0);
3072         ASSERT3U(pa->used, >=, delta);
3073         dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
3074         dsl_dir_diduse_space(dd, DD_USED_HEAD,
3075             pa->used - delta, pa->comp, pa->uncomp, tx);
3076 
3077         delta = pa->originusedsnap -
3078             odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
3079         ASSERT3S(delta, <=, 0);
3080         ASSERT3U(pa->used, >=, -delta);
3081         dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
3082         dsl_dir_diduse_space(odd, DD_USED_HEAD,
3083             -pa->used - delta, -pa->comp, -pa->uncomp, tx);
3084 
3085         origin_ds->ds_phys->ds_unique_bytes = pa->unique;
3086 
3087         /* log history record */
3088         spa_history_log_internal_ds(hds, "promote", tx, "");
3089 
3090         dsl_dir_close(odd, FTAG);
3091 }
3092 
3093 static char *snaplist_tag = "snaplist";
3094 /*
3095  * Make a list of dsl_dataset_t's for the snapshots between first_obj
3096  * (exclusive) and last_obj (inclusive).  The list will be in reverse
3097  * order (last_obj will be the list_head()).  If first_obj == 0, do all
3098  * snapshots back to this dataset's origin.
3099  */
3100 static int
3101 snaplist_make(dsl_pool_t *dp, boolean_t own,
3102     uint64_t first_obj, uint64_t last_obj, list_t *l)
3103 {
3104         uint64_t obj = last_obj;
3105 
3106         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
3107 
3108         list_create(l, sizeof (struct promotenode),
3109             offsetof(struct promotenode, link));
3110 
3111         while (obj != first_obj) {
3112                 dsl_dataset_t *ds;
3113                 struct promotenode *snap;
3114                 int err;
3115 
3116                 if (own) {
3117                         err = dsl_dataset_own_obj(dp, obj,
3118                             0, snaplist_tag, &ds);
3119                         if (err == 0)
3120                                 dsl_dataset_make_exclusive(ds, snaplist_tag);
3121                 } else {
3122                         err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
3123                 }
3124                 if (err == ENOENT) {
3125                         /* lost race with snapshot destroy */
3126                         struct promotenode *last = list_tail(l);
3127                         ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
3128                         obj = last->ds->ds_phys->ds_prev_snap_obj;
3129                         continue;
3130                 } else if (err) {
3131                         return (err);
3132                 }
3133 
3134                 if (first_obj == 0)
3135                         first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
3136 
3137                 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
3138                 snap->ds = ds;
3139                 list_insert_tail(l, snap);
3140                 obj = ds->ds_phys->ds_prev_snap_obj;
3141         }
3142 
3143         return (0);
3144 }
3145 
3146 static int
3147 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
3148 {
3149         struct promotenode *snap;
3150 
3151         *spacep = 0;
3152         for (snap = list_head(l); snap; snap = list_next(l, snap)) {
3153                 uint64_t used, comp, uncomp;
3154                 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
3155                     mintxg, UINT64_MAX, &used, &comp, &uncomp);
3156                 *spacep += used;
3157         }
3158         return (0);
3159 }
3160 
3161 static void
3162 snaplist_destroy(list_t *l, boolean_t own)
3163 {
3164         struct promotenode *snap;
3165 
3166         if (!l || !list_link_active(&l->list_head))
3167                 return;
3168 
3169         while ((snap = list_tail(l)) != NULL) {
3170                 list_remove(l, snap);
3171                 if (own)
3172                         dsl_dataset_disown(snap->ds, snaplist_tag);
3173                 else
3174                         dsl_dataset_rele(snap->ds, snaplist_tag);
3175                 kmem_free(snap, sizeof (struct promotenode));
3176         }
3177         list_destroy(l);
3178 }
3179 
3180 /*
3181  * Promote a clone.  Nomenclature note:
3182  * "clone" or "cds": the original clone which is being promoted
3183  * "origin" or "ods": the snapshot which is originally clone's origin
3184  * "origin head" or "ohds": the dataset which is the head
3185  * (filesystem/volume) for the origin
3186  * "origin origin": the origin of the origin's filesystem (typically
3187  * NULL, indicating that the clone is not a clone of a clone).
3188  */
3189 int
3190 dsl_dataset_promote(const char *name, char *conflsnap)
3191 {
3192         dsl_dataset_t *ds;
3193         dsl_dir_t *dd;
3194         dsl_pool_t *dp;
3195         dmu_object_info_t doi;
3196         struct promotearg pa = { 0 };
3197         struct promotenode *snap;
3198         int err;
3199 
3200         err = dsl_dataset_hold(name, FTAG, &ds);
3201         if (err)
3202                 return (err);
3203         dd = ds->ds_dir;
3204         dp = dd->dd_pool;
3205 
3206         err = dmu_object_info(dp->dp_meta_objset,
3207             ds->ds_phys->ds_snapnames_zapobj, &doi);
3208         if (err) {
3209                 dsl_dataset_rele(ds, FTAG);
3210                 return (err);
3211         }
3212 
3213         if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
3214                 dsl_dataset_rele(ds, FTAG);
3215                 return (EINVAL);
3216         }
3217 
3218         /*
3219          * We are going to inherit all the snapshots taken before our
3220          * origin (i.e., our new origin will be our parent's origin).
3221          * Take ownership of them so that we can rename them into our
3222          * namespace.
3223          */
3224         rw_enter(&dp->dp_config_rwlock, RW_READER);
3225 
3226         err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
3227             &pa.shared_snaps);
3228         if (err != 0)
3229                 goto out;
3230 
3231         err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
3232         if (err != 0)
3233                 goto out;
3234 
3235         snap = list_head(&pa.shared_snaps);
3236         ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
3237         err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
3238             snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
3239         if (err != 0)
3240                 goto out;
3241 
3242         if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
3243                 err = dsl_dataset_hold_obj(dp,
3244                     snap->ds->ds_dir->dd_phys->dd_origin_obj,
3245                     FTAG, &pa.origin_origin);
3246                 if (err != 0)
3247                         goto out;
3248         }
3249 
3250 out:
3251         rw_exit(&dp->dp_config_rwlock);
3252 
3253         /*
3254          * Add in 128x the snapnames zapobj size, since we will be moving
3255          * a bunch of snapnames to the promoted ds, and dirtying their
3256          * bonus buffers.
3257          */
3258         if (err == 0) {
3259                 err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
3260                     dsl_dataset_promote_sync, ds, &pa,
3261                     2 + 2 * doi.doi_physical_blocks_512);
3262                 if (err && pa.err_ds && conflsnap)
3263                         (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
3264         }
3265 
3266         snaplist_destroy(&pa.shared_snaps, B_TRUE);
3267         snaplist_destroy(&pa.clone_snaps, B_FALSE);
3268         snaplist_destroy(&pa.origin_snaps, B_FALSE);
3269         if (pa.origin_origin)
3270                 dsl_dataset_rele(pa.origin_origin, FTAG);
3271         dsl_dataset_rele(ds, FTAG);
3272         return (err);
3273 }
3274 
3275 struct cloneswaparg {
3276         dsl_dataset_t *cds; /* clone dataset */
3277         dsl_dataset_t *ohds; /* origin's head dataset */
3278         boolean_t force;
3279         int64_t unused_refres_delta; /* change in unconsumed refreservation */
3280 };
3281 
3282 /* ARGSUSED */
3283 static int
3284 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
3285 {
3286         struct cloneswaparg *csa = arg1;
3287 
3288         /* they should both be heads */
3289         if (dsl_dataset_is_snapshot(csa->cds) ||
3290             dsl_dataset_is_snapshot(csa->ohds))
3291                 return (EINVAL);
3292 
3293         /* the branch point should be just before them */
3294         if (csa->cds->ds_prev != csa->ohds->ds_prev)
3295                 return (EINVAL);
3296 
3297         /* cds should be the clone (unless they are unrelated) */
3298         if (csa->cds->ds_prev != NULL &&
3299             csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
3300             csa->ohds->ds_object !=
3301             csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
3302                 return (EINVAL);
3303 
3304         /* the clone should be a child of the origin */
3305         if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
3306                 return (EINVAL);
3307 
3308         /* ohds shouldn't be modified unless 'force' */
3309         if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
3310                 return (ETXTBSY);
3311 
3312         /* adjust amount of any unconsumed refreservation */
3313         csa->unused_refres_delta =
3314             (int64_t)MIN(csa->ohds->ds_reserved,
3315             csa->ohds->ds_phys->ds_unique_bytes) -
3316             (int64_t)MIN(csa->ohds->ds_reserved,
3317             csa->cds->ds_phys->ds_unique_bytes);
3318 
3319         if (csa->unused_refres_delta > 0 &&
3320             csa->unused_refres_delta >
3321             dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
3322                 return (ENOSPC);
3323 
3324         if (csa->ohds->ds_quota != 0 &&
3325             csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
3326                 return (EDQUOT);
3327 
3328         return (0);
3329 }
3330 
3331 /* ARGSUSED */
3332 static void
3333 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3334 {
3335         struct cloneswaparg *csa = arg1;
3336         dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
3337 
3338         ASSERT(csa->cds->ds_reserved == 0);
3339         ASSERT(csa->ohds->ds_quota == 0 ||
3340             csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
3341 
3342         dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
3343         dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
3344 
3345         if (csa->cds->ds_objset != NULL) {
3346                 dmu_objset_evict(csa->cds->ds_objset);
3347                 csa->cds->ds_objset = NULL;
3348         }
3349 
3350         if (csa->ohds->ds_objset != NULL) {
3351                 dmu_objset_evict(csa->ohds->ds_objset);
3352                 csa->ohds->ds_objset = NULL;
3353         }
3354 
3355         /*
3356          * Reset origin's unique bytes, if it exists.
3357          */
3358         if (csa->cds->ds_prev) {
3359                 dsl_dataset_t *origin = csa->cds->ds_prev;
3360                 uint64_t comp, uncomp;
3361 
3362                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
3363                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3364                     origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
3365                     &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
3366         }
3367 
3368         /* swap blkptrs */
3369         {
3370                 blkptr_t tmp;
3371                 tmp = csa->ohds->ds_phys->ds_bp;
3372                 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
3373                 csa->cds->ds_phys->ds_bp = tmp;
3374         }
3375 
3376         /* set dd_*_bytes */
3377         {
3378                 int64_t dused, dcomp, duncomp;
3379                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
3380                 uint64_t odl_used, odl_comp, odl_uncomp;
3381 
3382                 ASSERT3U(csa->cds->ds_dir->dd_phys->
3383                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
3384 
3385                 dsl_deadlist_space(&csa->cds->ds_deadlist,
3386                     &cdl_used, &cdl_comp, &cdl_uncomp);
3387                 dsl_deadlist_space(&csa->ohds->ds_deadlist,
3388                     &odl_used, &odl_comp, &odl_uncomp);
3389 
3390                 dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
3391                     (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
3392                 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
3393                     (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
3394                 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
3395                     cdl_uncomp -
3396                     (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
3397 
3398                 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
3399                     dused, dcomp, duncomp, tx);
3400                 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
3401                     -dused, -dcomp, -duncomp, tx);
3402 
3403                 /*
3404                  * The difference in the space used by snapshots is the
3405                  * difference in snapshot space due to the head's
3406                  * deadlist (since that's the only thing that's
3407                  * changing that affects the snapused).
3408                  */
3409                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3410                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3411                     &cdl_used, &cdl_comp, &cdl_uncomp);
3412                 dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
3413                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3414                     &odl_used, &odl_comp, &odl_uncomp);
3415                 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
3416                     DD_USED_HEAD, DD_USED_SNAP, tx);
3417         }
3418 
3419         /* swap ds_*_bytes */
3420         SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
3421             csa->cds->ds_phys->ds_referenced_bytes);
3422         SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
3423             csa->cds->ds_phys->ds_compressed_bytes);
3424         SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
3425             csa->cds->ds_phys->ds_uncompressed_bytes);
3426         SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
3427             csa->cds->ds_phys->ds_unique_bytes);
3428 
3429         /* apply any parent delta for change in unconsumed refreservation */
3430         dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
3431             csa->unused_refres_delta, 0, 0, tx);
3432 
3433         /*
3434          * Swap deadlists.
3435          */
3436         dsl_deadlist_close(&csa->cds->ds_deadlist);
3437         dsl_deadlist_close(&csa->ohds->ds_deadlist);
3438         SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
3439             csa->cds->ds_phys->ds_deadlist_obj);
3440         dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
3441             csa->cds->ds_phys->ds_deadlist_obj);
3442         dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
3443             csa->ohds->ds_phys->ds_deadlist_obj);
3444 
3445         dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
3446 
3447         spa_history_log_internal_ds(csa->cds, "clone swap", tx,
3448             "parent=%s", csa->ohds->ds_dir->dd_myname);
3449 }
3450 
3451 /*
3452  * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
3453  * recv" into an existing fs to swizzle the file system to the new
3454  * version, and by "zfs rollback".  Can also be used to swap two
3455  * independent head datasets if neither has any snapshots.
3456  */
3457 int
3458 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
3459     boolean_t force)
3460 {
3461         struct cloneswaparg csa;
3462         int error;
3463 
3464         ASSERT(clone->ds_owner);
3465         ASSERT(origin_head->ds_owner);
3466 retry:
3467         /*
3468          * Need exclusive access for the swap. If we're swapping these
3469          * datasets back after an error, we already hold the locks.
3470          */
3471         if (!RW_WRITE_HELD(&clone->ds_rwlock))
3472                 rw_enter(&clone->ds_rwlock, RW_WRITER);
3473         if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
3474             !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
3475                 rw_exit(&clone->ds_rwlock);
3476                 rw_enter(&origin_head->ds_rwlock, RW_WRITER);
3477                 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
3478                         rw_exit(&origin_head->ds_rwlock);
3479                         goto retry;
3480                 }
3481         }
3482         csa.cds = clone;
3483         csa.ohds = origin_head;
3484         csa.force = force;
3485         error = dsl_sync_task_do(clone->ds_dir->dd_pool,
3486             dsl_dataset_clone_swap_check,
3487             dsl_dataset_clone_swap_sync, &csa, NULL, 9);
3488         return (error);
3489 }
3490 
3491 /*
3492  * Given a pool name and a dataset object number in that pool,
3493  * return the name of that dataset.
3494  */
3495 int
3496 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3497 {
3498         spa_t *spa;
3499         dsl_pool_t *dp;
3500         dsl_dataset_t *ds;
3501         int error;
3502 
3503         if ((error = spa_open(pname, &spa, FTAG)) != 0)
3504                 return (error);
3505         dp = spa_get_dsl(spa);
3506         rw_enter(&dp->dp_config_rwlock, RW_READER);
3507         if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
3508                 dsl_dataset_name(ds, buf);
3509                 dsl_dataset_rele(ds, FTAG);
3510         }
3511         rw_exit(&dp->dp_config_rwlock);
3512         spa_close(spa, FTAG);
3513 
3514         return (error);
3515 }
3516 
3517 int
3518 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3519     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3520 {
3521         int error = 0;
3522 
3523         ASSERT3S(asize, >, 0);
3524 
3525         /*
3526          * *ref_rsrv is the portion of asize that will come from any
3527          * unconsumed refreservation space.
3528          */
3529         *ref_rsrv = 0;
3530 
3531         mutex_enter(&ds->ds_lock);
3532         /*
3533          * Make a space adjustment for reserved bytes.
3534          */
3535         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
3536                 ASSERT3U(*used, >=,
3537                     ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3538                 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3539                 *ref_rsrv =
3540                     asize - MIN(asize, parent_delta(ds, asize + inflight));
3541         }
3542 
3543         if (!check_quota || ds->ds_quota == 0) {
3544                 mutex_exit(&ds->ds_lock);
3545                 return (0);
3546         }
3547         /*
3548          * If they are requesting more space, and our current estimate
3549          * is over quota, they get to try again unless the actual
3550          * on-disk is over quota and there are no pending changes (which
3551          * may free up space for us).
3552          */
3553         if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
3554                 if (inflight > 0 ||
3555                     ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
3556                         error = ERESTART;
3557                 else
3558                         error = EDQUOT;
3559         }
3560         mutex_exit(&ds->ds_lock);
3561 
3562         return (error);
3563 }
3564 
3565 /* ARGSUSED */
3566 static int
3567 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
3568 {
3569         dsl_dataset_t *ds = arg1;
3570         dsl_prop_setarg_t *psa = arg2;
3571         int err;
3572 
3573         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
3574                 return (ENOTSUP);
3575 
3576         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3577                 return (err);
3578 
3579         if (psa->psa_effective_value == 0)
3580                 return (0);
3581 
3582         if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
3583             psa->psa_effective_value < ds->ds_reserved)
3584                 return (ENOSPC);
3585 
3586         return (0);
3587 }
3588 
3589 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
3590 
3591 void
3592 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3593 {
3594         dsl_dataset_t *ds = arg1;
3595         dsl_prop_setarg_t *psa = arg2;
3596         uint64_t effective_value = psa->psa_effective_value;
3597 
3598         dsl_prop_set_sync(ds, psa, tx);
3599         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3600 
3601         if (ds->ds_quota != effective_value) {
3602                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3603                 ds->ds_quota = effective_value;
3604         }
3605 }
3606 
3607 int
3608 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
3609 {
3610         dsl_dataset_t *ds;
3611         dsl_prop_setarg_t psa;
3612         int err;
3613 
3614         dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
3615 
3616         err = dsl_dataset_hold(dsname, FTAG, &ds);
3617         if (err)
3618                 return (err);
3619 
3620         /*
3621          * If someone removes a file, then tries to set the quota, we
3622          * want to make sure the file freeing takes effect.
3623          */
3624         txg_wait_open(ds->ds_dir->dd_pool, 0);
3625 
3626         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3627             dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
3628             ds, &psa, 0);
3629 
3630         dsl_dataset_rele(ds, FTAG);
3631         return (err);
3632 }
3633 
3634 static int
3635 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
3636 {
3637         dsl_dataset_t *ds = arg1;
3638         dsl_prop_setarg_t *psa = arg2;
3639         uint64_t effective_value;
3640         uint64_t unique;
3641         int err;
3642 
3643         if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
3644             SPA_VERSION_REFRESERVATION)
3645                 return (ENOTSUP);
3646 
3647         if (dsl_dataset_is_snapshot(ds))
3648                 return (EINVAL);
3649 
3650         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3651                 return (err);
3652 
3653         effective_value = psa->psa_effective_value;
3654 
3655         /*
3656          * If we are doing the preliminary check in open context, the
3657          * space estimates may be inaccurate.
3658          */
3659         if (!dmu_tx_is_syncing(tx))
3660                 return (0);
3661 
3662         mutex_enter(&ds->ds_lock);
3663         if (!DS_UNIQUE_IS_ACCURATE(ds))
3664                 dsl_dataset_recalc_head_uniq(ds);
3665         unique = ds->ds_phys->ds_unique_bytes;
3666         mutex_exit(&ds->ds_lock);
3667 
3668         if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
3669                 uint64_t delta = MAX(unique, effective_value) -
3670                     MAX(unique, ds->ds_reserved);
3671 
3672                 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
3673                         return (ENOSPC);
3674                 if (ds->ds_quota > 0 &&
3675                     effective_value > ds->ds_quota)
3676                         return (ENOSPC);
3677         }
3678 
3679         return (0);
3680 }
3681 
3682 static void
3683 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3684 {
3685         dsl_dataset_t *ds = arg1;
3686         dsl_prop_setarg_t *psa = arg2;
3687         uint64_t effective_value = psa->psa_effective_value;
3688         uint64_t unique;
3689         int64_t delta;
3690 
3691         dsl_prop_set_sync(ds, psa, tx);
3692         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3693 
3694         dmu_buf_will_dirty(ds->ds_dbuf, tx);
3695 
3696         mutex_enter(&ds->ds_dir->dd_lock);
3697         mutex_enter(&ds->ds_lock);
3698         ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3699         unique = ds->ds_phys->ds_unique_bytes;
3700         delta = MAX(0, (int64_t)(effective_value - unique)) -
3701             MAX(0, (int64_t)(ds->ds_reserved - unique));
3702         ds->ds_reserved = effective_value;
3703         mutex_exit(&ds->ds_lock);
3704 
3705         dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3706         mutex_exit(&ds->ds_dir->dd_lock);
3707 }
3708 
3709 int
3710 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
3711     uint64_t reservation)
3712 {
3713         dsl_dataset_t *ds;
3714         dsl_prop_setarg_t psa;
3715         int err;
3716 
3717         dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
3718             &reservation);
3719 
3720         err = dsl_dataset_hold(dsname, FTAG, &ds);
3721         if (err)
3722                 return (err);
3723 
3724         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3725             dsl_dataset_set_reservation_check,
3726             dsl_dataset_set_reservation_sync, ds, &psa, 0);
3727 
3728         dsl_dataset_rele(ds, FTAG);
3729         return (err);
3730 }
3731 
3732 typedef struct zfs_hold_cleanup_arg {
3733         dsl_pool_t *dp;
3734         uint64_t dsobj;
3735         char htag[MAXNAMELEN];
3736 } zfs_hold_cleanup_arg_t;
3737 
3738 static void
3739 dsl_dataset_user_release_onexit(void *arg)
3740 {
3741         zfs_hold_cleanup_arg_t *ca = arg;
3742 
3743         (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
3744             B_TRUE);
3745         kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
3746 }
3747 
3748 void
3749 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
3750     minor_t minor)
3751 {
3752         zfs_hold_cleanup_arg_t *ca;
3753 
3754         ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
3755         ca->dp = ds->ds_dir->dd_pool;
3756         ca->dsobj = ds->ds_object;
3757         (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
3758         VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
3759             dsl_dataset_user_release_onexit, ca, NULL));
3760 }
3761 
3762 /*
3763  * If you add new checks here, you may need to add
3764  * additional checks to the "temporary" case in
3765  * snapshot_check() in dmu_objset.c.
3766  */
3767 static int
3768 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
3769 {
3770         dsl_dataset_t *ds = arg1;
3771         struct dsl_ds_holdarg *ha = arg2;
3772         const char *htag = ha->htag;
3773         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3774         int error = 0;
3775 
3776         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3777                 return (ENOTSUP);
3778 
3779         if (!dsl_dataset_is_snapshot(ds))
3780                 return (EINVAL);
3781 
3782         /* tags must be unique */
3783         mutex_enter(&ds->ds_lock);
3784         if (ds->ds_phys->ds_userrefs_obj) {
3785                 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
3786                     8, 1, tx);
3787                 if (error == 0)
3788                         error = EEXIST;
3789                 else if (error == ENOENT)
3790                         error = 0;
3791         }
3792         mutex_exit(&ds->ds_lock);
3793 
3794         if (error == 0 && ha->temphold &&
3795             strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
3796                 error = E2BIG;
3797 
3798         return (error);
3799 }
3800 
3801 void
3802 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3803 {
3804         dsl_dataset_t *ds = arg1;
3805         struct dsl_ds_holdarg *ha = arg2;
3806         const char *htag = ha->htag;
3807         dsl_pool_t *dp = ds->ds_dir->dd_pool;
3808         objset_t *mos = dp->dp_meta_objset;
3809         uint64_t now = gethrestime_sec();
3810         uint64_t zapobj;
3811 
3812         mutex_enter(&ds->ds_lock);
3813         if (ds->ds_phys->ds_userrefs_obj == 0) {
3814                 /*
3815                  * This is the first user hold for this dataset.  Create
3816                  * the userrefs zap object.
3817                  */
3818                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3819                 zapobj = ds->ds_phys->ds_userrefs_obj =
3820                     zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
3821         } else {
3822                 zapobj = ds->ds_phys->ds_userrefs_obj;
3823         }
3824         ds->ds_userrefs++;
3825         mutex_exit(&ds->ds_lock);
3826 
3827         VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
3828 
3829         if (ha->temphold) {
3830                 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
3831                     htag, &now, tx));
3832         }
3833 
3834         spa_history_log_internal_ds(ds, "hold", tx,
3835             "tag = %s temp = %d holds now = %llu",
3836             htag, (int)ha->temphold, ds->ds_userrefs);
3837 }
3838 
3839 static int
3840 dsl_dataset_user_hold_one(const char *dsname, void *arg)
3841 {
3842         struct dsl_ds_holdarg *ha = arg;
3843         dsl_dataset_t *ds;
3844         int error;
3845         char *name;
3846 
3847         /* alloc a buffer to hold dsname@snapname plus terminating NULL */
3848         name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3849         error = dsl_dataset_hold(name, ha->dstg, &ds);
3850         strfree(name);
3851         if (error == 0) {
3852                 ha->gotone = B_TRUE;
3853                 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
3854                     dsl_dataset_user_hold_sync, ds, ha, 0);
3855         } else if (error == ENOENT && ha->recursive) {
3856                 error = 0;
3857         } else {
3858                 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3859         }
3860         return (error);
3861 }
3862 
3863 int
3864 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
3865     boolean_t temphold)
3866 {
3867         struct dsl_ds_holdarg *ha;
3868         int error;
3869 
3870         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3871         ha->htag = htag;
3872         ha->temphold = temphold;
3873         error = dsl_sync_task_do(ds->ds_dir->dd_pool,
3874             dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
3875             ds, ha, 0);
3876         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3877 
3878         return (error);
3879 }
3880 
3881 int
3882 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
3883     boolean_t recursive, boolean_t temphold, int cleanup_fd)
3884 {
3885         struct dsl_ds_holdarg *ha;
3886         dsl_sync_task_t *dst;
3887         spa_t *spa;
3888         int error;
3889         minor_t minor = 0;
3890 
3891         if (cleanup_fd != -1) {
3892                 /* Currently we only support cleanup-on-exit of tempholds. */
3893                 if (!temphold)
3894                         return (EINVAL);
3895                 error = zfs_onexit_fd_hold(cleanup_fd, &minor);
3896                 if (error)
3897                         return (error);
3898         }
3899 
3900         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3901 
3902         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3903 
3904         error = spa_open(dsname, &spa, FTAG);
3905         if (error) {
3906                 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3907                 if (cleanup_fd != -1)
3908                         zfs_onexit_fd_rele(cleanup_fd);
3909                 return (error);
3910         }
3911 
3912         ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3913         ha->htag = htag;
3914         ha->snapname = snapname;
3915         ha->recursive = recursive;
3916         ha->temphold = temphold;
3917 
3918         if (recursive) {
3919                 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
3920                     ha, DS_FIND_CHILDREN);
3921         } else {
3922                 error = dsl_dataset_user_hold_one(dsname, ha);
3923         }
3924         if (error == 0)
3925                 error = dsl_sync_task_group_wait(ha->dstg);
3926 
3927         for (dst = list_head(&ha->dstg->dstg_tasks); dst;
3928             dst = list_next(&ha->dstg->dstg_tasks, dst)) {
3929                 dsl_dataset_t *ds = dst->dst_arg1;
3930 
3931                 if (dst->dst_err) {
3932                         dsl_dataset_name(ds, ha->failed);
3933                         *strchr(ha->failed, '@') = '\0';
3934                 } else if (error == 0 && minor != 0 && temphold) {
3935                         /*
3936                          * If this hold is to be released upon process exit,
3937                          * register that action now.
3938                          */
3939                         dsl_register_onexit_hold_cleanup(ds, htag, minor);
3940                 }
3941                 dsl_dataset_rele(ds, ha->dstg);
3942         }
3943 
3944         if (error == 0 && recursive && !ha->gotone)
3945                 error = ENOENT;
3946 
3947         if (error)
3948                 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
3949 
3950         dsl_sync_task_group_destroy(ha->dstg);
3951 
3952         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3953         spa_close(spa, FTAG);
3954         if (cleanup_fd != -1)
3955                 zfs_onexit_fd_rele(cleanup_fd);
3956         return (error);
3957 }
3958 
3959 struct dsl_ds_releasearg {
3960         dsl_dataset_t *ds;
3961         const char *htag;
3962         boolean_t own;          /* do we own or just hold ds? */
3963 };
3964 
3965 static int
3966 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
3967     boolean_t *might_destroy)
3968 {
3969         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3970         uint64_t zapobj;
3971         uint64_t tmp;
3972         int error;
3973 
3974         *might_destroy = B_FALSE;
3975 
3976         mutex_enter(&ds->ds_lock);
3977         zapobj = ds->ds_phys->ds_userrefs_obj;
3978         if (zapobj == 0) {
3979                 /* The tag can't possibly exist */
3980                 mutex_exit(&ds->ds_lock);
3981                 return (ESRCH);
3982         }
3983 
3984         /* Make sure the tag exists */
3985         error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
3986         if (error) {
3987                 mutex_exit(&ds->ds_lock);
3988                 if (error == ENOENT)
3989                         error = ESRCH;
3990                 return (error);
3991         }
3992 
3993         if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
3994             DS_IS_DEFER_DESTROY(ds))
3995                 *might_destroy = B_TRUE;
3996 
3997         mutex_exit(&ds->ds_lock);
3998         return (0);
3999 }
4000 
4001 static int
4002 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
4003 {
4004         struct dsl_ds_releasearg *ra = arg1;
4005         dsl_dataset_t *ds = ra->ds;
4006         boolean_t might_destroy;
4007         int error;
4008 
4009         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
4010                 return (ENOTSUP);
4011 
4012         error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
4013         if (error)
4014                 return (error);
4015 
4016         if (might_destroy) {
4017                 struct dsl_ds_destroyarg dsda = {0};
4018 
4019                 if (dmu_tx_is_syncing(tx)) {
4020                         /*
4021                          * If we're not prepared to remove the snapshot,
4022                          * we can't allow the release to happen right now.
4023                          */
4024                         if (!ra->own)
4025                                 return (EBUSY);
4026                 }
4027                 dsda.ds = ds;
4028                 dsda.releasing = B_TRUE;
4029                 return (dsl_dataset_destroy_check(&dsda, tag, tx));
4030         }
4031 
4032         return (0);
4033 }
4034 
4035 static void
4036 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
4037 {
4038         struct dsl_ds_releasearg *ra = arg1;
4039         dsl_dataset_t *ds = ra->ds;
4040         dsl_pool_t *dp = ds->ds_dir->dd_pool;
4041         objset_t *mos = dp->dp_meta_objset;
4042         uint64_t zapobj;
4043         uint64_t refs;
4044         int error;
4045 
4046         mutex_enter(&ds->ds_lock);
4047         ds->ds_userrefs--;
4048         refs = ds->ds_userrefs;
4049         mutex_exit(&ds->ds_lock);
4050         error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
4051         VERIFY(error == 0 || error == ENOENT);
4052         zapobj = ds->ds_phys->ds_userrefs_obj;
4053         VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
4054 
4055         spa_history_log_internal_ds(ds, "release", tx,
4056             "tag = %s refs now = %lld", ra->htag, (longlong_t)refs);
4057 
4058         if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
4059             DS_IS_DEFER_DESTROY(ds)) {
4060                 struct dsl_ds_destroyarg dsda = {0};
4061 
4062                 ASSERT(ra->own);
4063                 dsda.ds = ds;
4064                 dsda.releasing = B_TRUE;
4065                 /* We already did the destroy_check */
4066                 dsl_dataset_destroy_sync(&dsda, tag, tx);
4067         }
4068 }
4069 
4070 static int
4071 dsl_dataset_user_release_one(const char *dsname, void *arg)
4072 {
4073         struct dsl_ds_holdarg *ha = arg;
4074         struct dsl_ds_releasearg *ra;
4075         dsl_dataset_t *ds;
4076         int error;
4077         void *dtag = ha->dstg;
4078         char *name;
4079         boolean_t own = B_FALSE;
4080         boolean_t might_destroy;
4081 
4082         /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
4083         name = kmem_asprintf("%s@%s", dsname, ha->snapname);
4084         error = dsl_dataset_hold(name, dtag, &ds);
4085         strfree(name);
4086         if (error == ENOENT && ha->recursive)
4087                 return (0);
4088         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
4089         if (error)
4090                 return (error);
4091 
4092         ha->gotone = B_TRUE;
4093 
4094         ASSERT(dsl_dataset_is_snapshot(ds));
4095 
4096         error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
4097         if (error) {
4098                 dsl_dataset_rele(ds, dtag);
4099                 return (error);
4100         }
4101 
4102         if (might_destroy) {
4103 #ifdef _KERNEL
4104                 name = kmem_asprintf("%s@%s", dsname, ha->snapname);
4105                 error = zfs_unmount_snap(name, NULL);
4106                 strfree(name);
4107                 if (error) {
4108                         dsl_dataset_rele(ds, dtag);
4109                         return (error);
4110                 }
4111 #endif
4112                 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
4113                         dsl_dataset_rele(ds, dtag);
4114                         return (EBUSY);
4115                 } else {
4116                         own = B_TRUE;
4117                         dsl_dataset_make_exclusive(ds, dtag);
4118                 }
4119         }
4120 
4121         ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
4122         ra->ds = ds;
4123         ra->htag = ha->htag;
4124         ra->own = own;
4125         dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
4126             dsl_dataset_user_release_sync, ra, dtag, 0);
4127 
4128         return (0);
4129 }
4130 
4131 int
4132 dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
4133     boolean_t recursive)
4134 {
4135         struct dsl_ds_holdarg *ha;
4136         dsl_sync_task_t *dst;
4137         spa_t *spa;
4138         int error;
4139 
4140 top:
4141         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
4142 
4143         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
4144 
4145         error = spa_open(dsname, &spa, FTAG);
4146         if (error) {
4147                 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
4148                 return (error);
4149         }
4150 
4151         ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
4152         ha->htag = htag;
4153         ha->snapname = snapname;
4154         ha->recursive = recursive;
4155         if (recursive) {
4156                 error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
4157                     ha, DS_FIND_CHILDREN);
4158         } else {
4159                 error = dsl_dataset_user_release_one(dsname, ha);
4160         }
4161         if (error == 0)
4162                 error = dsl_sync_task_group_wait(ha->dstg);
4163 
4164         for (dst = list_head(&ha->dstg->dstg_tasks); dst;
4165             dst = list_next(&ha->dstg->dstg_tasks, dst)) {
4166                 struct dsl_ds_releasearg *ra = dst->dst_arg1;
4167                 dsl_dataset_t *ds = ra->ds;
4168 
4169                 if (dst->dst_err)
4170                         dsl_dataset_name(ds, ha->failed);
4171 
4172                 if (ra->own)
4173                         dsl_dataset_disown(ds, ha->dstg);
4174                 else
4175                         dsl_dataset_rele(ds, ha->dstg);
4176 
4177                 kmem_free(ra, sizeof (struct dsl_ds_releasearg));
4178         }
4179 
4180         if (error == 0 && recursive && !ha->gotone)
4181                 error = ENOENT;
4182 
4183         if (error && error != EBUSY)
4184                 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
4185 
4186         dsl_sync_task_group_destroy(ha->dstg);
4187         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
4188         spa_close(spa, FTAG);
4189 
4190         /*
4191          * We can get EBUSY if we were racing with deferred destroy and
4192          * dsl_dataset_user_release_check() hadn't done the necessary
4193          * open context setup.  We can also get EBUSY if we're racing
4194          * with destroy and that thread is the ds_owner.  Either way
4195          * the busy condition should be transient, and we should retry
4196          * the release operation.
4197          */
4198         if (error == EBUSY)
4199                 goto top;
4200 
4201         return (error);
4202 }
4203 
4204 /*
4205  * Called at spa_load time (with retry == B_FALSE) to release a stale
4206  * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
4207  */
4208 int
4209 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
4210     boolean_t retry)
4211 {
4212         dsl_dataset_t *ds;
4213         char *snap;
4214         char *name;
4215         int namelen;
4216         int error;
4217 
4218         do {
4219                 rw_enter(&dp->dp_config_rwlock, RW_READER);
4220                 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
4221                 rw_exit(&dp->dp_config_rwlock);
4222                 if (error)
4223                         return (error);
4224                 namelen = dsl_dataset_namelen(ds)+1;
4225                 name = kmem_alloc(namelen, KM_SLEEP);
4226                 dsl_dataset_name(ds, name);
4227                 dsl_dataset_rele(ds, FTAG);
4228 
4229                 snap = strchr(name, '@');
4230                 *snap = '\0';
4231                 ++snap;
4232                 error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
4233                 kmem_free(name, namelen);
4234 
4235                 /*
4236                  * The object can't have been destroyed because we have a hold,
4237                  * but it might have been renamed, resulting in ENOENT.  Retry
4238                  * if we've been requested to do so.
4239                  *
4240                  * It would be nice if we could use the dsobj all the way
4241                  * through and avoid ENOENT entirely.  But we might need to
4242                  * unmount the snapshot, and there's currently no way to lookup
4243                  * a vfsp using a ZFS object id.
4244                  */
4245         } while ((error == ENOENT) && retry);
4246 
4247         return (error);
4248 }
4249 
4250 int
4251 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
4252 {
4253         dsl_dataset_t *ds;
4254         int err;
4255 
4256         err = dsl_dataset_hold(dsname, FTAG, &ds);
4257         if (err)
4258                 return (err);
4259 
4260         VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
4261         if (ds->ds_phys->ds_userrefs_obj != 0) {
4262                 zap_attribute_t *za;
4263                 zap_cursor_t zc;
4264 
4265                 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
4266                 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
4267                     ds->ds_phys->ds_userrefs_obj);
4268                     zap_cursor_retrieve(&zc, za) == 0;
4269                     zap_cursor_advance(&zc)) {
4270                         VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
4271                             za->za_first_integer));
4272                 }
4273                 zap_cursor_fini(&zc);
4274                 kmem_free(za, sizeof (zap_attribute_t));
4275         }
4276         dsl_dataset_rele(ds, FTAG);
4277         return (0);
4278 }
4279 
4280 /*
4281  * Note, this function is used as the callback for dmu_objset_find().  We
4282  * always return 0 so that we will continue to find and process
4283  * inconsistent datasets, even if we encounter an error trying to
4284  * process one of them.
4285  */
4286 /* ARGSUSED */
4287 int
4288 dsl_destroy_inconsistent(const char *dsname, void *arg)
4289 {
4290         dsl_dataset_t *ds;
4291 
4292         if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
4293                 if (DS_IS_INCONSISTENT(ds))
4294                         (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
4295                 else
4296                         dsl_dataset_disown(ds, FTAG);
4297         }
4298         return (0);
4299 }
4300 
4301 /*
4302  * Return (in *usedp) the amount of space written in new that is not
4303  * present in oldsnap.  New may be a snapshot or the head.  Old must be
4304  * a snapshot before new, in new's filesystem (or its origin).  If not then
4305  * fail and return EINVAL.
4306  *
4307  * The written space is calculated by considering two components:  First, we
4308  * ignore any freed space, and calculate the written as new's used space
4309  * minus old's used space.  Next, we add in the amount of space that was freed
4310  * between the two snapshots, thus reducing new's used space relative to old's.
4311  * Specifically, this is the space that was born before old->ds_creation_txg,
4312  * and freed before new (ie. on new's deadlist or a previous deadlist).
4313  *
4314  * space freed                         [---------------------]
4315  * snapshots                       ---O-------O--------O-------O------
4316  *                                         oldsnap            new
4317  */
4318 int
4319 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
4320     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4321 {
4322         int err = 0;
4323         uint64_t snapobj;
4324         dsl_pool_t *dp = new->ds_dir->dd_pool;
4325 
4326         *usedp = 0;
4327         *usedp += new->ds_phys->ds_referenced_bytes;
4328         *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
4329 
4330         *compp = 0;
4331         *compp += new->ds_phys->ds_compressed_bytes;
4332         *compp -= oldsnap->ds_phys->ds_compressed_bytes;
4333 
4334         *uncompp = 0;
4335         *uncompp += new->ds_phys->ds_uncompressed_bytes;
4336         *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
4337 
4338         rw_enter(&dp->dp_config_rwlock, RW_READER);
4339         snapobj = new->ds_object;
4340         while (snapobj != oldsnap->ds_object) {
4341                 dsl_dataset_t *snap;
4342                 uint64_t used, comp, uncomp;
4343 
4344                 if (snapobj == new->ds_object) {
4345                         snap = new;
4346                 } else {
4347                         err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
4348                         if (err != 0)
4349                                 break;
4350                 }
4351 
4352                 if (snap->ds_phys->ds_prev_snap_txg ==
4353                     oldsnap->ds_phys->ds_creation_txg) {
4354                         /*
4355                          * The blocks in the deadlist can not be born after
4356                          * ds_prev_snap_txg, so get the whole deadlist space,
4357                          * which is more efficient (especially for old-format
4358                          * deadlists).  Unfortunately the deadlist code
4359                          * doesn't have enough information to make this
4360                          * optimization itself.
4361                          */
4362                         dsl_deadlist_space(&snap->ds_deadlist,
4363                             &used, &comp, &uncomp);
4364                 } else {
4365                         dsl_deadlist_space_range(&snap->ds_deadlist,
4366                             0, oldsnap->ds_phys->ds_creation_txg,
4367                             &used, &comp, &uncomp);
4368                 }
4369                 *usedp += used;
4370                 *compp += comp;
4371                 *uncompp += uncomp;
4372 
4373                 /*
4374                  * If we get to the beginning of the chain of snapshots
4375                  * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
4376                  * was not a snapshot of/before new.
4377                  */
4378                 snapobj = snap->ds_phys->ds_prev_snap_obj;
4379                 if (snap != new)
4380                         dsl_dataset_rele(snap, FTAG);
4381                 if (snapobj == 0) {
4382                         err = EINVAL;
4383                         break;
4384                 }
4385 
4386         }
4387         rw_exit(&dp->dp_config_rwlock);
4388         return (err);
4389 }
4390 
4391 /*
4392  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
4393  * lastsnap, and all snapshots in between are deleted.
4394  *
4395  * blocks that would be freed            [---------------------------]
4396  * snapshots                       ---O-------O--------O-------O--------O
4397  *                                        firstsnap        lastsnap
4398  *
4399  * This is the set of blocks that were born after the snap before firstsnap,
4400  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
4401  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
4402  * We calculate this by iterating over the relevant deadlists (from the snap
4403  * after lastsnap, backward to the snap after firstsnap), summing up the
4404  * space on the deadlist that was born after the snap before firstsnap.
4405  */
4406 int
4407 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
4408     dsl_dataset_t *lastsnap,
4409     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4410 {
4411         int err = 0;
4412         uint64_t snapobj;
4413         dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
4414 
4415         ASSERT(dsl_dataset_is_snapshot(firstsnap));
4416         ASSERT(dsl_dataset_is_snapshot(lastsnap));
4417 
4418         /*
4419          * Check that the snapshots are in the same dsl_dir, and firstsnap
4420          * is before lastsnap.
4421          */
4422         if (firstsnap->ds_dir != lastsnap->ds_dir ||
4423             firstsnap->ds_phys->ds_creation_txg >
4424             lastsnap->ds_phys->ds_creation_txg)
4425                 return (EINVAL);
4426 
4427         *usedp = *compp = *uncompp = 0;
4428 
4429         rw_enter(&dp->dp_config_rwlock, RW_READER);
4430         snapobj = lastsnap->ds_phys->ds_next_snap_obj;
4431         while (snapobj != firstsnap->ds_object) {
4432                 dsl_dataset_t *ds;
4433                 uint64_t used, comp, uncomp;
4434 
4435                 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
4436                 if (err != 0)
4437                         break;
4438 
4439                 dsl_deadlist_space_range(&ds->ds_deadlist,
4440                     firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
4441                     &used, &comp, &uncomp);
4442                 *usedp += used;
4443                 *compp += comp;
4444                 *uncompp += uncomp;
4445 
4446                 snapobj = ds->ds_phys->ds_prev_snap_obj;
4447                 ASSERT3U(snapobj, !=, 0);
4448                 dsl_dataset_rele(ds, FTAG);
4449         }
4450         rw_exit(&dp->dp_config_rwlock);
4451         return (err);
4452 }