1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/dmu_objset.h>
  28 #include <sys/dsl_dataset.h>
  29 #include <sys/dsl_dir.h>
  30 #include <sys/dsl_prop.h>
  31 #include <sys/dsl_synctask.h>
  32 #include <sys/dmu_traverse.h>
  33 #include <sys/dmu_impl.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/arc.h>
  36 #include <sys/zio.h>
  37 #include <sys/zap.h>
  38 #include <sys/zfeature.h>
  39 #include <sys/unique.h>
  40 #include <sys/zfs_context.h>
  41 #include <sys/zfs_ioctl.h>
  42 #include <sys/spa.h>
  43 #include <sys/zfs_znode.h>
  44 #include <sys/zfs_onexit.h>
  45 #include <sys/zvol.h>
  46 #include <sys/dsl_scan.h>
  47 #include <sys/dsl_deadlist.h>
  48 
  49 static char *dsl_reaper = "the grim reaper";
  50 
  51 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
  52 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
  53 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
  54 
  55 #define SWITCH64(x, y) \
  56         { \
  57                 uint64_t __tmp = (x); \
  58                 (x) = (y); \
  59                 (y) = __tmp; \
  60         }
  61 
  62 #define DS_REF_MAX      (1ULL << 62)
  63 
  64 #define DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
  65 
  66 #define DSL_DATASET_IS_DESTROYED(ds)    ((ds)->ds_owner == dsl_reaper)
  67 
  68 
  69 /*
  70  * Figure out how much of this delta should be propogated to the dsl_dir
  71  * layer.  If there's a refreservation, that space has already been
  72  * partially accounted for in our ancestors.
  73  */
  74 static int64_t
  75 parent_delta(dsl_dataset_t *ds, int64_t delta)
  76 {
  77         uint64_t old_bytes, new_bytes;
  78 
  79         if (ds->ds_reserved == 0)
  80                 return (delta);
  81 
  82         old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
  83         new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
  84 
  85         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
  86         return (new_bytes - old_bytes);
  87 }
  88 
  89 void
  90 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
  91 {
  92         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
  93         int compressed = BP_GET_PSIZE(bp);
  94         int uncompressed = BP_GET_UCSIZE(bp);
  95         int64_t delta;
  96 
  97         dprintf_bp(bp, "ds=%p", ds);
  98 
  99         ASSERT(dmu_tx_is_syncing(tx));
 100         /* It could have been compressed away to nothing */
 101         if (BP_IS_HOLE(bp))
 102                 return;
 103         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 104         ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
 105         if (ds == NULL) {
 106                 /*
 107                  * Account for the meta-objset space in its placeholder
 108                  * dsl_dir.
 109                  */
 110                 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
 111                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 112                     used, compressed, uncompressed, tx);
 113                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 114                 return;
 115         }
 116         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 117 
 118         mutex_enter(&ds->ds_dir->dd_lock);
 119         mutex_enter(&ds->ds_lock);
 120         delta = parent_delta(ds, used);
 121         ds->ds_phys->ds_referenced_bytes += used;
 122         ds->ds_phys->ds_compressed_bytes += compressed;
 123         ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 124         ds->ds_phys->ds_unique_bytes += used;
 125         mutex_exit(&ds->ds_lock);
 126         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 127             compressed, uncompressed, tx);
 128         dsl_dir_transfer_space(ds->ds_dir, used - delta,
 129             DD_USED_REFRSRV, DD_USED_HEAD, tx);
 130         mutex_exit(&ds->ds_dir->dd_lock);
 131 }
 132 
 133 int
 134 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 135     boolean_t async)
 136 {
 137         if (BP_IS_HOLE(bp))
 138                 return (0);
 139 
 140         ASSERT(dmu_tx_is_syncing(tx));
 141         ASSERT(bp->blk_birth <= tx->tx_txg);
 142 
 143         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 144         int compressed = BP_GET_PSIZE(bp);
 145         int uncompressed = BP_GET_UCSIZE(bp);
 146 
 147         ASSERT(used > 0);
 148         if (ds == NULL) {
 149                 /*
 150                  * Account for the meta-objset space in its placeholder
 151                  * dataset.
 152                  */
 153                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
 154 
 155                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 156                     -used, -compressed, -uncompressed, tx);
 157                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 158                 return (used);
 159         }
 160         ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 161 
 162         ASSERT(!dsl_dataset_is_snapshot(ds));
 163         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 164 
 165         if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 166                 int64_t delta;
 167 
 168                 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 169                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
 170 
 171                 mutex_enter(&ds->ds_dir->dd_lock);
 172                 mutex_enter(&ds->ds_lock);
 173                 ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
 174                     !DS_UNIQUE_IS_ACCURATE(ds));
 175                 delta = parent_delta(ds, -used);
 176                 ds->ds_phys->ds_unique_bytes -= used;
 177                 mutex_exit(&ds->ds_lock);
 178                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 179                     delta, -compressed, -uncompressed, tx);
 180                 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
 181                     DD_USED_REFRSRV, DD_USED_HEAD, tx);
 182                 mutex_exit(&ds->ds_dir->dd_lock);
 183         } else {
 184                 dprintf_bp(bp, "putting on dead list: %s", "");
 185                 if (async) {
 186                         /*
 187                          * We are here as part of zio's write done callback,
 188                          * which means we're a zio interrupt thread.  We can't
 189                          * call dsl_deadlist_insert() now because it may block
 190                          * waiting for I/O.  Instead, put bp on the deferred
 191                          * queue and let dsl_pool_sync() finish the job.
 192                          */
 193                         bplist_append(&ds->ds_pending_deadlist, bp);
 194                 } else {
 195                         dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 196                 }
 197                 ASSERT3U(ds->ds_prev->ds_object, ==,
 198                     ds->ds_phys->ds_prev_snap_obj);
 199                 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 200                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 201                 if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
 202                     ds->ds_object && bp->blk_birth >
 203                     ds->ds_prev->ds_phys->ds_prev_snap_txg) {
 204                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 205                         mutex_enter(&ds->ds_prev->ds_lock);
 206                         ds->ds_prev->ds_phys->ds_unique_bytes += used;
 207                         mutex_exit(&ds->ds_prev->ds_lock);
 208                 }
 209                 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 210                         dsl_dir_transfer_space(ds->ds_dir, used,
 211                             DD_USED_HEAD, DD_USED_SNAP, tx);
 212                 }
 213         }
 214         mutex_enter(&ds->ds_lock);
 215         ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
 216         ds->ds_phys->ds_referenced_bytes -= used;
 217         ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 218         ds->ds_phys->ds_compressed_bytes -= compressed;
 219         ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 220         ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 221         mutex_exit(&ds->ds_lock);
 222 
 223         return (used);
 224 }
 225 
 226 uint64_t
 227 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 228 {
 229         uint64_t trysnap = 0;
 230 
 231         if (ds == NULL)
 232                 return (0);
 233         /*
 234          * The snapshot creation could fail, but that would cause an
 235          * incorrect FALSE return, which would only result in an
 236          * overestimation of the amount of space that an operation would
 237          * consume, which is OK.
 238          *
 239          * There's also a small window where we could miss a pending
 240          * snapshot, because we could set the sync task in the quiescing
 241          * phase.  So this should only be used as a guess.
 242          */
 243         if (ds->ds_trysnap_txg >
 244             spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 245                 trysnap = ds->ds_trysnap_txg;
 246         return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
 247 }
 248 
 249 boolean_t
 250 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
 251     uint64_t blk_birth)
 252 {
 253         if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
 254                 return (B_FALSE);
 255 
 256         ddt_prefetch(dsl_dataset_get_spa(ds), bp);
 257 
 258         return (B_TRUE);
 259 }
 260 
 261 /* ARGSUSED */
 262 static void
 263 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 264 {
 265         dsl_dataset_t *ds = dsv;
 266 
 267         ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
 268 
 269         unique_remove(ds->ds_fsid_guid);
 270 
 271         if (ds->ds_objset != NULL)
 272                 dmu_objset_evict(ds->ds_objset);
 273 
 274         if (ds->ds_prev) {
 275                 dsl_dataset_drop_ref(ds->ds_prev, ds);
 276                 ds->ds_prev = NULL;
 277         }
 278 
 279         bplist_destroy(&ds->ds_pending_deadlist);
 280         if (db != NULL) {
 281                 dsl_deadlist_close(&ds->ds_deadlist);
 282         } else {
 283                 ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
 284                 ASSERT(!ds->ds_deadlist.dl_oldfmt);
 285         }
 286         if (ds->ds_dir)
 287                 dsl_dir_close(ds->ds_dir, ds);
 288 
 289         ASSERT(!list_link_active(&ds->ds_synced_link));
 290 
 291         mutex_destroy(&ds->ds_lock);
 292         mutex_destroy(&ds->ds_recvlock);
 293         mutex_destroy(&ds->ds_opening_lock);
 294         rw_destroy(&ds->ds_rwlock);
 295         cv_destroy(&ds->ds_exclusive_cv);
 296 
 297         kmem_free(ds, sizeof (dsl_dataset_t));
 298 }
 299 
 300 static int
 301 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 302 {
 303         dsl_dataset_phys_t *headphys;
 304         int err;
 305         dmu_buf_t *headdbuf;
 306         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 307         objset_t *mos = dp->dp_meta_objset;
 308 
 309         if (ds->ds_snapname[0])
 310                 return (0);
 311         if (ds->ds_phys->ds_next_snap_obj == 0)
 312                 return (0);
 313 
 314         err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
 315             FTAG, &headdbuf);
 316         if (err)
 317                 return (err);
 318         headphys = headdbuf->db_data;
 319         err = zap_value_search(dp->dp_meta_objset,
 320             headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
 321         dmu_buf_rele(headdbuf, FTAG);
 322         return (err);
 323 }
 324 
 325 static int
 326 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 327 {
 328         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 329         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 330         matchtype_t mt;
 331         int err;
 332 
 333         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 334                 mt = MT_FIRST;
 335         else
 336                 mt = MT_EXACT;
 337 
 338         err = zap_lookup_norm(mos, snapobj, name, 8, 1,
 339             value, mt, NULL, 0, NULL);
 340         if (err == ENOTSUP && mt == MT_FIRST)
 341                 err = zap_lookup(mos, snapobj, name, 8, 1, value);
 342         return (err);
 343 }
 344 
 345 static int
 346 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
 347 {
 348         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 349         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 350         matchtype_t mt;
 351         int err;
 352 
 353         dsl_dir_snap_cmtime_update(ds->ds_dir);
 354 
 355         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 356                 mt = MT_FIRST;
 357         else
 358                 mt = MT_EXACT;
 359 
 360         err = zap_remove_norm(mos, snapobj, name, mt, tx);
 361         if (err == ENOTSUP && mt == MT_FIRST)
 362                 err = zap_remove(mos, snapobj, name, tx);
 363         return (err);
 364 }
 365 
 366 static int
 367 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 368     dsl_dataset_t **dsp)
 369 {
 370         objset_t *mos = dp->dp_meta_objset;
 371         dmu_buf_t *dbuf;
 372         dsl_dataset_t *ds;
 373         int err;
 374         dmu_object_info_t doi;
 375 
 376         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 377             dsl_pool_sync_context(dp));
 378 
 379         err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 380         if (err)
 381                 return (err);
 382 
 383         /* Make sure dsobj has the correct object type. */
 384         dmu_object_info_from_db(dbuf, &doi);
 385         if (doi.doi_type != DMU_OT_DSL_DATASET)
 386                 return (EINVAL);
 387 
 388         ds = dmu_buf_get_user(dbuf);
 389         if (ds == NULL) {
 390                 dsl_dataset_t *winner;
 391 
 392                 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 393                 ds->ds_dbuf = dbuf;
 394                 ds->ds_object = dsobj;
 395                 ds->ds_phys = dbuf->db_data;
 396 
 397                 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 398                 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
 399                 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 400                 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
 401 
 402                 rw_init(&ds->ds_rwlock, 0, 0, 0);
 403                 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
 404 
 405                 bplist_create(&ds->ds_pending_deadlist);
 406                 dsl_deadlist_open(&ds->ds_deadlist,
 407                     mos, ds->ds_phys->ds_deadlist_obj);
 408 
 409                 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
 410                     offsetof(dmu_sendarg_t, dsa_link));
 411 
 412                 if (err == 0) {
 413                         err = dsl_dir_open_obj(dp,
 414                             ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
 415                 }
 416                 if (err) {
 417                         mutex_destroy(&ds->ds_lock);
 418                         mutex_destroy(&ds->ds_recvlock);
 419                         mutex_destroy(&ds->ds_opening_lock);
 420                         rw_destroy(&ds->ds_rwlock);
 421                         cv_destroy(&ds->ds_exclusive_cv);
 422                         bplist_destroy(&ds->ds_pending_deadlist);
 423                         dsl_deadlist_close(&ds->ds_deadlist);
 424                         kmem_free(ds, sizeof (dsl_dataset_t));
 425                         dmu_buf_rele(dbuf, tag);
 426                         return (err);
 427                 }
 428 
 429                 if (!dsl_dataset_is_snapshot(ds)) {
 430                         ds->ds_snapname[0] = '\0';
 431                         if (ds->ds_phys->ds_prev_snap_obj) {
 432                                 err = dsl_dataset_get_ref(dp,
 433                                     ds->ds_phys->ds_prev_snap_obj,
 434                                     ds, &ds->ds_prev);
 435                         }
 436                 } else {
 437                         if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 438                                 err = dsl_dataset_get_snapname(ds);
 439                         if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
 440                                 err = zap_count(
 441                                     ds->ds_dir->dd_pool->dp_meta_objset,
 442                                     ds->ds_phys->ds_userrefs_obj,
 443                                     &ds->ds_userrefs);
 444                         }
 445                 }
 446 
 447                 if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
 448                         /*
 449                          * In sync context, we're called with either no lock
 450                          * or with the write lock.  If we're not syncing,
 451                          * we're always called with the read lock held.
 452                          */
 453                         boolean_t need_lock =
 454                             !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
 455                             dsl_pool_sync_context(dp);
 456 
 457                         if (need_lock)
 458                                 rw_enter(&dp->dp_config_rwlock, RW_READER);
 459 
 460                         err = dsl_prop_get_ds(ds,
 461                             "refreservation", sizeof (uint64_t), 1,
 462                             &ds->ds_reserved, NULL);
 463                         if (err == 0) {
 464                                 err = dsl_prop_get_ds(ds,
 465                                     "refquota", sizeof (uint64_t), 1,
 466                                     &ds->ds_quota, NULL);
 467                         }
 468 
 469                         if (need_lock)
 470                                 rw_exit(&dp->dp_config_rwlock);
 471                 } else {
 472                         ds->ds_reserved = ds->ds_quota = 0;
 473                 }
 474 
 475                 if (err == 0) {
 476                         winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
 477                             dsl_dataset_evict);
 478                 }
 479                 if (err || winner) {
 480                         bplist_destroy(&ds->ds_pending_deadlist);
 481                         dsl_deadlist_close(&ds->ds_deadlist);
 482                         if (ds->ds_prev)
 483                                 dsl_dataset_drop_ref(ds->ds_prev, ds);
 484                         dsl_dir_close(ds->ds_dir, ds);
 485                         mutex_destroy(&ds->ds_lock);
 486                         mutex_destroy(&ds->ds_recvlock);
 487                         mutex_destroy(&ds->ds_opening_lock);
 488                         rw_destroy(&ds->ds_rwlock);
 489                         cv_destroy(&ds->ds_exclusive_cv);
 490                         kmem_free(ds, sizeof (dsl_dataset_t));
 491                         if (err) {
 492                                 dmu_buf_rele(dbuf, tag);
 493                                 return (err);
 494                         }
 495                         ds = winner;
 496                 } else {
 497                         ds->ds_fsid_guid =
 498                             unique_insert(ds->ds_phys->ds_fsid_guid);
 499                 }
 500         }
 501         ASSERT3P(ds->ds_dbuf, ==, dbuf);
 502         ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
 503         ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
 504             spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 505             dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 506         mutex_enter(&ds->ds_lock);
 507         if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
 508                 mutex_exit(&ds->ds_lock);
 509                 dmu_buf_rele(ds->ds_dbuf, tag);
 510                 return (ENOENT);
 511         }
 512         mutex_exit(&ds->ds_lock);
 513         *dsp = ds;
 514         return (0);
 515 }
 516 
 517 static int
 518 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
 519 {
 520         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 521 
 522         /*
 523          * In syncing context we don't want the rwlock lock: there
 524          * may be an existing writer waiting for sync phase to
 525          * finish.  We don't need to worry about such writers, since
 526          * sync phase is single-threaded, so the writer can't be
 527          * doing anything while we are active.
 528          */
 529         if (dsl_pool_sync_context(dp)) {
 530                 ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
 531                 return (0);
 532         }
 533 
 534         /*
 535          * Normal users will hold the ds_rwlock as a READER until they
 536          * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
 537          * drop their READER lock after they set the ds_owner field.
 538          *
 539          * If the dataset is being destroyed, the destroy thread will
 540          * obtain a WRITER lock for exclusive access after it's done its
 541          * open-context work and then change the ds_owner to
 542          * dsl_reaper once destruction is assured.  So threads
 543          * may block here temporarily, until the "destructability" of
 544          * the dataset is determined.
 545          */
 546         ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
 547         mutex_enter(&ds->ds_lock);
 548         while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
 549                 rw_exit(&dp->dp_config_rwlock);
 550                 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
 551                 if (DSL_DATASET_IS_DESTROYED(ds)) {
 552                         mutex_exit(&ds->ds_lock);
 553                         dsl_dataset_drop_ref(ds, tag);
 554                         rw_enter(&dp->dp_config_rwlock, RW_READER);
 555                         return (ENOENT);
 556                 }
 557                 /*
 558                  * The dp_config_rwlock lives above the ds_lock. And
 559                  * we need to check DSL_DATASET_IS_DESTROYED() while
 560                  * holding the ds_lock, so we have to drop and reacquire
 561                  * the ds_lock here.
 562                  */
 563                 mutex_exit(&ds->ds_lock);
 564                 rw_enter(&dp->dp_config_rwlock, RW_READER);
 565                 mutex_enter(&ds->ds_lock);
 566         }
 567         mutex_exit(&ds->ds_lock);
 568         return (0);
 569 }
 570 
 571 int
 572 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 573     dsl_dataset_t **dsp)
 574 {
 575         int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
 576 
 577         if (err)
 578                 return (err);
 579         return (dsl_dataset_hold_ref(*dsp, tag));
 580 }
 581 
 582 int
 583 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
 584     void *tag, dsl_dataset_t **dsp)
 585 {
 586         int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
 587         if (err)
 588                 return (err);
 589         if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 590                 dsl_dataset_rele(*dsp, tag);
 591                 *dsp = NULL;
 592                 return (EBUSY);
 593         }
 594         return (0);
 595 }
 596 
 597 int
 598 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
 599 {
 600         dsl_dir_t *dd;
 601         dsl_pool_t *dp;
 602         const char *snapname;
 603         uint64_t obj;
 604         int err = 0;
 605 
 606         err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
 607         if (err)
 608                 return (err);
 609 
 610         dp = dd->dd_pool;
 611         obj = dd->dd_phys->dd_head_dataset_obj;
 612         rw_enter(&dp->dp_config_rwlock, RW_READER);
 613         if (obj)
 614                 err = dsl_dataset_get_ref(dp, obj, tag, dsp);
 615         else
 616                 err = ENOENT;
 617         if (err)
 618                 goto out;
 619 
 620         err = dsl_dataset_hold_ref(*dsp, tag);
 621 
 622         /* we may be looking for a snapshot */
 623         if (err == 0 && snapname != NULL) {
 624                 dsl_dataset_t *ds = NULL;
 625 
 626                 if (*snapname++ != '@') {
 627                         dsl_dataset_rele(*dsp, tag);
 628                         err = ENOENT;
 629                         goto out;
 630                 }
 631 
 632                 dprintf("looking for snapshot '%s'\n", snapname);
 633                 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
 634                 if (err == 0)
 635                         err = dsl_dataset_get_ref(dp, obj, tag, &ds);
 636                 dsl_dataset_rele(*dsp, tag);
 637 
 638                 ASSERT3U((err == 0), ==, (ds != NULL));
 639 
 640                 if (ds) {
 641                         mutex_enter(&ds->ds_lock);
 642                         if (ds->ds_snapname[0] == 0)
 643                                 (void) strlcpy(ds->ds_snapname, snapname,
 644                                     sizeof (ds->ds_snapname));
 645                         mutex_exit(&ds->ds_lock);
 646                         err = dsl_dataset_hold_ref(ds, tag);
 647                         *dsp = err ? NULL : ds;
 648                 }
 649         }
 650 out:
 651         rw_exit(&dp->dp_config_rwlock);
 652         dsl_dir_close(dd, FTAG);
 653         return (err);
 654 }
 655 
 656 int
 657 dsl_dataset_own(const char *name, boolean_t inconsistentok,
 658     void *tag, dsl_dataset_t **dsp)
 659 {
 660         int err = dsl_dataset_hold(name, tag, dsp);
 661         if (err)
 662                 return (err);
 663         if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 664                 dsl_dataset_rele(*dsp, tag);
 665                 return (EBUSY);
 666         }
 667         return (0);
 668 }
 669 
 670 void
 671 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 672 {
 673         if (ds == NULL) {
 674                 (void) strcpy(name, "mos");
 675         } else {
 676                 dsl_dir_name(ds->ds_dir, name);
 677                 VERIFY(0 == dsl_dataset_get_snapname(ds));
 678                 if (ds->ds_snapname[0]) {
 679                         (void) strcat(name, "@");
 680                         /*
 681                          * We use a "recursive" mutex so that we
 682                          * can call dprintf_ds() with ds_lock held.
 683                          */
 684                         if (!MUTEX_HELD(&ds->ds_lock)) {
 685                                 mutex_enter(&ds->ds_lock);
 686                                 (void) strcat(name, ds->ds_snapname);
 687                                 mutex_exit(&ds->ds_lock);
 688                         } else {
 689                                 (void) strcat(name, ds->ds_snapname);
 690                         }
 691                 }
 692         }
 693 }
 694 
 695 static int
 696 dsl_dataset_namelen(dsl_dataset_t *ds)
 697 {
 698         int result;
 699 
 700         if (ds == NULL) {
 701                 result = 3;     /* "mos" */
 702         } else {
 703                 result = dsl_dir_namelen(ds->ds_dir);
 704                 VERIFY(0 == dsl_dataset_get_snapname(ds));
 705                 if (ds->ds_snapname[0]) {
 706                         ++result;       /* adding one for the @-sign */
 707                         if (!MUTEX_HELD(&ds->ds_lock)) {
 708                                 mutex_enter(&ds->ds_lock);
 709                                 result += strlen(ds->ds_snapname);
 710                                 mutex_exit(&ds->ds_lock);
 711                         } else {
 712                                 result += strlen(ds->ds_snapname);
 713                         }
 714                 }
 715         }
 716 
 717         return (result);
 718 }
 719 
 720 void
 721 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
 722 {
 723         dmu_buf_rele(ds->ds_dbuf, tag);
 724 }
 725 
 726 void
 727 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 728 {
 729         if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
 730                 rw_exit(&ds->ds_rwlock);
 731         }
 732         dsl_dataset_drop_ref(ds, tag);
 733 }
 734 
 735 void
 736 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 737 {
 738         ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
 739             (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
 740 
 741         mutex_enter(&ds->ds_lock);
 742         ds->ds_owner = NULL;
 743         if (RW_WRITE_HELD(&ds->ds_rwlock)) {
 744                 rw_exit(&ds->ds_rwlock);
 745                 cv_broadcast(&ds->ds_exclusive_cv);
 746         }
 747         mutex_exit(&ds->ds_lock);
 748         if (ds->ds_dbuf)
 749                 dsl_dataset_drop_ref(ds, tag);
 750         else
 751                 dsl_dataset_evict(NULL, ds);
 752 }
 753 
 754 boolean_t
 755 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
 756 {
 757         boolean_t gotit = FALSE;
 758 
 759         mutex_enter(&ds->ds_lock);
 760         if (ds->ds_owner == NULL &&
 761             (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
 762                 ds->ds_owner = tag;
 763                 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
 764                         rw_exit(&ds->ds_rwlock);
 765                 gotit = TRUE;
 766         }
 767         mutex_exit(&ds->ds_lock);
 768         return (gotit);
 769 }
 770 
 771 void
 772 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
 773 {
 774         ASSERT3P(owner, ==, ds->ds_owner);
 775         if (!RW_WRITE_HELD(&ds->ds_rwlock))
 776                 rw_enter(&ds->ds_rwlock, RW_WRITER);
 777 }
 778 
 779 uint64_t
 780 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 781     uint64_t flags, dmu_tx_t *tx)
 782 {
 783         dsl_pool_t *dp = dd->dd_pool;
 784         dmu_buf_t *dbuf;
 785         dsl_dataset_phys_t *dsphys;
 786         uint64_t dsobj;
 787         objset_t *mos = dp->dp_meta_objset;
 788 
 789         if (origin == NULL)
 790                 origin = dp->dp_origin_snap;
 791 
 792         ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
 793         ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
 794         ASSERT(dmu_tx_is_syncing(tx));
 795         ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 796 
 797         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 798             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 799         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 800         dmu_buf_will_dirty(dbuf, tx);
 801         dsphys = dbuf->db_data;
 802         bzero(dsphys, sizeof (dsl_dataset_phys_t));
 803         dsphys->ds_dir_obj = dd->dd_object;
 804         dsphys->ds_flags = flags;
 805         dsphys->ds_fsid_guid = unique_create();
 806         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 807             sizeof (dsphys->ds_guid));
 808         dsphys->ds_snapnames_zapobj =
 809             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 810             DMU_OT_NONE, 0, tx);
 811         dsphys->ds_creation_time = gethrestime_sec();
 812         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 813 
 814         if (origin == NULL) {
 815                 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 816         } else {
 817                 dsl_dataset_t *ohds;
 818 
 819                 dsphys->ds_prev_snap_obj = origin->ds_object;
 820                 dsphys->ds_prev_snap_txg =
 821                     origin->ds_phys->ds_creation_txg;
 822                 dsphys->ds_referenced_bytes =
 823                     origin->ds_phys->ds_referenced_bytes;
 824                 dsphys->ds_compressed_bytes =
 825                     origin->ds_phys->ds_compressed_bytes;
 826                 dsphys->ds_uncompressed_bytes =
 827                     origin->ds_phys->ds_uncompressed_bytes;
 828                 dsphys->ds_bp = origin->ds_phys->ds_bp;
 829                 dsphys->ds_flags |= origin->ds_phys->ds_flags;
 830 
 831                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
 832                 origin->ds_phys->ds_num_children++;
 833 
 834                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
 835                     origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
 836                 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 837                     dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 838                 dsl_dataset_rele(ohds, FTAG);
 839 
 840                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 841                         if (origin->ds_phys->ds_next_clones_obj == 0) {
 842                                 origin->ds_phys->ds_next_clones_obj =
 843                                     zap_create(mos,
 844                                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 845                         }
 846                         VERIFY(0 == zap_add_int(mos,
 847                             origin->ds_phys->ds_next_clones_obj,
 848                             dsobj, tx));
 849                 }
 850 
 851                 dmu_buf_will_dirty(dd->dd_dbuf, tx);
 852                 dd->dd_phys->dd_origin_obj = origin->ds_object;
 853                 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 854                         if (origin->ds_dir->dd_phys->dd_clones == 0) {
 855                                 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 856                                 origin->ds_dir->dd_phys->dd_clones =
 857                                     zap_create(mos,
 858                                     DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 859                         }
 860                         VERIFY3U(0, ==, zap_add_int(mos,
 861                             origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
 862                 }
 863         }
 864 
 865         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 866                 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 867 
 868         dmu_buf_rele(dbuf, FTAG);
 869 
 870         dmu_buf_will_dirty(dd->dd_dbuf, tx);
 871         dd->dd_phys->dd_head_dataset_obj = dsobj;
 872 
 873         return (dsobj);
 874 }
 875 
 876 uint64_t
 877 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
 878     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
 879 {
 880         dsl_pool_t *dp = pdd->dd_pool;
 881         uint64_t dsobj, ddobj;
 882         dsl_dir_t *dd;
 883 
 884         ASSERT(lastname[0] != '@');
 885 
 886         ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 887         VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
 888 
 889         dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
 890 
 891         dsl_deleg_set_create_perms(dd, tx, cr);
 892 
 893         dsl_dir_close(dd, FTAG);
 894 
 895         /*
 896          * If we are creating a clone, make sure we zero out any stale
 897          * data from the origin snapshots zil header.
 898          */
 899         if (origin != NULL) {
 900                 dsl_dataset_t *ds;
 901                 objset_t *os;
 902 
 903                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 904                 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
 905                 bzero(&os->os_zil_header, sizeof (os->os_zil_header));
 906                 dsl_dataset_dirty(ds, tx);
 907                 dsl_dataset_rele(ds, FTAG);
 908         }
 909 
 910         return (dsobj);
 911 }
 912 
 913 /*
 914  * The snapshots must all be in the same pool.
 915  */
 916 int
 917 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer,
 918     nvlist_t *errlist)
 919 {
 920         int err;
 921         dsl_sync_task_t *dst;
 922         spa_t *spa;
 923         nvpair_t *pair;
 924         dsl_sync_task_group_t *dstg;
 925 
 926         pair = nvlist_next_nvpair(snaps, NULL);
 927         if (pair == NULL)
 928                 return (0);
 929 
 930         err = spa_open(nvpair_name(pair), &spa, FTAG);
 931         if (err)
 932                 return (err);
 933         dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 934 
 935         for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 936             pair = nvlist_next_nvpair(snaps, pair)) {
 937                 dsl_dataset_t *ds;
 938 
 939                 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
 940                 if (err == 0) {
 941                         struct dsl_ds_destroyarg *dsda;
 942 
 943                         dsl_dataset_make_exclusive(ds, dstg);
 944                         dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
 945                             KM_SLEEP);
 946                         dsda->ds = ds;
 947                         dsda->defer = defer;
 948                         dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
 949                             dsl_dataset_destroy_sync, dsda, dstg, 0);
 950                 } else if (err == ENOENT) {
 951                         err = 0;
 952                 } else {
 953                         fnvlist_add_int32(errlist, nvpair_name(pair), err);
 954                         break;
 955                 }
 956         }
 957 
 958         if (err == 0)
 959                 err = dsl_sync_task_group_wait(dstg);
 960 
 961         for (dst = list_head(&dstg->dstg_tasks); dst;
 962             dst = list_next(&dstg->dstg_tasks, dst)) {
 963                 struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
 964                 dsl_dataset_t *ds = dsda->ds;
 965 
 966                 /*
 967                  * Return the snapshots that triggered the error.
 968                  */
 969                 if (dst->dst_err != 0) {
 970                         char name[ZFS_MAXNAMELEN];
 971                         dsl_dataset_name(ds, name);
 972                         fnvlist_add_int32(errlist, name, dst->dst_err);
 973                 }
 974                 ASSERT3P(dsda->rm_origin, ==, NULL);
 975                 dsl_dataset_disown(ds, dstg);
 976                 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
 977         }
 978 
 979         dsl_sync_task_group_destroy(dstg);
 980         spa_close(spa, FTAG);
 981         return (err);
 982 
 983 }
 984 
 985 static boolean_t
 986 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
 987 {
 988         boolean_t might_destroy = B_FALSE;
 989 
 990         mutex_enter(&ds->ds_lock);
 991         if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
 992             DS_IS_DEFER_DESTROY(ds))
 993                 might_destroy = B_TRUE;
 994         mutex_exit(&ds->ds_lock);
 995 
 996         return (might_destroy);
 997 }
 998 
 999 /*
1000  * If we're removing a clone, and these three conditions are true:
1001  *      1) the clone's origin has no other children
1002  *      2) the clone's origin has no user references
1003  *      3) the clone's origin has been marked for deferred destruction
1004  * Then, prepare to remove the origin as part of this sync task group.
1005  */
1006 static int
1007 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
1008 {
1009         dsl_dataset_t *ds = dsda->ds;
1010         dsl_dataset_t *origin = ds->ds_prev;
1011 
1012         if (dsl_dataset_might_destroy_origin(origin)) {
1013                 char *name;
1014                 int namelen;
1015                 int error;
1016 
1017                 namelen = dsl_dataset_namelen(origin) + 1;
1018                 name = kmem_alloc(namelen, KM_SLEEP);
1019                 dsl_dataset_name(origin, name);
1020 #ifdef _KERNEL
1021                 error = zfs_unmount_snap(name, NULL);
1022                 if (error) {
1023                         kmem_free(name, namelen);
1024                         return (error);
1025                 }
1026 #endif
1027                 error = dsl_dataset_own(name, B_TRUE, tag, &origin);
1028                 kmem_free(name, namelen);
1029                 if (error)
1030                         return (error);
1031                 dsda->rm_origin = origin;
1032                 dsl_dataset_make_exclusive(origin, tag);
1033         }
1034 
1035         return (0);
1036 }
1037 
1038 /*
1039  * ds must be opened as OWNER.  On return (whether successful or not),
1040  * ds will be closed and caller can no longer dereference it.
1041  */
1042 int
1043 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
1044 {
1045         int err;
1046         dsl_sync_task_group_t *dstg;
1047         objset_t *os;
1048         dsl_dir_t *dd;
1049         uint64_t obj;
1050         struct dsl_ds_destroyarg dsda = { 0 };
1051 
1052         dsda.ds = ds;
1053 
1054         if (dsl_dataset_is_snapshot(ds)) {
1055                 /* Destroying a snapshot is simpler */
1056                 dsl_dataset_make_exclusive(ds, tag);
1057 
1058                 dsda.defer = defer;
1059                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1060                     dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
1061                     &dsda, tag, 0);
1062                 ASSERT3P(dsda.rm_origin, ==, NULL);
1063                 goto out;
1064         } else if (defer) {
1065                 err = EINVAL;
1066                 goto out;
1067         }
1068 
1069         dd = ds->ds_dir;
1070 
1071         /*
1072          * Check for errors and mark this ds as inconsistent, in
1073          * case we crash while freeing the objects.
1074          */
1075         err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
1076             dsl_dataset_destroy_begin_sync, ds, NULL, 0);
1077         if (err)
1078                 goto out;
1079 
1080         err = dmu_objset_from_ds(ds, &os);
1081         if (err)
1082                 goto out;
1083 
1084         /*
1085          * If async destruction is not enabled try to remove all objects
1086          * while in the open context so that there is less work to do in
1087          * the syncing context.
1088          */
1089         if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
1090             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1091                 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
1092                     ds->ds_phys->ds_prev_snap_txg)) {
1093                         /*
1094                          * Ignore errors, if there is not enough disk space
1095                          * we will deal with it in dsl_dataset_destroy_sync().
1096                          */
1097                         (void) dmu_free_object(os, obj);
1098                 }
1099                 if (err != ESRCH)
1100                         goto out;
1101         }
1102 
1103         /*
1104          * Only the ZIL knows how to free log blocks.
1105          */
1106         zil_destroy(dmu_objset_zil(os), B_FALSE);
1107 
1108         /*
1109          * Sync out all in-flight IO.
1110          */
1111         txg_wait_synced(dd->dd_pool, 0);
1112 
1113         /*
1114          * If we managed to free all the objects in open
1115          * context, the user space accounting should be zero.
1116          */
1117         if (ds->ds_phys->ds_bp.blk_fill == 0 &&
1118             dmu_objset_userused_enabled(os)) {
1119                 uint64_t count;
1120 
1121                 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
1122                     count == 0);
1123                 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
1124                     count == 0);
1125         }
1126 
1127         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
1128         err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
1129         rw_exit(&dd->dd_pool->dp_config_rwlock);
1130 
1131         if (err)
1132                 goto out;
1133 
1134         /*
1135          * Blow away the dsl_dir + head dataset.
1136          */
1137         dsl_dataset_make_exclusive(ds, tag);
1138         /*
1139          * If we're removing a clone, we might also need to remove its
1140          * origin.
1141          */
1142         do {
1143                 dsda.need_prep = B_FALSE;
1144                 if (dsl_dir_is_clone(dd)) {
1145                         err = dsl_dataset_origin_rm_prep(&dsda, tag);
1146                         if (err) {
1147                                 dsl_dir_close(dd, FTAG);
1148                                 goto out;
1149                         }
1150                 }
1151 
1152                 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
1153                 dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
1154                     dsl_dataset_destroy_sync, &dsda, tag, 0);
1155                 dsl_sync_task_create(dstg, dsl_dir_destroy_check,
1156                     dsl_dir_destroy_sync, dd, FTAG, 0);
1157                 err = dsl_sync_task_group_wait(dstg);
1158                 dsl_sync_task_group_destroy(dstg);
1159 
1160                 /*
1161                  * We could be racing against 'zfs release' or 'zfs destroy -d'
1162                  * on the origin snap, in which case we can get EBUSY if we
1163                  * needed to destroy the origin snap but were not ready to
1164                  * do so.
1165                  */
1166                 if (dsda.need_prep) {
1167                         ASSERT(err == EBUSY);
1168                         ASSERT(dsl_dir_is_clone(dd));
1169                         ASSERT(dsda.rm_origin == NULL);
1170                 }
1171         } while (dsda.need_prep);
1172 
1173         if (dsda.rm_origin != NULL)
1174                 dsl_dataset_disown(dsda.rm_origin, tag);
1175 
1176         /* if it is successful, dsl_dir_destroy_sync will close the dd */
1177         if (err)
1178                 dsl_dir_close(dd, FTAG);
1179 out:
1180         dsl_dataset_disown(ds, tag);
1181         return (err);
1182 }
1183 
1184 blkptr_t *
1185 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1186 {
1187         return (&ds->ds_phys->ds_bp);
1188 }
1189 
1190 void
1191 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1192 {
1193         ASSERT(dmu_tx_is_syncing(tx));
1194         /* If it's the meta-objset, set dp_meta_rootbp */
1195         if (ds == NULL) {
1196                 tx->tx_pool->dp_meta_rootbp = *bp;
1197         } else {
1198                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1199                 ds->ds_phys->ds_bp = *bp;
1200         }
1201 }
1202 
1203 spa_t *
1204 dsl_dataset_get_spa(dsl_dataset_t *ds)
1205 {
1206         return (ds->ds_dir->dd_pool->dp_spa);
1207 }
1208 
1209 void
1210 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1211 {
1212         dsl_pool_t *dp;
1213 
1214         if (ds == NULL) /* this is the meta-objset */
1215                 return;
1216 
1217         ASSERT(ds->ds_objset != NULL);
1218 
1219         if (ds->ds_phys->ds_next_snap_obj != 0)
1220                 panic("dirtying snapshot!");
1221 
1222         dp = ds->ds_dir->dd_pool;
1223 
1224         if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
1225                 /* up the hold count until we can be written out */
1226                 dmu_buf_add_ref(ds->ds_dbuf, ds);
1227         }
1228 }
1229 
1230 /*
1231  * The unique space in the head dataset can be calculated by subtracting
1232  * the space used in the most recent snapshot, that is still being used
1233  * in this file system, from the space currently in use.  To figure out
1234  * the space in the most recent snapshot still in use, we need to take
1235  * the total space used in the snapshot and subtract out the space that
1236  * has been freed up since the snapshot was taken.
1237  */
1238 static void
1239 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1240 {
1241         uint64_t mrs_used;
1242         uint64_t dlused, dlcomp, dluncomp;
1243 
1244         ASSERT(!dsl_dataset_is_snapshot(ds));
1245 
1246         if (ds->ds_phys->ds_prev_snap_obj != 0)
1247                 mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
1248         else
1249                 mrs_used = 0;
1250 
1251         dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1252 
1253         ASSERT3U(dlused, <=, mrs_used);
1254         ds->ds_phys->ds_unique_bytes =
1255             ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
1256 
1257         if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1258             SPA_VERSION_UNIQUE_ACCURATE)
1259                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1260 }
1261 
1262 struct killarg {
1263         dsl_dataset_t *ds;
1264         dmu_tx_t *tx;
1265 };
1266 
1267 /* ARGSUSED */
1268 static int
1269 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
1270     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1271 {
1272         struct killarg *ka = arg;
1273         dmu_tx_t *tx = ka->tx;
1274 
1275         if (bp == NULL)
1276                 return (0);
1277 
1278         if (zb->zb_level == ZB_ZIL_LEVEL) {
1279                 ASSERT(zilog != NULL);
1280                 /*
1281                  * It's a block in the intent log.  It has no
1282                  * accounting, so just free it.
1283                  */
1284                 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
1285         } else {
1286                 ASSERT(zilog == NULL);
1287                 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
1288                 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
1289         }
1290 
1291         return (0);
1292 }
1293 
1294 /* ARGSUSED */
1295 static int
1296 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
1297 {
1298         dsl_dataset_t *ds = arg1;
1299         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1300         uint64_t count;
1301         int err;
1302 
1303         /*
1304          * Can't delete a head dataset if there are snapshots of it.
1305          * (Except if the only snapshots are from the branch we cloned
1306          * from.)
1307          */
1308         if (ds->ds_prev != NULL &&
1309             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1310                 return (EBUSY);
1311 
1312         /*
1313          * This is really a dsl_dir thing, but check it here so that
1314          * we'll be less likely to leave this dataset inconsistent &
1315          * nearly destroyed.
1316          */
1317         err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
1318         if (err)
1319                 return (err);
1320         if (count != 0)
1321                 return (EEXIST);
1322 
1323         return (0);
1324 }
1325 
1326 /* ARGSUSED */
1327 static void
1328 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1329 {
1330         dsl_dataset_t *ds = arg1;
1331 
1332         /* Mark it as inconsistent on-disk, in case we crash */
1333         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1334         ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1335 
1336         spa_history_log_internal_ds(ds, "destroy begin", tx, "");
1337 }
1338 
1339 static int
1340 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
1341     dmu_tx_t *tx)
1342 {
1343         dsl_dataset_t *ds = dsda->ds;
1344         dsl_dataset_t *ds_prev = ds->ds_prev;
1345 
1346         if (dsl_dataset_might_destroy_origin(ds_prev)) {
1347                 struct dsl_ds_destroyarg ndsda = {0};
1348 
1349                 /*
1350                  * If we're not prepared to remove the origin, don't remove
1351                  * the clone either.
1352                  */
1353                 if (dsda->rm_origin == NULL) {
1354                         dsda->need_prep = B_TRUE;
1355                         return (EBUSY);
1356                 }
1357 
1358                 ndsda.ds = ds_prev;
1359                 ndsda.is_origin_rm = B_TRUE;
1360                 return (dsl_dataset_destroy_check(&ndsda, tag, tx));
1361         }
1362 
1363         /*
1364          * If we're not going to remove the origin after all,
1365          * undo the open context setup.
1366          */
1367         if (dsda->rm_origin != NULL) {
1368                 dsl_dataset_disown(dsda->rm_origin, tag);
1369                 dsda->rm_origin = NULL;
1370         }
1371 
1372         return (0);
1373 }
1374 
1375 /*
1376  * If you add new checks here, you may need to add
1377  * additional checks to the "temporary" case in
1378  * snapshot_check() in dmu_objset.c.
1379  */
1380 /* ARGSUSED */
1381 int
1382 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
1383 {
1384         struct dsl_ds_destroyarg *dsda = arg1;
1385         dsl_dataset_t *ds = dsda->ds;
1386 
1387         /* we have an owner hold, so noone else can destroy us */
1388         ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
1389 
1390         /*
1391          * Only allow deferred destroy on pools that support it.
1392          * NOTE: deferred destroy is only supported on snapshots.
1393          */
1394         if (dsda->defer) {
1395                 if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
1396                     SPA_VERSION_USERREFS)
1397                         return (ENOTSUP);
1398                 ASSERT(dsl_dataset_is_snapshot(ds));
1399                 return (0);
1400         }
1401 
1402         /*
1403          * Can't delete a head dataset if there are snapshots of it.
1404          * (Except if the only snapshots are from the branch we cloned
1405          * from.)
1406          */
1407         if (ds->ds_prev != NULL &&
1408             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1409                 return (EBUSY);
1410 
1411         /*
1412          * If we made changes this txg, traverse_dsl_dataset won't find
1413          * them.  Try again.
1414          */
1415         if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1416                 return (EAGAIN);
1417 
1418         if (dsl_dataset_is_snapshot(ds)) {
1419                 /*
1420                  * If this snapshot has an elevated user reference count,
1421                  * we can't destroy it yet.
1422                  */
1423                 if (ds->ds_userrefs > 0 && !dsda->releasing)
1424                         return (EBUSY);
1425 
1426                 mutex_enter(&ds->ds_lock);
1427                 /*
1428                  * Can't delete a branch point. However, if we're destroying
1429                  * a clone and removing its origin due to it having a user
1430                  * hold count of 0 and having been marked for deferred destroy,
1431                  * it's OK for the origin to have a single clone.
1432                  */
1433                 if (ds->ds_phys->ds_num_children >
1434                     (dsda->is_origin_rm ? 2 : 1)) {
1435                         mutex_exit(&ds->ds_lock);
1436                         return (EEXIST);
1437                 }
1438                 mutex_exit(&ds->ds_lock);
1439         } else if (dsl_dir_is_clone(ds->ds_dir)) {
1440                 return (dsl_dataset_origin_check(dsda, arg2, tx));
1441         }
1442 
1443         /* XXX we should do some i/o error checking... */
1444         return (0);
1445 }
1446 
1447 struct refsarg {
1448         kmutex_t lock;
1449         boolean_t gone;
1450         kcondvar_t cv;
1451 };
1452 
1453 /* ARGSUSED */
1454 static void
1455 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
1456 {
1457         struct refsarg *arg = argv;
1458 
1459         mutex_enter(&arg->lock);
1460         arg->gone = TRUE;
1461         cv_signal(&arg->cv);
1462         mutex_exit(&arg->lock);
1463 }
1464 
1465 static void
1466 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
1467 {
1468         struct refsarg arg;
1469 
1470         mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
1471         cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
1472         arg.gone = FALSE;
1473         (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
1474             dsl_dataset_refs_gone);
1475         dmu_buf_rele(ds->ds_dbuf, tag);
1476         mutex_enter(&arg.lock);
1477         while (!arg.gone)
1478                 cv_wait(&arg.cv, &arg.lock);
1479         ASSERT(arg.gone);
1480         mutex_exit(&arg.lock);
1481         ds->ds_dbuf = NULL;
1482         ds->ds_phys = NULL;
1483         mutex_destroy(&arg.lock);
1484         cv_destroy(&arg.cv);
1485 }
1486 
1487 static void
1488 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
1489 {
1490         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1491         uint64_t count;
1492         int err;
1493 
1494         ASSERT(ds->ds_phys->ds_num_children >= 2);
1495         err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
1496         /*
1497          * The err should not be ENOENT, but a bug in a previous version
1498          * of the code could cause upgrade_clones_cb() to not set
1499          * ds_next_snap_obj when it should, leading to a missing entry.
1500          * If we knew that the pool was created after
1501          * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1502          * ENOENT.  However, at least we can check that we don't have
1503          * too many entries in the next_clones_obj even after failing to
1504          * remove this one.
1505          */
1506         if (err != ENOENT) {
1507                 VERIFY3U(err, ==, 0);
1508         }
1509         ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
1510             &count));
1511         ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
1512 }
1513 
1514 static void
1515 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
1516 {
1517         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1518         zap_cursor_t zc;
1519         zap_attribute_t za;
1520 
1521         /*
1522          * If it is the old version, dd_clones doesn't exist so we can't
1523          * find the clones, but deadlist_remove_key() is a no-op so it
1524          * doesn't matter.
1525          */
1526         if (ds->ds_dir->dd_phys->dd_clones == 0)
1527                 return;
1528 
1529         for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
1530             zap_cursor_retrieve(&zc, &za) == 0;
1531             zap_cursor_advance(&zc)) {
1532                 dsl_dataset_t *clone;
1533 
1534                 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1535                     za.za_first_integer, FTAG, &clone));
1536                 if (clone->ds_dir->dd_origin_txg > mintxg) {
1537                         dsl_deadlist_remove_key(&clone->ds_deadlist,
1538                             mintxg, tx);
1539                         dsl_dataset_remove_clones_key(clone, mintxg, tx);
1540                 }
1541                 dsl_dataset_rele(clone, FTAG);
1542         }
1543         zap_cursor_fini(&zc);
1544 }
1545 
1546 struct process_old_arg {
1547         dsl_dataset_t *ds;
1548         dsl_dataset_t *ds_prev;
1549         boolean_t after_branch_point;
1550         zio_t *pio;
1551         uint64_t used, comp, uncomp;
1552 };
1553 
1554 static int
1555 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1556 {
1557         struct process_old_arg *poa = arg;
1558         dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
1559 
1560         if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
1561                 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
1562                 if (poa->ds_prev && !poa->after_branch_point &&
1563                     bp->blk_birth >
1564                     poa->ds_prev->ds_phys->ds_prev_snap_txg) {
1565                         poa->ds_prev->ds_phys->ds_unique_bytes +=
1566                             bp_get_dsize_sync(dp->dp_spa, bp);
1567                 }
1568         } else {
1569                 poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
1570                 poa->comp += BP_GET_PSIZE(bp);
1571                 poa->uncomp += BP_GET_UCSIZE(bp);
1572                 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
1573         }
1574         return (0);
1575 }
1576 
1577 static void
1578 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
1579     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
1580 {
1581         struct process_old_arg poa = { 0 };
1582         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1583         objset_t *mos = dp->dp_meta_objset;
1584 
1585         ASSERT(ds->ds_deadlist.dl_oldfmt);
1586         ASSERT(ds_next->ds_deadlist.dl_oldfmt);
1587 
1588         poa.ds = ds;
1589         poa.ds_prev = ds_prev;
1590         poa.after_branch_point = after_branch_point;
1591         poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1592         VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
1593             process_old_cb, &poa, tx));
1594         VERIFY3U(zio_wait(poa.pio), ==, 0);
1595         ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
1596 
1597         /* change snapused */
1598         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1599             -poa.used, -poa.comp, -poa.uncomp, tx);
1600 
1601         /* swap next's deadlist to our deadlist */
1602         dsl_deadlist_close(&ds->ds_deadlist);
1603         dsl_deadlist_close(&ds_next->ds_deadlist);
1604         SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
1605             ds->ds_phys->ds_deadlist_obj);
1606         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1607         dsl_deadlist_open(&ds_next->ds_deadlist, mos,
1608             ds_next->ds_phys->ds_deadlist_obj);
1609 }
1610 
1611 static int
1612 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
1613 {
1614         int err;
1615         struct killarg ka;
1616 
1617         /*
1618          * Free everything that we point to (that's born after
1619          * the previous snapshot, if we are a clone)
1620          *
1621          * NB: this should be very quick, because we already
1622          * freed all the objects in open context.
1623          */
1624         ka.ds = ds;
1625         ka.tx = tx;
1626         err = traverse_dataset(ds,
1627             ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
1628             kill_blkptr, &ka);
1629         ASSERT3U(err, ==, 0);
1630         ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
1631 
1632         return (err);
1633 }
1634 
1635 void
1636 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
1637 {
1638         struct dsl_ds_destroyarg *dsda = arg1;
1639         dsl_dataset_t *ds = dsda->ds;
1640         int err;
1641         int after_branch_point = FALSE;
1642         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1643         objset_t *mos = dp->dp_meta_objset;
1644         dsl_dataset_t *ds_prev = NULL;
1645         boolean_t wont_destroy;
1646         uint64_t obj;
1647 
1648         wont_destroy = (dsda->defer &&
1649             (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
1650 
1651         ASSERT(ds->ds_owner || wont_destroy);
1652         ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
1653         ASSERT(ds->ds_prev == NULL ||
1654             ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
1655         ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
1656 
1657         if (wont_destroy) {
1658                 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
1659                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1660                 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
1661                 spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
1662                 return;
1663         }
1664 
1665         /* We need to log before removing it from the namespace. */
1666         spa_history_log_internal_ds(ds, "destroy", tx, "");
1667 
1668         /* signal any waiters that this dataset is going away */
1669         mutex_enter(&ds->ds_lock);
1670         ds->ds_owner = dsl_reaper;
1671         cv_broadcast(&ds->ds_exclusive_cv);
1672         mutex_exit(&ds->ds_lock);
1673 
1674         /* Remove our reservation */
1675         if (ds->ds_reserved != 0) {
1676                 dsl_prop_setarg_t psa;
1677                 uint64_t value = 0;
1678 
1679                 dsl_prop_setarg_init_uint64(&psa, "refreservation",
1680                     (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
1681                     &value);
1682                 psa.psa_effective_value = 0;    /* predict default value */
1683 
1684                 dsl_dataset_set_reservation_sync(ds, &psa, tx);
1685                 ASSERT3U(ds->ds_reserved, ==, 0);
1686         }
1687 
1688         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1689 
1690         dsl_scan_ds_destroyed(ds, tx);
1691 
1692         obj = ds->ds_object;
1693 
1694         if (ds->ds_phys->ds_prev_snap_obj != 0) {
1695                 if (ds->ds_prev) {
1696                         ds_prev = ds->ds_prev;
1697                 } else {
1698                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1699                             ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
1700                 }
1701                 after_branch_point =
1702                     (ds_prev->ds_phys->ds_next_snap_obj != obj);
1703 
1704                 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1705                 if (after_branch_point &&
1706                     ds_prev->ds_phys->ds_next_clones_obj != 0) {
1707                         remove_from_next_clones(ds_prev, obj, tx);
1708                         if (ds->ds_phys->ds_next_snap_obj != 0) {
1709                                 VERIFY(0 == zap_add_int(mos,
1710                                     ds_prev->ds_phys->ds_next_clones_obj,
1711                                     ds->ds_phys->ds_next_snap_obj, tx));
1712                         }
1713                 }
1714                 if (after_branch_point &&
1715                     ds->ds_phys->ds_next_snap_obj == 0) {
1716                         /* This clone is toast. */
1717                         ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1718                         ds_prev->ds_phys->ds_num_children--;
1719 
1720                         /*
1721                          * If the clone's origin has no other clones, no
1722                          * user holds, and has been marked for deferred
1723                          * deletion, then we should have done the necessary
1724                          * destroy setup for it.
1725                          */
1726                         if (ds_prev->ds_phys->ds_num_children == 1 &&
1727                             ds_prev->ds_userrefs == 0 &&
1728                             DS_IS_DEFER_DESTROY(ds_prev)) {
1729                                 ASSERT3P(dsda->rm_origin, !=, NULL);
1730                         } else {
1731                                 ASSERT3P(dsda->rm_origin, ==, NULL);
1732                         }
1733                 } else if (!after_branch_point) {
1734                         ds_prev->ds_phys->ds_next_snap_obj =
1735                             ds->ds_phys->ds_next_snap_obj;
1736                 }
1737         }
1738 
1739         if (dsl_dataset_is_snapshot(ds)) {
1740                 dsl_dataset_t *ds_next;
1741                 uint64_t old_unique;
1742                 uint64_t used = 0, comp = 0, uncomp = 0;
1743 
1744                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1745                     ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
1746                 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1747 
1748                 old_unique = ds_next->ds_phys->ds_unique_bytes;
1749 
1750                 dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1751                 ds_next->ds_phys->ds_prev_snap_obj =
1752                     ds->ds_phys->ds_prev_snap_obj;
1753                 ds_next->ds_phys->ds_prev_snap_txg =
1754                     ds->ds_phys->ds_prev_snap_txg;
1755                 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1756                     ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1757 
1758 
1759                 if (ds_next->ds_deadlist.dl_oldfmt) {
1760                         process_old_deadlist(ds, ds_prev, ds_next,
1761                             after_branch_point, tx);
1762                 } else {
1763                         /* Adjust prev's unique space. */
1764                         if (ds_prev && !after_branch_point) {
1765                                 dsl_deadlist_space_range(&ds_next->ds_deadlist,
1766                                     ds_prev->ds_phys->ds_prev_snap_txg,
1767                                     ds->ds_phys->ds_prev_snap_txg,
1768                                     &used, &comp, &uncomp);
1769                                 ds_prev->ds_phys->ds_unique_bytes += used;
1770                         }
1771 
1772                         /* Adjust snapused. */
1773                         dsl_deadlist_space_range(&ds_next->ds_deadlist,
1774                             ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1775                             &used, &comp, &uncomp);
1776                         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1777                             -used, -comp, -uncomp, tx);
1778 
1779                         /* Move blocks to be freed to pool's free list. */
1780                         dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
1781                             &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
1782                             tx);
1783                         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
1784                             DD_USED_HEAD, used, comp, uncomp, tx);
1785 
1786                         /* Merge our deadlist into next's and free it. */
1787                         dsl_deadlist_merge(&ds_next->ds_deadlist,
1788                             ds->ds_phys->ds_deadlist_obj, tx);
1789                 }
1790                 dsl_deadlist_close(&ds->ds_deadlist);
1791                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1792 
1793                 /* Collapse range in clone heads */
1794                 dsl_dataset_remove_clones_key(ds,
1795                     ds->ds_phys->ds_creation_txg, tx);
1796 
1797                 if (dsl_dataset_is_snapshot(ds_next)) {
1798                         dsl_dataset_t *ds_nextnext;
1799 
1800                         /*
1801                          * Update next's unique to include blocks which
1802                          * were previously shared by only this snapshot
1803                          * and it.  Those blocks will be born after the
1804                          * prev snap and before this snap, and will have
1805                          * died after the next snap and before the one
1806                          * after that (ie. be on the snap after next's
1807                          * deadlist).
1808                          */
1809                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1810                             ds_next->ds_phys->ds_next_snap_obj,
1811                             FTAG, &ds_nextnext));
1812                         dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
1813                             ds->ds_phys->ds_prev_snap_txg,
1814                             ds->ds_phys->ds_creation_txg,
1815                             &used, &comp, &uncomp);
1816                         ds_next->ds_phys->ds_unique_bytes += used;
1817                         dsl_dataset_rele(ds_nextnext, FTAG);
1818                         ASSERT3P(ds_next->ds_prev, ==, NULL);
1819 
1820                         /* Collapse range in this head. */
1821                         dsl_dataset_t *hds;
1822                         VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
1823                             ds->ds_dir->dd_phys->dd_head_dataset_obj,
1824                             FTAG, &hds));
1825                         dsl_deadlist_remove_key(&hds->ds_deadlist,
1826                             ds->ds_phys->ds_creation_txg, tx);
1827                         dsl_dataset_rele(hds, FTAG);
1828 
1829                 } else {
1830                         ASSERT3P(ds_next->ds_prev, ==, ds);
1831                         dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
1832                         ds_next->ds_prev = NULL;
1833                         if (ds_prev) {
1834                                 VERIFY(0 == dsl_dataset_get_ref(dp,
1835                                     ds->ds_phys->ds_prev_snap_obj,
1836                                     ds_next, &ds_next->ds_prev));
1837                         }
1838 
1839                         dsl_dataset_recalc_head_uniq(ds_next);
1840 
1841                         /*
1842                          * Reduce the amount of our unconsmed refreservation
1843                          * being charged to our parent by the amount of
1844                          * new unique data we have gained.
1845                          */
1846                         if (old_unique < ds_next->ds_reserved) {
1847                                 int64_t mrsdelta;
1848                                 uint64_t new_unique =
1849                                     ds_next->ds_phys->ds_unique_bytes;
1850 
1851                                 ASSERT(old_unique <= new_unique);
1852                                 mrsdelta = MIN(new_unique - old_unique,
1853                                     ds_next->ds_reserved - old_unique);
1854                                 dsl_dir_diduse_space(ds->ds_dir,
1855                                     DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1856                         }
1857                 }
1858                 dsl_dataset_rele(ds_next, FTAG);
1859         } else {
1860                 zfeature_info_t *async_destroy =
1861                     &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
1862 
1863                 /*
1864                  * There's no next snapshot, so this is a head dataset.
1865                  * Destroy the deadlist.  Unless it's a clone, the
1866                  * deadlist should be empty.  (If it's a clone, it's
1867                  * safe to ignore the deadlist contents.)
1868                  */
1869                 dsl_deadlist_close(&ds->ds_deadlist);
1870                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1871                 ds->ds_phys->ds_deadlist_obj = 0;
1872 
1873                 if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
1874                         err = old_synchronous_dataset_destroy(ds, tx);
1875                 } else {
1876                         /*
1877                          * Move the bptree into the pool's list of trees to
1878                          * clean up and update space accounting information.
1879                          */
1880                         uint64_t used, comp, uncomp;
1881 
1882                         ASSERT(err == 0 || err == EBUSY);
1883                         if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
1884                                 spa_feature_incr(dp->dp_spa, async_destroy, tx);
1885                                 dp->dp_bptree_obj = bptree_alloc(
1886                                     dp->dp_meta_objset, tx);
1887                                 VERIFY(zap_add(dp->dp_meta_objset,
1888                                     DMU_POOL_DIRECTORY_OBJECT,
1889                                     DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
1890                                     &dp->dp_bptree_obj, tx) == 0);
1891                         }
1892 
1893                         used = ds->ds_dir->dd_phys->dd_used_bytes;
1894                         comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
1895                         uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
1896 
1897                         ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
1898                             ds->ds_phys->ds_unique_bytes == used);
1899 
1900                         bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj,
1901                             &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
1902                             used, comp, uncomp, tx);
1903                         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
1904                             -used, -comp, -uncomp, tx);
1905                         dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
1906                             used, comp, uncomp, tx);
1907                 }
1908 
1909                 if (ds->ds_prev != NULL) {
1910                         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
1911                                 VERIFY3U(0, ==, zap_remove_int(mos,
1912                                     ds->ds_prev->ds_dir->dd_phys->dd_clones,
1913                                     ds->ds_object, tx));
1914                         }
1915                         dsl_dataset_rele(ds->ds_prev, ds);
1916                         ds->ds_prev = ds_prev = NULL;
1917                 }
1918         }
1919 
1920         /*
1921          * This must be done after the dsl_traverse(), because it will
1922          * re-open the objset.
1923          */
1924         if (ds->ds_objset) {
1925                 dmu_objset_evict(ds->ds_objset);
1926                 ds->ds_objset = NULL;
1927         }
1928 
1929         if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1930                 /* Erase the link in the dir */
1931                 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1932                 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
1933                 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
1934                 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1935                 ASSERT(err == 0);
1936         } else {
1937                 /* remove from snapshot namespace */
1938                 dsl_dataset_t *ds_head;
1939                 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
1940                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1941                     ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
1942                 VERIFY(0 == dsl_dataset_get_snapname(ds));
1943 #ifdef ZFS_DEBUG
1944                 {
1945                         uint64_t val;
1946 
1947                         err = dsl_dataset_snap_lookup(ds_head,
1948                             ds->ds_snapname, &val);
1949                         ASSERT3U(err, ==, 0);
1950                         ASSERT3U(val, ==, obj);
1951                 }
1952 #endif
1953                 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
1954                 ASSERT(err == 0);
1955                 dsl_dataset_rele(ds_head, FTAG);
1956         }
1957 
1958         if (ds_prev && ds->ds_prev != ds_prev)
1959                 dsl_dataset_rele(ds_prev, FTAG);
1960 
1961         spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
1962 
1963         if (ds->ds_phys->ds_next_clones_obj != 0) {
1964                 uint64_t count;
1965                 ASSERT(0 == zap_count(mos,
1966                     ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
1967                 VERIFY(0 == dmu_object_free(mos,
1968                     ds->ds_phys->ds_next_clones_obj, tx));
1969         }
1970         if (ds->ds_phys->ds_props_obj != 0)
1971                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
1972         if (ds->ds_phys->ds_userrefs_obj != 0)
1973                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
1974         dsl_dir_close(ds->ds_dir, ds);
1975         ds->ds_dir = NULL;
1976         dsl_dataset_drain_refs(ds, tag);
1977         VERIFY(0 == dmu_object_free(mos, obj, tx));
1978 
1979         if (dsda->rm_origin) {
1980                 /*
1981                  * Remove the origin of the clone we just destroyed.
1982                  */
1983                 struct dsl_ds_destroyarg ndsda = {0};
1984 
1985                 ndsda.ds = dsda->rm_origin;
1986                 dsl_dataset_destroy_sync(&ndsda, tag, tx);
1987         }
1988 }
1989 
1990 static int
1991 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1992 {
1993         uint64_t asize;
1994 
1995         if (!dmu_tx_is_syncing(tx))
1996                 return (0);
1997 
1998         /*
1999          * If there's an fs-only reservation, any blocks that might become
2000          * owned by the snapshot dataset must be accommodated by space
2001          * outside of the reservation.
2002          */
2003         ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
2004         asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2005         if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
2006                 return (ENOSPC);
2007 
2008         /*
2009          * Propagate any reserved space for this snapshot to other
2010          * snapshot checks in this sync group.
2011          */
2012         if (asize > 0)
2013                 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
2014 
2015         return (0);
2016 }
2017 
2018 int
2019 dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
2020     dmu_tx_t *tx)
2021 {
2022         int err;
2023         uint64_t value;
2024 
2025         /*
2026          * We don't allow multiple snapshots of the same txg.  If there
2027          * is already one, try again.
2028          */
2029         if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
2030                 return (EAGAIN);
2031 
2032         /*
2033          * Check for conflicting snapshot name.
2034          */
2035         err = dsl_dataset_snap_lookup(ds, snapname, &value);
2036         if (err == 0)
2037                 return (EEXIST);
2038         if (err != ENOENT)
2039                 return (err);
2040 
2041         /*
2042          * Check that the dataset's name is not too long.  Name consists
2043          * of the dataset's length + 1 for the @-sign + snapshot name's length
2044          */
2045         if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
2046                 return (ENAMETOOLONG);
2047 
2048         err = dsl_dataset_snapshot_reserve_space(ds, tx);
2049         if (err)
2050                 return (err);
2051 
2052         ds->ds_trysnap_txg = tx->tx_txg;
2053         return (0);
2054 }
2055 
2056 void
2057 dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
2058     dmu_tx_t *tx)
2059 {
2060         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2061         dmu_buf_t *dbuf;
2062         dsl_dataset_phys_t *dsphys;
2063         uint64_t dsobj, crtxg;
2064         objset_t *mos = dp->dp_meta_objset;
2065         int err;
2066 
2067         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
2068 
2069         /*
2070          * The origin's ds_creation_txg has to be < TXG_INITIAL
2071          */
2072         if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
2073                 crtxg = 1;
2074         else
2075                 crtxg = tx->tx_txg;
2076 
2077         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
2078             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
2079         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
2080         dmu_buf_will_dirty(dbuf, tx);
2081         dsphys = dbuf->db_data;
2082         bzero(dsphys, sizeof (dsl_dataset_phys_t));
2083         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
2084         dsphys->ds_fsid_guid = unique_create();
2085         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
2086             sizeof (dsphys->ds_guid));
2087         dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
2088         dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
2089         dsphys->ds_next_snap_obj = ds->ds_object;
2090         dsphys->ds_num_children = 1;
2091         dsphys->ds_creation_time = gethrestime_sec();
2092         dsphys->ds_creation_txg = crtxg;
2093         dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
2094         dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
2095         dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
2096         dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
2097         dsphys->ds_flags = ds->ds_phys->ds_flags;
2098         dsphys->ds_bp = ds->ds_phys->ds_bp;
2099         dmu_buf_rele(dbuf, FTAG);
2100 
2101         ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
2102         if (ds->ds_prev) {
2103                 uint64_t next_clones_obj =
2104                     ds->ds_prev->ds_phys->ds_next_clones_obj;
2105                 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
2106                     ds->ds_object ||
2107                     ds->ds_prev->ds_phys->ds_num_children > 1);
2108                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
2109                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2110                         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
2111                             ds->ds_prev->ds_phys->ds_creation_txg);
2112                         ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
2113                 } else if (next_clones_obj != 0) {
2114                         remove_from_next_clones(ds->ds_prev,
2115                             dsphys->ds_next_snap_obj, tx);
2116                         VERIFY3U(0, ==, zap_add_int(mos,
2117                             next_clones_obj, dsobj, tx));
2118                 }
2119         }
2120 
2121         /*
2122          * If we have a reference-reservation on this dataset, we will
2123          * need to increase the amount of refreservation being charged
2124          * since our unique space is going to zero.
2125          */
2126         if (ds->ds_reserved) {
2127                 int64_t delta;
2128                 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
2129                 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2130                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
2131                     delta, 0, 0, tx);
2132         }
2133 
2134         dmu_buf_will_dirty(ds->ds_dbuf, tx);
2135         zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
2136             ds->ds_dir->dd_myname, snapname, dsobj,
2137             ds->ds_phys->ds_prev_snap_txg);
2138         ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
2139             UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
2140         dsl_deadlist_close(&ds->ds_deadlist);
2141         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
2142         dsl_deadlist_add_key(&ds->ds_deadlist,
2143             ds->ds_phys->ds_prev_snap_txg, tx);
2144 
2145         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
2146         ds->ds_phys->ds_prev_snap_obj = dsobj;
2147         ds->ds_phys->ds_prev_snap_txg = crtxg;
2148         ds->ds_phys->ds_unique_bytes = 0;
2149         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
2150                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
2151 
2152         err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
2153             snapname, 8, 1, &dsobj, tx);
2154         ASSERT(err == 0);
2155 
2156         if (ds->ds_prev)
2157                 dsl_dataset_drop_ref(ds->ds_prev, ds);
2158         VERIFY(0 == dsl_dataset_get_ref(dp,
2159             ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
2160 
2161         dsl_scan_ds_snapshotted(ds, tx);
2162 
2163         dsl_dir_snap_cmtime_update(ds->ds_dir);
2164 
2165         spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
2166 }
2167 
2168 void
2169 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
2170 {
2171         ASSERT(dmu_tx_is_syncing(tx));
2172         ASSERT(ds->ds_objset != NULL);
2173         ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
2174 
2175         /*
2176          * in case we had to change ds_fsid_guid when we opened it,
2177          * sync it out now.
2178          */
2179         dmu_buf_will_dirty(ds->ds_dbuf, tx);
2180         ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
2181 
2182         dsl_dir_dirty(ds->ds_dir, tx);
2183         dmu_objset_sync(ds->ds_objset, zio, tx);
2184 }
2185 
2186 static void
2187 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
2188 {
2189         uint64_t count = 0;
2190         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
2191         zap_cursor_t zc;
2192         zap_attribute_t za;
2193         nvlist_t *propval;
2194         nvlist_t *val;
2195 
2196         rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2197         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2198         VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2199 
2200         /*
2201          * There may me missing entries in ds_next_clones_obj
2202          * due to a bug in a previous version of the code.
2203          * Only trust it if it has the right number of entries.
2204          */
2205         if (ds->ds_phys->ds_next_clones_obj != 0) {
2206                 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
2207                     &count));
2208         }
2209         if (count != ds->ds_phys->ds_num_children - 1) {
2210                 goto fail;
2211         }
2212         for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
2213             zap_cursor_retrieve(&zc, &za) == 0;
2214             zap_cursor_advance(&zc)) {
2215                 dsl_dataset_t *clone;
2216                 char buf[ZFS_MAXNAMELEN];
2217                 /*
2218                  * Even though we hold the dp_config_rwlock, the dataset
2219                  * may fail to open, returning ENOENT.  If there is a
2220                  * thread concurrently attempting to destroy this
2221                  * dataset, it will have the ds_rwlock held for
2222                  * RW_WRITER.  Our call to dsl_dataset_hold_obj() ->
2223                  * dsl_dataset_hold_ref() will fail its
2224                  * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
2225                  * dp_config_rwlock, and wait for the destroy progress
2226                  * and signal ds_exclusive_cv.  If the destroy was
2227                  * successful, we will see that
2228                  * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
2229                  */
2230                 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
2231                     za.za_first_integer, FTAG, &clone) != 0)
2232                         continue;
2233                 dsl_dir_name(clone->ds_dir, buf);
2234                 VERIFY(nvlist_add_boolean(val, buf) == 0);
2235                 dsl_dataset_rele(clone, FTAG);
2236         }
2237         zap_cursor_fini(&zc);
2238         VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
2239         VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
2240             propval) == 0);
2241 fail:
2242         nvlist_free(val);
2243         nvlist_free(propval);
2244         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2245 }
2246 
2247 void
2248 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
2249 {
2250         uint64_t refd, avail, uobjs, aobjs, ratio;
2251 
2252         ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
2253             (ds->ds_phys->ds_uncompressed_bytes * 100 /
2254             ds->ds_phys->ds_compressed_bytes);
2255 
2256         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
2257 
2258         if (dsl_dataset_is_snapshot(ds)) {
2259                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
2260                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
2261                     ds->ds_phys->ds_unique_bytes);
2262                 get_clones_stat(ds, nv);
2263         } else {
2264                 dsl_dir_stats(ds->ds_dir, nv);
2265         }
2266 
2267         dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
2268         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
2269         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
2270 
2271         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
2272             ds->ds_phys->ds_creation_time);
2273         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
2274             ds->ds_phys->ds_creation_txg);
2275         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
2276             ds->ds_quota);
2277         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
2278             ds->ds_reserved);
2279         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
2280             ds->ds_phys->ds_guid);
2281         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
2282             ds->ds_phys->ds_unique_bytes);
2283         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
2284             ds->ds_object);
2285         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
2286             ds->ds_userrefs);
2287         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
2288             DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
2289 
2290         if (ds->ds_phys->ds_prev_snap_obj != 0) {
2291                 uint64_t written, comp, uncomp;
2292                 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2293                 dsl_dataset_t *prev;
2294 
2295                 rw_enter(&dp->dp_config_rwlock, RW_READER);
2296                 int err = dsl_dataset_hold_obj(dp,
2297                     ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
2298                 rw_exit(&dp->dp_config_rwlock);
2299                 if (err == 0) {
2300                         err = dsl_dataset_space_written(prev, ds, &written,
2301                             &comp, &uncomp);
2302                         dsl_dataset_rele(prev, FTAG);
2303                         if (err == 0) {
2304                                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
2305                                     written);
2306                         }
2307                 }
2308         }
2309 
2310 }
2311 
2312 void
2313 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
2314 {
2315         stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
2316         stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
2317         stat->dds_guid = ds->ds_phys->ds_guid;
2318         stat->dds_origin[0] = '\0';
2319         if (dsl_dataset_is_snapshot(ds)) {
2320                 stat->dds_is_snapshot = B_TRUE;
2321                 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
2322         } else {
2323                 stat->dds_is_snapshot = B_FALSE;
2324                 stat->dds_num_clones = 0;
2325 
2326                 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2327                 if (dsl_dir_is_clone(ds->ds_dir)) {
2328                         dsl_dataset_t *ods;
2329 
2330                         VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
2331                             ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
2332                         dsl_dataset_name(ods, stat->dds_origin);
2333                         dsl_dataset_drop_ref(ods, FTAG);
2334                 }
2335                 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2336         }
2337 }
2338 
2339 uint64_t
2340 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2341 {
2342         return (ds->ds_fsid_guid);
2343 }
2344 
2345 void
2346 dsl_dataset_space(dsl_dataset_t *ds,
2347     uint64_t *refdbytesp, uint64_t *availbytesp,
2348     uint64_t *usedobjsp, uint64_t *availobjsp)
2349 {
2350         *refdbytesp = ds->ds_phys->ds_referenced_bytes;
2351         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2352         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
2353                 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
2354         if (ds->ds_quota != 0) {
2355                 /*
2356                  * Adjust available bytes according to refquota
2357                  */
2358                 if (*refdbytesp < ds->ds_quota)
2359                         *availbytesp = MIN(*availbytesp,
2360                             ds->ds_quota - *refdbytesp);
2361                 else
2362                         *availbytesp = 0;
2363         }
2364         *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
2365         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
2366 }
2367 
2368 boolean_t
2369 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2370 {
2371         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2372 
2373         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
2374             dsl_pool_sync_context(dp));
2375         if (ds->ds_prev == NULL)
2376                 return (B_FALSE);
2377         if (ds->ds_phys->ds_bp.blk_birth >
2378             ds->ds_prev->ds_phys->ds_creation_txg) {
2379                 objset_t *os, *os_prev;
2380                 /*
2381                  * It may be that only the ZIL differs, because it was
2382                  * reset in the head.  Don't count that as being
2383                  * modified.
2384                  */
2385                 if (dmu_objset_from_ds(ds, &os) != 0)
2386                         return (B_TRUE);
2387                 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
2388                         return (B_TRUE);
2389                 return (bcmp(&os->os_phys->os_meta_dnode,
2390                     &os_prev->os_phys->os_meta_dnode,
2391                     sizeof (os->os_phys->os_meta_dnode)) != 0);
2392         }
2393         return (B_FALSE);
2394 }
2395 
2396 /* ARGSUSED */
2397 static int
2398 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
2399 {
2400         dsl_dataset_t *ds = arg1;
2401         char *newsnapname = arg2;
2402         dsl_dir_t *dd = ds->ds_dir;
2403         dsl_dataset_t *hds;
2404         uint64_t val;
2405         int err;
2406 
2407         err = dsl_dataset_hold_obj(dd->dd_pool,
2408             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
2409         if (err)
2410                 return (err);
2411 
2412         /* new name better not be in use */
2413         err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
2414         dsl_dataset_rele(hds, FTAG);
2415 
2416         if (err == 0)
2417                 err = EEXIST;
2418         else if (err == ENOENT)
2419                 err = 0;
2420 
2421         /* dataset name + 1 for the "@" + the new snapshot name must fit */
2422         if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
2423                 err = ENAMETOOLONG;
2424 
2425         return (err);
2426 }
2427 
2428 static void
2429 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2430 {
2431         dsl_dataset_t *ds = arg1;
2432         const char *newsnapname = arg2;
2433         dsl_dir_t *dd = ds->ds_dir;
2434         objset_t *mos = dd->dd_pool->dp_meta_objset;
2435         dsl_dataset_t *hds;
2436         int err;
2437 
2438         ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
2439 
2440         VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2441             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
2442 
2443         VERIFY(0 == dsl_dataset_get_snapname(ds));
2444         err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
2445         ASSERT3U(err, ==, 0);
2446         mutex_enter(&ds->ds_lock);
2447         (void) strcpy(ds->ds_snapname, newsnapname);
2448         mutex_exit(&ds->ds_lock);
2449         err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
2450             ds->ds_snapname, 8, 1, &ds->ds_object, tx);
2451         ASSERT3U(err, ==, 0);
2452 
2453         spa_history_log_internal_ds(ds, "rename", tx,
2454             "-> @%s", newsnapname);
2455         dsl_dataset_rele(hds, FTAG);
2456 }
2457 
2458 struct renamesnaparg {
2459         dsl_sync_task_group_t *dstg;
2460         char failed[MAXPATHLEN];
2461         char *oldsnap;
2462         char *newsnap;
2463 };
2464 
2465 static int
2466 dsl_snapshot_rename_one(const char *name, void *arg)
2467 {
2468         struct renamesnaparg *ra = arg;
2469         dsl_dataset_t *ds = NULL;
2470         char *snapname;
2471         int err;
2472 
2473         snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
2474         (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
2475 
2476         /*
2477          * For recursive snapshot renames the parent won't be changing
2478          * so we just pass name for both the to/from argument.
2479          */
2480         err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
2481         if (err != 0) {
2482                 strfree(snapname);
2483                 return (err == ENOENT ? 0 : err);
2484         }
2485 
2486 #ifdef _KERNEL
2487         /*
2488          * For all filesystems undergoing rename, we'll need to unmount it.
2489          */
2490         (void) zfs_unmount_snap(snapname, NULL);
2491 #endif
2492         err = dsl_dataset_hold(snapname, ra->dstg, &ds);
2493         strfree(snapname);
2494         if (err != 0)
2495                 return (err == ENOENT ? 0 : err);
2496 
2497         dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
2498             dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
2499 
2500         return (0);
2501 }
2502 
2503 static int
2504 dsl_recursive_rename(char *oldname, const char *newname)
2505 {
2506         int err;
2507         struct renamesnaparg *ra;
2508         dsl_sync_task_t *dst;
2509         spa_t *spa;
2510         char *cp, *fsname = spa_strdup(oldname);
2511         int len = strlen(oldname) + 1;
2512 
2513         /* truncate the snapshot name to get the fsname */
2514         cp = strchr(fsname, '@');
2515         *cp = '\0';
2516 
2517         err = spa_open(fsname, &spa, FTAG);
2518         if (err) {
2519                 kmem_free(fsname, len);
2520                 return (err);
2521         }
2522         ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
2523         ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
2524 
2525         ra->oldsnap = strchr(oldname, '@') + 1;
2526         ra->newsnap = strchr(newname, '@') + 1;
2527         *ra->failed = '\0';
2528 
2529         err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
2530             DS_FIND_CHILDREN);
2531         kmem_free(fsname, len);
2532 
2533         if (err == 0) {
2534                 err = dsl_sync_task_group_wait(ra->dstg);
2535         }
2536 
2537         for (dst = list_head(&ra->dstg->dstg_tasks); dst;
2538             dst = list_next(&ra->dstg->dstg_tasks, dst)) {
2539                 dsl_dataset_t *ds = dst->dst_arg1;
2540                 if (dst->dst_err) {
2541                         dsl_dir_name(ds->ds_dir, ra->failed);
2542                         (void) strlcat(ra->failed, "@", sizeof (ra->failed));
2543                         (void) strlcat(ra->failed, ra->newsnap,
2544                             sizeof (ra->failed));
2545                 }
2546                 dsl_dataset_rele(ds, ra->dstg);
2547         }
2548 
2549         if (err)
2550                 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
2551 
2552         dsl_sync_task_group_destroy(ra->dstg);
2553         kmem_free(ra, sizeof (struct renamesnaparg));
2554         spa_close(spa, FTAG);
2555         return (err);
2556 }
2557 
2558 static int
2559 dsl_valid_rename(const char *oldname, void *arg)
2560 {
2561         int delta = *(int *)arg;
2562 
2563         if (strlen(oldname) + delta >= MAXNAMELEN)
2564                 return (ENAMETOOLONG);
2565 
2566         return (0);
2567 }
2568 
2569 #pragma weak dmu_objset_rename = dsl_dataset_rename
2570 int
2571 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
2572 {
2573         dsl_dir_t *dd;
2574         dsl_dataset_t *ds;
2575         const char *tail;
2576         int err;
2577 
2578         err = dsl_dir_open(oldname, FTAG, &dd, &tail);
2579         if (err)
2580                 return (err);
2581 
2582         if (tail == NULL) {
2583                 int delta = strlen(newname) - strlen(oldname);
2584 
2585                 /* if we're growing, validate child name lengths */
2586                 if (delta > 0)
2587                         err = dmu_objset_find(oldname, dsl_valid_rename,
2588                             &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
2589 
2590                 if (err == 0)
2591                         err = dsl_dir_rename(dd, newname);
2592                 dsl_dir_close(dd, FTAG);
2593                 return (err);
2594         }
2595 
2596         if (tail[0] != '@') {
2597                 /* the name ended in a nonexistent component */
2598                 dsl_dir_close(dd, FTAG);
2599                 return (ENOENT);
2600         }
2601 
2602         dsl_dir_close(dd, FTAG);
2603 
2604         /* new name must be snapshot in same filesystem */
2605         tail = strchr(newname, '@');
2606         if (tail == NULL)
2607                 return (EINVAL);
2608         tail++;
2609         if (strncmp(oldname, newname, tail - newname) != 0)
2610                 return (EXDEV);
2611 
2612         if (recursive) {
2613                 err = dsl_recursive_rename(oldname, newname);
2614         } else {
2615                 err = dsl_dataset_hold(oldname, FTAG, &ds);
2616                 if (err)
2617                         return (err);
2618 
2619                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
2620                     dsl_dataset_snapshot_rename_check,
2621                     dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
2622 
2623                 dsl_dataset_rele(ds, FTAG);
2624         }
2625 
2626         return (err);
2627 }
2628 
2629 struct promotenode {
2630         list_node_t link;
2631         dsl_dataset_t *ds;
2632 };
2633 
2634 struct promotearg {
2635         list_t shared_snaps, origin_snaps, clone_snaps;
2636         dsl_dataset_t *origin_origin;
2637         uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2638         char *err_ds;
2639 };
2640 
2641 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2642 static boolean_t snaplist_unstable(list_t *l);
2643 
2644 static int
2645 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
2646 {
2647         dsl_dataset_t *hds = arg1;
2648         struct promotearg *pa = arg2;
2649         struct promotenode *snap = list_head(&pa->shared_snaps);
2650         dsl_dataset_t *origin_ds = snap->ds;
2651         int err;
2652         uint64_t unused;
2653 
2654         /* Check that it is a real clone */
2655         if (!dsl_dir_is_clone(hds->ds_dir))
2656                 return (EINVAL);
2657 
2658         /* Since this is so expensive, don't do the preliminary check */
2659         if (!dmu_tx_is_syncing(tx))
2660                 return (0);
2661 
2662         if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
2663                 return (EXDEV);
2664 
2665         /* compute origin's new unique space */
2666         snap = list_tail(&pa->clone_snaps);
2667         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2668         dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2669             origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2670             &pa->unique, &unused, &unused);
2671 
2672         /*
2673          * Walk the snapshots that we are moving
2674          *
2675          * Compute space to transfer.  Consider the incremental changes
2676          * to used for each snapshot:
2677          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2678          * So each snapshot gave birth to:
2679          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2680          * So a sequence would look like:
2681          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2682          * Which simplifies to:
2683          * uN + kN + kN-1 + ... + k1 + k0
2684          * Note however, if we stop before we reach the ORIGIN we get:
2685          * uN + kN + kN-1 + ... + kM - uM-1
2686          */
2687         pa->used = origin_ds->ds_phys->ds_referenced_bytes;
2688         pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2689         pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2690         for (snap = list_head(&pa->shared_snaps); snap;
2691             snap = list_next(&pa->shared_snaps, snap)) {
2692                 uint64_t val, dlused, dlcomp, dluncomp;
2693                 dsl_dataset_t *ds = snap->ds;
2694 
2695                 /* Check that the snapshot name does not conflict */
2696                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2697                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2698                 if (err == 0) {
2699                         err = EEXIST;
2700                         goto out;
2701                 }
2702                 if (err != ENOENT)
2703                         goto out;
2704 
2705                 /* The very first snapshot does not have a deadlist */
2706                 if (ds->ds_phys->ds_prev_snap_obj == 0)
2707                         continue;
2708 
2709                 dsl_deadlist_space(&ds->ds_deadlist,
2710                     &dlused, &dlcomp, &dluncomp);
2711                 pa->used += dlused;
2712                 pa->comp += dlcomp;
2713                 pa->uncomp += dluncomp;
2714         }
2715 
2716         /*
2717          * If we are a clone of a clone then we never reached ORIGIN,
2718          * so we need to subtract out the clone origin's used space.
2719          */
2720         if (pa->origin_origin) {
2721                 pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
2722                 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2723                 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2724         }
2725 
2726         /* Check that there is enough space here */
2727         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2728             pa->used);
2729         if (err)
2730                 return (err);
2731 
2732         /*
2733          * Compute the amounts of space that will be used by snapshots
2734          * after the promotion (for both origin and clone).  For each,
2735          * it is the amount of space that will be on all of their
2736          * deadlists (that was not born before their new origin).
2737          */
2738         if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2739                 uint64_t space;
2740 
2741                 /*
2742                  * Note, typically this will not be a clone of a clone,
2743                  * so dd_origin_txg will be < TXG_INITIAL, so
2744                  * these snaplist_space() -> dsl_deadlist_space_range()
2745                  * calls will be fast because they do not have to
2746                  * iterate over all bps.
2747                  */
2748                 snap = list_head(&pa->origin_snaps);
2749                 err = snaplist_space(&pa->shared_snaps,
2750                     snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
2751                 if (err)
2752                         return (err);
2753 
2754                 err = snaplist_space(&pa->clone_snaps,
2755                     snap->ds->ds_dir->dd_origin_txg, &space);
2756                 if (err)
2757                         return (err);
2758                 pa->cloneusedsnap += space;
2759         }
2760         if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2761                 err = snaplist_space(&pa->origin_snaps,
2762                     origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
2763                 if (err)
2764                         return (err);
2765         }
2766 
2767         return (0);
2768 out:
2769         pa->err_ds =  snap->ds->ds_snapname;
2770         return (err);
2771 }
2772 
2773 static void
2774 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2775 {
2776         dsl_dataset_t *hds = arg1;
2777         struct promotearg *pa = arg2;
2778         struct promotenode *snap = list_head(&pa->shared_snaps);
2779         dsl_dataset_t *origin_ds = snap->ds;
2780         dsl_dataset_t *origin_head;
2781         dsl_dir_t *dd = hds->ds_dir;
2782         dsl_pool_t *dp = hds->ds_dir->dd_pool;
2783         dsl_dir_t *odd = NULL;
2784         uint64_t oldnext_obj;
2785         int64_t delta;
2786 
2787         ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
2788 
2789         snap = list_head(&pa->origin_snaps);
2790         origin_head = snap->ds;
2791 
2792         /*
2793          * We need to explicitly open odd, since origin_ds's dd will be
2794          * changing.
2795          */
2796         VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
2797             NULL, FTAG, &odd));
2798 
2799         /* change origin's next snap */
2800         dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2801         oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2802         snap = list_tail(&pa->clone_snaps);
2803         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2804         origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2805 
2806         /* change the origin's next clone */
2807         if (origin_ds->ds_phys->ds_next_clones_obj) {
2808                 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
2809                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2810                     origin_ds->ds_phys->ds_next_clones_obj,
2811                     oldnext_obj, tx));
2812         }
2813 
2814         /* change origin */
2815         dmu_buf_will_dirty(dd->dd_dbuf, tx);
2816         ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2817         dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2818         dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2819         dmu_buf_will_dirty(odd->dd_dbuf, tx);
2820         odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2821         origin_head->ds_dir->dd_origin_txg =
2822             origin_ds->ds_phys->ds_creation_txg;
2823 
2824         /* change dd_clone entries */
2825         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2826                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2827                     odd->dd_phys->dd_clones, hds->ds_object, tx));
2828                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2829                     pa->origin_origin->ds_dir->dd_phys->dd_clones,
2830                     hds->ds_object, tx));
2831 
2832                 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2833                     pa->origin_origin->ds_dir->dd_phys->dd_clones,
2834                     origin_head->ds_object, tx));
2835                 if (dd->dd_phys->dd_clones == 0) {
2836                         dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
2837                             DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
2838                 }
2839                 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2840                     dd->dd_phys->dd_clones, origin_head->ds_object, tx));
2841 
2842         }
2843 
2844         /* move snapshots to this dir */
2845         for (snap = list_head(&pa->shared_snaps); snap;
2846             snap = list_next(&pa->shared_snaps, snap)) {
2847                 dsl_dataset_t *ds = snap->ds;
2848 
2849                 /* unregister props as dsl_dir is changing */
2850                 if (ds->ds_objset) {
2851                         dmu_objset_evict(ds->ds_objset);
2852                         ds->ds_objset = NULL;
2853                 }
2854                 /* move snap name entry */
2855                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2856                 VERIFY(0 == dsl_dataset_snap_remove(origin_head,
2857                     ds->ds_snapname, tx));
2858                 VERIFY(0 == zap_add(dp->dp_meta_objset,
2859                     hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2860                     8, 1, &ds->ds_object, tx));
2861 
2862                 /* change containing dsl_dir */
2863                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2864                 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2865                 ds->ds_phys->ds_dir_obj = dd->dd_object;
2866                 ASSERT3P(ds->ds_dir, ==, odd);
2867                 dsl_dir_close(ds->ds_dir, ds);
2868                 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
2869                     NULL, ds, &ds->ds_dir));
2870 
2871                 /* move any clone references */
2872                 if (ds->ds_phys->ds_next_clones_obj &&
2873                     spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2874                         zap_cursor_t zc;
2875                         zap_attribute_t za;
2876 
2877                         for (zap_cursor_init(&zc, dp->dp_meta_objset,
2878                             ds->ds_phys->ds_next_clones_obj);
2879                             zap_cursor_retrieve(&zc, &za) == 0;
2880                             zap_cursor_advance(&zc)) {
2881                                 dsl_dataset_t *cnds;
2882                                 uint64_t o;
2883 
2884                                 if (za.za_first_integer == oldnext_obj) {
2885                                         /*
2886                                          * We've already moved the
2887                                          * origin's reference.
2888                                          */
2889                                         continue;
2890                                 }
2891 
2892                                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
2893                                     za.za_first_integer, FTAG, &cnds));
2894                                 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
2895 
2896                                 VERIFY3U(zap_remove_int(dp->dp_meta_objset,
2897                                     odd->dd_phys->dd_clones, o, tx), ==, 0);
2898                                 VERIFY3U(zap_add_int(dp->dp_meta_objset,
2899                                     dd->dd_phys->dd_clones, o, tx), ==, 0);
2900                                 dsl_dataset_rele(cnds, FTAG);
2901                         }
2902                         zap_cursor_fini(&zc);
2903                 }
2904 
2905                 ASSERT3U(dsl_prop_numcb(ds), ==, 0);
2906         }
2907 
2908         /*
2909          * Change space accounting.
2910          * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2911          * both be valid, or both be 0 (resulting in delta == 0).  This
2912          * is true for each of {clone,origin} independently.
2913          */
2914 
2915         delta = pa->cloneusedsnap -
2916             dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2917         ASSERT3S(delta, >=, 0);
2918         ASSERT3U(pa->used, >=, delta);
2919         dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2920         dsl_dir_diduse_space(dd, DD_USED_HEAD,
2921             pa->used - delta, pa->comp, pa->uncomp, tx);
2922 
2923         delta = pa->originusedsnap -
2924             odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2925         ASSERT3S(delta, <=, 0);
2926         ASSERT3U(pa->used, >=, -delta);
2927         dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2928         dsl_dir_diduse_space(odd, DD_USED_HEAD,
2929             -pa->used - delta, -pa->comp, -pa->uncomp, tx);
2930 
2931         origin_ds->ds_phys->ds_unique_bytes = pa->unique;
2932 
2933         /* log history record */
2934         spa_history_log_internal_ds(hds, "promote", tx, "");
2935 
2936         dsl_dir_close(odd, FTAG);
2937 }
2938 
2939 static char *snaplist_tag = "snaplist";
2940 /*
2941  * Make a list of dsl_dataset_t's for the snapshots between first_obj
2942  * (exclusive) and last_obj (inclusive).  The list will be in reverse
2943  * order (last_obj will be the list_head()).  If first_obj == 0, do all
2944  * snapshots back to this dataset's origin.
2945  */
2946 static int
2947 snaplist_make(dsl_pool_t *dp, boolean_t own,
2948     uint64_t first_obj, uint64_t last_obj, list_t *l)
2949 {
2950         uint64_t obj = last_obj;
2951 
2952         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
2953 
2954         list_create(l, sizeof (struct promotenode),
2955             offsetof(struct promotenode, link));
2956 
2957         while (obj != first_obj) {
2958                 dsl_dataset_t *ds;
2959                 struct promotenode *snap;
2960                 int err;
2961 
2962                 if (own) {
2963                         err = dsl_dataset_own_obj(dp, obj,
2964                             0, snaplist_tag, &ds);
2965                         if (err == 0)
2966                                 dsl_dataset_make_exclusive(ds, snaplist_tag);
2967                 } else {
2968                         err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
2969                 }
2970                 if (err == ENOENT) {
2971                         /* lost race with snapshot destroy */
2972                         struct promotenode *last = list_tail(l);
2973                         ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
2974                         obj = last->ds->ds_phys->ds_prev_snap_obj;
2975                         continue;
2976                 } else if (err) {
2977                         return (err);
2978                 }
2979 
2980                 if (first_obj == 0)
2981                         first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
2982 
2983                 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
2984                 snap->ds = ds;
2985                 list_insert_tail(l, snap);
2986                 obj = ds->ds_phys->ds_prev_snap_obj;
2987         }
2988 
2989         return (0);
2990 }
2991 
2992 static int
2993 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2994 {
2995         struct promotenode *snap;
2996 
2997         *spacep = 0;
2998         for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2999                 uint64_t used, comp, uncomp;
3000                 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
3001                     mintxg, UINT64_MAX, &used, &comp, &uncomp);
3002                 *spacep += used;
3003         }
3004         return (0);
3005 }
3006 
3007 static void
3008 snaplist_destroy(list_t *l, boolean_t own)
3009 {
3010         struct promotenode *snap;
3011 
3012         if (!l || !list_link_active(&l->list_head))
3013                 return;
3014 
3015         while ((snap = list_tail(l)) != NULL) {
3016                 list_remove(l, snap);
3017                 if (own)
3018                         dsl_dataset_disown(snap->ds, snaplist_tag);
3019                 else
3020                         dsl_dataset_rele(snap->ds, snaplist_tag);
3021                 kmem_free(snap, sizeof (struct promotenode));
3022         }
3023         list_destroy(l);
3024 }
3025 
3026 /*
3027  * Promote a clone.  Nomenclature note:
3028  * "clone" or "cds": the original clone which is being promoted
3029  * "origin" or "ods": the snapshot which is originally clone's origin
3030  * "origin head" or "ohds": the dataset which is the head
3031  * (filesystem/volume) for the origin
3032  * "origin origin": the origin of the origin's filesystem (typically
3033  * NULL, indicating that the clone is not a clone of a clone).
3034  */
3035 int
3036 dsl_dataset_promote(const char *name, char *conflsnap)
3037 {
3038         dsl_dataset_t *ds;
3039         dsl_dir_t *dd;
3040         dsl_pool_t *dp;
3041         dmu_object_info_t doi;
3042         struct promotearg pa = { 0 };
3043         struct promotenode *snap;
3044         int err;
3045 
3046         err = dsl_dataset_hold(name, FTAG, &ds);
3047         if (err)
3048                 return (err);
3049         dd = ds->ds_dir;
3050         dp = dd->dd_pool;
3051 
3052         err = dmu_object_info(dp->dp_meta_objset,
3053             ds->ds_phys->ds_snapnames_zapobj, &doi);
3054         if (err) {
3055                 dsl_dataset_rele(ds, FTAG);
3056                 return (err);
3057         }
3058 
3059         if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
3060                 dsl_dataset_rele(ds, FTAG);
3061                 return (EINVAL);
3062         }
3063 
3064         /*
3065          * We are going to inherit all the snapshots taken before our
3066          * origin (i.e., our new origin will be our parent's origin).
3067          * Take ownership of them so that we can rename them into our
3068          * namespace.
3069          */
3070         rw_enter(&dp->dp_config_rwlock, RW_READER);
3071 
3072         err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
3073             &pa.shared_snaps);
3074         if (err != 0)
3075                 goto out;
3076 
3077         err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
3078         if (err != 0)
3079                 goto out;
3080 
3081         snap = list_head(&pa.shared_snaps);
3082         ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
3083         err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
3084             snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
3085         if (err != 0)
3086                 goto out;
3087 
3088         if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
3089                 err = dsl_dataset_hold_obj(dp,
3090                     snap->ds->ds_dir->dd_phys->dd_origin_obj,
3091                     FTAG, &pa.origin_origin);
3092                 if (err != 0)
3093                         goto out;
3094         }
3095 
3096 out:
3097         rw_exit(&dp->dp_config_rwlock);
3098 
3099         /*
3100          * Add in 128x the snapnames zapobj size, since we will be moving
3101          * a bunch of snapnames to the promoted ds, and dirtying their
3102          * bonus buffers.
3103          */
3104         if (err == 0) {
3105                 err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
3106                     dsl_dataset_promote_sync, ds, &pa,
3107                     2 + 2 * doi.doi_physical_blocks_512);
3108                 if (err && pa.err_ds && conflsnap)
3109                         (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
3110         }
3111 
3112         snaplist_destroy(&pa.shared_snaps, B_TRUE);
3113         snaplist_destroy(&pa.clone_snaps, B_FALSE);
3114         snaplist_destroy(&pa.origin_snaps, B_FALSE);
3115         if (pa.origin_origin)
3116                 dsl_dataset_rele(pa.origin_origin, FTAG);
3117         dsl_dataset_rele(ds, FTAG);
3118         return (err);
3119 }
3120 
3121 struct cloneswaparg {
3122         dsl_dataset_t *cds; /* clone dataset */
3123         dsl_dataset_t *ohds; /* origin's head dataset */
3124         boolean_t force;
3125         int64_t unused_refres_delta; /* change in unconsumed refreservation */
3126 };
3127 
3128 /* ARGSUSED */
3129 static int
3130 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
3131 {
3132         struct cloneswaparg *csa = arg1;
3133 
3134         /* they should both be heads */
3135         if (dsl_dataset_is_snapshot(csa->cds) ||
3136             dsl_dataset_is_snapshot(csa->ohds))
3137                 return (EINVAL);
3138 
3139         /* the branch point should be just before them */
3140         if (csa->cds->ds_prev != csa->ohds->ds_prev)
3141                 return (EINVAL);
3142 
3143         /* cds should be the clone (unless they are unrelated) */
3144         if (csa->cds->ds_prev != NULL &&
3145             csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
3146             csa->ohds->ds_object !=
3147             csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
3148                 return (EINVAL);
3149 
3150         /* the clone should be a child of the origin */
3151         if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
3152                 return (EINVAL);
3153 
3154         /* ohds shouldn't be modified unless 'force' */
3155         if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
3156                 return (ETXTBSY);
3157 
3158         /* adjust amount of any unconsumed refreservation */
3159         csa->unused_refres_delta =
3160             (int64_t)MIN(csa->ohds->ds_reserved,
3161             csa->ohds->ds_phys->ds_unique_bytes) -
3162             (int64_t)MIN(csa->ohds->ds_reserved,
3163             csa->cds->ds_phys->ds_unique_bytes);
3164 
3165         if (csa->unused_refres_delta > 0 &&
3166             csa->unused_refres_delta >
3167             dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
3168                 return (ENOSPC);
3169 
3170         if (csa->ohds->ds_quota != 0 &&
3171             csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
3172                 return (EDQUOT);
3173 
3174         return (0);
3175 }
3176 
3177 /* ARGSUSED */
3178 static void
3179 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3180 {
3181         struct cloneswaparg *csa = arg1;
3182         dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
3183 
3184         ASSERT(csa->cds->ds_reserved == 0);
3185         ASSERT(csa->ohds->ds_quota == 0 ||
3186             csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
3187 
3188         dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
3189         dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
3190 
3191         if (csa->cds->ds_objset != NULL) {
3192                 dmu_objset_evict(csa->cds->ds_objset);
3193                 csa->cds->ds_objset = NULL;
3194         }
3195 
3196         if (csa->ohds->ds_objset != NULL) {
3197                 dmu_objset_evict(csa->ohds->ds_objset);
3198                 csa->ohds->ds_objset = NULL;
3199         }
3200 
3201         /*
3202          * Reset origin's unique bytes, if it exists.
3203          */
3204         if (csa->cds->ds_prev) {
3205                 dsl_dataset_t *origin = csa->cds->ds_prev;
3206                 uint64_t comp, uncomp;
3207 
3208                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
3209                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3210                     origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
3211                     &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
3212         }
3213 
3214         /* swap blkptrs */
3215         {
3216                 blkptr_t tmp;
3217                 tmp = csa->ohds->ds_phys->ds_bp;
3218                 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
3219                 csa->cds->ds_phys->ds_bp = tmp;
3220         }
3221 
3222         /* set dd_*_bytes */
3223         {
3224                 int64_t dused, dcomp, duncomp;
3225                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
3226                 uint64_t odl_used, odl_comp, odl_uncomp;
3227 
3228                 ASSERT3U(csa->cds->ds_dir->dd_phys->
3229                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
3230 
3231                 dsl_deadlist_space(&csa->cds->ds_deadlist,
3232                     &cdl_used, &cdl_comp, &cdl_uncomp);
3233                 dsl_deadlist_space(&csa->ohds->ds_deadlist,
3234                     &odl_used, &odl_comp, &odl_uncomp);
3235 
3236                 dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
3237                     (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
3238                 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
3239                     (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
3240                 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
3241                     cdl_uncomp -
3242                     (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
3243 
3244                 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
3245                     dused, dcomp, duncomp, tx);
3246                 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
3247                     -dused, -dcomp, -duncomp, tx);
3248 
3249                 /*
3250                  * The difference in the space used by snapshots is the
3251                  * difference in snapshot space due to the head's
3252                  * deadlist (since that's the only thing that's
3253                  * changing that affects the snapused).
3254                  */
3255                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3256                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3257                     &cdl_used, &cdl_comp, &cdl_uncomp);
3258                 dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
3259                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3260                     &odl_used, &odl_comp, &odl_uncomp);
3261                 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
3262                     DD_USED_HEAD, DD_USED_SNAP, tx);
3263         }
3264 
3265         /* swap ds_*_bytes */
3266         SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
3267             csa->cds->ds_phys->ds_referenced_bytes);
3268         SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
3269             csa->cds->ds_phys->ds_compressed_bytes);
3270         SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
3271             csa->cds->ds_phys->ds_uncompressed_bytes);
3272         SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
3273             csa->cds->ds_phys->ds_unique_bytes);
3274 
3275         /* apply any parent delta for change in unconsumed refreservation */
3276         dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
3277             csa->unused_refres_delta, 0, 0, tx);
3278 
3279         /*
3280          * Swap deadlists.
3281          */
3282         dsl_deadlist_close(&csa->cds->ds_deadlist);
3283         dsl_deadlist_close(&csa->ohds->ds_deadlist);
3284         SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
3285             csa->cds->ds_phys->ds_deadlist_obj);
3286         dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
3287             csa->cds->ds_phys->ds_deadlist_obj);
3288         dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
3289             csa->ohds->ds_phys->ds_deadlist_obj);
3290 
3291         dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
3292 
3293         spa_history_log_internal_ds(csa->cds, "clone swap", tx,
3294             "parent=%s", csa->ohds->ds_dir->dd_myname);
3295 }
3296 
3297 /*
3298  * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
3299  * recv" into an existing fs to swizzle the file system to the new
3300  * version, and by "zfs rollback".  Can also be used to swap two
3301  * independent head datasets if neither has any snapshots.
3302  */
3303 int
3304 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
3305     boolean_t force)
3306 {
3307         struct cloneswaparg csa;
3308         int error;
3309 
3310         ASSERT(clone->ds_owner);
3311         ASSERT(origin_head->ds_owner);
3312 retry:
3313         /*
3314          * Need exclusive access for the swap. If we're swapping these
3315          * datasets back after an error, we already hold the locks.
3316          */
3317         if (!RW_WRITE_HELD(&clone->ds_rwlock))
3318                 rw_enter(&clone->ds_rwlock, RW_WRITER);
3319         if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
3320             !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
3321                 rw_exit(&clone->ds_rwlock);
3322                 rw_enter(&origin_head->ds_rwlock, RW_WRITER);
3323                 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
3324                         rw_exit(&origin_head->ds_rwlock);
3325                         goto retry;
3326                 }
3327         }
3328         csa.cds = clone;
3329         csa.ohds = origin_head;
3330         csa.force = force;
3331         error = dsl_sync_task_do(clone->ds_dir->dd_pool,
3332             dsl_dataset_clone_swap_check,
3333             dsl_dataset_clone_swap_sync, &csa, NULL, 9);
3334         return (error);
3335 }
3336 
3337 /*
3338  * Given a pool name and a dataset object number in that pool,
3339  * return the name of that dataset.
3340  */
3341 int
3342 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3343 {
3344         spa_t *spa;
3345         dsl_pool_t *dp;
3346         dsl_dataset_t *ds;
3347         int error;
3348 
3349         if ((error = spa_open(pname, &spa, FTAG)) != 0)
3350                 return (error);
3351         dp = spa_get_dsl(spa);
3352         rw_enter(&dp->dp_config_rwlock, RW_READER);
3353         if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
3354                 dsl_dataset_name(ds, buf);
3355                 dsl_dataset_rele(ds, FTAG);
3356         }
3357         rw_exit(&dp->dp_config_rwlock);
3358         spa_close(spa, FTAG);
3359 
3360         return (error);
3361 }
3362 
3363 int
3364 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3365     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3366 {
3367         int error = 0;
3368 
3369         ASSERT3S(asize, >, 0);
3370 
3371         /*
3372          * *ref_rsrv is the portion of asize that will come from any
3373          * unconsumed refreservation space.
3374          */
3375         *ref_rsrv = 0;
3376 
3377         mutex_enter(&ds->ds_lock);
3378         /*
3379          * Make a space adjustment for reserved bytes.
3380          */
3381         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
3382                 ASSERT3U(*used, >=,
3383                     ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3384                 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3385                 *ref_rsrv =
3386                     asize - MIN(asize, parent_delta(ds, asize + inflight));
3387         }
3388 
3389         if (!check_quota || ds->ds_quota == 0) {
3390                 mutex_exit(&ds->ds_lock);
3391                 return (0);
3392         }
3393         /*
3394          * If they are requesting more space, and our current estimate
3395          * is over quota, they get to try again unless the actual
3396          * on-disk is over quota and there are no pending changes (which
3397          * may free up space for us).
3398          */
3399         if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
3400                 if (inflight > 0 ||
3401                     ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
3402                         error = ERESTART;
3403                 else
3404                         error = EDQUOT;
3405         }
3406         mutex_exit(&ds->ds_lock);
3407 
3408         return (error);
3409 }
3410 
3411 /* ARGSUSED */
3412 static int
3413 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
3414 {
3415         dsl_dataset_t *ds = arg1;
3416         dsl_prop_setarg_t *psa = arg2;
3417         int err;
3418 
3419         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
3420                 return (ENOTSUP);
3421 
3422         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3423                 return (err);
3424 
3425         if (psa->psa_effective_value == 0)
3426                 return (0);
3427 
3428         if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
3429             psa->psa_effective_value < ds->ds_reserved)
3430                 return (ENOSPC);
3431 
3432         return (0);
3433 }
3434 
3435 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
3436 
3437 void
3438 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3439 {
3440         dsl_dataset_t *ds = arg1;
3441         dsl_prop_setarg_t *psa = arg2;
3442         uint64_t effective_value = psa->psa_effective_value;
3443 
3444         dsl_prop_set_sync(ds, psa, tx);
3445         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3446 
3447         if (ds->ds_quota != effective_value) {
3448                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3449                 ds->ds_quota = effective_value;
3450 
3451                 spa_history_log_internal_ds(ds, "set refquota", tx,
3452                     "refquota=%lld", (longlong_t)ds->ds_quota);
3453         }
3454 }
3455 
3456 int
3457 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
3458 {
3459         dsl_dataset_t *ds;
3460         dsl_prop_setarg_t psa;
3461         int err;
3462 
3463         dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
3464 
3465         err = dsl_dataset_hold(dsname, FTAG, &ds);
3466         if (err)
3467                 return (err);
3468 
3469         /*
3470          * If someone removes a file, then tries to set the quota, we
3471          * want to make sure the file freeing takes effect.
3472          */
3473         txg_wait_open(ds->ds_dir->dd_pool, 0);
3474 
3475         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3476             dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
3477             ds, &psa, 0);
3478 
3479         dsl_dataset_rele(ds, FTAG);
3480         return (err);
3481 }
3482 
3483 static int
3484 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
3485 {
3486         dsl_dataset_t *ds = arg1;
3487         dsl_prop_setarg_t *psa = arg2;
3488         uint64_t effective_value;
3489         uint64_t unique;
3490         int err;
3491 
3492         if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
3493             SPA_VERSION_REFRESERVATION)
3494                 return (ENOTSUP);
3495 
3496         if (dsl_dataset_is_snapshot(ds))
3497                 return (EINVAL);
3498 
3499         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3500                 return (err);
3501 
3502         effective_value = psa->psa_effective_value;
3503 
3504         /*
3505          * If we are doing the preliminary check in open context, the
3506          * space estimates may be inaccurate.
3507          */
3508         if (!dmu_tx_is_syncing(tx))
3509                 return (0);
3510 
3511         mutex_enter(&ds->ds_lock);
3512         if (!DS_UNIQUE_IS_ACCURATE(ds))
3513                 dsl_dataset_recalc_head_uniq(ds);
3514         unique = ds->ds_phys->ds_unique_bytes;
3515         mutex_exit(&ds->ds_lock);
3516 
3517         if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
3518                 uint64_t delta = MAX(unique, effective_value) -
3519                     MAX(unique, ds->ds_reserved);
3520 
3521                 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
3522                         return (ENOSPC);
3523                 if (ds->ds_quota > 0 &&
3524                     effective_value > ds->ds_quota)
3525                         return (ENOSPC);
3526         }
3527 
3528         return (0);
3529 }
3530 
3531 static void
3532 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3533 {
3534         dsl_dataset_t *ds = arg1;
3535         dsl_prop_setarg_t *psa = arg2;
3536         uint64_t effective_value = psa->psa_effective_value;
3537         uint64_t unique;
3538         int64_t delta;
3539 
3540         dsl_prop_set_sync(ds, psa, tx);
3541         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3542 
3543         dmu_buf_will_dirty(ds->ds_dbuf, tx);
3544 
3545         mutex_enter(&ds->ds_dir->dd_lock);
3546         mutex_enter(&ds->ds_lock);
3547         ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3548         unique = ds->ds_phys->ds_unique_bytes;
3549         delta = MAX(0, (int64_t)(effective_value - unique)) -
3550             MAX(0, (int64_t)(ds->ds_reserved - unique));
3551         ds->ds_reserved = effective_value;
3552         mutex_exit(&ds->ds_lock);
3553 
3554         dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3555         mutex_exit(&ds->ds_dir->dd_lock);
3556 
3557         spa_history_log_internal_ds(ds, "set refreservation", tx,
3558             "refreservation=%lld", (longlong_t)effective_value);
3559 }
3560 
3561 int
3562 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
3563     uint64_t reservation)
3564 {
3565         dsl_dataset_t *ds;
3566         dsl_prop_setarg_t psa;
3567         int err;
3568 
3569         dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
3570             &reservation);
3571 
3572         err = dsl_dataset_hold(dsname, FTAG, &ds);
3573         if (err)
3574                 return (err);
3575 
3576         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3577             dsl_dataset_set_reservation_check,
3578             dsl_dataset_set_reservation_sync, ds, &psa, 0);
3579 
3580         dsl_dataset_rele(ds, FTAG);
3581         return (err);
3582 }
3583 
3584 typedef struct zfs_hold_cleanup_arg {
3585         dsl_pool_t *dp;
3586         uint64_t dsobj;
3587         char htag[MAXNAMELEN];
3588 } zfs_hold_cleanup_arg_t;
3589 
3590 static void
3591 dsl_dataset_user_release_onexit(void *arg)
3592 {
3593         zfs_hold_cleanup_arg_t *ca = arg;
3594 
3595         (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
3596             B_TRUE);
3597         kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
3598 }
3599 
3600 void
3601 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
3602     minor_t minor)
3603 {
3604         zfs_hold_cleanup_arg_t *ca;
3605 
3606         ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
3607         ca->dp = ds->ds_dir->dd_pool;
3608         ca->dsobj = ds->ds_object;
3609         (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
3610         VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
3611             dsl_dataset_user_release_onexit, ca, NULL));
3612 }
3613 
3614 /*
3615  * If you add new checks here, you may need to add
3616  * additional checks to the "temporary" case in
3617  * snapshot_check() in dmu_objset.c.
3618  */
3619 static int
3620 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
3621 {
3622         dsl_dataset_t *ds = arg1;
3623         struct dsl_ds_holdarg *ha = arg2;
3624         const char *htag = ha->htag;
3625         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3626         int error = 0;
3627 
3628         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3629                 return (ENOTSUP);
3630 
3631         if (!dsl_dataset_is_snapshot(ds))
3632                 return (EINVAL);
3633 
3634         /* tags must be unique */
3635         mutex_enter(&ds->ds_lock);
3636         if (ds->ds_phys->ds_userrefs_obj) {
3637                 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
3638                     8, 1, tx);
3639                 if (error == 0)
3640                         error = EEXIST;
3641                 else if (error == ENOENT)
3642                         error = 0;
3643         }
3644         mutex_exit(&ds->ds_lock);
3645 
3646         if (error == 0 && ha->temphold &&
3647             strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
3648                 error = E2BIG;
3649 
3650         return (error);
3651 }
3652 
3653 void
3654 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3655 {
3656         dsl_dataset_t *ds = arg1;
3657         struct dsl_ds_holdarg *ha = arg2;
3658         const char *htag = ha->htag;
3659         dsl_pool_t *dp = ds->ds_dir->dd_pool;
3660         objset_t *mos = dp->dp_meta_objset;
3661         uint64_t now = gethrestime_sec();
3662         uint64_t zapobj;
3663 
3664         mutex_enter(&ds->ds_lock);
3665         if (ds->ds_phys->ds_userrefs_obj == 0) {
3666                 /*
3667                  * This is the first user hold for this dataset.  Create
3668                  * the userrefs zap object.
3669                  */
3670                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3671                 zapobj = ds->ds_phys->ds_userrefs_obj =
3672                     zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
3673         } else {
3674                 zapobj = ds->ds_phys->ds_userrefs_obj;
3675         }
3676         ds->ds_userrefs++;
3677         mutex_exit(&ds->ds_lock);
3678 
3679         VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
3680 
3681         if (ha->temphold) {
3682                 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
3683                     htag, &now, tx));
3684         }
3685 
3686         spa_history_log_internal_ds(ds, "hold", tx,
3687             "tag = %s temp = %d holds now = %llu",
3688             htag, (int)ha->temphold, ds->ds_userrefs);
3689 }
3690 
3691 static int
3692 dsl_dataset_user_hold_one(const char *dsname, void *arg)
3693 {
3694         struct dsl_ds_holdarg *ha = arg;
3695         dsl_dataset_t *ds;
3696         int error;
3697         char *name;
3698 
3699         /* alloc a buffer to hold dsname@snapname plus terminating NULL */
3700         name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3701         error = dsl_dataset_hold(name, ha->dstg, &ds);
3702         strfree(name);
3703         if (error == 0) {
3704                 ha->gotone = B_TRUE;
3705                 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
3706                     dsl_dataset_user_hold_sync, ds, ha, 0);
3707         } else if (error == ENOENT && ha->recursive) {
3708                 error = 0;
3709         } else {
3710                 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3711         }
3712         return (error);
3713 }
3714 
3715 int
3716 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
3717     boolean_t temphold)
3718 {
3719         struct dsl_ds_holdarg *ha;
3720         int error;
3721 
3722         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3723         ha->htag = htag;
3724         ha->temphold = temphold;
3725         error = dsl_sync_task_do(ds->ds_dir->dd_pool,
3726             dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
3727             ds, ha, 0);
3728         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3729 
3730         return (error);
3731 }
3732 
3733 int
3734 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
3735     boolean_t recursive, boolean_t temphold, int cleanup_fd)
3736 {
3737         struct dsl_ds_holdarg *ha;
3738         dsl_sync_task_t *dst;
3739         spa_t *spa;
3740         int error;
3741         minor_t minor = 0;
3742 
3743         if (cleanup_fd != -1) {
3744                 /* Currently we only support cleanup-on-exit of tempholds. */
3745                 if (!temphold)
3746                         return (EINVAL);
3747                 error = zfs_onexit_fd_hold(cleanup_fd, &minor);
3748                 if (error)
3749                         return (error);
3750         }
3751 
3752         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3753 
3754         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3755 
3756         error = spa_open(dsname, &spa, FTAG);
3757         if (error) {
3758                 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3759                 if (cleanup_fd != -1)
3760                         zfs_onexit_fd_rele(cleanup_fd);
3761                 return (error);
3762         }
3763 
3764         ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3765         ha->htag = htag;
3766         ha->snapname = snapname;
3767         ha->recursive = recursive;
3768         ha->temphold = temphold;
3769 
3770         if (recursive) {
3771                 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
3772                     ha, DS_FIND_CHILDREN);
3773         } else {
3774                 error = dsl_dataset_user_hold_one(dsname, ha);
3775         }
3776         if (error == 0)
3777                 error = dsl_sync_task_group_wait(ha->dstg);
3778 
3779         for (dst = list_head(&ha->dstg->dstg_tasks); dst;
3780             dst = list_next(&ha->dstg->dstg_tasks, dst)) {
3781                 dsl_dataset_t *ds = dst->dst_arg1;
3782 
3783                 if (dst->dst_err) {
3784                         dsl_dataset_name(ds, ha->failed);
3785                         *strchr(ha->failed, '@') = '\0';
3786                 } else if (error == 0 && minor != 0 && temphold) {
3787                         /*
3788                          * If this hold is to be released upon process exit,
3789                          * register that action now.
3790                          */
3791                         dsl_register_onexit_hold_cleanup(ds, htag, minor);
3792                 }
3793                 dsl_dataset_rele(ds, ha->dstg);
3794         }
3795 
3796         if (error == 0 && recursive && !ha->gotone)
3797                 error = ENOENT;
3798 
3799         if (error)
3800                 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
3801 
3802         dsl_sync_task_group_destroy(ha->dstg);
3803 
3804         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3805         spa_close(spa, FTAG);
3806         if (cleanup_fd != -1)
3807                 zfs_onexit_fd_rele(cleanup_fd);
3808         return (error);
3809 }
3810 
3811 struct dsl_ds_releasearg {
3812         dsl_dataset_t *ds;
3813         const char *htag;
3814         boolean_t own;          /* do we own or just hold ds? */
3815 };
3816 
3817 static int
3818 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
3819     boolean_t *might_destroy)
3820 {
3821         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3822         uint64_t zapobj;
3823         uint64_t tmp;
3824         int error;
3825 
3826         *might_destroy = B_FALSE;
3827 
3828         mutex_enter(&ds->ds_lock);
3829         zapobj = ds->ds_phys->ds_userrefs_obj;
3830         if (zapobj == 0) {
3831                 /* The tag can't possibly exist */
3832                 mutex_exit(&ds->ds_lock);
3833                 return (ESRCH);
3834         }
3835 
3836         /* Make sure the tag exists */
3837         error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
3838         if (error) {
3839                 mutex_exit(&ds->ds_lock);
3840                 if (error == ENOENT)
3841                         error = ESRCH;
3842                 return (error);
3843         }
3844 
3845         if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
3846             DS_IS_DEFER_DESTROY(ds))
3847                 *might_destroy = B_TRUE;
3848 
3849         mutex_exit(&ds->ds_lock);
3850         return (0);
3851 }
3852 
3853 static int
3854 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
3855 {
3856         struct dsl_ds_releasearg *ra = arg1;
3857         dsl_dataset_t *ds = ra->ds;
3858         boolean_t might_destroy;
3859         int error;
3860 
3861         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3862                 return (ENOTSUP);
3863 
3864         error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
3865         if (error)
3866                 return (error);
3867 
3868         if (might_destroy) {
3869                 struct dsl_ds_destroyarg dsda = {0};
3870 
3871                 if (dmu_tx_is_syncing(tx)) {
3872                         /*
3873                          * If we're not prepared to remove the snapshot,
3874                          * we can't allow the release to happen right now.
3875                          */
3876                         if (!ra->own)
3877                                 return (EBUSY);
3878                 }
3879                 dsda.ds = ds;
3880                 dsda.releasing = B_TRUE;
3881                 return (dsl_dataset_destroy_check(&dsda, tag, tx));
3882         }
3883 
3884         return (0);
3885 }
3886 
3887 static void
3888 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
3889 {
3890         struct dsl_ds_releasearg *ra = arg1;
3891         dsl_dataset_t *ds = ra->ds;
3892         dsl_pool_t *dp = ds->ds_dir->dd_pool;
3893         objset_t *mos = dp->dp_meta_objset;
3894         uint64_t zapobj;
3895         uint64_t refs;
3896         int error;
3897 
3898         mutex_enter(&ds->ds_lock);
3899         ds->ds_userrefs--;
3900         refs = ds->ds_userrefs;
3901         mutex_exit(&ds->ds_lock);
3902         error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
3903         VERIFY(error == 0 || error == ENOENT);
3904         zapobj = ds->ds_phys->ds_userrefs_obj;
3905         VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
3906         if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
3907             DS_IS_DEFER_DESTROY(ds)) {
3908                 struct dsl_ds_destroyarg dsda = {0};
3909 
3910                 ASSERT(ra->own);
3911                 dsda.ds = ds;
3912                 dsda.releasing = B_TRUE;
3913                 /* We already did the destroy_check */
3914                 dsl_dataset_destroy_sync(&dsda, tag, tx);
3915         }
3916 
3917         spa_history_log_internal_ds(ds, "release", tx,
3918             "tag = %s refs now = %lld", ra->htag, (longlong_t)refs);
3919 }
3920 
3921 static int
3922 dsl_dataset_user_release_one(const char *dsname, void *arg)
3923 {
3924         struct dsl_ds_holdarg *ha = arg;
3925         struct dsl_ds_releasearg *ra;
3926         dsl_dataset_t *ds;
3927         int error;
3928         void *dtag = ha->dstg;
3929         char *name;
3930         boolean_t own = B_FALSE;
3931         boolean_t might_destroy;
3932 
3933         /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
3934         name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3935         error = dsl_dataset_hold(name, dtag, &ds);
3936         strfree(name);
3937         if (error == ENOENT && ha->recursive)
3938                 return (0);
3939         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3940         if (error)
3941                 return (error);
3942 
3943         ha->gotone = B_TRUE;
3944 
3945         ASSERT(dsl_dataset_is_snapshot(ds));
3946 
3947         error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
3948         if (error) {
3949                 dsl_dataset_rele(ds, dtag);
3950                 return (error);
3951         }
3952 
3953         if (might_destroy) {
3954 #ifdef _KERNEL
3955                 name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3956                 error = zfs_unmount_snap(name, NULL);
3957                 strfree(name);
3958                 if (error) {
3959                         dsl_dataset_rele(ds, dtag);
3960                         return (error);
3961                 }
3962 #endif
3963                 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
3964                         dsl_dataset_rele(ds, dtag);
3965                         return (EBUSY);
3966                 } else {
3967                         own = B_TRUE;
3968                         dsl_dataset_make_exclusive(ds, dtag);
3969                 }
3970         }
3971 
3972         ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
3973         ra->ds = ds;
3974         ra->htag = ha->htag;
3975         ra->own = own;
3976         dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
3977             dsl_dataset_user_release_sync, ra, dtag, 0);
3978 
3979         return (0);
3980 }
3981 
3982 int
3983 dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
3984     boolean_t recursive)
3985 {
3986         struct dsl_ds_holdarg *ha;
3987         dsl_sync_task_t *dst;
3988         spa_t *spa;
3989         int error;
3990 
3991 top:
3992         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3993 
3994         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3995 
3996         error = spa_open(dsname, &spa, FTAG);
3997         if (error) {
3998                 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3999                 return (error);
4000         }
4001 
4002         ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
4003         ha->htag = htag;
4004         ha->snapname = snapname;
4005         ha->recursive = recursive;
4006         if (recursive) {
4007                 error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
4008                     ha, DS_FIND_CHILDREN);
4009         } else {
4010                 error = dsl_dataset_user_release_one(dsname, ha);
4011         }
4012         if (error == 0)
4013                 error = dsl_sync_task_group_wait(ha->dstg);
4014 
4015         for (dst = list_head(&ha->dstg->dstg_tasks); dst;
4016             dst = list_next(&ha->dstg->dstg_tasks, dst)) {
4017                 struct dsl_ds_releasearg *ra = dst->dst_arg1;
4018                 dsl_dataset_t *ds = ra->ds;
4019 
4020                 if (dst->dst_err)
4021                         dsl_dataset_name(ds, ha->failed);
4022 
4023                 if (ra->own)
4024                         dsl_dataset_disown(ds, ha->dstg);
4025                 else
4026                         dsl_dataset_rele(ds, ha->dstg);
4027 
4028                 kmem_free(ra, sizeof (struct dsl_ds_releasearg));
4029         }
4030 
4031         if (error == 0 && recursive && !ha->gotone)
4032                 error = ENOENT;
4033 
4034         if (error && error != EBUSY)
4035                 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
4036 
4037         dsl_sync_task_group_destroy(ha->dstg);
4038         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
4039         spa_close(spa, FTAG);
4040 
4041         /*
4042          * We can get EBUSY if we were racing with deferred destroy and
4043          * dsl_dataset_user_release_check() hadn't done the necessary
4044          * open context setup.  We can also get EBUSY if we're racing
4045          * with destroy and that thread is the ds_owner.  Either way
4046          * the busy condition should be transient, and we should retry
4047          * the release operation.
4048          */
4049         if (error == EBUSY)
4050                 goto top;
4051 
4052         return (error);
4053 }
4054 
4055 /*
4056  * Called at spa_load time (with retry == B_FALSE) to release a stale
4057  * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
4058  */
4059 int
4060 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
4061     boolean_t retry)
4062 {
4063         dsl_dataset_t *ds;
4064         char *snap;
4065         char *name;
4066         int namelen;
4067         int error;
4068 
4069         do {
4070                 rw_enter(&dp->dp_config_rwlock, RW_READER);
4071                 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
4072                 rw_exit(&dp->dp_config_rwlock);
4073                 if (error)
4074                         return (error);
4075                 namelen = dsl_dataset_namelen(ds)+1;
4076                 name = kmem_alloc(namelen, KM_SLEEP);
4077                 dsl_dataset_name(ds, name);
4078                 dsl_dataset_rele(ds, FTAG);
4079 
4080                 snap = strchr(name, '@');
4081                 *snap = '\0';
4082                 ++snap;
4083                 error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
4084                 kmem_free(name, namelen);
4085 
4086                 /*
4087                  * The object can't have been destroyed because we have a hold,
4088                  * but it might have been renamed, resulting in ENOENT.  Retry
4089                  * if we've been requested to do so.
4090                  *
4091                  * It would be nice if we could use the dsobj all the way
4092                  * through and avoid ENOENT entirely.  But we might need to
4093                  * unmount the snapshot, and there's currently no way to lookup
4094                  * a vfsp using a ZFS object id.
4095                  */
4096         } while ((error == ENOENT) && retry);
4097 
4098         return (error);
4099 }
4100 
4101 int
4102 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
4103 {
4104         dsl_dataset_t *ds;
4105         int err;
4106 
4107         err = dsl_dataset_hold(dsname, FTAG, &ds);
4108         if (err)
4109                 return (err);
4110 
4111         VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
4112         if (ds->ds_phys->ds_userrefs_obj != 0) {
4113                 zap_attribute_t *za;
4114                 zap_cursor_t zc;
4115 
4116                 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
4117                 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
4118                     ds->ds_phys->ds_userrefs_obj);
4119                     zap_cursor_retrieve(&zc, za) == 0;
4120                     zap_cursor_advance(&zc)) {
4121                         VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
4122                             za->za_first_integer));
4123                 }
4124                 zap_cursor_fini(&zc);
4125                 kmem_free(za, sizeof (zap_attribute_t));
4126         }
4127         dsl_dataset_rele(ds, FTAG);
4128         return (0);
4129 }
4130 
4131 /*
4132  * Note, this function is used as the callback for dmu_objset_find().  We
4133  * always return 0 so that we will continue to find and process
4134  * inconsistent datasets, even if we encounter an error trying to
4135  * process one of them.
4136  */
4137 /* ARGSUSED */
4138 int
4139 dsl_destroy_inconsistent(const char *dsname, void *arg)
4140 {
4141         dsl_dataset_t *ds;
4142 
4143         if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
4144                 if (DS_IS_INCONSISTENT(ds))
4145                         (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
4146                 else
4147                         dsl_dataset_disown(ds, FTAG);
4148         }
4149         return (0);
4150 }
4151 
4152 /*
4153  * Return (in *usedp) the amount of space written in new that is not
4154  * present in oldsnap.  New may be a snapshot or the head.  Old must be
4155  * a snapshot before new, in new's filesystem (or its origin).  If not then
4156  * fail and return EINVAL.
4157  *
4158  * The written space is calculated by considering two components:  First, we
4159  * ignore any freed space, and calculate the written as new's used space
4160  * minus old's used space.  Next, we add in the amount of space that was freed
4161  * between the two snapshots, thus reducing new's used space relative to old's.
4162  * Specifically, this is the space that was born before old->ds_creation_txg,
4163  * and freed before new (ie. on new's deadlist or a previous deadlist).
4164  *
4165  * space freed                         [---------------------]
4166  * snapshots                       ---O-------O--------O-------O------
4167  *                                         oldsnap            new
4168  */
4169 int
4170 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
4171     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4172 {
4173         int err = 0;
4174         uint64_t snapobj;
4175         dsl_pool_t *dp = new->ds_dir->dd_pool;
4176 
4177         *usedp = 0;
4178         *usedp += new->ds_phys->ds_referenced_bytes;
4179         *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
4180 
4181         *compp = 0;
4182         *compp += new->ds_phys->ds_compressed_bytes;
4183         *compp -= oldsnap->ds_phys->ds_compressed_bytes;
4184 
4185         *uncompp = 0;
4186         *uncompp += new->ds_phys->ds_uncompressed_bytes;
4187         *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
4188 
4189         rw_enter(&dp->dp_config_rwlock, RW_READER);
4190         snapobj = new->ds_object;
4191         while (snapobj != oldsnap->ds_object) {
4192                 dsl_dataset_t *snap;
4193                 uint64_t used, comp, uncomp;
4194 
4195                 if (snapobj == new->ds_object) {
4196                         snap = new;
4197                 } else {
4198                         err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
4199                         if (err != 0)
4200                                 break;
4201                 }
4202 
4203                 if (snap->ds_phys->ds_prev_snap_txg ==
4204                     oldsnap->ds_phys->ds_creation_txg) {
4205                         /*
4206                          * The blocks in the deadlist can not be born after
4207                          * ds_prev_snap_txg, so get the whole deadlist space,
4208                          * which is more efficient (especially for old-format
4209                          * deadlists).  Unfortunately the deadlist code
4210                          * doesn't have enough information to make this
4211                          * optimization itself.
4212                          */
4213                         dsl_deadlist_space(&snap->ds_deadlist,
4214                             &used, &comp, &uncomp);
4215                 } else {
4216                         dsl_deadlist_space_range(&snap->ds_deadlist,
4217                             0, oldsnap->ds_phys->ds_creation_txg,
4218                             &used, &comp, &uncomp);
4219                 }
4220                 *usedp += used;
4221                 *compp += comp;
4222                 *uncompp += uncomp;
4223 
4224                 /*
4225                  * If we get to the beginning of the chain of snapshots
4226                  * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
4227                  * was not a snapshot of/before new.
4228                  */
4229                 snapobj = snap->ds_phys->ds_prev_snap_obj;
4230                 if (snap != new)
4231                         dsl_dataset_rele(snap, FTAG);
4232                 if (snapobj == 0) {
4233                         err = EINVAL;
4234                         break;
4235                 }
4236 
4237         }
4238         rw_exit(&dp->dp_config_rwlock);
4239         return (err);
4240 }
4241 
4242 /*
4243  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
4244  * lastsnap, and all snapshots in between are deleted.
4245  *
4246  * blocks that would be freed            [---------------------------]
4247  * snapshots                       ---O-------O--------O-------O--------O
4248  *                                        firstsnap        lastsnap
4249  *
4250  * This is the set of blocks that were born after the snap before firstsnap,
4251  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
4252  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
4253  * We calculate this by iterating over the relevant deadlists (from the snap
4254  * after lastsnap, backward to the snap after firstsnap), summing up the
4255  * space on the deadlist that was born after the snap before firstsnap.
4256  */
4257 int
4258 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
4259     dsl_dataset_t *lastsnap,
4260     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4261 {
4262         int err = 0;
4263         uint64_t snapobj;
4264         dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
4265 
4266         ASSERT(dsl_dataset_is_snapshot(firstsnap));
4267         ASSERT(dsl_dataset_is_snapshot(lastsnap));
4268 
4269         /*
4270          * Check that the snapshots are in the same dsl_dir, and firstsnap
4271          * is before lastsnap.
4272          */
4273         if (firstsnap->ds_dir != lastsnap->ds_dir ||
4274             firstsnap->ds_phys->ds_creation_txg >
4275             lastsnap->ds_phys->ds_creation_txg)
4276                 return (EINVAL);
4277 
4278         *usedp = *compp = *uncompp = 0;
4279 
4280         rw_enter(&dp->dp_config_rwlock, RW_READER);
4281         snapobj = lastsnap->ds_phys->ds_next_snap_obj;
4282         while (snapobj != firstsnap->ds_object) {
4283                 dsl_dataset_t *ds;
4284                 uint64_t used, comp, uncomp;
4285 
4286                 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
4287                 if (err != 0)
4288                         break;
4289 
4290                 dsl_deadlist_space_range(&ds->ds_deadlist,
4291                     firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
4292                     &used, &comp, &uncomp);
4293                 *usedp += used;
4294                 *compp += comp;
4295                 *uncompp += uncomp;
4296 
4297                 snapobj = ds->ds_phys->ds_prev_snap_obj;
4298                 ASSERT3U(snapobj, !=, 0);
4299                 dsl_dataset_rele(ds, FTAG);
4300         }
4301         rw_exit(&dp->dp_config_rwlock);
4302         return (err);
4303 }