illumos-gate New usr/src/uts/common/fs/zfs/dsl

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/dmu_objset.h>
  28 #include <sys/dsl_dataset.h>
  29 #include <sys/dsl_dir.h>
  30 #include <sys/dsl_prop.h>
  31 #include <sys/dsl_synctask.h>
  32 #include <sys/dmu_traverse.h>
  33 #include <sys/dmu_impl.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/arc.h>
  36 #include <sys/zio.h>
  37 #include <sys/zap.h>
  38 #include <sys/zfeature.h>
  39 #include <sys/unique.h>
  40 #include <sys/zfs_context.h>
  41 #include <sys/zfs_ioctl.h>
  42 #include <sys/spa.h>
  43 #include <sys/zfs_znode.h>
  44 #include <sys/zfs_onexit.h>
  45 #include <sys/zvol.h>
  46 #include <sys/dsl_scan.h>
  47 #include <sys/dsl_deadlist.h>
  48 
  49 static char *dsl_reaper = "the grim reaper";
  50 
  51 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
  52 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
  53 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
  54 
  55 #define SWITCH64(x, y) \
  56         { \
  57                 uint64_t __tmp = (x); \
  58                 (x) = (y); \
  59                 (y) = __tmp; \
  60         }
  61 
  62 #define DS_REF_MAX      (1ULL << 62)
  63 
  64 #define DSL_DEADLIST_BLOCKSIZE  SPA_MAXBLOCKSIZE
  65 
  66 #define DSL_DATASET_IS_DESTROYED(ds)    ((ds)->ds_owner == dsl_reaper)
  67 
  68 
  69 /*
  70  * Figure out how much of this delta should be propogated to the dsl_dir
  71  * layer.  If there's a refreservation, that space has already been
  72  * partially accounted for in our ancestors.
  73  */
  74 static int64_t
  75 parent_delta(dsl_dataset_t *ds, int64_t delta)
  76 {
  77         uint64_t old_bytes, new_bytes;
  78 
  79         if (ds->ds_reserved == 0)
  80                 return (delta);
  81 
  82         old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
  83         new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
  84 
  85         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
  86         return (new_bytes - old_bytes);
  87 }
  88 
  89 void
  90 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
  91 {
  92         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
  93         int compressed = BP_GET_PSIZE(bp);
  94         int uncompressed = BP_GET_UCSIZE(bp);
  95         int64_t delta;
  96 
  97         dprintf_bp(bp, "ds=%p", ds);
  98 
  99         ASSERT(dmu_tx_is_syncing(tx));
 100         /* It could have been compressed away to nothing */
 101         if (BP_IS_HOLE(bp))
 102                 return;
 103         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 104         ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
 105         if (ds == NULL) {
 106                 /*
 107                  * Account for the meta-objset space in its placeholder
 108                  * dsl_dir.
 109                  */
 110                 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
 111                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 112                     used, compressed, uncompressed, tx);
 113                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 114                 return;
 115         }
 116         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 117 
 118         mutex_enter(&ds->ds_dir->dd_lock);
 119         mutex_enter(&ds->ds_lock);
 120         delta = parent_delta(ds, used);
 121         ds->ds_phys->ds_referenced_bytes += used;
 122         ds->ds_phys->ds_compressed_bytes += compressed;
 123         ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 124         ds->ds_phys->ds_unique_bytes += used;
 125         mutex_exit(&ds->ds_lock);
 126         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 127             compressed, uncompressed, tx);
 128         dsl_dir_transfer_space(ds->ds_dir, used - delta,
 129             DD_USED_REFRSRV, DD_USED_HEAD, tx);
 130         mutex_exit(&ds->ds_dir->dd_lock);
 131 }
 132 
 133 int
 134 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 135     boolean_t async)
 136 {
 137         if (BP_IS_HOLE(bp))
 138                 return (0);
 139 
 140         ASSERT(dmu_tx_is_syncing(tx));
 141         ASSERT(bp->blk_birth <= tx->tx_txg);
 142 
 143         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 144         int compressed = BP_GET_PSIZE(bp);
 145         int uncompressed = BP_GET_UCSIZE(bp);
 146 
 147         ASSERT(used > 0);
 148         if (ds == NULL) {
 149                 /*
 150                  * Account for the meta-objset space in its placeholder
 151                  * dataset.
 152                  */
 153                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
 154 
 155                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 156                     -used, -compressed, -uncompressed, tx);
 157                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 158                 return (used);
 159         }
 160         ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 161 
 162         ASSERT(!dsl_dataset_is_snapshot(ds));
 163         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 164 
 165         if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 166                 int64_t delta;
 167 
 168                 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 169                 dsl_free(tx->tx_pool, tx->tx_txg, bp);
 170 
 171                 mutex_enter(&ds->ds_dir->dd_lock);
 172                 mutex_enter(&ds->ds_lock);
 173                 ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
 174                     !DS_UNIQUE_IS_ACCURATE(ds));
 175                 delta = parent_delta(ds, -used);
 176                 ds->ds_phys->ds_unique_bytes -= used;
 177                 mutex_exit(&ds->ds_lock);
 178                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 179                     delta, -compressed, -uncompressed, tx);
 180                 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
 181                     DD_USED_REFRSRV, DD_USED_HEAD, tx);
 182                 mutex_exit(&ds->ds_dir->dd_lock);
 183         } else {
 184                 dprintf_bp(bp, "putting on dead list: %s", "");
 185                 if (async) {
 186                         /*
 187                          * We are here as part of zio's write done callback,
 188                          * which means we're a zio interrupt thread.  We can't
 189                          * call dsl_deadlist_insert() now because it may block
 190                          * waiting for I/O.  Instead, put bp on the deferred
 191                          * queue and let dsl_pool_sync() finish the job.
 192                          */
 193                         bplist_append(&ds->ds_pending_deadlist, bp);
 194                 } else {
 195                         dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 196                 }
 197                 ASSERT3U(ds->ds_prev->ds_object, ==,
 198                     ds->ds_phys->ds_prev_snap_obj);
 199                 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 200                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 201                 if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
 202                     ds->ds_object && bp->blk_birth >
 203                     ds->ds_prev->ds_phys->ds_prev_snap_txg) {
 204                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 205                         mutex_enter(&ds->ds_prev->ds_lock);
 206                         ds->ds_prev->ds_phys->ds_unique_bytes += used;
 207                         mutex_exit(&ds->ds_prev->ds_lock);
 208                 }
 209                 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 210                         dsl_dir_transfer_space(ds->ds_dir, used,
 211                             DD_USED_HEAD, DD_USED_SNAP, tx);
 212                 }
 213         }
 214         mutex_enter(&ds->ds_lock);
 215         ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
 216         ds->ds_phys->ds_referenced_bytes -= used;
 217         ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 218         ds->ds_phys->ds_compressed_bytes -= compressed;
 219         ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 220         ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 221         mutex_exit(&ds->ds_lock);
 222 
 223         return (used);
 224 }
 225 
 226 uint64_t
 227 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 228 {
 229         uint64_t trysnap = 0;
 230 
 231         if (ds == NULL)
 232                 return (0);
 233         /*
 234          * The snapshot creation could fail, but that would cause an
 235          * incorrect FALSE return, which would only result in an
 236          * overestimation of the amount of space that an operation would
 237          * consume, which is OK.
 238          *
 239          * There's also a small window where we could miss a pending
 240          * snapshot, because we could set the sync task in the quiescing
 241          * phase.  So this should only be used as a guess.
 242          */
 243         if (ds->ds_trysnap_txg >
 244             spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 245                 trysnap = ds->ds_trysnap_txg;
 246         return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
 247 }
 248 
 249 boolean_t
 250 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
 251     uint64_t blk_birth)
 252 {
 253         if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
 254                 return (B_FALSE);
 255 
 256         ddt_prefetch(dsl_dataset_get_spa(ds), bp);
 257 
 258         return (B_TRUE);
 259 }
 260 
 261 /* ARGSUSED */
 262 static void
 263 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 264 {
 265         dsl_dataset_t *ds = dsv;
 266 
 267         ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
 268 
 269         unique_remove(ds->ds_fsid_guid);
 270 
 271         if (ds->ds_objset != NULL)
 272                 dmu_objset_evict(ds->ds_objset);
 273 
 274         if (ds->ds_prev) {
 275                 dsl_dataset_drop_ref(ds->ds_prev, ds);
 276                 ds->ds_prev = NULL;
 277         }
 278 
 279         bplist_destroy(&ds->ds_pending_deadlist);
 280         if (db != NULL) {
 281                 dsl_deadlist_close(&ds->ds_deadlist);
 282         } else {
 283                 ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
 284                 ASSERT(!ds->ds_deadlist.dl_oldfmt);
 285         }
 286         if (ds->ds_dir)
 287                 dsl_dir_close(ds->ds_dir, ds);
 288 
 289         ASSERT(!list_link_active(&ds->ds_synced_link));
 290 
 291         mutex_destroy(&ds->ds_lock);
 292         mutex_destroy(&ds->ds_recvlock);
 293         mutex_destroy(&ds->ds_opening_lock);
 294         rw_destroy(&ds->ds_rwlock);
 295         cv_destroy(&ds->ds_exclusive_cv);
 296 
 297         kmem_free(ds, sizeof (dsl_dataset_t));
 298 }
 299 
 300 static int
 301 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 302 {
 303         dsl_dataset_phys_t *headphys;
 304         int err;
 305         dmu_buf_t *headdbuf;
 306         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 307         objset_t *mos = dp->dp_meta_objset;
 308 
 309         if (ds->ds_snapname[0])
 310                 return (0);
 311         if (ds->ds_phys->ds_next_snap_obj == 0)
 312                 return (0);
 313 
 314         err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
 315             FTAG, &headdbuf);
 316         if (err)
 317                 return (err);
 318         headphys = headdbuf->db_data;
 319         err = zap_value_search(dp->dp_meta_objset,
 320             headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
 321         dmu_buf_rele(headdbuf, FTAG);
 322         return (err);
 323 }
 324 
 325 static int
 326 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 327 {
 328         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 329         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 330         matchtype_t mt;
 331         int err;
 332 
 333         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 334                 mt = MT_FIRST;
 335         else
 336                 mt = MT_EXACT;
 337 
 338         err = zap_lookup_norm(mos, snapobj, name, 8, 1,
 339             value, mt, NULL, 0, NULL);
 340         if (err == ENOTSUP && mt == MT_FIRST)
 341                 err = zap_lookup(mos, snapobj, name, 8, 1, value);
 342         return (err);
 343 }
 344 
 345 static int
 346 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
 347 {
 348         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 349         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
 350         matchtype_t mt;
 351         int err;
 352 
 353         dsl_dir_snap_cmtime_update(ds->ds_dir);
 354 
 355         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 356                 mt = MT_FIRST;
 357         else
 358                 mt = MT_EXACT;
 359 
 360         err = zap_remove_norm(mos, snapobj, name, mt, tx);
 361         if (err == ENOTSUP && mt == MT_FIRST)
 362                 err = zap_remove(mos, snapobj, name, tx);
 363         return (err);
 364 }
 365 
 366 static int
 367 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 368     dsl_dataset_t **dsp)
 369 {
 370         objset_t *mos = dp->dp_meta_objset;
 371         dmu_buf_t *dbuf;
 372         dsl_dataset_t *ds;
 373         int err;
 374         dmu_object_info_t doi;
 375 
 376         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
 377             dsl_pool_sync_context(dp));
 378 
 379         err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
 380         if (err)
 381                 return (err);
 382 
 383         /* Make sure dsobj has the correct object type. */
 384         dmu_object_info_from_db(dbuf, &doi);
 385         if (doi.doi_type != DMU_OT_DSL_DATASET)
 386                 return (EINVAL);
 387 
 388         ds = dmu_buf_get_user(dbuf);
 389         if (ds == NULL) {
 390                 dsl_dataset_t *winner;
 391 
 392                 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 393                 ds->ds_dbuf = dbuf;
 394                 ds->ds_object = dsobj;
 395                 ds->ds_phys = dbuf->db_data;
 396 
 397                 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
 398                 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
 399                 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 400                 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
 401 
 402                 rw_init(&ds->ds_rwlock, 0, 0, 0);
 403                 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
 404 
 405                 bplist_create(&ds->ds_pending_deadlist);
 406                 dsl_deadlist_open(&ds->ds_deadlist,
 407                     mos, ds->ds_phys->ds_deadlist_obj);
 408 
 409                 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
 410                     offsetof(dmu_sendarg_t, dsa_link));
 411 
 412                 if (err == 0) {
 413                         err = dsl_dir_open_obj(dp,
 414                             ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
 415                 }
 416                 if (err) {
 417                         mutex_destroy(&ds->ds_lock);
 418                         mutex_destroy(&ds->ds_recvlock);
 419                         mutex_destroy(&ds->ds_opening_lock);
 420                         rw_destroy(&ds->ds_rwlock);
 421                         cv_destroy(&ds->ds_exclusive_cv);
 422                         bplist_destroy(&ds->ds_pending_deadlist);
 423                         dsl_deadlist_close(&ds->ds_deadlist);
 424                         kmem_free(ds, sizeof (dsl_dataset_t));
 425                         dmu_buf_rele(dbuf, tag);
 426                         return (err);
 427                 }
 428 
 429                 if (!dsl_dataset_is_snapshot(ds)) {
 430                         ds->ds_snapname[0] = '\0';
 431                         if (ds->ds_phys->ds_prev_snap_obj) {
 432                                 err = dsl_dataset_get_ref(dp,
 433                                     ds->ds_phys->ds_prev_snap_obj,
 434                                     ds, &ds->ds_prev);
 435                         }
 436                 } else {
 437                         if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 438                                 err = dsl_dataset_get_snapname(ds);
 439                         if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
 440                                 err = zap_count(
 441                                     ds->ds_dir->dd_pool->dp_meta_objset,
 442                                     ds->ds_phys->ds_userrefs_obj,
 443                                     &ds->ds_userrefs);
 444                         }
 445                 }
 446 
 447                 if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
 448                         /*
 449                          * In sync context, we're called with either no lock
 450                          * or with the write lock.  If we're not syncing,
 451                          * we're always called with the read lock held.
 452                          */
 453                         boolean_t need_lock =
 454                             !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
 455                             dsl_pool_sync_context(dp);
 456 
 457                         if (need_lock)
 458                                 rw_enter(&dp->dp_config_rwlock, RW_READER);
 459 
 460                         err = dsl_prop_get_ds(ds,
 461                             "refreservation", sizeof (uint64_t), 1,
 462                             &ds->ds_reserved, NULL);
 463                         if (err == 0) {
 464                                 err = dsl_prop_get_ds(ds,
 465                                     "refquota", sizeof (uint64_t), 1,
 466                                     &ds->ds_quota, NULL);
 467                         }
 468 
 469                         if (need_lock)
 470                                 rw_exit(&dp->dp_config_rwlock);
 471                 } else {
 472                         ds->ds_reserved = ds->ds_quota = 0;
 473                 }
 474 
 475                 if (err == 0) {
 476                         winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
 477                             dsl_dataset_evict);
 478                 }
 479                 if (err || winner) {
 480                         bplist_destroy(&ds->ds_pending_deadlist);
 481                         dsl_deadlist_close(&ds->ds_deadlist);
 482                         if (ds->ds_prev)
 483                                 dsl_dataset_drop_ref(ds->ds_prev, ds);
 484                         dsl_dir_close(ds->ds_dir, ds);
 485                         mutex_destroy(&ds->ds_lock);
 486                         mutex_destroy(&ds->ds_recvlock);
 487                         mutex_destroy(&ds->ds_opening_lock);
 488                         rw_destroy(&ds->ds_rwlock);
 489                         cv_destroy(&ds->ds_exclusive_cv);
 490                         kmem_free(ds, sizeof (dsl_dataset_t));
 491                         if (err) {
 492                                 dmu_buf_rele(dbuf, tag);
 493                                 return (err);
 494                         }
 495                         ds = winner;
 496                 } else {
 497                         ds->ds_fsid_guid =
 498                             unique_insert(ds->ds_phys->ds_fsid_guid);
 499                 }
 500         }
 501         ASSERT3P(ds->ds_dbuf, ==, dbuf);
 502         ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
 503         ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
 504             spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 505             dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 506         mutex_enter(&ds->ds_lock);
 507         if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
 508                 mutex_exit(&ds->ds_lock);
 509                 dmu_buf_rele(ds->ds_dbuf, tag);
 510                 return (ENOENT);
 511         }
 512         mutex_exit(&ds->ds_lock);
 513         *dsp = ds;
 514         return (0);
 515 }
 516 
 517 static int
 518 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
 519 {
 520         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 521 
 522         /*
 523          * In syncing context we don't want the rwlock lock: there
 524          * may be an existing writer waiting for sync phase to
 525          * finish.  We don't need to worry about such writers, since
 526          * sync phase is single-threaded, so the writer can't be
 527          * doing anything while we are active.
 528          */
 529         if (dsl_pool_sync_context(dp)) {
 530                 ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
 531                 return (0);
 532         }
 533 
 534         /*
 535          * Normal users will hold the ds_rwlock as a READER until they
 536          * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
 537          * drop their READER lock after they set the ds_owner field.
 538          *
 539          * If the dataset is being destroyed, the destroy thread will
 540          * obtain a WRITER lock for exclusive access after it's done its
 541          * open-context work and then change the ds_owner to
 542          * dsl_reaper once destruction is assured.  So threads
 543          * may block here temporarily, until the "destructability" of
 544          * the dataset is determined.
 545          */
 546         ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
 547         mutex_enter(&ds->ds_lock);
 548         while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
 549                 rw_exit(&dp->dp_config_rwlock);
 550                 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
 551                 if (DSL_DATASET_IS_DESTROYED(ds)) {
 552                         mutex_exit(&ds->ds_lock);
 553                         dsl_dataset_drop_ref(ds, tag);
 554                         rw_enter(&dp->dp_config_rwlock, RW_READER);
 555                         return (ENOENT);
 556                 }
 557                 /*
 558                  * The dp_config_rwlock lives above the ds_lock. And
 559                  * we need to check DSL_DATASET_IS_DESTROYED() while
 560                  * holding the ds_lock, so we have to drop and reacquire
 561                  * the ds_lock here.
 562                  */
 563                 mutex_exit(&ds->ds_lock);
 564                 rw_enter(&dp->dp_config_rwlock, RW_READER);
 565                 mutex_enter(&ds->ds_lock);
 566         }
 567         mutex_exit(&ds->ds_lock);
 568         return (0);
 569 }
 570 
 571 int
 572 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 573     dsl_dataset_t **dsp)
 574 {
 575         int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
 576 
 577         if (err)
 578                 return (err);
 579         return (dsl_dataset_hold_ref(*dsp, tag));
 580 }
 581 
 582 int
 583 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
 584     void *tag, dsl_dataset_t **dsp)
 585 {
 586         int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
 587         if (err)
 588                 return (err);
 589         if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 590                 dsl_dataset_rele(*dsp, tag);
 591                 *dsp = NULL;
 592                 return (EBUSY);
 593         }
 594         return (0);
 595 }
 596 
 597 int
 598 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
 599 {
 600         dsl_dir_t *dd;
 601         dsl_pool_t *dp;
 602         const char *snapname;
 603         uint64_t obj;
 604         int err = 0;
 605 
 606         err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
 607         if (err)
 608                 return (err);
 609 
 610         dp = dd->dd_pool;
 611         obj = dd->dd_phys->dd_head_dataset_obj;
 612         rw_enter(&dp->dp_config_rwlock, RW_READER);
 613         if (obj)
 614                 err = dsl_dataset_get_ref(dp, obj, tag, dsp);
 615         else
 616                 err = ENOENT;
 617         if (err)
 618                 goto out;
 619 
 620         err = dsl_dataset_hold_ref(*dsp, tag);
 621 
 622         /* we may be looking for a snapshot */
 623         if (err == 0 && snapname != NULL) {
 624                 dsl_dataset_t *ds = NULL;
 625 
 626                 if (*snapname++ != '@') {
 627                         dsl_dataset_rele(*dsp, tag);
 628                         err = ENOENT;
 629                         goto out;
 630                 }
 631 
 632                 dprintf("looking for snapshot '%s'\n", snapname);
 633                 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
 634                 if (err == 0)
 635                         err = dsl_dataset_get_ref(dp, obj, tag, &ds);
 636                 dsl_dataset_rele(*dsp, tag);
 637 
 638                 ASSERT3U((err == 0), ==, (ds != NULL));
 639 
 640                 if (ds) {
 641                         mutex_enter(&ds->ds_lock);
 642                         if (ds->ds_snapname[0] == 0)
 643                                 (void) strlcpy(ds->ds_snapname, snapname,
 644                                     sizeof (ds->ds_snapname));
 645                         mutex_exit(&ds->ds_lock);
 646                         err = dsl_dataset_hold_ref(ds, tag);
 647                         *dsp = err ? NULL : ds;
 648                 }
 649         }
 650 out:
 651         rw_exit(&dp->dp_config_rwlock);
 652         dsl_dir_close(dd, FTAG);
 653         return (err);
 654 }
 655 
 656 int
 657 dsl_dataset_own(const char *name, boolean_t inconsistentok,
 658     void *tag, dsl_dataset_t **dsp)
 659 {
 660         int err = dsl_dataset_hold(name, tag, dsp);
 661         if (err)
 662                 return (err);
 663         if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
 664                 dsl_dataset_rele(*dsp, tag);
 665                 return (EBUSY);
 666         }
 667         return (0);
 668 }
 669 
 670 void
 671 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 672 {
 673         if (ds == NULL) {
 674                 (void) strcpy(name, "mos");
 675         } else {
 676                 dsl_dir_name(ds->ds_dir, name);
 677                 VERIFY(0 == dsl_dataset_get_snapname(ds));
 678                 if (ds->ds_snapname[0]) {
 679                         (void) strcat(name, "@");
 680                         /*
 681                          * We use a "recursive" mutex so that we
 682                          * can call dprintf_ds() with ds_lock held.
 683                          */
 684                         if (!MUTEX_HELD(&ds->ds_lock)) {
 685                                 mutex_enter(&ds->ds_lock);
 686                                 (void) strcat(name, ds->ds_snapname);
 687                                 mutex_exit(&ds->ds_lock);
 688                         } else {
 689                                 (void) strcat(name, ds->ds_snapname);
 690                         }
 691                 }
 692         }
 693 }
 694 
 695 static int
 696 dsl_dataset_namelen(dsl_dataset_t *ds)
 697 {
 698         int result;
 699 
 700         if (ds == NULL) {
 701                 result = 3;     /* "mos" */
 702         } else {
 703                 result = dsl_dir_namelen(ds->ds_dir);
 704                 VERIFY(0 == dsl_dataset_get_snapname(ds));
 705                 if (ds->ds_snapname[0]) {
 706                         ++result;       /* adding one for the @-sign */
 707                         if (!MUTEX_HELD(&ds->ds_lock)) {
 708                                 mutex_enter(&ds->ds_lock);
 709                                 result += strlen(ds->ds_snapname);
 710                                 mutex_exit(&ds->ds_lock);
 711                         } else {
 712                                 result += strlen(ds->ds_snapname);
 713                         }
 714                 }
 715         }
 716 
 717         return (result);
 718 }
 719 
 720 void
 721 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
 722 {
 723         dmu_buf_rele(ds->ds_dbuf, tag);
 724 }
 725 
 726 void
 727 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 728 {
 729         if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
 730                 rw_exit(&ds->ds_rwlock);
 731         }
 732         dsl_dataset_drop_ref(ds, tag);
 733 }
 734 
 735 void
 736 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 737 {
 738         ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
 739             (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
 740 
 741         mutex_enter(&ds->ds_lock);
 742         ds->ds_owner = NULL;
 743         if (RW_WRITE_HELD(&ds->ds_rwlock)) {
 744                 rw_exit(&ds->ds_rwlock);
 745                 cv_broadcast(&ds->ds_exclusive_cv);
 746         }
 747         mutex_exit(&ds->ds_lock);
 748         if (ds->ds_dbuf)
 749                 dsl_dataset_drop_ref(ds, tag);
 750         else
 751                 dsl_dataset_evict(NULL, ds);
 752 }
 753 
 754 boolean_t
 755 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
 756 {
 757         boolean_t gotit = FALSE;
 758 
 759         mutex_enter(&ds->ds_lock);
 760         if (ds->ds_owner == NULL &&
 761             (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
 762                 ds->ds_owner = tag;
 763                 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
 764                         rw_exit(&ds->ds_rwlock);
 765                 gotit = TRUE;
 766         }
 767         mutex_exit(&ds->ds_lock);
 768         return (gotit);
 769 }
 770 
 771 void
 772 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
 773 {
 774         ASSERT3P(owner, ==, ds->ds_owner);
 775         if (!RW_WRITE_HELD(&ds->ds_rwlock))
 776                 rw_enter(&ds->ds_rwlock, RW_WRITER);
 777 }
 778 
 779 uint64_t
 780 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
 781     uint64_t flags, dmu_tx_t *tx)
 782 {
 783         dsl_pool_t *dp = dd->dd_pool;
 784         dmu_buf_t *dbuf;
 785         dsl_dataset_phys_t *dsphys;
 786         uint64_t dsobj;
 787         objset_t *mos = dp->dp_meta_objset;
 788 
 789         if (origin == NULL)
 790                 origin = dp->dp_origin_snap;
 791 
 792         ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
 793         ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
 794         ASSERT(dmu_tx_is_syncing(tx));
 795         ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 796 
 797         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 798             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 799         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 800         dmu_buf_will_dirty(dbuf, tx);
 801         dsphys = dbuf->db_data;
 802         bzero(dsphys, sizeof (dsl_dataset_phys_t));
 803         dsphys->ds_dir_obj = dd->dd_object;
 804         dsphys->ds_flags = flags;
 805         dsphys->ds_fsid_guid = unique_create();
 806         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 807             sizeof (dsphys->ds_guid));
 808         dsphys->ds_snapnames_zapobj =
 809             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 810             DMU_OT_NONE, 0, tx);
 811         dsphys->ds_creation_time = gethrestime_sec();
 812         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 813 
 814         if (origin == NULL) {
 815                 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 816         } else {
 817                 dsl_dataset_t *ohds;
 818 
 819                 dsphys->ds_prev_snap_obj = origin->ds_object;
 820                 dsphys->ds_prev_snap_txg =
 821                     origin->ds_phys->ds_creation_txg;
 822                 dsphys->ds_referenced_bytes =
 823                     origin->ds_phys->ds_referenced_bytes;
 824                 dsphys->ds_compressed_bytes =
 825                     origin->ds_phys->ds_compressed_bytes;
 826                 dsphys->ds_uncompressed_bytes =
 827                     origin->ds_phys->ds_uncompressed_bytes;
 828                 dsphys->ds_bp = origin->ds_phys->ds_bp;
 829                 dsphys->ds_flags |= origin->ds_phys->ds_flags;
 830 
 831                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
 832                 origin->ds_phys->ds_num_children++;
 833 
 834                 VERIFY0(dsl_dataset_hold_obj(dp,
 835                     origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
 836                 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 837                     dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 838                 dsl_dataset_rele(ohds, FTAG);
 839 
 840                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 841                         if (origin->ds_phys->ds_next_clones_obj == 0) {
 842                                 origin->ds_phys->ds_next_clones_obj =
 843                                     zap_create(mos,
 844                                     DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 845                         }
 846                         VERIFY(0 == zap_add_int(mos,
 847                             origin->ds_phys->ds_next_clones_obj,
 848                             dsobj, tx));
 849                 }
 850 
 851                 dmu_buf_will_dirty(dd->dd_dbuf, tx);
 852                 dd->dd_phys->dd_origin_obj = origin->ds_object;
 853                 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 854                         if (origin->ds_dir->dd_phys->dd_clones == 0) {
 855                                 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 856                                 origin->ds_dir->dd_phys->dd_clones =
 857                                     zap_create(mos,
 858                                     DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
 859                         }
 860                         VERIFY0(zap_add_int(mos,
 861                             origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
 862                 }
 863         }
 864 
 865         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 866                 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 867 
 868         dmu_buf_rele(dbuf, FTAG);
 869 
 870         dmu_buf_will_dirty(dd->dd_dbuf, tx);
 871         dd->dd_phys->dd_head_dataset_obj = dsobj;
 872 
 873         return (dsobj);
 874 }
 875 
 876 uint64_t
 877 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
 878     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
 879 {
 880         dsl_pool_t *dp = pdd->dd_pool;
 881         uint64_t dsobj, ddobj;
 882         dsl_dir_t *dd;
 883 
 884         ASSERT(lastname[0] != '@');
 885 
 886         ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 887         VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
 888 
 889         dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
 890 
 891         dsl_deleg_set_create_perms(dd, tx, cr);
 892 
 893         dsl_dir_close(dd, FTAG);
 894 
 895         /*
 896          * If we are creating a clone, make sure we zero out any stale
 897          * data from the origin snapshots zil header.
 898          */
 899         if (origin != NULL) {
 900                 dsl_dataset_t *ds;
 901                 objset_t *os;
 902 
 903                 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 904                 VERIFY0(dmu_objset_from_ds(ds, &os));
 905                 bzero(&os->os_zil_header, sizeof (os->os_zil_header));
 906                 dsl_dataset_dirty(ds, tx);
 907                 dsl_dataset_rele(ds, FTAG);
 908         }
 909 
 910         return (dsobj);
 911 }
 912 
 913 /*
 914  * The snapshots must all be in the same pool.
 915  */
 916 int
 917 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer,
 918     nvlist_t *errlist)
 919 {
 920         int err;
 921         dsl_sync_task_t *dst;
 922         spa_t *spa;
 923         nvpair_t *pair;
 924         dsl_sync_task_group_t *dstg;
 925 
 926         pair = nvlist_next_nvpair(snaps, NULL);
 927         if (pair == NULL)
 928                 return (0);
 929 
 930         err = spa_open(nvpair_name(pair), &spa, FTAG);
 931         if (err)
 932                 return (err);
 933         dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 934 
 935         for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 936             pair = nvlist_next_nvpair(snaps, pair)) {
 937                 dsl_dataset_t *ds;
 938 
 939                 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
 940                 if (err == 0) {
 941                         struct dsl_ds_destroyarg *dsda;
 942 
 943                         dsl_dataset_make_exclusive(ds, dstg);
 944                         dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
 945                             KM_SLEEP);
 946                         dsda->ds = ds;
 947                         dsda->defer = defer;
 948                         dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
 949                             dsl_dataset_destroy_sync, dsda, dstg, 0);
 950                 } else if (err == ENOENT) {
 951                         err = 0;
 952                 } else {
 953                         fnvlist_add_int32(errlist, nvpair_name(pair), err);
 954                         break;
 955                 }
 956         }
 957 
 958         if (err == 0)
 959                 err = dsl_sync_task_group_wait(dstg);
 960 
 961         for (dst = list_head(&dstg->dstg_tasks); dst;
 962             dst = list_next(&dstg->dstg_tasks, dst)) {
 963                 struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
 964                 dsl_dataset_t *ds = dsda->ds;
 965 
 966                 /*
 967                  * Return the snapshots that triggered the error.
 968                  */
 969                 if (dst->dst_err != 0) {
 970                         char name[ZFS_MAXNAMELEN];
 971                         dsl_dataset_name(ds, name);
 972                         fnvlist_add_int32(errlist, name, dst->dst_err);
 973                 }
 974                 ASSERT3P(dsda->rm_origin, ==, NULL);
 975                 dsl_dataset_disown(ds, dstg);
 976                 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
 977         }
 978 
 979         dsl_sync_task_group_destroy(dstg);
 980         spa_close(spa, FTAG);
 981         return (err);
 982 
 983 }
 984 
 985 static boolean_t
 986 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
 987 {
 988         boolean_t might_destroy = B_FALSE;
 989 
 990         mutex_enter(&ds->ds_lock);
 991         if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
 992             DS_IS_DEFER_DESTROY(ds))
 993                 might_destroy = B_TRUE;
 994         mutex_exit(&ds->ds_lock);
 995 
 996         return (might_destroy);
 997 }
 998 
 999 /*
1000  * If we're removing a clone, and these three conditions are true:
1001  *      1) the clone's origin has no other children
1002  *      2) the clone's origin has no user references
1003  *      3) the clone's origin has been marked for deferred destruction
1004  * Then, prepare to remove the origin as part of this sync task group.
1005  */
1006 static int
1007 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
1008 {
1009         dsl_dataset_t *ds = dsda->ds;
1010         dsl_dataset_t *origin = ds->ds_prev;
1011 
1012         if (dsl_dataset_might_destroy_origin(origin)) {
1013                 char *name;
1014                 int namelen;
1015                 int error;
1016 
1017                 namelen = dsl_dataset_namelen(origin) + 1;
1018                 name = kmem_alloc(namelen, KM_SLEEP);
1019                 dsl_dataset_name(origin, name);
1020 #ifdef _KERNEL
1021                 error = zfs_unmount_snap(name, NULL);
1022                 if (error) {
1023                         kmem_free(name, namelen);
1024                         return (error);
1025                 }
1026 #endif
1027                 error = dsl_dataset_own(name, B_TRUE, tag, &origin);
1028                 kmem_free(name, namelen);
1029                 if (error)
1030                         return (error);
1031                 dsda->rm_origin = origin;
1032                 dsl_dataset_make_exclusive(origin, tag);
1033         }
1034 
1035         return (0);
1036 }
1037 
1038 /*
1039  * ds must be opened as OWNER.  On return (whether successful or not),
1040  * ds will be closed and caller can no longer dereference it.
1041  */
1042 int
1043 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
1044 {
1045         int err;
1046         dsl_sync_task_group_t *dstg;
1047         objset_t *os;
1048         dsl_dir_t *dd;
1049         uint64_t obj;
1050         struct dsl_ds_destroyarg dsda = { 0 };
1051 
1052         dsda.ds = ds;
1053 
1054         if (dsl_dataset_is_snapshot(ds)) {
1055                 /* Destroying a snapshot is simpler */
1056                 dsl_dataset_make_exclusive(ds, tag);
1057 
1058                 dsda.defer = defer;
1059                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1060                     dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
1061                     &dsda, tag, 0);
1062                 ASSERT3P(dsda.rm_origin, ==, NULL);
1063                 goto out;
1064         } else if (defer) {
1065                 err = EINVAL;
1066                 goto out;
1067         }
1068 
1069         dd = ds->ds_dir;
1070 
1071         /*
1072          * Check for errors and mark this ds as inconsistent, in
1073          * case we crash while freeing the objects.
1074          */
1075         err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
1076             dsl_dataset_destroy_begin_sync, ds, NULL, 0);
1077         if (err)
1078                 goto out;
1079 
1080         err = dmu_objset_from_ds(ds, &os);
1081         if (err)
1082                 goto out;
1083 
1084         /*
1085          * If async destruction is not enabled try to remove all objects
1086          * while in the open context so that there is less work to do in
1087          * the syncing context.
1088          */
1089         if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
1090             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1091                 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
1092                     ds->ds_phys->ds_prev_snap_txg)) {
1093                         /*
1094                          * Ignore errors, if there is not enough disk space
1095                          * we will deal with it in dsl_dataset_destroy_sync().
1096                          */
1097                         (void) dmu_free_object(os, obj);
1098                 }
1099                 if (err != ESRCH)
1100                         goto out;
1101         }
1102 
1103         /*
1104          * Only the ZIL knows how to free log blocks.
1105          */
1106         zil_destroy(dmu_objset_zil(os), B_FALSE);
1107 
1108         /*
1109          * Sync out all in-flight IO.
1110          */
1111         txg_wait_synced(dd->dd_pool, 0);
1112 
1113         /*
1114          * If we managed to free all the objects in open
1115          * context, the user space accounting should be zero.
1116          */
1117         if (ds->ds_phys->ds_bp.blk_fill == 0 &&
1118             dmu_objset_userused_enabled(os)) {
1119                 uint64_t count;
1120 
1121                 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
1122                     count == 0);
1123                 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
1124                     count == 0);
1125         }
1126 
1127         rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
1128         err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
1129         rw_exit(&dd->dd_pool->dp_config_rwlock);
1130 
1131         if (err)
1132                 goto out;
1133 
1134         /*
1135          * Blow away the dsl_dir + head dataset.
1136          */
1137         dsl_dataset_make_exclusive(ds, tag);
1138         /*
1139          * If we're removing a clone, we might also need to remove its
1140          * origin.
1141          */
1142         do {
1143                 dsda.need_prep = B_FALSE;
1144                 if (dsl_dir_is_clone(dd)) {
1145                         err = dsl_dataset_origin_rm_prep(&dsda, tag);
1146                         if (err) {
1147                                 dsl_dir_close(dd, FTAG);
1148                                 goto out;
1149                         }
1150                 }
1151 
1152                 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
1153                 dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
1154                     dsl_dataset_destroy_sync, &dsda, tag, 0);
1155                 dsl_sync_task_create(dstg, dsl_dir_destroy_check,
1156                     dsl_dir_destroy_sync, dd, FTAG, 0);
1157                 err = dsl_sync_task_group_wait(dstg);
1158                 dsl_sync_task_group_destroy(dstg);
1159 
1160                 /*
1161                  * We could be racing against 'zfs release' or 'zfs destroy -d'
1162                  * on the origin snap, in which case we can get EBUSY if we
1163                  * needed to destroy the origin snap but were not ready to
1164                  * do so.
1165                  */
1166                 if (dsda.need_prep) {
1167                         ASSERT(err == EBUSY);
1168                         ASSERT(dsl_dir_is_clone(dd));
1169                         ASSERT(dsda.rm_origin == NULL);
1170                 }
1171         } while (dsda.need_prep);
1172 
1173         if (dsda.rm_origin != NULL)
1174                 dsl_dataset_disown(dsda.rm_origin, tag);
1175 
1176         /* if it is successful, dsl_dir_destroy_sync will close the dd */
1177         if (err)
1178                 dsl_dir_close(dd, FTAG);
1179 out:
1180         dsl_dataset_disown(ds, tag);
1181         return (err);
1182 }
1183 
1184 blkptr_t *
1185 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1186 {
1187         return (&ds->ds_phys->ds_bp);
1188 }
1189 
1190 void
1191 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1192 {
1193         ASSERT(dmu_tx_is_syncing(tx));
1194         /* If it's the meta-objset, set dp_meta_rootbp */
1195         if (ds == NULL) {
1196                 tx->tx_pool->dp_meta_rootbp = *bp;
1197         } else {
1198                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1199                 ds->ds_phys->ds_bp = *bp;
1200         }
1201 }
1202 
1203 spa_t *
1204 dsl_dataset_get_spa(dsl_dataset_t *ds)
1205 {
1206         return (ds->ds_dir->dd_pool->dp_spa);
1207 }
1208 
1209 void
1210 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1211 {
1212         dsl_pool_t *dp;
1213 
1214         if (ds == NULL) /* this is the meta-objset */
1215                 return;
1216 
1217         ASSERT(ds->ds_objset != NULL);
1218 
1219         if (ds->ds_phys->ds_next_snap_obj != 0)
1220                 panic("dirtying snapshot!");
1221 
1222         dp = ds->ds_dir->dd_pool;
1223 
1224         if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
1225                 /* up the hold count until we can be written out */
1226                 dmu_buf_add_ref(ds->ds_dbuf, ds);
1227         }
1228 }
1229 
1230 /*
1231  * The unique space in the head dataset can be calculated by subtracting
1232  * the space used in the most recent snapshot, that is still being used
1233  * in this file system, from the space currently in use.  To figure out
1234  * the space in the most recent snapshot still in use, we need to take
1235  * the total space used in the snapshot and subtract out the space that
1236  * has been freed up since the snapshot was taken.
1237  */
1238 static void
1239 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1240 {
1241         uint64_t mrs_used;
1242         uint64_t dlused, dlcomp, dluncomp;
1243 
1244         ASSERT(!dsl_dataset_is_snapshot(ds));
1245 
1246         if (ds->ds_phys->ds_prev_snap_obj != 0)
1247                 mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
1248         else
1249                 mrs_used = 0;
1250 
1251         dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1252 
1253         ASSERT3U(dlused, <=, mrs_used);
1254         ds->ds_phys->ds_unique_bytes =
1255             ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
1256 
1257         if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1258             SPA_VERSION_UNIQUE_ACCURATE)
1259                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1260 }
1261 
1262 struct killarg {
1263         dsl_dataset_t *ds;
1264         dmu_tx_t *tx;
1265 };
1266 
1267 /* ARGSUSED */
1268 static int
1269 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
1270     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1271 {
1272         struct killarg *ka = arg;
1273         dmu_tx_t *tx = ka->tx;
1274 
1275         if (bp == NULL)
1276                 return (0);
1277 
1278         if (zb->zb_level == ZB_ZIL_LEVEL) {
1279                 ASSERT(zilog != NULL);
1280                 /*
1281                  * It's a block in the intent log.  It has no
1282                  * accounting, so just free it.
1283                  */
1284                 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
1285         } else {
1286                 ASSERT(zilog == NULL);
1287                 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
1288                 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
1289         }
1290 
1291         return (0);
1292 }
1293 
1294 /* ARGSUSED */
1295 static int
1296 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
1297 {
1298         dsl_dataset_t *ds = arg1;
1299         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1300         uint64_t count;
1301         int err;
1302 
1303         /*
1304          * Can't delete a head dataset if there are snapshots of it.
1305          * (Except if the only snapshots are from the branch we cloned
1306          * from.)
1307          */
1308         if (ds->ds_prev != NULL &&
1309             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1310                 return (EBUSY);
1311 
1312         /*
1313          * This is really a dsl_dir thing, but check it here so that
1314          * we'll be less likely to leave this dataset inconsistent &
1315          * nearly destroyed.
1316          */
1317         err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
1318         if (err)
1319                 return (err);
1320         if (count != 0)
1321                 return (EEXIST);
1322 
1323         return (0);
1324 }
1325 
1326 /* ARGSUSED */
1327 static void
1328 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1329 {
1330         dsl_dataset_t *ds = arg1;
1331 
1332         /* Mark it as inconsistent on-disk, in case we crash */
1333         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1334         ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1335 
1336         spa_history_log_internal_ds(ds, "destroy begin", tx, "");
1337 }
1338 
1339 static int
1340 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
1341     dmu_tx_t *tx)
1342 {
1343         dsl_dataset_t *ds = dsda->ds;
1344         dsl_dataset_t *ds_prev = ds->ds_prev;
1345 
1346         if (dsl_dataset_might_destroy_origin(ds_prev)) {
1347                 struct dsl_ds_destroyarg ndsda = {0};
1348 
1349                 /*
1350                  * If we're not prepared to remove the origin, don't remove
1351                  * the clone either.
1352                  */
1353                 if (dsda->rm_origin == NULL) {
1354                         dsda->need_prep = B_TRUE;
1355                         return (EBUSY);
1356                 }
1357 
1358                 ndsda.ds = ds_prev;
1359                 ndsda.is_origin_rm = B_TRUE;
1360                 return (dsl_dataset_destroy_check(&ndsda, tag, tx));
1361         }
1362 
1363         /*
1364          * If we're not going to remove the origin after all,
1365          * undo the open context setup.
1366          */
1367         if (dsda->rm_origin != NULL) {
1368                 dsl_dataset_disown(dsda->rm_origin, tag);
1369                 dsda->rm_origin = NULL;
1370         }
1371 
1372         return (0);
1373 }
1374 
1375 /*
1376  * If you add new checks here, you may need to add
1377  * additional checks to the "temporary" case in
1378  * snapshot_check() in dmu_objset.c.
1379  */
1380 /* ARGSUSED */
1381 int
1382 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
1383 {
1384         struct dsl_ds_destroyarg *dsda = arg1;
1385         dsl_dataset_t *ds = dsda->ds;
1386 
1387         /* we have an owner hold, so noone else can destroy us */
1388         ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
1389 
1390         /*
1391          * Only allow deferred destroy on pools that support it.
1392          * NOTE: deferred destroy is only supported on snapshots.
1393          */
1394         if (dsda->defer) {
1395                 if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
1396                     SPA_VERSION_USERREFS)
1397                         return (ENOTSUP);
1398                 ASSERT(dsl_dataset_is_snapshot(ds));
1399                 return (0);
1400         }
1401 
1402         /*
1403          * Can't delete a head dataset if there are snapshots of it.
1404          * (Except if the only snapshots are from the branch we cloned
1405          * from.)
1406          */
1407         if (ds->ds_prev != NULL &&
1408             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1409                 return (EBUSY);
1410 
1411         /*
1412          * If we made changes this txg, traverse_dsl_dataset won't find
1413          * them.  Try again.
1414          */
1415         if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1416                 return (EAGAIN);
1417 
1418         if (dsl_dataset_is_snapshot(ds)) {
1419                 /*
1420                  * If this snapshot has an elevated user reference count,
1421                  * we can't destroy it yet.
1422                  */
1423                 if (ds->ds_userrefs > 0 && !dsda->releasing)
1424                         return (EBUSY);
1425 
1426                 mutex_enter(&ds->ds_lock);
1427                 /*
1428                  * Can't delete a branch point. However, if we're destroying
1429                  * a clone and removing its origin due to it having a user
1430                  * hold count of 0 and having been marked for deferred destroy,
1431                  * it's OK for the origin to have a single clone.
1432                  */
1433                 if (ds->ds_phys->ds_num_children >
1434                     (dsda->is_origin_rm ? 2 : 1)) {
1435                         mutex_exit(&ds->ds_lock);
1436                         return (EEXIST);
1437                 }
1438                 mutex_exit(&ds->ds_lock);
1439         } else if (dsl_dir_is_clone(ds->ds_dir)) {
1440                 return (dsl_dataset_origin_check(dsda, arg2, tx));
1441         }
1442 
1443         /* XXX we should do some i/o error checking... */
1444         return (0);
1445 }
1446 
1447 struct refsarg {
1448         kmutex_t lock;
1449         boolean_t gone;
1450         kcondvar_t cv;
1451 };
1452 
1453 /* ARGSUSED */
1454 static void
1455 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
1456 {
1457         struct refsarg *arg = argv;
1458 
1459         mutex_enter(&arg->lock);
1460         arg->gone = TRUE;
1461         cv_signal(&arg->cv);
1462         mutex_exit(&arg->lock);
1463 }
1464 
1465 static void
1466 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
1467 {
1468         struct refsarg arg;
1469 
1470         mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
1471         cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
1472         arg.gone = FALSE;
1473         (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
1474             dsl_dataset_refs_gone);
1475         dmu_buf_rele(ds->ds_dbuf, tag);
1476         mutex_enter(&arg.lock);
1477         while (!arg.gone)
1478                 cv_wait(&arg.cv, &arg.lock);
1479         ASSERT(arg.gone);
1480         mutex_exit(&arg.lock);
1481         ds->ds_dbuf = NULL;
1482         ds->ds_phys = NULL;
1483         mutex_destroy(&arg.lock);
1484         cv_destroy(&arg.cv);
1485 }
1486 
1487 static void
1488 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
1489 {
1490         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1491         uint64_t count;
1492         int err;
1493 
1494         ASSERT(ds->ds_phys->ds_num_children >= 2);
1495         err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
1496         /*
1497          * The err should not be ENOENT, but a bug in a previous version
1498          * of the code could cause upgrade_clones_cb() to not set
1499          * ds_next_snap_obj when it should, leading to a missing entry.
1500          * If we knew that the pool was created after
1501          * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1502          * ENOENT.  However, at least we can check that we don't have
1503          * too many entries in the next_clones_obj even after failing to
1504          * remove this one.
1505          */
1506         if (err != ENOENT) {
1507                 VERIFY0(err);
1508         }
1509         ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj, &count));
1510         ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
1511 }
1512 
1513 static void
1514 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
1515 {
1516         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1517         zap_cursor_t zc;
1518         zap_attribute_t za;
1519 
1520         /*
1521          * If it is the old version, dd_clones doesn't exist so we can't
1522          * find the clones, but deadlist_remove_key() is a no-op so it
1523          * doesn't matter.
1524          */
1525         if (ds->ds_dir->dd_phys->dd_clones == 0)
1526                 return;
1527 
1528         for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
1529             zap_cursor_retrieve(&zc, &za) == 0;
1530             zap_cursor_advance(&zc)) {
1531                 dsl_dataset_t *clone;
1532 
1533                 VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1534                     za.za_first_integer, FTAG, &clone));
1535                 if (clone->ds_dir->dd_origin_txg > mintxg) {
1536                         dsl_deadlist_remove_key(&clone->ds_deadlist,
1537                             mintxg, tx);
1538                         dsl_dataset_remove_clones_key(clone, mintxg, tx);
1539                 }
1540                 dsl_dataset_rele(clone, FTAG);
1541         }
1542         zap_cursor_fini(&zc);
1543 }
1544 
1545 struct process_old_arg {
1546         dsl_dataset_t *ds;
1547         dsl_dataset_t *ds_prev;
1548         boolean_t after_branch_point;
1549         zio_t *pio;
1550         uint64_t used, comp, uncomp;
1551 };
1552 
1553 static int
1554 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1555 {
1556         struct process_old_arg *poa = arg;
1557         dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
1558 
1559         if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
1560                 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
1561                 if (poa->ds_prev && !poa->after_branch_point &&
1562                     bp->blk_birth >
1563                     poa->ds_prev->ds_phys->ds_prev_snap_txg) {
1564                         poa->ds_prev->ds_phys->ds_unique_bytes +=
1565                             bp_get_dsize_sync(dp->dp_spa, bp);
1566                 }
1567         } else {
1568                 poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
1569                 poa->comp += BP_GET_PSIZE(bp);
1570                 poa->uncomp += BP_GET_UCSIZE(bp);
1571                 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
1572         }
1573         return (0);
1574 }
1575 
1576 static void
1577 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
1578     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
1579 {
1580         struct process_old_arg poa = { 0 };
1581         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1582         objset_t *mos = dp->dp_meta_objset;
1583 
1584         ASSERT(ds->ds_deadlist.dl_oldfmt);
1585         ASSERT(ds_next->ds_deadlist.dl_oldfmt);
1586 
1587         poa.ds = ds;
1588         poa.ds_prev = ds_prev;
1589         poa.after_branch_point = after_branch_point;
1590         poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1591         VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
1592             process_old_cb, &poa, tx));
1593         VERIFY0(zio_wait(poa.pio));
1594         ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
1595 
1596         /* change snapused */
1597         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1598             -poa.used, -poa.comp, -poa.uncomp, tx);
1599 
1600         /* swap next's deadlist to our deadlist */
1601         dsl_deadlist_close(&ds->ds_deadlist);
1602         dsl_deadlist_close(&ds_next->ds_deadlist);
1603         SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
1604             ds->ds_phys->ds_deadlist_obj);
1605         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1606         dsl_deadlist_open(&ds_next->ds_deadlist, mos,
1607             ds_next->ds_phys->ds_deadlist_obj);
1608 }
1609 
1610 static int
1611 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
1612 {
1613         int err;
1614         struct killarg ka;
1615 
1616         /*
1617          * Free everything that we point to (that's born after
1618          * the previous snapshot, if we are a clone)
1619          *
1620          * NB: this should be very quick, because we already
1621          * freed all the objects in open context.
1622          */
1623         ka.ds = ds;
1624         ka.tx = tx;
1625         err = traverse_dataset(ds,
1626             ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
1627             kill_blkptr, &ka);
1628         ASSERT0(err);
1629         ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
1630 
1631         return (err);
1632 }
1633 
1634 void
1635 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
1636 {
1637         struct dsl_ds_destroyarg *dsda = arg1;
1638         dsl_dataset_t *ds = dsda->ds;
1639         int err;
1640         int after_branch_point = FALSE;
1641         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1642         objset_t *mos = dp->dp_meta_objset;
1643         dsl_dataset_t *ds_prev = NULL;
1644         boolean_t wont_destroy;
1645         uint64_t obj;
1646 
1647         wont_destroy = (dsda->defer &&
1648             (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
1649 
1650         ASSERT(ds->ds_owner || wont_destroy);
1651         ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
1652         ASSERT(ds->ds_prev == NULL ||
1653             ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
1654         ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
1655 
1656         if (wont_destroy) {
1657                 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
1658                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1659                 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
1660                 spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
1661                 return;
1662         }
1663 
1664         /* We need to log before removing it from the namespace. */
1665         spa_history_log_internal_ds(ds, "destroy", tx, "");
1666 
1667         /* signal any waiters that this dataset is going away */
1668         mutex_enter(&ds->ds_lock);
1669         ds->ds_owner = dsl_reaper;
1670         cv_broadcast(&ds->ds_exclusive_cv);
1671         mutex_exit(&ds->ds_lock);
1672 
1673         /* Remove our reservation */
1674         if (ds->ds_reserved != 0) {
1675                 dsl_prop_setarg_t psa;
1676                 uint64_t value = 0;
1677 
1678                 dsl_prop_setarg_init_uint64(&psa, "refreservation",
1679                     (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
1680                     &value);
1681                 psa.psa_effective_value = 0;    /* predict default value */
1682 
1683                 dsl_dataset_set_reservation_sync(ds, &psa, tx);
1684                 ASSERT0(ds->ds_reserved);
1685         }
1686 
1687         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1688 
1689         dsl_scan_ds_destroyed(ds, tx);
1690 
1691         obj = ds->ds_object;
1692 
1693         if (ds->ds_phys->ds_prev_snap_obj != 0) {
1694                 if (ds->ds_prev) {
1695                         ds_prev = ds->ds_prev;
1696                 } else {
1697                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1698                             ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
1699                 }
1700                 after_branch_point =
1701                     (ds_prev->ds_phys->ds_next_snap_obj != obj);
1702 
1703                 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1704                 if (after_branch_point &&
1705                     ds_prev->ds_phys->ds_next_clones_obj != 0) {
1706                         remove_from_next_clones(ds_prev, obj, tx);
1707                         if (ds->ds_phys->ds_next_snap_obj != 0) {
1708                                 VERIFY(0 == zap_add_int(mos,
1709                                     ds_prev->ds_phys->ds_next_clones_obj,
1710                                     ds->ds_phys->ds_next_snap_obj, tx));
1711                         }
1712                 }
1713                 if (after_branch_point &&
1714                     ds->ds_phys->ds_next_snap_obj == 0) {
1715                         /* This clone is toast. */
1716                         ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1717                         ds_prev->ds_phys->ds_num_children--;
1718 
1719                         /*
1720                          * If the clone's origin has no other clones, no
1721                          * user holds, and has been marked for deferred
1722                          * deletion, then we should have done the necessary
1723                          * destroy setup for it.
1724                          */
1725                         if (ds_prev->ds_phys->ds_num_children == 1 &&
1726                             ds_prev->ds_userrefs == 0 &&
1727                             DS_IS_DEFER_DESTROY(ds_prev)) {
1728                                 ASSERT3P(dsda->rm_origin, !=, NULL);
1729                         } else {
1730                                 ASSERT3P(dsda->rm_origin, ==, NULL);
1731                         }
1732                 } else if (!after_branch_point) {
1733                         ds_prev->ds_phys->ds_next_snap_obj =
1734                             ds->ds_phys->ds_next_snap_obj;
1735                 }
1736         }
1737 
1738         if (dsl_dataset_is_snapshot(ds)) {
1739                 dsl_dataset_t *ds_next;
1740                 uint64_t old_unique;
1741                 uint64_t used = 0, comp = 0, uncomp = 0;
1742 
1743                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1744                     ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
1745                 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1746 
1747                 old_unique = ds_next->ds_phys->ds_unique_bytes;
1748 
1749                 dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1750                 ds_next->ds_phys->ds_prev_snap_obj =
1751                     ds->ds_phys->ds_prev_snap_obj;
1752                 ds_next->ds_phys->ds_prev_snap_txg =
1753                     ds->ds_phys->ds_prev_snap_txg;
1754                 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1755                     ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1756 
1757 
1758                 if (ds_next->ds_deadlist.dl_oldfmt) {
1759                         process_old_deadlist(ds, ds_prev, ds_next,
1760                             after_branch_point, tx);
1761                 } else {
1762                         /* Adjust prev's unique space. */
1763                         if (ds_prev && !after_branch_point) {
1764                                 dsl_deadlist_space_range(&ds_next->ds_deadlist,
1765                                     ds_prev->ds_phys->ds_prev_snap_txg,
1766                                     ds->ds_phys->ds_prev_snap_txg,
1767                                     &used, &comp, &uncomp);
1768                                 ds_prev->ds_phys->ds_unique_bytes += used;
1769                         }
1770 
1771                         /* Adjust snapused. */
1772                         dsl_deadlist_space_range(&ds_next->ds_deadlist,
1773                             ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1774                             &used, &comp, &uncomp);
1775                         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1776                             -used, -comp, -uncomp, tx);
1777 
1778                         /* Move blocks to be freed to pool's free list. */
1779                         dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
1780                             &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
1781                             tx);
1782                         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
1783                             DD_USED_HEAD, used, comp, uncomp, tx);
1784 
1785                         /* Merge our deadlist into next's and free it. */
1786                         dsl_deadlist_merge(&ds_next->ds_deadlist,
1787                             ds->ds_phys->ds_deadlist_obj, tx);
1788                 }
1789                 dsl_deadlist_close(&ds->ds_deadlist);
1790                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1791 
1792                 /* Collapse range in clone heads */
1793                 dsl_dataset_remove_clones_key(ds,
1794                     ds->ds_phys->ds_creation_txg, tx);
1795 
1796                 if (dsl_dataset_is_snapshot(ds_next)) {
1797                         dsl_dataset_t *ds_nextnext;
1798 
1799                         /*
1800                          * Update next's unique to include blocks which
1801                          * were previously shared by only this snapshot
1802                          * and it.  Those blocks will be born after the
1803                          * prev snap and before this snap, and will have
1804                          * died after the next snap and before the one
1805                          * after that (ie. be on the snap after next's
1806                          * deadlist).
1807                          */
1808                         VERIFY(0 == dsl_dataset_hold_obj(dp,
1809                             ds_next->ds_phys->ds_next_snap_obj,
1810                             FTAG, &ds_nextnext));
1811                         dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
1812                             ds->ds_phys->ds_prev_snap_txg,
1813                             ds->ds_phys->ds_creation_txg,
1814                             &used, &comp, &uncomp);
1815                         ds_next->ds_phys->ds_unique_bytes += used;
1816                         dsl_dataset_rele(ds_nextnext, FTAG);
1817                         ASSERT3P(ds_next->ds_prev, ==, NULL);
1818 
1819                         /* Collapse range in this head. */
1820                         dsl_dataset_t *hds;
1821                         VERIFY0(dsl_dataset_hold_obj(dp,
1822                             ds->ds_dir->dd_phys->dd_head_dataset_obj,
1823                             FTAG, &hds));
1824                         dsl_deadlist_remove_key(&hds->ds_deadlist,
1825                             ds->ds_phys->ds_creation_txg, tx);
1826                         dsl_dataset_rele(hds, FTAG);
1827 
1828                 } else {
1829                         ASSERT3P(ds_next->ds_prev, ==, ds);
1830                         dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
1831                         ds_next->ds_prev = NULL;
1832                         if (ds_prev) {
1833                                 VERIFY(0 == dsl_dataset_get_ref(dp,
1834                                     ds->ds_phys->ds_prev_snap_obj,
1835                                     ds_next, &ds_next->ds_prev));
1836                         }
1837 
1838                         dsl_dataset_recalc_head_uniq(ds_next);
1839 
1840                         /*
1841                          * Reduce the amount of our unconsmed refreservation
1842                          * being charged to our parent by the amount of
1843                          * new unique data we have gained.
1844                          */
1845                         if (old_unique < ds_next->ds_reserved) {
1846                                 int64_t mrsdelta;
1847                                 uint64_t new_unique =
1848                                     ds_next->ds_phys->ds_unique_bytes;
1849 
1850                                 ASSERT(old_unique <= new_unique);
1851                                 mrsdelta = MIN(new_unique - old_unique,
1852                                     ds_next->ds_reserved - old_unique);
1853                                 dsl_dir_diduse_space(ds->ds_dir,
1854                                     DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1855                         }
1856                 }
1857                 dsl_dataset_rele(ds_next, FTAG);
1858         } else {
1859                 zfeature_info_t *async_destroy =
1860                     &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
1861 
1862                 /*
1863                  * There's no next snapshot, so this is a head dataset.
1864                  * Destroy the deadlist.  Unless it's a clone, the
1865                  * deadlist should be empty.  (If it's a clone, it's
1866                  * safe to ignore the deadlist contents.)
1867                  */
1868                 dsl_deadlist_close(&ds->ds_deadlist);
1869                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1870                 ds->ds_phys->ds_deadlist_obj = 0;
1871 
1872                 if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
1873                         err = old_synchronous_dataset_destroy(ds, tx);
1874                 } else {
1875                         /*
1876                          * Move the bptree into the pool's list of trees to
1877                          * clean up and update space accounting information.
1878                          */
1879                         uint64_t used, comp, uncomp;
1880 
1881                         ASSERT(err == 0 || err == EBUSY);
1882                         if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
1883                                 spa_feature_incr(dp->dp_spa, async_destroy, tx);
1884                                 dp->dp_bptree_obj = bptree_alloc(
1885                                     dp->dp_meta_objset, tx);
1886                                 VERIFY(zap_add(dp->dp_meta_objset,
1887                                     DMU_POOL_DIRECTORY_OBJECT,
1888                                     DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
1889                                     &dp->dp_bptree_obj, tx) == 0);
1890                         }
1891 
1892                         used = ds->ds_dir->dd_phys->dd_used_bytes;
1893                         comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
1894                         uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
1895 
1896                         ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
1897                             ds->ds_phys->ds_unique_bytes == used);
1898 
1899                         bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj,
1900                             &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
1901                             used, comp, uncomp, tx);
1902                         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
1903                             -used, -comp, -uncomp, tx);
1904                         dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
1905                             used, comp, uncomp, tx);
1906                 }
1907 
1908                 if (ds->ds_prev != NULL) {
1909                         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
1910                                 VERIFY0(zap_remove_int(mos,
1911                                     ds->ds_prev->ds_dir->dd_phys->dd_clones,
1912                                     ds->ds_object, tx));
1913                         }
1914                         dsl_dataset_rele(ds->ds_prev, ds);
1915                         ds->ds_prev = ds_prev = NULL;
1916                 }
1917         }
1918 
1919         /*
1920          * This must be done after the dsl_traverse(), because it will
1921          * re-open the objset.
1922          */
1923         if (ds->ds_objset) {
1924                 dmu_objset_evict(ds->ds_objset);
1925                 ds->ds_objset = NULL;
1926         }
1927 
1928         if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1929                 /* Erase the link in the dir */
1930                 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1931                 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
1932                 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
1933                 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1934                 ASSERT(err == 0);
1935         } else {
1936                 /* remove from snapshot namespace */
1937                 dsl_dataset_t *ds_head;
1938                 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
1939                 VERIFY(0 == dsl_dataset_hold_obj(dp,
1940                     ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
1941                 VERIFY(0 == dsl_dataset_get_snapname(ds));
1942 #ifdef ZFS_DEBUG
1943                 {
1944                         uint64_t val;
1945 
1946                         err = dsl_dataset_snap_lookup(ds_head,
1947                             ds->ds_snapname, &val);
1948                         ASSERT0(err);
1949                         ASSERT3U(val, ==, obj);
1950                 }
1951 #endif
1952                 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
1953                 ASSERT(err == 0);
1954                 dsl_dataset_rele(ds_head, FTAG);
1955         }
1956 
1957         if (ds_prev && ds->ds_prev != ds_prev)
1958                 dsl_dataset_rele(ds_prev, FTAG);
1959 
1960         spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
1961 
1962         if (ds->ds_phys->ds_next_clones_obj != 0) {
1963                 uint64_t count;
1964                 ASSERT(0 == zap_count(mos,
1965                     ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
1966                 VERIFY(0 == dmu_object_free(mos,
1967                     ds->ds_phys->ds_next_clones_obj, tx));
1968         }
1969         if (ds->ds_phys->ds_props_obj != 0)
1970                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
1971         if (ds->ds_phys->ds_userrefs_obj != 0)
1972                 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
1973         dsl_dir_close(ds->ds_dir, ds);
1974         ds->ds_dir = NULL;
1975         dsl_dataset_drain_refs(ds, tag);
1976         VERIFY(0 == dmu_object_free(mos, obj, tx));
1977 
1978         if (dsda->rm_origin) {
1979                 /*
1980                  * Remove the origin of the clone we just destroyed.
1981                  */
1982                 struct dsl_ds_destroyarg ndsda = {0};
1983 
1984                 ndsda.ds = dsda->rm_origin;
1985                 dsl_dataset_destroy_sync(&ndsda, tag, tx);
1986         }
1987 }
1988 
1989 static int
1990 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1991 {
1992         uint64_t asize;
1993 
1994         if (!dmu_tx_is_syncing(tx))
1995                 return (0);
1996 
1997         /*
1998          * If there's an fs-only reservation, any blocks that might become
1999          * owned by the snapshot dataset must be accommodated by space
2000          * outside of the reservation.
2001          */
2002         ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
2003         asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2004         if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
2005                 return (ENOSPC);
2006 
2007         /*
2008          * Propagate any reserved space for this snapshot to other
2009          * snapshot checks in this sync group.
2010          */
2011         if (asize > 0)
2012                 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
2013 
2014         return (0);
2015 }
2016 
2017 int
2018 dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
2019     dmu_tx_t *tx)
2020 {
2021         int err;
2022         uint64_t value;
2023 
2024         /*
2025          * We don't allow multiple snapshots of the same txg.  If there
2026          * is already one, try again.
2027          */
2028         if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
2029                 return (EAGAIN);
2030 
2031         /*
2032          * Check for conflicting snapshot name.
2033          */
2034         err = dsl_dataset_snap_lookup(ds, snapname, &value);
2035         if (err == 0)
2036                 return (EEXIST);
2037         if (err != ENOENT)
2038                 return (err);
2039 
2040         /*
2041          * Check that the dataset's name is not too long.  Name consists
2042          * of the dataset's length + 1 for the @-sign + snapshot name's length
2043          */
2044         if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
2045                 return (ENAMETOOLONG);
2046 
2047         err = dsl_dataset_snapshot_reserve_space(ds, tx);
2048         if (err)
2049                 return (err);
2050 
2051         ds->ds_trysnap_txg = tx->tx_txg;
2052         return (0);
2053 }
2054 
2055 void
2056 dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
2057     dmu_tx_t *tx)
2058 {
2059         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2060         dmu_buf_t *dbuf;
2061         dsl_dataset_phys_t *dsphys;
2062         uint64_t dsobj, crtxg;
2063         objset_t *mos = dp->dp_meta_objset;
2064         int err;
2065 
2066         ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
2067 
2068         /*
2069          * The origin's ds_creation_txg has to be < TXG_INITIAL
2070          */
2071         if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
2072                 crtxg = 1;
2073         else
2074                 crtxg = tx->tx_txg;
2075 
2076         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
2077             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
2078         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
2079         dmu_buf_will_dirty(dbuf, tx);
2080         dsphys = dbuf->db_data;
2081         bzero(dsphys, sizeof (dsl_dataset_phys_t));
2082         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
2083         dsphys->ds_fsid_guid = unique_create();
2084         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
2085             sizeof (dsphys->ds_guid));
2086         dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
2087         dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
2088         dsphys->ds_next_snap_obj = ds->ds_object;
2089         dsphys->ds_num_children = 1;
2090         dsphys->ds_creation_time = gethrestime_sec();
2091         dsphys->ds_creation_txg = crtxg;
2092         dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
2093         dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
2094         dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
2095         dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
2096         dsphys->ds_flags = ds->ds_phys->ds_flags;
2097         dsphys->ds_bp = ds->ds_phys->ds_bp;
2098         dmu_buf_rele(dbuf, FTAG);
2099 
2100         ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
2101         if (ds->ds_prev) {
2102                 uint64_t next_clones_obj =
2103                     ds->ds_prev->ds_phys->ds_next_clones_obj;
2104                 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
2105                     ds->ds_object ||
2106                     ds->ds_prev->ds_phys->ds_num_children > 1);
2107                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
2108                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2109                         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
2110                             ds->ds_prev->ds_phys->ds_creation_txg);
2111                         ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
2112                 } else if (next_clones_obj != 0) {
2113                         remove_from_next_clones(ds->ds_prev,
2114                             dsphys->ds_next_snap_obj, tx);
2115                         VERIFY0(zap_add_int(mos,
2116                             next_clones_obj, dsobj, tx));
2117                 }
2118         }
2119 
2120         /*
2121          * If we have a reference-reservation on this dataset, we will
2122          * need to increase the amount of refreservation being charged
2123          * since our unique space is going to zero.
2124          */
2125         if (ds->ds_reserved) {
2126                 int64_t delta;
2127                 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
2128                 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2129                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
2130                     delta, 0, 0, tx);
2131         }
2132 
2133         dmu_buf_will_dirty(ds->ds_dbuf, tx);
2134         zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
2135             ds->ds_dir->dd_myname, snapname, dsobj,
2136             ds->ds_phys->ds_prev_snap_txg);
2137         ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
2138             UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
2139         dsl_deadlist_close(&ds->ds_deadlist);
2140         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
2141         dsl_deadlist_add_key(&ds->ds_deadlist,
2142             ds->ds_phys->ds_prev_snap_txg, tx);
2143 
2144         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
2145         ds->ds_phys->ds_prev_snap_obj = dsobj;
2146         ds->ds_phys->ds_prev_snap_txg = crtxg;
2147         ds->ds_phys->ds_unique_bytes = 0;
2148         if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
2149                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
2150 
2151         err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
2152             snapname, 8, 1, &dsobj, tx);
2153         ASSERT(err == 0);
2154 
2155         if (ds->ds_prev)
2156                 dsl_dataset_drop_ref(ds->ds_prev, ds);
2157         VERIFY(0 == dsl_dataset_get_ref(dp,
2158             ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
2159 
2160         dsl_scan_ds_snapshotted(ds, tx);
2161 
2162         dsl_dir_snap_cmtime_update(ds->ds_dir);
2163 
2164         spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
2165 }
2166 
2167 void
2168 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
2169 {
2170         ASSERT(dmu_tx_is_syncing(tx));
2171         ASSERT(ds->ds_objset != NULL);
2172         ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
2173 
2174         /*
2175          * in case we had to change ds_fsid_guid when we opened it,
2176          * sync it out now.
2177          */
2178         dmu_buf_will_dirty(ds->ds_dbuf, tx);
2179         ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
2180 
2181         dsl_dir_dirty(ds->ds_dir, tx);
2182         dmu_objset_sync(ds->ds_objset, zio, tx);
2183 }
2184 
2185 static void
2186 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
2187 {
2188         uint64_t count = 0;
2189         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
2190         zap_cursor_t zc;
2191         zap_attribute_t za;
2192         nvlist_t *propval;
2193         nvlist_t *val;
2194 
2195         rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2196         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2197         VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2198 
2199         /*
2200          * There may me missing entries in ds_next_clones_obj
2201          * due to a bug in a previous version of the code.
2202          * Only trust it if it has the right number of entries.
2203          */
2204         if (ds->ds_phys->ds_next_clones_obj != 0) {
2205                 ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
2206                     &count));
2207         }
2208         if (count != ds->ds_phys->ds_num_children - 1) {
2209                 goto fail;
2210         }
2211         for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
2212             zap_cursor_retrieve(&zc, &za) == 0;
2213             zap_cursor_advance(&zc)) {
2214                 dsl_dataset_t *clone;
2215                 char buf[ZFS_MAXNAMELEN];
2216                 /*
2217                  * Even though we hold the dp_config_rwlock, the dataset
2218                  * may fail to open, returning ENOENT.  If there is a
2219                  * thread concurrently attempting to destroy this
2220                  * dataset, it will have the ds_rwlock held for
2221                  * RW_WRITER.  Our call to dsl_dataset_hold_obj() ->
2222                  * dsl_dataset_hold_ref() will fail its
2223                  * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
2224                  * dp_config_rwlock, and wait for the destroy progress
2225                  * and signal ds_exclusive_cv.  If the destroy was
2226                  * successful, we will see that
2227                  * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
2228                  */
2229                 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
2230                     za.za_first_integer, FTAG, &clone) != 0)
2231                         continue;
2232                 dsl_dir_name(clone->ds_dir, buf);
2233                 VERIFY(nvlist_add_boolean(val, buf) == 0);
2234                 dsl_dataset_rele(clone, FTAG);
2235         }
2236         zap_cursor_fini(&zc);
2237         VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
2238         VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
2239             propval) == 0);
2240 fail:
2241         nvlist_free(val);
2242         nvlist_free(propval);
2243         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2244 }
2245 
2246 void
2247 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
2248 {
2249         uint64_t refd, avail, uobjs, aobjs, ratio;
2250 
2251         ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
2252             (ds->ds_phys->ds_uncompressed_bytes * 100 /
2253             ds->ds_phys->ds_compressed_bytes);
2254 
2255         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
2256 
2257         if (dsl_dataset_is_snapshot(ds)) {
2258                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
2259                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
2260                     ds->ds_phys->ds_unique_bytes);
2261                 get_clones_stat(ds, nv);
2262         } else {
2263                 dsl_dir_stats(ds->ds_dir, nv);
2264         }
2265 
2266         dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
2267         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
2268         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
2269 
2270         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
2271             ds->ds_phys->ds_creation_time);
2272         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
2273             ds->ds_phys->ds_creation_txg);
2274         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
2275             ds->ds_quota);
2276         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
2277             ds->ds_reserved);
2278         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
2279             ds->ds_phys->ds_guid);
2280         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
2281             ds->ds_phys->ds_unique_bytes);
2282         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
2283             ds->ds_object);
2284         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
2285             ds->ds_userrefs);
2286         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
2287             DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
2288 
2289         if (ds->ds_phys->ds_prev_snap_obj != 0) {
2290                 uint64_t written, comp, uncomp;
2291                 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2292                 dsl_dataset_t *prev;
2293 
2294                 rw_enter(&dp->dp_config_rwlock, RW_READER);
2295                 int err = dsl_dataset_hold_obj(dp,
2296                     ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
2297                 rw_exit(&dp->dp_config_rwlock);
2298                 if (err == 0) {
2299                         err = dsl_dataset_space_written(prev, ds, &written,
2300                             &comp, &uncomp);
2301                         dsl_dataset_rele(prev, FTAG);
2302                         if (err == 0) {
2303                                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
2304                                     written);
2305                         }
2306                 }
2307         }
2308 
2309 }
2310 
2311 void
2312 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
2313 {
2314         stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
2315         stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
2316         stat->dds_guid = ds->ds_phys->ds_guid;
2317         stat->dds_origin[0] = '\0';
2318         if (dsl_dataset_is_snapshot(ds)) {
2319                 stat->dds_is_snapshot = B_TRUE;
2320                 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
2321         } else {
2322                 stat->dds_is_snapshot = B_FALSE;
2323                 stat->dds_num_clones = 0;
2324 
2325                 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2326                 if (dsl_dir_is_clone(ds->ds_dir)) {
2327                         dsl_dataset_t *ods;
2328 
2329                         VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
2330                             ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
2331                         dsl_dataset_name(ods, stat->dds_origin);
2332                         dsl_dataset_drop_ref(ods, FTAG);
2333                 }
2334                 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2335         }
2336 }
2337 
2338 uint64_t
2339 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2340 {
2341         return (ds->ds_fsid_guid);
2342 }
2343 
2344 void
2345 dsl_dataset_space(dsl_dataset_t *ds,
2346     uint64_t *refdbytesp, uint64_t *availbytesp,
2347     uint64_t *usedobjsp, uint64_t *availobjsp)
2348 {
2349         *refdbytesp = ds->ds_phys->ds_referenced_bytes;
2350         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2351         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
2352                 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
2353         if (ds->ds_quota != 0) {
2354                 /*
2355                  * Adjust available bytes according to refquota
2356                  */
2357                 if (*refdbytesp < ds->ds_quota)
2358                         *availbytesp = MIN(*availbytesp,
2359                             ds->ds_quota - *refdbytesp);
2360                 else
2361                         *availbytesp = 0;
2362         }
2363         *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
2364         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
2365 }
2366 
2367 boolean_t
2368 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2369 {
2370         dsl_pool_t *dp = ds->ds_dir->dd_pool;
2371 
2372         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
2373             dsl_pool_sync_context(dp));
2374         if (ds->ds_prev == NULL)
2375                 return (B_FALSE);
2376         if (ds->ds_phys->ds_bp.blk_birth >
2377             ds->ds_prev->ds_phys->ds_creation_txg) {
2378                 objset_t *os, *os_prev;
2379                 /*
2380                  * It may be that only the ZIL differs, because it was
2381                  * reset in the head.  Don't count that as being
2382                  * modified.
2383                  */
2384                 if (dmu_objset_from_ds(ds, &os) != 0)
2385                         return (B_TRUE);
2386                 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
2387                         return (B_TRUE);
2388                 return (bcmp(&os->os_phys->os_meta_dnode,
2389                     &os_prev->os_phys->os_meta_dnode,
2390                     sizeof (os->os_phys->os_meta_dnode)) != 0);
2391         }
2392         return (B_FALSE);
2393 }
2394 
2395 /* ARGSUSED */
2396 static int
2397 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
2398 {
2399         dsl_dataset_t *ds = arg1;
2400         char *newsnapname = arg2;
2401         dsl_dir_t *dd = ds->ds_dir;
2402         dsl_dataset_t *hds;
2403         uint64_t val;
2404         int err;
2405 
2406         err = dsl_dataset_hold_obj(dd->dd_pool,
2407             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
2408         if (err)
2409                 return (err);
2410 
2411         /* new name better not be in use */
2412         err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
2413         dsl_dataset_rele(hds, FTAG);
2414 
2415         if (err == 0)
2416                 err = EEXIST;
2417         else if (err == ENOENT)
2418                 err = 0;
2419 
2420         /* dataset name + 1 for the "@" + the new snapshot name must fit */
2421         if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
2422                 err = ENAMETOOLONG;
2423 
2424         return (err);
2425 }
2426 
2427 static void
2428 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2429 {
2430         dsl_dataset_t *ds = arg1;
2431         const char *newsnapname = arg2;
2432         dsl_dir_t *dd = ds->ds_dir;
2433         objset_t *mos = dd->dd_pool->dp_meta_objset;
2434         dsl_dataset_t *hds;
2435         int err;
2436 
2437         ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
2438 
2439         VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2440             dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
2441 
2442         VERIFY(0 == dsl_dataset_get_snapname(ds));
2443         err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
2444         ASSERT0(err);
2445         mutex_enter(&ds->ds_lock);
2446         (void) strcpy(ds->ds_snapname, newsnapname);
2447         mutex_exit(&ds->ds_lock);
2448         err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
2449             ds->ds_snapname, 8, 1, &ds->ds_object, tx);
2450         ASSERT0(err);
2451 
2452         spa_history_log_internal_ds(ds, "rename", tx,
2453             "-> @%s", newsnapname);
2454         dsl_dataset_rele(hds, FTAG);
2455 }
2456 
2457 struct renamesnaparg {
2458         dsl_sync_task_group_t *dstg;
2459         char failed[MAXPATHLEN];
2460         char *oldsnap;
2461         char *newsnap;
2462 };
2463 
2464 static int
2465 dsl_snapshot_rename_one(const char *name, void *arg)
2466 {
2467         struct renamesnaparg *ra = arg;
2468         dsl_dataset_t *ds = NULL;
2469         char *snapname;
2470         int err;
2471 
2472         snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
2473         (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
2474 
2475         /*
2476          * For recursive snapshot renames the parent won't be changing
2477          * so we just pass name for both the to/from argument.
2478          */
2479         err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
2480         if (err != 0) {
2481                 strfree(snapname);
2482                 return (err == ENOENT ? 0 : err);
2483         }
2484 
2485 #ifdef _KERNEL
2486         /*
2487          * For all filesystems undergoing rename, we'll need to unmount it.
2488          */
2489         (void) zfs_unmount_snap(snapname, NULL);
2490 #endif
2491         err = dsl_dataset_hold(snapname, ra->dstg, &ds);
2492         strfree(snapname);
2493         if (err != 0)
2494                 return (err == ENOENT ? 0 : err);
2495 
2496         dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
2497             dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
2498 
2499         return (0);
2500 }
2501 
2502 static int
2503 dsl_recursive_rename(char *oldname, const char *newname)
2504 {
2505         int err;
2506         struct renamesnaparg *ra;
2507         dsl_sync_task_t *dst;
2508         spa_t *spa;
2509         char *cp, *fsname = spa_strdup(oldname);
2510         int len = strlen(oldname) + 1;
2511 
2512         /* truncate the snapshot name to get the fsname */
2513         cp = strchr(fsname, '@');
2514         *cp = '\0';
2515 
2516         err = spa_open(fsname, &spa, FTAG);
2517         if (err) {
2518                 kmem_free(fsname, len);
2519                 return (err);
2520         }
2521         ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
2522         ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
2523 
2524         ra->oldsnap = strchr(oldname, '@') + 1;
2525         ra->newsnap = strchr(newname, '@') + 1;
2526         *ra->failed = '\0';
2527 
2528         err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
2529             DS_FIND_CHILDREN);
2530         kmem_free(fsname, len);
2531 
2532         if (err == 0) {
2533                 err = dsl_sync_task_group_wait(ra->dstg);
2534         }
2535 
2536         for (dst = list_head(&ra->dstg->dstg_tasks); dst;
2537             dst = list_next(&ra->dstg->dstg_tasks, dst)) {
2538                 dsl_dataset_t *ds = dst->dst_arg1;
2539                 if (dst->dst_err) {
2540                         dsl_dir_name(ds->ds_dir, ra->failed);
2541                         (void) strlcat(ra->failed, "@", sizeof (ra->failed));
2542                         (void) strlcat(ra->failed, ra->newsnap,
2543                             sizeof (ra->failed));
2544                 }
2545                 dsl_dataset_rele(ds, ra->dstg);
2546         }
2547 
2548         if (err)
2549                 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
2550 
2551         dsl_sync_task_group_destroy(ra->dstg);
2552         kmem_free(ra, sizeof (struct renamesnaparg));
2553         spa_close(spa, FTAG);
2554         return (err);
2555 }
2556 
2557 static int
2558 dsl_valid_rename(const char *oldname, void *arg)
2559 {
2560         int delta = *(int *)arg;
2561 
2562         if (strlen(oldname) + delta >= MAXNAMELEN)
2563                 return (ENAMETOOLONG);
2564 
2565         return (0);
2566 }
2567 
2568 #pragma weak dmu_objset_rename = dsl_dataset_rename
2569 int
2570 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
2571 {
2572         dsl_dir_t *dd;
2573         dsl_dataset_t *ds;
2574         const char *tail;
2575         int err;
2576 
2577         err = dsl_dir_open(oldname, FTAG, &dd, &tail);
2578         if (err)
2579                 return (err);
2580 
2581         if (tail == NULL) {
2582                 int delta = strlen(newname) - strlen(oldname);
2583 
2584                 /* if we're growing, validate child name lengths */
2585                 if (delta > 0)
2586                         err = dmu_objset_find(oldname, dsl_valid_rename,
2587                             &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
2588 
2589                 if (err == 0)
2590                         err = dsl_dir_rename(dd, newname);
2591                 dsl_dir_close(dd, FTAG);
2592                 return (err);
2593         }
2594 
2595         if (tail[0] != '@') {
2596                 /* the name ended in a nonexistent component */
2597                 dsl_dir_close(dd, FTAG);
2598                 return (ENOENT);
2599         }
2600 
2601         dsl_dir_close(dd, FTAG);
2602 
2603         /* new name must be snapshot in same filesystem */
2604         tail = strchr(newname, '@');
2605         if (tail == NULL)
2606                 return (EINVAL);
2607         tail++;
2608         if (strncmp(oldname, newname, tail - newname) != 0)
2609                 return (EXDEV);
2610 
2611         if (recursive) {
2612                 err = dsl_recursive_rename(oldname, newname);
2613         } else {
2614                 err = dsl_dataset_hold(oldname, FTAG, &ds);
2615                 if (err)
2616                         return (err);
2617 
2618                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
2619                     dsl_dataset_snapshot_rename_check,
2620                     dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
2621 
2622                 dsl_dataset_rele(ds, FTAG);
2623         }
2624 
2625         return (err);
2626 }
2627 
2628 struct promotenode {
2629         list_node_t link;
2630         dsl_dataset_t *ds;
2631 };
2632 
2633 struct promotearg {
2634         list_t shared_snaps, origin_snaps, clone_snaps;
2635         dsl_dataset_t *origin_origin;
2636         uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2637         char *err_ds;
2638 };
2639 
2640 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2641 static boolean_t snaplist_unstable(list_t *l);
2642 
2643 static int
2644 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
2645 {
2646         dsl_dataset_t *hds = arg1;
2647         struct promotearg *pa = arg2;
2648         struct promotenode *snap = list_head(&pa->shared_snaps);
2649         dsl_dataset_t *origin_ds = snap->ds;
2650         int err;
2651         uint64_t unused;
2652 
2653         /* Check that it is a real clone */
2654         if (!dsl_dir_is_clone(hds->ds_dir))
2655                 return (EINVAL);
2656 
2657         /* Since this is so expensive, don't do the preliminary check */
2658         if (!dmu_tx_is_syncing(tx))
2659                 return (0);
2660 
2661         if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
2662                 return (EXDEV);
2663 
2664         /* compute origin's new unique space */
2665         snap = list_tail(&pa->clone_snaps);
2666         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2667         dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2668             origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2669             &pa->unique, &unused, &unused);
2670 
2671         /*
2672          * Walk the snapshots that we are moving
2673          *
2674          * Compute space to transfer.  Consider the incremental changes
2675          * to used for each snapshot:
2676          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2677          * So each snapshot gave birth to:
2678          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2679          * So a sequence would look like:
2680          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2681          * Which simplifies to:
2682          * uN + kN + kN-1 + ... + k1 + k0
2683          * Note however, if we stop before we reach the ORIGIN we get:
2684          * uN + kN + kN-1 + ... + kM - uM-1
2685          */
2686         pa->used = origin_ds->ds_phys->ds_referenced_bytes;
2687         pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2688         pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2689         for (snap = list_head(&pa->shared_snaps); snap;
2690             snap = list_next(&pa->shared_snaps, snap)) {
2691                 uint64_t val, dlused, dlcomp, dluncomp;
2692                 dsl_dataset_t *ds = snap->ds;
2693 
2694                 /* Check that the snapshot name does not conflict */
2695                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2696                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2697                 if (err == 0) {
2698                         err = EEXIST;
2699                         goto out;
2700                 }
2701                 if (err != ENOENT)
2702                         goto out;
2703 
2704                 /* The very first snapshot does not have a deadlist */
2705                 if (ds->ds_phys->ds_prev_snap_obj == 0)
2706                         continue;
2707 
2708                 dsl_deadlist_space(&ds->ds_deadlist,
2709                     &dlused, &dlcomp, &dluncomp);
2710                 pa->used += dlused;
2711                 pa->comp += dlcomp;
2712                 pa->uncomp += dluncomp;
2713         }
2714 
2715         /*
2716          * If we are a clone of a clone then we never reached ORIGIN,
2717          * so we need to subtract out the clone origin's used space.
2718          */
2719         if (pa->origin_origin) {
2720                 pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
2721                 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2722                 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2723         }
2724 
2725         /* Check that there is enough space here */
2726         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2727             pa->used);
2728         if (err)
2729                 return (err);
2730 
2731         /*
2732          * Compute the amounts of space that will be used by snapshots
2733          * after the promotion (for both origin and clone).  For each,
2734          * it is the amount of space that will be on all of their
2735          * deadlists (that was not born before their new origin).
2736          */
2737         if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2738                 uint64_t space;
2739 
2740                 /*
2741                  * Note, typically this will not be a clone of a clone,
2742                  * so dd_origin_txg will be < TXG_INITIAL, so
2743                  * these snaplist_space() -> dsl_deadlist_space_range()
2744                  * calls will be fast because they do not have to
2745                  * iterate over all bps.
2746                  */
2747                 snap = list_head(&pa->origin_snaps);
2748                 err = snaplist_space(&pa->shared_snaps,
2749                     snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
2750                 if (err)
2751                         return (err);
2752 
2753                 err = snaplist_space(&pa->clone_snaps,
2754                     snap->ds->ds_dir->dd_origin_txg, &space);
2755                 if (err)
2756                         return (err);
2757                 pa->cloneusedsnap += space;
2758         }
2759         if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2760                 err = snaplist_space(&pa->origin_snaps,
2761                     origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
2762                 if (err)
2763                         return (err);
2764         }
2765 
2766         return (0);
2767 out:
2768         pa->err_ds =  snap->ds->ds_snapname;
2769         return (err);
2770 }
2771 
2772 static void
2773 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2774 {
2775         dsl_dataset_t *hds = arg1;
2776         struct promotearg *pa = arg2;
2777         struct promotenode *snap = list_head(&pa->shared_snaps);
2778         dsl_dataset_t *origin_ds = snap->ds;
2779         dsl_dataset_t *origin_head;
2780         dsl_dir_t *dd = hds->ds_dir;
2781         dsl_pool_t *dp = hds->ds_dir->dd_pool;
2782         dsl_dir_t *odd = NULL;
2783         uint64_t oldnext_obj;
2784         int64_t delta;
2785 
2786         ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
2787 
2788         snap = list_head(&pa->origin_snaps);
2789         origin_head = snap->ds;
2790 
2791         /*
2792          * We need to explicitly open odd, since origin_ds's dd will be
2793          * changing.
2794          */
2795         VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
2796             NULL, FTAG, &odd));
2797 
2798         /* change origin's next snap */
2799         dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2800         oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2801         snap = list_tail(&pa->clone_snaps);
2802         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2803         origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2804 
2805         /* change the origin's next clone */
2806         if (origin_ds->ds_phys->ds_next_clones_obj) {
2807                 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
2808                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2809                     origin_ds->ds_phys->ds_next_clones_obj,
2810                     oldnext_obj, tx));
2811         }
2812 
2813         /* change origin */
2814         dmu_buf_will_dirty(dd->dd_dbuf, tx);
2815         ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2816         dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2817         dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2818         dmu_buf_will_dirty(odd->dd_dbuf, tx);
2819         odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2820         origin_head->ds_dir->dd_origin_txg =
2821             origin_ds->ds_phys->ds_creation_txg;
2822 
2823         /* change dd_clone entries */
2824         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2825                 VERIFY0(zap_remove_int(dp->dp_meta_objset,
2826                     odd->dd_phys->dd_clones, hds->ds_object, tx));
2827                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2828                     pa->origin_origin->ds_dir->dd_phys->dd_clones,
2829                     hds->ds_object, tx));
2830 
2831                 VERIFY0(zap_remove_int(dp->dp_meta_objset,
2832                     pa->origin_origin->ds_dir->dd_phys->dd_clones,
2833                     origin_head->ds_object, tx));
2834                 if (dd->dd_phys->dd_clones == 0) {
2835                         dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
2836                             DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
2837                 }
2838                 VERIFY0(zap_add_int(dp->dp_meta_objset,
2839                     dd->dd_phys->dd_clones, origin_head->ds_object, tx));
2840 
2841         }
2842 
2843         /* move snapshots to this dir */
2844         for (snap = list_head(&pa->shared_snaps); snap;
2845             snap = list_next(&pa->shared_snaps, snap)) {
2846                 dsl_dataset_t *ds = snap->ds;
2847 
2848                 /* unregister props as dsl_dir is changing */
2849                 if (ds->ds_objset) {
2850                         dmu_objset_evict(ds->ds_objset);
2851                         ds->ds_objset = NULL;
2852                 }
2853                 /* move snap name entry */
2854                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2855                 VERIFY(0 == dsl_dataset_snap_remove(origin_head,
2856                     ds->ds_snapname, tx));
2857                 VERIFY(0 == zap_add(dp->dp_meta_objset,
2858                     hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2859                     8, 1, &ds->ds_object, tx));
2860 
2861                 /* change containing dsl_dir */
2862                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2863                 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2864                 ds->ds_phys->ds_dir_obj = dd->dd_object;
2865                 ASSERT3P(ds->ds_dir, ==, odd);
2866                 dsl_dir_close(ds->ds_dir, ds);
2867                 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
2868                     NULL, ds, &ds->ds_dir));
2869 
2870                 /* move any clone references */
2871                 if (ds->ds_phys->ds_next_clones_obj &&
2872                     spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2873                         zap_cursor_t zc;
2874                         zap_attribute_t za;
2875 
2876                         for (zap_cursor_init(&zc, dp->dp_meta_objset,
2877                             ds->ds_phys->ds_next_clones_obj);
2878                             zap_cursor_retrieve(&zc, &za) == 0;
2879                             zap_cursor_advance(&zc)) {
2880                                 dsl_dataset_t *cnds;
2881                                 uint64_t o;
2882 
2883                                 if (za.za_first_integer == oldnext_obj) {
2884                                         /*
2885                                          * We've already moved the
2886                                          * origin's reference.
2887                                          */
2888                                         continue;
2889                                 }
2890 
2891                                 VERIFY0(dsl_dataset_hold_obj(dp,
2892                                     za.za_first_integer, FTAG, &cnds));
2893                                 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
2894 
2895                                 VERIFY3U(zap_remove_int(dp->dp_meta_objset,
2896                                     odd->dd_phys->dd_clones, o, tx), ==, 0);
2897                                 VERIFY3U(zap_add_int(dp->dp_meta_objset,
2898                                     dd->dd_phys->dd_clones, o, tx), ==, 0);
2899                                 dsl_dataset_rele(cnds, FTAG);
2900                         }
2901                         zap_cursor_fini(&zc);
2902                 }
2903 
2904                 ASSERT0(dsl_prop_numcb(ds));
2905         }
2906 
2907         /*
2908          * Change space accounting.
2909          * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2910          * both be valid, or both be 0 (resulting in delta == 0).  This
2911          * is true for each of {clone,origin} independently.
2912          */
2913 
2914         delta = pa->cloneusedsnap -
2915             dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2916         ASSERT3S(delta, >=, 0);
2917         ASSERT3U(pa->used, >=, delta);
2918         dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2919         dsl_dir_diduse_space(dd, DD_USED_HEAD,
2920             pa->used - delta, pa->comp, pa->uncomp, tx);
2921 
2922         delta = pa->originusedsnap -
2923             odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2924         ASSERT3S(delta, <=, 0);
2925         ASSERT3U(pa->used, >=, -delta);
2926         dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2927         dsl_dir_diduse_space(odd, DD_USED_HEAD,
2928             -pa->used - delta, -pa->comp, -pa->uncomp, tx);
2929 
2930         origin_ds->ds_phys->ds_unique_bytes = pa->unique;
2931 
2932         /* log history record */
2933         spa_history_log_internal_ds(hds, "promote", tx, "");
2934 
2935         dsl_dir_close(odd, FTAG);
2936 }
2937 
2938 static char *snaplist_tag = "snaplist";
2939 /*
2940  * Make a list of dsl_dataset_t's for the snapshots between first_obj
2941  * (exclusive) and last_obj (inclusive).  The list will be in reverse
2942  * order (last_obj will be the list_head()).  If first_obj == 0, do all
2943  * snapshots back to this dataset's origin.
2944  */
2945 static int
2946 snaplist_make(dsl_pool_t *dp, boolean_t own,
2947     uint64_t first_obj, uint64_t last_obj, list_t *l)
2948 {
2949         uint64_t obj = last_obj;
2950 
2951         ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
2952 
2953         list_create(l, sizeof (struct promotenode),
2954             offsetof(struct promotenode, link));
2955 
2956         while (obj != first_obj) {
2957                 dsl_dataset_t *ds;
2958                 struct promotenode *snap;
2959                 int err;
2960 
2961                 if (own) {
2962                         err = dsl_dataset_own_obj(dp, obj,
2963                             0, snaplist_tag, &ds);
2964                         if (err == 0)
2965                                 dsl_dataset_make_exclusive(ds, snaplist_tag);
2966                 } else {
2967                         err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
2968                 }
2969                 if (err == ENOENT) {
2970                         /* lost race with snapshot destroy */
2971                         struct promotenode *last = list_tail(l);
2972                         ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
2973                         obj = last->ds->ds_phys->ds_prev_snap_obj;
2974                         continue;
2975                 } else if (err) {
2976                         return (err);
2977                 }
2978 
2979                 if (first_obj == 0)
2980                         first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
2981 
2982                 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
2983                 snap->ds = ds;
2984                 list_insert_tail(l, snap);
2985                 obj = ds->ds_phys->ds_prev_snap_obj;
2986         }
2987 
2988         return (0);
2989 }
2990 
2991 static int
2992 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2993 {
2994         struct promotenode *snap;
2995 
2996         *spacep = 0;
2997         for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2998                 uint64_t used, comp, uncomp;
2999                 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
3000                     mintxg, UINT64_MAX, &used, &comp, &uncomp);
3001                 *spacep += used;
3002         }
3003         return (0);
3004 }
3005 
3006 static void
3007 snaplist_destroy(list_t *l, boolean_t own)
3008 {
3009         struct promotenode *snap;
3010 
3011         if (!l || !list_link_active(&l->list_head))
3012                 return;
3013 
3014         while ((snap = list_tail(l)) != NULL) {
3015                 list_remove(l, snap);
3016                 if (own)
3017                         dsl_dataset_disown(snap->ds, snaplist_tag);
3018                 else
3019                         dsl_dataset_rele(snap->ds, snaplist_tag);
3020                 kmem_free(snap, sizeof (struct promotenode));
3021         }
3022         list_destroy(l);
3023 }
3024 
3025 /*
3026  * Promote a clone.  Nomenclature note:
3027  * "clone" or "cds": the original clone which is being promoted
3028  * "origin" or "ods": the snapshot which is originally clone's origin
3029  * "origin head" or "ohds": the dataset which is the head
3030  * (filesystem/volume) for the origin
3031  * "origin origin": the origin of the origin's filesystem (typically
3032  * NULL, indicating that the clone is not a clone of a clone).
3033  */
3034 int
3035 dsl_dataset_promote(const char *name, char *conflsnap)
3036 {
3037         dsl_dataset_t *ds;
3038         dsl_dir_t *dd;
3039         dsl_pool_t *dp;
3040         dmu_object_info_t doi;
3041         struct promotearg pa = { 0 };
3042         struct promotenode *snap;
3043         int err;
3044 
3045         err = dsl_dataset_hold(name, FTAG, &ds);
3046         if (err)
3047                 return (err);
3048         dd = ds->ds_dir;
3049         dp = dd->dd_pool;
3050 
3051         err = dmu_object_info(dp->dp_meta_objset,
3052             ds->ds_phys->ds_snapnames_zapobj, &doi);
3053         if (err) {
3054                 dsl_dataset_rele(ds, FTAG);
3055                 return (err);
3056         }
3057 
3058         if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
3059                 dsl_dataset_rele(ds, FTAG);
3060                 return (EINVAL);
3061         }
3062 
3063         /*
3064          * We are going to inherit all the snapshots taken before our
3065          * origin (i.e., our new origin will be our parent's origin).
3066          * Take ownership of them so that we can rename them into our
3067          * namespace.
3068          */
3069         rw_enter(&dp->dp_config_rwlock, RW_READER);
3070 
3071         err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
3072             &pa.shared_snaps);
3073         if (err != 0)
3074                 goto out;
3075 
3076         err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
3077         if (err != 0)
3078                 goto out;
3079 
3080         snap = list_head(&pa.shared_snaps);
3081         ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
3082         err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
3083             snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
3084         if (err != 0)
3085                 goto out;
3086 
3087         if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
3088                 err = dsl_dataset_hold_obj(dp,
3089                     snap->ds->ds_dir->dd_phys->dd_origin_obj,
3090                     FTAG, &pa.origin_origin);
3091                 if (err != 0)
3092                         goto out;
3093         }
3094 
3095 out:
3096         rw_exit(&dp->dp_config_rwlock);
3097 
3098         /*
3099          * Add in 128x the snapnames zapobj size, since we will be moving
3100          * a bunch of snapnames to the promoted ds, and dirtying their
3101          * bonus buffers.
3102          */
3103         if (err == 0) {
3104                 err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
3105                     dsl_dataset_promote_sync, ds, &pa,
3106                     2 + 2 * doi.doi_physical_blocks_512);
3107                 if (err && pa.err_ds && conflsnap)
3108                         (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
3109         }
3110 
3111         snaplist_destroy(&pa.shared_snaps, B_TRUE);
3112         snaplist_destroy(&pa.clone_snaps, B_FALSE);
3113         snaplist_destroy(&pa.origin_snaps, B_FALSE);
3114         if (pa.origin_origin)
3115                 dsl_dataset_rele(pa.origin_origin, FTAG);
3116         dsl_dataset_rele(ds, FTAG);
3117         return (err);
3118 }
3119 
3120 struct cloneswaparg {
3121         dsl_dataset_t *cds; /* clone dataset */
3122         dsl_dataset_t *ohds; /* origin's head dataset */
3123         boolean_t force;
3124         int64_t unused_refres_delta; /* change in unconsumed refreservation */
3125 };
3126 
3127 /* ARGSUSED */
3128 static int
3129 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
3130 {
3131         struct cloneswaparg *csa = arg1;
3132 
3133         /* they should both be heads */
3134         if (dsl_dataset_is_snapshot(csa->cds) ||
3135             dsl_dataset_is_snapshot(csa->ohds))
3136                 return (EINVAL);
3137 
3138         /* the branch point should be just before them */
3139         if (csa->cds->ds_prev != csa->ohds->ds_prev)
3140                 return (EINVAL);
3141 
3142         /* cds should be the clone (unless they are unrelated) */
3143         if (csa->cds->ds_prev != NULL &&
3144             csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
3145             csa->ohds->ds_object !=
3146             csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
3147                 return (EINVAL);
3148 
3149         /* the clone should be a child of the origin */
3150         if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
3151                 return (EINVAL);
3152 
3153         /* ohds shouldn't be modified unless 'force' */
3154         if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
3155                 return (ETXTBSY);
3156 
3157         /* adjust amount of any unconsumed refreservation */
3158         csa->unused_refres_delta =
3159             (int64_t)MIN(csa->ohds->ds_reserved,
3160             csa->ohds->ds_phys->ds_unique_bytes) -
3161             (int64_t)MIN(csa->ohds->ds_reserved,
3162             csa->cds->ds_phys->ds_unique_bytes);
3163 
3164         if (csa->unused_refres_delta > 0 &&
3165             csa->unused_refres_delta >
3166             dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
3167                 return (ENOSPC);
3168 
3169         if (csa->ohds->ds_quota != 0 &&
3170             csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
3171                 return (EDQUOT);
3172 
3173         return (0);
3174 }
3175 
3176 /* ARGSUSED */
3177 static void
3178 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3179 {
3180         struct cloneswaparg *csa = arg1;
3181         dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
3182 
3183         ASSERT(csa->cds->ds_reserved == 0);
3184         ASSERT(csa->ohds->ds_quota == 0 ||
3185             csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
3186 
3187         dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
3188         dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
3189 
3190         if (csa->cds->ds_objset != NULL) {
3191                 dmu_objset_evict(csa->cds->ds_objset);
3192                 csa->cds->ds_objset = NULL;
3193         }
3194 
3195         if (csa->ohds->ds_objset != NULL) {
3196                 dmu_objset_evict(csa->ohds->ds_objset);
3197                 csa->ohds->ds_objset = NULL;
3198         }
3199 
3200         /*
3201          * Reset origin's unique bytes, if it exists.
3202          */
3203         if (csa->cds->ds_prev) {
3204                 dsl_dataset_t *origin = csa->cds->ds_prev;
3205                 uint64_t comp, uncomp;
3206 
3207                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
3208                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3209                     origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
3210                     &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
3211         }
3212 
3213         /* swap blkptrs */
3214         {
3215                 blkptr_t tmp;
3216                 tmp = csa->ohds->ds_phys->ds_bp;
3217                 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
3218                 csa->cds->ds_phys->ds_bp = tmp;
3219         }
3220 
3221         /* set dd_*_bytes */
3222         {
3223                 int64_t dused, dcomp, duncomp;
3224                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
3225                 uint64_t odl_used, odl_comp, odl_uncomp;
3226 
3227                 ASSERT3U(csa->cds->ds_dir->dd_phys->
3228                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
3229 
3230                 dsl_deadlist_space(&csa->cds->ds_deadlist,
3231                     &cdl_used, &cdl_comp, &cdl_uncomp);
3232                 dsl_deadlist_space(&csa->ohds->ds_deadlist,
3233                     &odl_used, &odl_comp, &odl_uncomp);
3234 
3235                 dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
3236                     (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
3237                 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
3238                     (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
3239                 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
3240                     cdl_uncomp -
3241                     (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
3242 
3243                 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
3244                     dused, dcomp, duncomp, tx);
3245                 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
3246                     -dused, -dcomp, -duncomp, tx);
3247 
3248                 /*
3249                  * The difference in the space used by snapshots is the
3250                  * difference in snapshot space due to the head's
3251                  * deadlist (since that's the only thing that's
3252                  * changing that affects the snapused).
3253                  */
3254                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3255                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3256                     &cdl_used, &cdl_comp, &cdl_uncomp);
3257                 dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
3258                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3259                     &odl_used, &odl_comp, &odl_uncomp);
3260                 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
3261                     DD_USED_HEAD, DD_USED_SNAP, tx);
3262         }
3263 
3264         /* swap ds_*_bytes */
3265         SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
3266             csa->cds->ds_phys->ds_referenced_bytes);
3267         SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
3268             csa->cds->ds_phys->ds_compressed_bytes);
3269         SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
3270             csa->cds->ds_phys->ds_uncompressed_bytes);
3271         SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
3272             csa->cds->ds_phys->ds_unique_bytes);
3273 
3274         /* apply any parent delta for change in unconsumed refreservation */
3275         dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
3276             csa->unused_refres_delta, 0, 0, tx);
3277 
3278         /*
3279          * Swap deadlists.
3280          */
3281         dsl_deadlist_close(&csa->cds->ds_deadlist);
3282         dsl_deadlist_close(&csa->ohds->ds_deadlist);
3283         SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
3284             csa->cds->ds_phys->ds_deadlist_obj);
3285         dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
3286             csa->cds->ds_phys->ds_deadlist_obj);
3287         dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
3288             csa->ohds->ds_phys->ds_deadlist_obj);
3289 
3290         dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
3291 
3292         spa_history_log_internal_ds(csa->cds, "clone swap", tx,
3293             "parent=%s", csa->ohds->ds_dir->dd_myname);
3294 }
3295 
3296 /*
3297  * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
3298  * recv" into an existing fs to swizzle the file system to the new
3299  * version, and by "zfs rollback".  Can also be used to swap two
3300  * independent head datasets if neither has any snapshots.
3301  */
3302 int
3303 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
3304     boolean_t force)
3305 {
3306         struct cloneswaparg csa;
3307         int error;
3308 
3309         ASSERT(clone->ds_owner);
3310         ASSERT(origin_head->ds_owner);
3311 retry:
3312         /*
3313          * Need exclusive access for the swap. If we're swapping these
3314          * datasets back after an error, we already hold the locks.
3315          */
3316         if (!RW_WRITE_HELD(&clone->ds_rwlock))
3317                 rw_enter(&clone->ds_rwlock, RW_WRITER);
3318         if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
3319             !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
3320                 rw_exit(&clone->ds_rwlock);
3321                 rw_enter(&origin_head->ds_rwlock, RW_WRITER);
3322                 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
3323                         rw_exit(&origin_head->ds_rwlock);
3324                         goto retry;
3325                 }
3326         }
3327         csa.cds = clone;
3328         csa.ohds = origin_head;
3329         csa.force = force;
3330         error = dsl_sync_task_do(clone->ds_dir->dd_pool,
3331             dsl_dataset_clone_swap_check,
3332             dsl_dataset_clone_swap_sync, &csa, NULL, 9);
3333         return (error);
3334 }
3335 
3336 /*
3337  * Given a pool name and a dataset object number in that pool,
3338  * return the name of that dataset.
3339  */
3340 int
3341 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3342 {
3343         spa_t *spa;
3344         dsl_pool_t *dp;
3345         dsl_dataset_t *ds;
3346         int error;
3347 
3348         if ((error = spa_open(pname, &spa, FTAG)) != 0)
3349                 return (error);
3350         dp = spa_get_dsl(spa);
3351         rw_enter(&dp->dp_config_rwlock, RW_READER);
3352         if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
3353                 dsl_dataset_name(ds, buf);
3354                 dsl_dataset_rele(ds, FTAG);
3355         }
3356         rw_exit(&dp->dp_config_rwlock);
3357         spa_close(spa, FTAG);
3358 
3359         return (error);
3360 }
3361 
3362 int
3363 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3364     uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3365 {
3366         int error = 0;
3367 
3368         ASSERT3S(asize, >, 0);
3369 
3370         /*
3371          * *ref_rsrv is the portion of asize that will come from any
3372          * unconsumed refreservation space.
3373          */
3374         *ref_rsrv = 0;
3375 
3376         mutex_enter(&ds->ds_lock);
3377         /*
3378          * Make a space adjustment for reserved bytes.
3379          */
3380         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
3381                 ASSERT3U(*used, >=,
3382                     ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3383                 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3384                 *ref_rsrv =
3385                     asize - MIN(asize, parent_delta(ds, asize + inflight));
3386         }
3387 
3388         if (!check_quota || ds->ds_quota == 0) {
3389                 mutex_exit(&ds->ds_lock);
3390                 return (0);
3391         }
3392         /*
3393          * If they are requesting more space, and our current estimate
3394          * is over quota, they get to try again unless the actual
3395          * on-disk is over quota and there are no pending changes (which
3396          * may free up space for us).
3397          */
3398         if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
3399                 if (inflight > 0 ||
3400                     ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
3401                         error = ERESTART;
3402                 else
3403                         error = EDQUOT;
3404         }
3405         mutex_exit(&ds->ds_lock);
3406 
3407         return (error);
3408 }
3409 
3410 /* ARGSUSED */
3411 static int
3412 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
3413 {
3414         dsl_dataset_t *ds = arg1;
3415         dsl_prop_setarg_t *psa = arg2;
3416         int err;
3417 
3418         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
3419                 return (ENOTSUP);
3420 
3421         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3422                 return (err);
3423 
3424         if (psa->psa_effective_value == 0)
3425                 return (0);
3426 
3427         if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
3428             psa->psa_effective_value < ds->ds_reserved)
3429                 return (ENOSPC);
3430 
3431         return (0);
3432 }
3433 
3434 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
3435 
3436 void
3437 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3438 {
3439         dsl_dataset_t *ds = arg1;
3440         dsl_prop_setarg_t *psa = arg2;
3441         uint64_t effective_value = psa->psa_effective_value;
3442 
3443         dsl_prop_set_sync(ds, psa, tx);
3444         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3445 
3446         if (ds->ds_quota != effective_value) {
3447                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3448                 ds->ds_quota = effective_value;
3449 
3450                 spa_history_log_internal_ds(ds, "set refquota", tx,
3451                     "refquota=%lld", (longlong_t)ds->ds_quota);
3452         }
3453 }
3454 
3455 int
3456 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
3457 {
3458         dsl_dataset_t *ds;
3459         dsl_prop_setarg_t psa;
3460         int err;
3461 
3462         dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
3463 
3464         err = dsl_dataset_hold(dsname, FTAG, &ds);
3465         if (err)
3466                 return (err);
3467 
3468         /*
3469          * If someone removes a file, then tries to set the quota, we
3470          * want to make sure the file freeing takes effect.
3471          */
3472         txg_wait_open(ds->ds_dir->dd_pool, 0);
3473 
3474         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3475             dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
3476             ds, &psa, 0);
3477 
3478         dsl_dataset_rele(ds, FTAG);
3479         return (err);
3480 }
3481 
3482 static int
3483 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
3484 {
3485         dsl_dataset_t *ds = arg1;
3486         dsl_prop_setarg_t *psa = arg2;
3487         uint64_t effective_value;
3488         uint64_t unique;
3489         int err;
3490 
3491         if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
3492             SPA_VERSION_REFRESERVATION)
3493                 return (ENOTSUP);
3494 
3495         if (dsl_dataset_is_snapshot(ds))
3496                 return (EINVAL);
3497 
3498         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3499                 return (err);
3500 
3501         effective_value = psa->psa_effective_value;
3502 
3503         /*
3504          * If we are doing the preliminary check in open context, the
3505          * space estimates may be inaccurate.
3506          */
3507         if (!dmu_tx_is_syncing(tx))
3508                 return (0);
3509 
3510         mutex_enter(&ds->ds_lock);
3511         if (!DS_UNIQUE_IS_ACCURATE(ds))
3512                 dsl_dataset_recalc_head_uniq(ds);
3513         unique = ds->ds_phys->ds_unique_bytes;
3514         mutex_exit(&ds->ds_lock);
3515 
3516         if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
3517                 uint64_t delta = MAX(unique, effective_value) -
3518                     MAX(unique, ds->ds_reserved);
3519 
3520                 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
3521                         return (ENOSPC);
3522                 if (ds->ds_quota > 0 &&
3523                     effective_value > ds->ds_quota)
3524                         return (ENOSPC);
3525         }
3526 
3527         return (0);
3528 }
3529 
3530 static void
3531 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3532 {
3533         dsl_dataset_t *ds = arg1;
3534         dsl_prop_setarg_t *psa = arg2;
3535         uint64_t effective_value = psa->psa_effective_value;
3536         uint64_t unique;
3537         int64_t delta;
3538 
3539         dsl_prop_set_sync(ds, psa, tx);
3540         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3541 
3542         dmu_buf_will_dirty(ds->ds_dbuf, tx);
3543 
3544         mutex_enter(&ds->ds_dir->dd_lock);
3545         mutex_enter(&ds->ds_lock);
3546         ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3547         unique = ds->ds_phys->ds_unique_bytes;
3548         delta = MAX(0, (int64_t)(effective_value - unique)) -
3549             MAX(0, (int64_t)(ds->ds_reserved - unique));
3550         ds->ds_reserved = effective_value;
3551         mutex_exit(&ds->ds_lock);
3552 
3553         dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3554         mutex_exit(&ds->ds_dir->dd_lock);
3555 
3556         spa_history_log_internal_ds(ds, "set refreservation", tx,
3557             "refreservation=%lld", (longlong_t)effective_value);
3558 }
3559 
3560 int
3561 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
3562     uint64_t reservation)
3563 {
3564         dsl_dataset_t *ds;
3565         dsl_prop_setarg_t psa;
3566         int err;
3567 
3568         dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
3569             &reservation);
3570 
3571         err = dsl_dataset_hold(dsname, FTAG, &ds);
3572         if (err)
3573                 return (err);
3574 
3575         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3576             dsl_dataset_set_reservation_check,
3577             dsl_dataset_set_reservation_sync, ds, &psa, 0);
3578 
3579         dsl_dataset_rele(ds, FTAG);
3580         return (err);
3581 }
3582 
3583 typedef struct zfs_hold_cleanup_arg {
3584         dsl_pool_t *dp;
3585         uint64_t dsobj;
3586         char htag[MAXNAMELEN];
3587 } zfs_hold_cleanup_arg_t;
3588 
3589 static void
3590 dsl_dataset_user_release_onexit(void *arg)
3591 {
3592         zfs_hold_cleanup_arg_t *ca = arg;
3593 
3594         (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
3595             B_TRUE);
3596         kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
3597 }
3598 
3599 void
3600 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
3601     minor_t minor)
3602 {
3603         zfs_hold_cleanup_arg_t *ca;
3604 
3605         ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
3606         ca->dp = ds->ds_dir->dd_pool;
3607         ca->dsobj = ds->ds_object;
3608         (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
3609         VERIFY0(zfs_onexit_add_cb(minor,
3610             dsl_dataset_user_release_onexit, ca, NULL));
3611 }
3612 
3613 /*
3614  * If you add new checks here, you may need to add
3615  * additional checks to the "temporary" case in
3616  * snapshot_check() in dmu_objset.c.
3617  */
3618 static int
3619 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
3620 {
3621         dsl_dataset_t *ds = arg1;
3622         struct dsl_ds_holdarg *ha = arg2;
3623         const char *htag = ha->htag;
3624         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3625         int error = 0;
3626 
3627         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3628                 return (ENOTSUP);
3629 
3630         if (!dsl_dataset_is_snapshot(ds))
3631                 return (EINVAL);
3632 
3633         /* tags must be unique */
3634         mutex_enter(&ds->ds_lock);
3635         if (ds->ds_phys->ds_userrefs_obj) {
3636                 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
3637                     8, 1, tx);
3638                 if (error == 0)
3639                         error = EEXIST;
3640                 else if (error == ENOENT)
3641                         error = 0;
3642         }
3643         mutex_exit(&ds->ds_lock);
3644 
3645         if (error == 0 && ha->temphold &&
3646             strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
3647                 error = E2BIG;
3648 
3649         return (error);
3650 }
3651 
3652 void
3653 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3654 {
3655         dsl_dataset_t *ds = arg1;
3656         struct dsl_ds_holdarg *ha = arg2;
3657         const char *htag = ha->htag;
3658         dsl_pool_t *dp = ds->ds_dir->dd_pool;
3659         objset_t *mos = dp->dp_meta_objset;
3660         uint64_t now = gethrestime_sec();
3661         uint64_t zapobj;
3662 
3663         mutex_enter(&ds->ds_lock);
3664         if (ds->ds_phys->ds_userrefs_obj == 0) {
3665                 /*
3666                  * This is the first user hold for this dataset.  Create
3667                  * the userrefs zap object.
3668                  */
3669                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3670                 zapobj = ds->ds_phys->ds_userrefs_obj =
3671                     zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
3672         } else {
3673                 zapobj = ds->ds_phys->ds_userrefs_obj;
3674         }
3675         ds->ds_userrefs++;
3676         mutex_exit(&ds->ds_lock);
3677 
3678         VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
3679 
3680         if (ha->temphold) {
3681                 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
3682                     htag, &now, tx));
3683         }
3684 
3685         spa_history_log_internal_ds(ds, "hold", tx,
3686             "tag = %s temp = %d holds now = %llu",
3687             htag, (int)ha->temphold, ds->ds_userrefs);
3688 }
3689 
3690 static int
3691 dsl_dataset_user_hold_one(const char *dsname, void *arg)
3692 {
3693         struct dsl_ds_holdarg *ha = arg;
3694         dsl_dataset_t *ds;
3695         int error;
3696         char *name;
3697 
3698         /* alloc a buffer to hold dsname@snapname plus terminating NULL */
3699         name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3700         error = dsl_dataset_hold(name, ha->dstg, &ds);
3701         strfree(name);
3702         if (error == 0) {
3703                 ha->gotone = B_TRUE;
3704                 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
3705                     dsl_dataset_user_hold_sync, ds, ha, 0);
3706         } else if (error == ENOENT && ha->recursive) {
3707                 error = 0;
3708         } else {
3709                 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3710         }
3711         return (error);
3712 }
3713 
3714 int
3715 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
3716     boolean_t temphold)
3717 {
3718         struct dsl_ds_holdarg *ha;
3719         int error;
3720 
3721         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3722         ha->htag = htag;
3723         ha->temphold = temphold;
3724         error = dsl_sync_task_do(ds->ds_dir->dd_pool,
3725             dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
3726             ds, ha, 0);
3727         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3728 
3729         return (error);
3730 }
3731 
3732 int
3733 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
3734     boolean_t recursive, boolean_t temphold, int cleanup_fd)
3735 {
3736         struct dsl_ds_holdarg *ha;
3737         dsl_sync_task_t *dst;
3738         spa_t *spa;
3739         int error;
3740         minor_t minor = 0;
3741 
3742         if (cleanup_fd != -1) {
3743                 /* Currently we only support cleanup-on-exit of tempholds. */
3744                 if (!temphold)
3745                         return (EINVAL);
3746                 error = zfs_onexit_fd_hold(cleanup_fd, &minor);
3747                 if (error)
3748                         return (error);
3749         }
3750 
3751         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3752 
3753         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3754 
3755         error = spa_open(dsname, &spa, FTAG);
3756         if (error) {
3757                 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3758                 if (cleanup_fd != -1)
3759                         zfs_onexit_fd_rele(cleanup_fd);
3760                 return (error);
3761         }
3762 
3763         ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3764         ha->htag = htag;
3765         ha->snapname = snapname;
3766         ha->recursive = recursive;
3767         ha->temphold = temphold;
3768 
3769         if (recursive) {
3770                 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
3771                     ha, DS_FIND_CHILDREN);
3772         } else {
3773                 error = dsl_dataset_user_hold_one(dsname, ha);
3774         }
3775         if (error == 0)
3776                 error = dsl_sync_task_group_wait(ha->dstg);
3777 
3778         for (dst = list_head(&ha->dstg->dstg_tasks); dst;
3779             dst = list_next(&ha->dstg->dstg_tasks, dst)) {
3780                 dsl_dataset_t *ds = dst->dst_arg1;
3781 
3782                 if (dst->dst_err) {
3783                         dsl_dataset_name(ds, ha->failed);
3784                         *strchr(ha->failed, '@') = '\0';
3785                 } else if (error == 0 && minor != 0 && temphold) {
3786                         /*
3787                          * If this hold is to be released upon process exit,
3788                          * register that action now.
3789                          */
3790                         dsl_register_onexit_hold_cleanup(ds, htag, minor);
3791                 }
3792                 dsl_dataset_rele(ds, ha->dstg);
3793         }
3794 
3795         if (error == 0 && recursive && !ha->gotone)
3796                 error = ENOENT;
3797 
3798         if (error)
3799                 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
3800 
3801         dsl_sync_task_group_destroy(ha->dstg);
3802 
3803         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3804         spa_close(spa, FTAG);
3805         if (cleanup_fd != -1)
3806                 zfs_onexit_fd_rele(cleanup_fd);
3807         return (error);
3808 }
3809 
3810 struct dsl_ds_releasearg {
3811         dsl_dataset_t *ds;
3812         const char *htag;
3813         boolean_t own;          /* do we own or just hold ds? */
3814 };
3815 
3816 static int
3817 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
3818     boolean_t *might_destroy)
3819 {
3820         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3821         uint64_t zapobj;
3822         uint64_t tmp;
3823         int error;
3824 
3825         *might_destroy = B_FALSE;
3826 
3827         mutex_enter(&ds->ds_lock);
3828         zapobj = ds->ds_phys->ds_userrefs_obj;
3829         if (zapobj == 0) {
3830                 /* The tag can't possibly exist */
3831                 mutex_exit(&ds->ds_lock);
3832                 return (ESRCH);
3833         }
3834 
3835         /* Make sure the tag exists */
3836         error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
3837         if (error) {
3838                 mutex_exit(&ds->ds_lock);
3839                 if (error == ENOENT)
3840                         error = ESRCH;
3841                 return (error);
3842         }
3843 
3844         if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
3845             DS_IS_DEFER_DESTROY(ds))
3846                 *might_destroy = B_TRUE;
3847 
3848         mutex_exit(&ds->ds_lock);
3849         return (0);
3850 }
3851 
3852 static int
3853 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
3854 {
3855         struct dsl_ds_releasearg *ra = arg1;
3856         dsl_dataset_t *ds = ra->ds;
3857         boolean_t might_destroy;
3858         int error;
3859 
3860         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3861                 return (ENOTSUP);
3862 
3863         error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
3864         if (error)
3865                 return (error);
3866 
3867         if (might_destroy) {
3868                 struct dsl_ds_destroyarg dsda = {0};
3869 
3870                 if (dmu_tx_is_syncing(tx)) {
3871                         /*
3872                          * If we're not prepared to remove the snapshot,
3873                          * we can't allow the release to happen right now.
3874                          */
3875                         if (!ra->own)
3876                                 return (EBUSY);
3877                 }
3878                 dsda.ds = ds;
3879                 dsda.releasing = B_TRUE;
3880                 return (dsl_dataset_destroy_check(&dsda, tag, tx));
3881         }
3882 
3883         return (0);
3884 }
3885 
3886 static void
3887 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
3888 {
3889         struct dsl_ds_releasearg *ra = arg1;
3890         dsl_dataset_t *ds = ra->ds;
3891         dsl_pool_t *dp = ds->ds_dir->dd_pool;
3892         objset_t *mos = dp->dp_meta_objset;
3893         uint64_t zapobj;
3894         uint64_t refs;
3895         int error;
3896 
3897         mutex_enter(&ds->ds_lock);
3898         ds->ds_userrefs--;
3899         refs = ds->ds_userrefs;
3900         mutex_exit(&ds->ds_lock);
3901         error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
3902         VERIFY(error == 0 || error == ENOENT);
3903         zapobj = ds->ds_phys->ds_userrefs_obj;
3904         VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
3905         if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
3906             DS_IS_DEFER_DESTROY(ds)) {
3907                 struct dsl_ds_destroyarg dsda = {0};
3908 
3909                 ASSERT(ra->own);
3910                 dsda.ds = ds;
3911                 dsda.releasing = B_TRUE;
3912                 /* We already did the destroy_check */
3913                 dsl_dataset_destroy_sync(&dsda, tag, tx);
3914         }
3915 
3916         spa_history_log_internal_ds(ds, "release", tx,
3917             "tag = %s refs now = %lld", ra->htag, (longlong_t)refs);
3918 }
3919 
3920 static int
3921 dsl_dataset_user_release_one(const char *dsname, void *arg)
3922 {
3923         struct dsl_ds_holdarg *ha = arg;
3924         struct dsl_ds_releasearg *ra;
3925         dsl_dataset_t *ds;
3926         int error;
3927         void *dtag = ha->dstg;
3928         char *name;
3929         boolean_t own = B_FALSE;
3930         boolean_t might_destroy;
3931 
3932         /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
3933         name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3934         error = dsl_dataset_hold(name, dtag, &ds);
3935         strfree(name);
3936         if (error == ENOENT && ha->recursive)
3937                 return (0);
3938         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3939         if (error)
3940                 return (error);
3941 
3942         ha->gotone = B_TRUE;
3943 
3944         ASSERT(dsl_dataset_is_snapshot(ds));
3945 
3946         error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
3947         if (error) {
3948                 dsl_dataset_rele(ds, dtag);
3949                 return (error);
3950         }
3951 
3952         if (might_destroy) {
3953 #ifdef _KERNEL
3954                 name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3955                 error = zfs_unmount_snap(name, NULL);
3956                 strfree(name);
3957                 if (error) {
3958                         dsl_dataset_rele(ds, dtag);
3959                         return (error);
3960                 }
3961 #endif
3962                 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
3963                         dsl_dataset_rele(ds, dtag);
3964                         return (EBUSY);
3965                 } else {
3966                         own = B_TRUE;
3967                         dsl_dataset_make_exclusive(ds, dtag);
3968                 }
3969         }
3970 
3971         ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
3972         ra->ds = ds;
3973         ra->htag = ha->htag;
3974         ra->own = own;
3975         dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
3976             dsl_dataset_user_release_sync, ra, dtag, 0);
3977 
3978         return (0);
3979 }
3980 
3981 int
3982 dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
3983     boolean_t recursive)
3984 {
3985         struct dsl_ds_holdarg *ha;
3986         dsl_sync_task_t *dst;
3987         spa_t *spa;
3988         int error;
3989 
3990 top:
3991         ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3992 
3993         (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3994 
3995         error = spa_open(dsname, &spa, FTAG);
3996         if (error) {
3997                 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3998                 return (error);
3999         }
4000 
4001         ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
4002         ha->htag = htag;
4003         ha->snapname = snapname;
4004         ha->recursive = recursive;
4005         if (recursive) {
4006                 error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
4007                     ha, DS_FIND_CHILDREN);
4008         } else {
4009                 error = dsl_dataset_user_release_one(dsname, ha);
4010         }
4011         if (error == 0)
4012                 error = dsl_sync_task_group_wait(ha->dstg);
4013 
4014         for (dst = list_head(&ha->dstg->dstg_tasks); dst;
4015             dst = list_next(&ha->dstg->dstg_tasks, dst)) {
4016                 struct dsl_ds_releasearg *ra = dst->dst_arg1;
4017                 dsl_dataset_t *ds = ra->ds;
4018 
4019                 if (dst->dst_err)
4020                         dsl_dataset_name(ds, ha->failed);
4021 
4022                 if (ra->own)
4023                         dsl_dataset_disown(ds, ha->dstg);
4024                 else
4025                         dsl_dataset_rele(ds, ha->dstg);
4026 
4027                 kmem_free(ra, sizeof (struct dsl_ds_releasearg));
4028         }
4029 
4030         if (error == 0 && recursive && !ha->gotone)
4031                 error = ENOENT;
4032 
4033         if (error && error != EBUSY)
4034                 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
4035 
4036         dsl_sync_task_group_destroy(ha->dstg);
4037         kmem_free(ha, sizeof (struct dsl_ds_holdarg));
4038         spa_close(spa, FTAG);
4039 
4040         /*
4041          * We can get EBUSY if we were racing with deferred destroy and
4042          * dsl_dataset_user_release_check() hadn't done the necessary
4043          * open context setup.  We can also get EBUSY if we're racing
4044          * with destroy and that thread is the ds_owner.  Either way
4045          * the busy condition should be transient, and we should retry
4046          * the release operation.
4047          */
4048         if (error == EBUSY)
4049                 goto top;
4050 
4051         return (error);
4052 }
4053 
4054 /*
4055  * Called at spa_load time (with retry == B_FALSE) to release a stale
4056  * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
4057  */
4058 int
4059 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
4060     boolean_t retry)
4061 {
4062         dsl_dataset_t *ds;
4063         char *snap;
4064         char *name;
4065         int namelen;
4066         int error;
4067 
4068         do {
4069                 rw_enter(&dp->dp_config_rwlock, RW_READER);
4070                 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
4071                 rw_exit(&dp->dp_config_rwlock);
4072                 if (error)
4073                         return (error);
4074                 namelen = dsl_dataset_namelen(ds)+1;
4075                 name = kmem_alloc(namelen, KM_SLEEP);
4076                 dsl_dataset_name(ds, name);
4077                 dsl_dataset_rele(ds, FTAG);
4078 
4079                 snap = strchr(name, '@');
4080                 *snap = '\0';
4081                 ++snap;
4082                 error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
4083                 kmem_free(name, namelen);
4084 
4085                 /*
4086                  * The object can't have been destroyed because we have a hold,
4087                  * but it might have been renamed, resulting in ENOENT.  Retry
4088                  * if we've been requested to do so.
4089                  *
4090                  * It would be nice if we could use the dsobj all the way
4091                  * through and avoid ENOENT entirely.  But we might need to
4092                  * unmount the snapshot, and there's currently no way to lookup
4093                  * a vfsp using a ZFS object id.
4094                  */
4095         } while ((error == ENOENT) && retry);
4096 
4097         return (error);
4098 }
4099 
4100 int
4101 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
4102 {
4103         dsl_dataset_t *ds;
4104         int err;
4105 
4106         err = dsl_dataset_hold(dsname, FTAG, &ds);
4107         if (err)
4108                 return (err);
4109 
4110         VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
4111         if (ds->ds_phys->ds_userrefs_obj != 0) {
4112                 zap_attribute_t *za;
4113                 zap_cursor_t zc;
4114 
4115                 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
4116                 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
4117                     ds->ds_phys->ds_userrefs_obj);
4118                     zap_cursor_retrieve(&zc, za) == 0;
4119                     zap_cursor_advance(&zc)) {
4120                         VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
4121                             za->za_first_integer));
4122                 }
4123                 zap_cursor_fini(&zc);
4124                 kmem_free(za, sizeof (zap_attribute_t));
4125         }
4126         dsl_dataset_rele(ds, FTAG);
4127         return (0);
4128 }
4129 
4130 /*
4131  * Note, this function is used as the callback for dmu_objset_find().  We
4132  * always return 0 so that we will continue to find and process
4133  * inconsistent datasets, even if we encounter an error trying to
4134  * process one of them.
4135  */
4136 /* ARGSUSED */
4137 int
4138 dsl_destroy_inconsistent(const char *dsname, void *arg)
4139 {
4140         dsl_dataset_t *ds;
4141 
4142         if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
4143                 if (DS_IS_INCONSISTENT(ds))
4144                         (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
4145                 else
4146                         dsl_dataset_disown(ds, FTAG);
4147         }
4148         return (0);
4149 }
4150 
4151 /*
4152  * Return (in *usedp) the amount of space written in new that is not
4153  * present in oldsnap.  New may be a snapshot or the head.  Old must be
4154  * a snapshot before new, in new's filesystem (or its origin).  If not then
4155  * fail and return EINVAL.
4156  *
4157  * The written space is calculated by considering two components:  First, we
4158  * ignore any freed space, and calculate the written as new's used space
4159  * minus old's used space.  Next, we add in the amount of space that was freed
4160  * between the two snapshots, thus reducing new's used space relative to old's.
4161  * Specifically, this is the space that was born before old->ds_creation_txg,
4162  * and freed before new (ie. on new's deadlist or a previous deadlist).
4163  *
4164  * space freed                         [---------------------]
4165  * snapshots                       ---O-------O--------O-------O------
4166  *                                         oldsnap            new
4167  */
4168 int
4169 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
4170     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4171 {
4172         int err = 0;
4173         uint64_t snapobj;
4174         dsl_pool_t *dp = new->ds_dir->dd_pool;
4175 
4176         *usedp = 0;
4177         *usedp += new->ds_phys->ds_referenced_bytes;
4178         *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
4179 
4180         *compp = 0;
4181         *compp += new->ds_phys->ds_compressed_bytes;
4182         *compp -= oldsnap->ds_phys->ds_compressed_bytes;
4183 
4184         *uncompp = 0;
4185         *uncompp += new->ds_phys->ds_uncompressed_bytes;
4186         *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
4187 
4188         rw_enter(&dp->dp_config_rwlock, RW_READER);
4189         snapobj = new->ds_object;
4190         while (snapobj != oldsnap->ds_object) {
4191                 dsl_dataset_t *snap;
4192                 uint64_t used, comp, uncomp;
4193 
4194                 if (snapobj == new->ds_object) {
4195                         snap = new;
4196                 } else {
4197                         err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
4198                         if (err != 0)
4199                                 break;
4200                 }
4201 
4202                 if (snap->ds_phys->ds_prev_snap_txg ==
4203                     oldsnap->ds_phys->ds_creation_txg) {
4204                         /*
4205                          * The blocks in the deadlist can not be born after
4206                          * ds_prev_snap_txg, so get the whole deadlist space,
4207                          * which is more efficient (especially for old-format
4208                          * deadlists).  Unfortunately the deadlist code
4209                          * doesn't have enough information to make this
4210                          * optimization itself.
4211                          */
4212                         dsl_deadlist_space(&snap->ds_deadlist,
4213                             &used, &comp, &uncomp);
4214                 } else {
4215                         dsl_deadlist_space_range(&snap->ds_deadlist,
4216                             0, oldsnap->ds_phys->ds_creation_txg,
4217                             &used, &comp, &uncomp);
4218                 }
4219                 *usedp += used;
4220                 *compp += comp;
4221                 *uncompp += uncomp;
4222 
4223                 /*
4224                  * If we get to the beginning of the chain of snapshots
4225                  * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
4226                  * was not a snapshot of/before new.
4227                  */
4228                 snapobj = snap->ds_phys->ds_prev_snap_obj;
4229                 if (snap != new)
4230                         dsl_dataset_rele(snap, FTAG);
4231                 if (snapobj == 0) {
4232                         err = EINVAL;
4233                         break;
4234                 }
4235 
4236         }
4237         rw_exit(&dp->dp_config_rwlock);
4238         return (err);
4239 }
4240 
4241 /*
4242  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
4243  * lastsnap, and all snapshots in between are deleted.
4244  *
4245  * blocks that would be freed            [---------------------------]
4246  * snapshots                       ---O-------O--------O-------O--------O
4247  *                                        firstsnap        lastsnap
4248  *
4249  * This is the set of blocks that were born after the snap before firstsnap,
4250  * (birth > firstsnap->prev_snap_txg) and died before the snap after the
4251  * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
4252  * We calculate this by iterating over the relevant deadlists (from the snap
4253  * after lastsnap, backward to the snap after firstsnap), summing up the
4254  * space on the deadlist that was born after the snap before firstsnap.
4255  */
4256 int
4257 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
4258     dsl_dataset_t *lastsnap,
4259     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4260 {
4261         int err = 0;
4262         uint64_t snapobj;
4263         dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
4264 
4265         ASSERT(dsl_dataset_is_snapshot(firstsnap));
4266         ASSERT(dsl_dataset_is_snapshot(lastsnap));
4267 
4268         /*
4269          * Check that the snapshots are in the same dsl_dir, and firstsnap
4270          * is before lastsnap.
4271          */
4272         if (firstsnap->ds_dir != lastsnap->ds_dir ||
4273             firstsnap->ds_phys->ds_creation_txg >
4274             lastsnap->ds_phys->ds_creation_txg)
4275                 return (EINVAL);
4276 
4277         *usedp = *compp = *uncompp = 0;
4278 
4279         rw_enter(&dp->dp_config_rwlock, RW_READER);
4280         snapobj = lastsnap->ds_phys->ds_next_snap_obj;
4281         while (snapobj != firstsnap->ds_object) {
4282                 dsl_dataset_t *ds;
4283                 uint64_t used, comp, uncomp;
4284 
4285                 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
4286                 if (err != 0)
4287                         break;
4288 
4289                 dsl_deadlist_space_range(&ds->ds_deadlist,
4290                     firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
4291                     &used, &comp, &uncomp);
4292                 *usedp += used;
4293                 *compp += comp;
4294                 *uncompp += uncomp;
4295 
4296                 snapobj = ds->ds_phys->ds_prev_snap_obj;
4297                 ASSERT3U(snapobj, !=, 0);
4298                 dsl_dataset_rele(ds, FTAG);
4299         }
4300         rw_exit(&dp->dp_config_rwlock);
4301         return (err);
4302 }