1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/dsl_userhold.h>
  29 #include <sys/dsl_dataset.h>
  30 #include <sys/dsl_synctask.h>
  31 #include <sys/dmu_tx.h>
  32 #include <sys/dsl_pool.h>
  33 #include <sys/dsl_dir.h>
  34 #include <sys/dmu_traverse.h>
  35 #include <sys/dsl_scan.h>
  36 #include <sys/dmu_objset.h>
  37 #include <sys/zap.h>
  38 #include <sys/zfeature.h>
  39 #include <sys/zfs_ioctl.h>
  40 #include <sys/dsl_deleg.h>
  41 
  42 typedef struct dmu_snapshots_destroy_arg {
  43         nvlist_t *dsda_snaps;
  44         nvlist_t *dsda_successful_snaps;
  45         boolean_t dsda_defer;
  46         nvlist_t *dsda_errlist;
  47 } dmu_snapshots_destroy_arg_t;
  48 
  49 /*
  50  * ds must be owned.
  51  */
  52 static int
  53 dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
  54 {
  55         if (!dsl_dataset_is_snapshot(ds))
  56                 return (SET_ERROR(EINVAL));
  57 
  58         if (dsl_dataset_long_held(ds))
  59                 return (SET_ERROR(EBUSY));
  60 
  61         /*
  62          * Only allow deferred destroy on pools that support it.
  63          * NOTE: deferred destroy is only supported on snapshots.
  64          */
  65         if (defer) {
  66                 if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
  67                     SPA_VERSION_USERREFS)
  68                         return (SET_ERROR(ENOTSUP));
  69                 return (0);
  70         }
  71 
  72         /*
  73          * If this snapshot has an elevated user reference count,
  74          * we can't destroy it yet.
  75          */
  76         if (ds->ds_userrefs > 0)
  77                 return (SET_ERROR(EBUSY));
  78 
  79         /*
  80          * Can't delete a branch point.
  81          */
  82         if (ds->ds_phys->ds_num_children > 1)
  83                 return (SET_ERROR(EEXIST));
  84 
  85         return (0);
  86 }
  87 
  88 static int
  89 dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
  90 {
  91         dmu_snapshots_destroy_arg_t *dsda = arg;
  92         dsl_pool_t *dp = dmu_tx_pool(tx);
  93         nvpair_t *pair;
  94         int error = 0;
  95 
  96         if (!dmu_tx_is_syncing(tx))
  97                 return (0);
  98 
  99         for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL);
 100             pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) {
 101                 dsl_dataset_t *ds;
 102 
 103                 error = dsl_dataset_hold(dp, nvpair_name(pair),
 104                     FTAG, &ds);
 105 
 106                 /*
 107                  * If the snapshot does not exist, silently ignore it
 108                  * (it's "already destroyed").
 109                  */
 110                 if (error == ENOENT)
 111                         continue;
 112 
 113                 if (error == 0) {
 114                         error = dsl_destroy_snapshot_check_impl(ds,
 115                             dsda->dsda_defer);
 116                         dsl_dataset_rele(ds, FTAG);
 117                 }
 118 
 119                 if (error == 0) {
 120                         fnvlist_add_boolean(dsda->dsda_successful_snaps,
 121                             nvpair_name(pair));
 122                 } else {
 123                         fnvlist_add_int32(dsda->dsda_errlist,
 124                             nvpair_name(pair), error);
 125                 }
 126         }
 127 
 128         pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL);
 129         if (pair != NULL)
 130                 return (fnvpair_value_int32(pair));
 131 
 132         return (0);
 133 }
 134 
 135 struct process_old_arg {
 136         dsl_dataset_t *ds;
 137         dsl_dataset_t *ds_prev;
 138         boolean_t after_branch_point;
 139         zio_t *pio;
 140         uint64_t used, comp, uncomp;
 141 };
 142 
 143 static int
 144 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 145 {
 146         struct process_old_arg *poa = arg;
 147         dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
 148 
 149         if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
 150                 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
 151                 if (poa->ds_prev && !poa->after_branch_point &&
 152                     bp->blk_birth >
 153                     poa->ds_prev->ds_phys->ds_prev_snap_txg) {
 154                         poa->ds_prev->ds_phys->ds_unique_bytes +=
 155                             bp_get_dsize_sync(dp->dp_spa, bp);
 156                 }
 157         } else {
 158                 poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
 159                 poa->comp += BP_GET_PSIZE(bp);
 160                 poa->uncomp += BP_GET_UCSIZE(bp);
 161                 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
 162         }
 163         return (0);
 164 }
 165 
 166 static void
 167 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
 168     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
 169 {
 170         struct process_old_arg poa = { 0 };
 171         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 172         objset_t *mos = dp->dp_meta_objset;
 173         uint64_t deadlist_obj;
 174 
 175         ASSERT(ds->ds_deadlist.dl_oldfmt);
 176         ASSERT(ds_next->ds_deadlist.dl_oldfmt);
 177 
 178         poa.ds = ds;
 179         poa.ds_prev = ds_prev;
 180         poa.after_branch_point = after_branch_point;
 181         poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 182         VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
 183             process_old_cb, &poa, tx));
 184         VERIFY0(zio_wait(poa.pio));
 185         ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
 186 
 187         /* change snapused */
 188         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
 189             -poa.used, -poa.comp, -poa.uncomp, tx);
 190 
 191         /* swap next's deadlist to our deadlist */
 192         dsl_deadlist_close(&ds->ds_deadlist);
 193         dsl_deadlist_close(&ds_next->ds_deadlist);
 194         deadlist_obj = ds->ds_phys->ds_deadlist_obj;
 195         ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj;
 196         ds_next->ds_phys->ds_deadlist_obj = deadlist_obj;
 197         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
 198         dsl_deadlist_open(&ds_next->ds_deadlist, mos,
 199             ds_next->ds_phys->ds_deadlist_obj);
 200 }
 201 
 202 static void
 203 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
 204 {
 205         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 206         zap_cursor_t zc;
 207         zap_attribute_t za;
 208 
 209         /*
 210          * If it is the old version, dd_clones doesn't exist so we can't
 211          * find the clones, but dsl_deadlist_remove_key() is a no-op so it
 212          * doesn't matter.
 213          */
 214         if (ds->ds_dir->dd_phys->dd_clones == 0)
 215                 return;
 216 
 217         for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
 218             zap_cursor_retrieve(&zc, &za) == 0;
 219             zap_cursor_advance(&zc)) {
 220                 dsl_dataset_t *clone;
 221 
 222                 VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 223                     za.za_first_integer, FTAG, &clone));
 224                 if (clone->ds_dir->dd_origin_txg > mintxg) {
 225                         dsl_deadlist_remove_key(&clone->ds_deadlist,
 226                             mintxg, tx);
 227                         dsl_dataset_remove_clones_key(clone, mintxg, tx);
 228                 }
 229                 dsl_dataset_rele(clone, FTAG);
 230         }
 231         zap_cursor_fini(&zc);
 232 }
 233 
 234 void
 235 dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 236 {
 237         int err;
 238         int after_branch_point = FALSE;
 239         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 240         objset_t *mos = dp->dp_meta_objset;
 241         dsl_dataset_t *ds_prev = NULL;
 242         uint64_t obj;
 243 
 244         ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 245         ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
 246         ASSERT(refcount_is_zero(&ds->ds_longholds));
 247 
 248         if (defer &&
 249             (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) {
 250                 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 251                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
 252                 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
 253                 spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
 254                 return;
 255         }
 256 
 257         ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
 258 
 259         /* We need to log before removing it from the namespace. */
 260         spa_history_log_internal_ds(ds, "destroy", tx, "");
 261 
 262         dsl_scan_ds_destroyed(ds, tx);
 263 
 264         obj = ds->ds_object;
 265 
 266         if (ds->ds_phys->ds_prev_snap_obj != 0) {
 267                 ASSERT3P(ds->ds_prev, ==, NULL);
 268                 VERIFY0(dsl_dataset_hold_obj(dp,
 269                     ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
 270                 after_branch_point =
 271                     (ds_prev->ds_phys->ds_next_snap_obj != obj);
 272 
 273                 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 274                 if (after_branch_point &&
 275                     ds_prev->ds_phys->ds_next_clones_obj != 0) {
 276                         dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
 277                         if (ds->ds_phys->ds_next_snap_obj != 0) {
 278                                 VERIFY0(zap_add_int(mos,
 279                                     ds_prev->ds_phys->ds_next_clones_obj,
 280                                     ds->ds_phys->ds_next_snap_obj, tx));
 281                         }
 282                 }
 283                 if (!after_branch_point) {
 284                         ds_prev->ds_phys->ds_next_snap_obj =
 285                             ds->ds_phys->ds_next_snap_obj;
 286                 }
 287         }
 288 
 289         dsl_dataset_t *ds_next;
 290         uint64_t old_unique;
 291         uint64_t used = 0, comp = 0, uncomp = 0;
 292 
 293         VERIFY0(dsl_dataset_hold_obj(dp,
 294             ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
 295         ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
 296 
 297         old_unique = ds_next->ds_phys->ds_unique_bytes;
 298 
 299         dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
 300         ds_next->ds_phys->ds_prev_snap_obj =
 301             ds->ds_phys->ds_prev_snap_obj;
 302         ds_next->ds_phys->ds_prev_snap_txg =
 303             ds->ds_phys->ds_prev_snap_txg;
 304         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
 305             ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
 306 
 307         if (ds_next->ds_deadlist.dl_oldfmt) {
 308                 process_old_deadlist(ds, ds_prev, ds_next,
 309                     after_branch_point, tx);
 310         } else {
 311                 /* Adjust prev's unique space. */
 312                 if (ds_prev && !after_branch_point) {
 313                         dsl_deadlist_space_range(&ds_next->ds_deadlist,
 314                             ds_prev->ds_phys->ds_prev_snap_txg,
 315                             ds->ds_phys->ds_prev_snap_txg,
 316                             &used, &comp, &uncomp);
 317                         ds_prev->ds_phys->ds_unique_bytes += used;
 318                 }
 319 
 320                 /* Adjust snapused. */
 321                 dsl_deadlist_space_range(&ds_next->ds_deadlist,
 322                     ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
 323                     &used, &comp, &uncomp);
 324                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
 325                     -used, -comp, -uncomp, tx);
 326 
 327                 /* Move blocks to be freed to pool's free list. */
 328                 dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
 329                     &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
 330                     tx);
 331                 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
 332                     DD_USED_HEAD, used, comp, uncomp, tx);
 333 
 334                 /* Merge our deadlist into next's and free it. */
 335                 dsl_deadlist_merge(&ds_next->ds_deadlist,
 336                     ds->ds_phys->ds_deadlist_obj, tx);
 337         }
 338         dsl_deadlist_close(&ds->ds_deadlist);
 339         dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
 340         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 341         ds->ds_phys->ds_deadlist_obj = 0;
 342 
 343         /* Collapse range in clone heads */
 344         dsl_dataset_remove_clones_key(ds,
 345             ds->ds_phys->ds_creation_txg, tx);
 346 
 347         if (dsl_dataset_is_snapshot(ds_next)) {
 348                 dsl_dataset_t *ds_nextnext;
 349 
 350                 /*
 351                  * Update next's unique to include blocks which
 352                  * were previously shared by only this snapshot
 353                  * and it.  Those blocks will be born after the
 354                  * prev snap and before this snap, and will have
 355                  * died after the next snap and before the one
 356                  * after that (ie. be on the snap after next's
 357                  * deadlist).
 358                  */
 359                 VERIFY0(dsl_dataset_hold_obj(dp,
 360                     ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext));
 361                 dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
 362                     ds->ds_phys->ds_prev_snap_txg,
 363                     ds->ds_phys->ds_creation_txg,
 364                     &used, &comp, &uncomp);
 365                 ds_next->ds_phys->ds_unique_bytes += used;
 366                 dsl_dataset_rele(ds_nextnext, FTAG);
 367                 ASSERT3P(ds_next->ds_prev, ==, NULL);
 368 
 369                 /* Collapse range in this head. */
 370                 dsl_dataset_t *hds;
 371                 VERIFY0(dsl_dataset_hold_obj(dp,
 372                     ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds));
 373                 dsl_deadlist_remove_key(&hds->ds_deadlist,
 374                     ds->ds_phys->ds_creation_txg, tx);
 375                 dsl_dataset_rele(hds, FTAG);
 376 
 377         } else {
 378                 ASSERT3P(ds_next->ds_prev, ==, ds);
 379                 dsl_dataset_rele(ds_next->ds_prev, ds_next);
 380                 ds_next->ds_prev = NULL;
 381                 if (ds_prev) {
 382                         VERIFY0(dsl_dataset_hold_obj(dp,
 383                             ds->ds_phys->ds_prev_snap_obj,
 384                             ds_next, &ds_next->ds_prev));
 385                 }
 386 
 387                 dsl_dataset_recalc_head_uniq(ds_next);
 388 
 389                 /*
 390                  * Reduce the amount of our unconsumed refreservation
 391                  * being charged to our parent by the amount of
 392                  * new unique data we have gained.
 393                  */
 394                 if (old_unique < ds_next->ds_reserved) {
 395                         int64_t mrsdelta;
 396                         uint64_t new_unique =
 397                             ds_next->ds_phys->ds_unique_bytes;
 398 
 399                         ASSERT(old_unique <= new_unique);
 400                         mrsdelta = MIN(new_unique - old_unique,
 401                             ds_next->ds_reserved - old_unique);
 402                         dsl_dir_diduse_space(ds->ds_dir,
 403                             DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
 404                 }
 405         }
 406         dsl_dataset_rele(ds_next, FTAG);
 407 
 408         /*
 409          * This must be done after the dsl_traverse(), because it will
 410          * re-open the objset.
 411          */
 412         if (ds->ds_objset) {
 413                 dmu_objset_evict(ds->ds_objset);
 414                 ds->ds_objset = NULL;
 415         }
 416 
 417         /* remove from snapshot namespace */
 418         dsl_dataset_t *ds_head;
 419         ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
 420         VERIFY0(dsl_dataset_hold_obj(dp,
 421             ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
 422         VERIFY0(dsl_dataset_get_snapname(ds));
 423 #ifdef ZFS_DEBUG
 424         {
 425                 uint64_t val;
 426 
 427                 err = dsl_dataset_snap_lookup(ds_head,
 428                     ds->ds_snapname, &val);
 429                 ASSERT0(err);
 430                 ASSERT3U(val, ==, obj);
 431         }
 432 #endif
 433         VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx));
 434         dsl_dataset_rele(ds_head, FTAG);
 435 
 436         if (ds_prev != NULL)
 437                 dsl_dataset_rele(ds_prev, FTAG);
 438 
 439         spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 440 
 441         if (ds->ds_phys->ds_next_clones_obj != 0) {
 442                 uint64_t count;
 443                 ASSERT0(zap_count(mos,
 444                     ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
 445                 VERIFY0(dmu_object_free(mos,
 446                     ds->ds_phys->ds_next_clones_obj, tx));
 447         }
 448         if (ds->ds_phys->ds_props_obj != 0)
 449                 VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
 450         if (ds->ds_phys->ds_userrefs_obj != 0)
 451                 VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
 452         dsl_dir_rele(ds->ds_dir, ds);
 453         ds->ds_dir = NULL;
 454         VERIFY0(dmu_object_free(mos, obj, tx));
 455 }
 456 
 457 static void
 458 dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
 459 {
 460         dmu_snapshots_destroy_arg_t *dsda = arg;
 461         dsl_pool_t *dp = dmu_tx_pool(tx);
 462         nvpair_t *pair;
 463 
 464         for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL);
 465             pair != NULL;
 466             pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) {
 467                 dsl_dataset_t *ds;
 468 
 469                 VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
 470 
 471                 dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx);
 472                 dsl_dataset_rele(ds, FTAG);
 473         }
 474 }
 475 
 476 /*
 477  * The semantics of this function are described in the comment above
 478  * lzc_destroy_snaps().  To summarize:
 479  *
 480  * The snapshots must all be in the same pool.
 481  *
 482  * Snapshots that don't exist will be silently ignored (considered to be
 483  * "already deleted").
 484  *
 485  * On success, all snaps will be destroyed and this will return 0.
 486  * On failure, no snaps will be destroyed, the errlist will be filled in,
 487  * and this will return an errno.
 488  */
 489 int
 490 dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
 491     nvlist_t *errlist)
 492 {
 493         dmu_snapshots_destroy_arg_t dsda;
 494         int error;
 495         nvpair_t *pair;
 496 
 497         pair = nvlist_next_nvpair(snaps, NULL);
 498         if (pair == NULL)
 499                 return (0);
 500 
 501         dsda.dsda_snaps = snaps;
 502         dsda.dsda_successful_snaps = fnvlist_alloc();
 503         dsda.dsda_defer = defer;
 504         dsda.dsda_errlist = errlist;
 505 
 506         error = dsl_sync_task(nvpair_name(pair),
 507             dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
 508             &dsda, 0);
 509         fnvlist_free(dsda.dsda_successful_snaps);
 510 
 511         return (error);
 512 }
 513 
 514 int
 515 dsl_destroy_snapshot(const char *name, boolean_t defer)
 516 {
 517         int error;
 518         nvlist_t *nvl = fnvlist_alloc();
 519         nvlist_t *errlist = fnvlist_alloc();
 520 
 521         fnvlist_add_boolean(nvl, name);
 522         error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
 523         fnvlist_free(errlist);
 524         fnvlist_free(nvl);
 525         return (error);
 526 }
 527 
 528 struct killarg {
 529         dsl_dataset_t *ds;
 530         dmu_tx_t *tx;
 531 };
 532 
 533 /* ARGSUSED */
 534 static int
 535 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 536     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 537 {
 538         struct killarg *ka = arg;
 539         dmu_tx_t *tx = ka->tx;
 540 
 541         if (bp == NULL)
 542                 return (0);
 543 
 544         if (zb->zb_level == ZB_ZIL_LEVEL) {
 545                 ASSERT(zilog != NULL);
 546                 /*
 547                  * It's a block in the intent log.  It has no
 548                  * accounting, so just free it.
 549                  */
 550                 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 551         } else {
 552                 ASSERT(zilog == NULL);
 553                 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
 554                 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 555         }
 556 
 557         return (0);
 558 }
 559 
 560 static void
 561 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
 562 {
 563         struct killarg ka;
 564 
 565         /*
 566          * Free everything that we point to (that's born after
 567          * the previous snapshot, if we are a clone)
 568          *
 569          * NB: this should be very quick, because we already
 570          * freed all the objects in open context.
 571          */
 572         ka.ds = ds;
 573         ka.tx = tx;
 574         VERIFY0(traverse_dataset(ds,
 575             ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
 576             kill_blkptr, &ka));
 577         ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
 578 }
 579 
 580 typedef struct dsl_destroy_head_arg {
 581         const char *ddha_name;
 582 } dsl_destroy_head_arg_t;
 583 
 584 int
 585 dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
 586 {
 587         int error;
 588         uint64_t count;
 589         objset_t *mos;
 590 
 591         if (dsl_dataset_is_snapshot(ds))
 592                 return (SET_ERROR(EINVAL));
 593 
 594         if (refcount_count(&ds->ds_longholds) != expected_holds)
 595                 return (SET_ERROR(EBUSY));
 596 
 597         mos = ds->ds_dir->dd_pool->dp_meta_objset;
 598 
 599         /*
 600          * Can't delete a head dataset if there are snapshots of it.
 601          * (Except if the only snapshots are from the branch we cloned
 602          * from.)
 603          */
 604         if (ds->ds_prev != NULL &&
 605             ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
 606                 return (SET_ERROR(EBUSY));
 607 
 608         /*
 609          * Can't delete if there are children of this fs.
 610          */
 611         error = zap_count(mos,
 612             ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
 613         if (error != 0)
 614                 return (error);
 615         if (count != 0)
 616                 return (SET_ERROR(EEXIST));
 617 
 618         if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
 619             ds->ds_prev->ds_phys->ds_num_children == 2 &&
 620             ds->ds_prev->ds_userrefs == 0) {
 621                 /* We need to remove the origin snapshot as well. */
 622                 if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
 623                         return (SET_ERROR(EBUSY));
 624         }
 625         return (0);
 626 }
 627 
 628 static int
 629 dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
 630 {
 631         dsl_destroy_head_arg_t *ddha = arg;
 632         dsl_pool_t *dp = dmu_tx_pool(tx);
 633         dsl_dataset_t *ds;
 634         int error;
 635 
 636         error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
 637         if (error != 0)
 638                 return (error);
 639 
 640         error = dsl_destroy_head_check_impl(ds, 0);
 641         dsl_dataset_rele(ds, FTAG);
 642         return (error);
 643 }
 644 
 645 static void
 646 dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
 647 {
 648         dsl_dir_t *dd;
 649         dsl_pool_t *dp = dmu_tx_pool(tx);
 650         objset_t *mos = dp->dp_meta_objset;
 651         dd_used_t t;
 652 
 653         ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
 654 
 655         VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
 656 
 657         ASSERT0(dd->dd_phys->dd_head_dataset_obj);
 658 
 659         /*
 660          * Remove our reservation. The impl() routine avoids setting the
 661          * actual property, which would require the (already destroyed) ds.
 662          */
 663         dsl_dir_set_reservation_sync_impl(dd, 0, tx);
 664 
 665         ASSERT0(dd->dd_phys->dd_used_bytes);
 666         ASSERT0(dd->dd_phys->dd_reserved);
 667         for (t = 0; t < DD_USED_NUM; t++)
 668                 ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
 669 
 670         VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
 671         VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
 672         VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
 673         VERIFY0(zap_remove(mos,
 674             dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
 675 
 676         dsl_dir_rele(dd, FTAG);
 677         VERIFY0(dmu_object_free(mos, ddobj, tx));
 678 }
 679 
 680 void
 681 dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 682 {
 683         dsl_pool_t *dp = dmu_tx_pool(tx);
 684         objset_t *mos = dp->dp_meta_objset;
 685         uint64_t obj, ddobj, prevobj = 0;
 686         boolean_t rmorigin;
 687 
 688         ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
 689         ASSERT(ds->ds_prev == NULL ||
 690             ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
 691         ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
 692         ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 693 
 694         /* We need to log before removing it from the namespace. */
 695         spa_history_log_internal_ds(ds, "destroy", tx, "");
 696 
 697         rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
 698             DS_IS_DEFER_DESTROY(ds->ds_prev) &&
 699             ds->ds_prev->ds_phys->ds_num_children == 2 &&
 700             ds->ds_prev->ds_userrefs == 0);
 701 
 702         /* Remove our reservation */
 703         if (ds->ds_reserved != 0) {
 704                 dsl_dataset_set_refreservation_sync_impl(ds,
 705                     (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
 706                     0, tx);
 707                 ASSERT0(ds->ds_reserved);
 708         }
 709 
 710         dsl_scan_ds_destroyed(ds, tx);
 711 
 712         obj = ds->ds_object;
 713 
 714         if (ds->ds_phys->ds_prev_snap_obj != 0) {
 715                 /* This is a clone */
 716                 ASSERT(ds->ds_prev != NULL);
 717                 ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj);
 718                 ASSERT0(ds->ds_phys->ds_next_snap_obj);
 719 
 720                 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 721                 if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) {
 722                         dsl_dataset_remove_from_next_clones(ds->ds_prev,
 723                             obj, tx);
 724                 }
 725 
 726                 ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1);
 727                 ds->ds_prev->ds_phys->ds_num_children--;
 728         }
 729 
 730         zfeature_info_t *async_destroy =
 731             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
 732         objset_t *os;
 733 
 734         /*
 735          * Destroy the deadlist.  Unless it's a clone, the
 736          * deadlist should be empty.  (If it's a clone, it's
 737          * safe to ignore the deadlist contents.)
 738          */
 739         dsl_deadlist_close(&ds->ds_deadlist);
 740         dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
 741         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 742         ds->ds_phys->ds_deadlist_obj = 0;
 743 
 744         VERIFY0(dmu_objset_from_ds(ds, &os));
 745 
 746         if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
 747                 old_synchronous_dataset_destroy(ds, tx);
 748         } else {
 749                 /*
 750                  * Move the bptree into the pool's list of trees to
 751                  * clean up and update space accounting information.
 752                  */
 753                 uint64_t used, comp, uncomp;
 754 
 755                 zil_destroy_sync(dmu_objset_zil(os), tx);
 756 
 757                 if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
 758                         dsl_scan_t *scn = dp->dp_scan;
 759 
 760                         spa_feature_incr(dp->dp_spa, async_destroy, tx);
 761                         dp->dp_bptree_obj = bptree_alloc(mos, tx);
 762                         VERIFY0(zap_add(mos,
 763                             DMU_POOL_DIRECTORY_OBJECT,
 764                             DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 765                             &dp->dp_bptree_obj, tx));
 766                         ASSERT(!scn->scn_async_destroying);
 767                         scn->scn_async_destroying = B_TRUE;
 768                 }
 769 
 770                 used = ds->ds_dir->dd_phys->dd_used_bytes;
 771                 comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
 772                 uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
 773 
 774                 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
 775                     ds->ds_phys->ds_unique_bytes == used);
 776 
 777                 bptree_add(mos, dp->dp_bptree_obj,
 778                     &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
 779                     used, comp, uncomp, tx);
 780                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 781                     -used, -comp, -uncomp, tx);
 782                 dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
 783                     used, comp, uncomp, tx);
 784         }
 785 
 786         if (ds->ds_prev != NULL) {
 787                 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
 788                         VERIFY0(zap_remove_int(mos,
 789                             ds->ds_prev->ds_dir->dd_phys->dd_clones,
 790                             ds->ds_object, tx));
 791                 }
 792                 prevobj = ds->ds_prev->ds_object;
 793                 dsl_dataset_rele(ds->ds_prev, ds);
 794                 ds->ds_prev = NULL;
 795         }
 796 
 797         /*
 798          * This must be done after the dsl_traverse(), because it will
 799          * re-open the objset.
 800          */
 801         if (ds->ds_objset) {
 802                 dmu_objset_evict(ds->ds_objset);
 803                 ds->ds_objset = NULL;
 804         }
 805 
 806         /* Erase the link in the dir */
 807         dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 808         ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
 809         ddobj = ds->ds_dir->dd_object;
 810         ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
 811         VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx));
 812 
 813         spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
 814 
 815         ASSERT0(ds->ds_phys->ds_next_clones_obj);
 816         ASSERT0(ds->ds_phys->ds_props_obj);
 817         ASSERT0(ds->ds_phys->ds_userrefs_obj);
 818         dsl_dir_rele(ds->ds_dir, ds);
 819         ds->ds_dir = NULL;
 820         VERIFY0(dmu_object_free(mos, obj, tx));
 821 
 822         dsl_dir_destroy_sync(ddobj, tx);
 823 
 824         if (rmorigin) {
 825                 dsl_dataset_t *prev;
 826                 VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
 827                 dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
 828                 dsl_dataset_rele(prev, FTAG);
 829         }
 830 }
 831 
 832 static void
 833 dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
 834 {
 835         dsl_destroy_head_arg_t *ddha = arg;
 836         dsl_pool_t *dp = dmu_tx_pool(tx);
 837         dsl_dataset_t *ds;
 838 
 839         VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
 840         dsl_destroy_head_sync_impl(ds, tx);
 841         dsl_dataset_rele(ds, FTAG);
 842 }
 843 
 844 static void
 845 dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
 846 {
 847         dsl_destroy_head_arg_t *ddha = arg;
 848         dsl_pool_t *dp = dmu_tx_pool(tx);
 849         dsl_dataset_t *ds;
 850 
 851         VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
 852 
 853         /* Mark it as inconsistent on-disk, in case we crash */
 854         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 855         ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 856 
 857         spa_history_log_internal_ds(ds, "destroy begin", tx, "");
 858         dsl_dataset_rele(ds, FTAG);
 859 }
 860 
 861 int
 862 dsl_destroy_head(const char *name)
 863 {
 864         dsl_destroy_head_arg_t ddha;
 865         int error;
 866         spa_t *spa;
 867         boolean_t isenabled;
 868 
 869 #ifdef _KERNEL
 870         zfs_destroy_unmount_origin(name);
 871 #endif
 872 
 873         error = spa_open(name, &spa, FTAG);
 874         if (error != 0)
 875                 return (error);
 876         isenabled = spa_feature_is_enabled(spa,
 877             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
 878         spa_close(spa, FTAG);
 879 
 880         ddha.ddha_name = name;
 881 
 882         if (!isenabled) {
 883                 objset_t *os;
 884 
 885                 error = dsl_sync_task(name, dsl_destroy_head_check,
 886                     dsl_destroy_head_begin_sync, &ddha, 0);
 887                 if (error != 0)
 888                         return (error);
 889 
 890                 /*
 891                  * Head deletion is processed in one txg on old pools;
 892                  * remove the objects from open context so that the txg sync
 893                  * is not too long.
 894                  */
 895                 error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
 896                 if (error == 0) {
 897                         uint64_t prev_snap_txg =
 898                             dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg;
 899                         for (uint64_t obj = 0; error == 0;
 900                             error = dmu_object_next(os, &obj, FALSE,
 901                             prev_snap_txg))
 902                                 (void) dmu_free_object(os, obj);
 903                         /* sync out all frees */
 904                         txg_wait_synced(dmu_objset_pool(os), 0);
 905                         dmu_objset_disown(os, FTAG);
 906                 }
 907         }
 908 
 909         return (dsl_sync_task(name, dsl_destroy_head_check,
 910             dsl_destroy_head_sync, &ddha, 0));
 911 }
 912 
 913 /*
 914  * Note, this function is used as the callback for dmu_objset_find().  We
 915  * always return 0 so that we will continue to find and process
 916  * inconsistent datasets, even if we encounter an error trying to
 917  * process one of them.
 918  */
 919 /* ARGSUSED */
 920 int
 921 dsl_destroy_inconsistent(const char *dsname, void *arg)
 922 {
 923         objset_t *os;
 924 
 925         if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
 926                 boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os));
 927                 dmu_objset_rele(os, FTAG);
 928                 if (inconsistent)
 929                         (void) dsl_destroy_head(dsname);
 930         }
 931         return (0);
 932 }