1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 25 */ 26 27 #include <sys/dmu_objset.h> 28 #include <sys/dsl_dataset.h> 29 #include <sys/dsl_dir.h> 30 #include <sys/dsl_prop.h> 31 #include <sys/dsl_synctask.h> 32 #include <sys/dmu_traverse.h> 33 #include <sys/dmu_impl.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/arc.h> 36 #include <sys/zio.h> 37 #include <sys/zap.h> 38 #include <sys/zfeature.h> 39 #include <sys/unique.h> 40 #include <sys/zfs_context.h> 41 #include <sys/zfs_ioctl.h> 42 #include <sys/spa.h> 43 #include <sys/zfs_znode.h> 44 #include <sys/zfs_onexit.h> 45 #include <sys/zvol.h> 46 #include <sys/dsl_scan.h> 47 #include <sys/dsl_deadlist.h> 48 #include "zfs_prop.h" 49 50 static char *dsl_reaper = "the grim reaper"; 51 52 static dsl_checkfunc_t dsl_dataset_destroy_begin_check; 53 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; 54 static dsl_syncfunc_t dsl_dataset_set_reservation_sync; 55 56 #define SWITCH64(x, y) \ 57 { \ 58 uint64_t __tmp = (x); \ 59 (x) = (y); \ 60 (y) = __tmp; \ 61 } 62 63 #define DS_REF_MAX (1ULL << 62) 64 65 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE 66 67 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) 68 69 70 /* 71 * Figure out how much of this delta should be propogated to the dsl_dir 72 * layer. If there's a refreservation, that space has already been 73 * partially accounted for in our ancestors. 74 */ 75 static int64_t 76 parent_delta(dsl_dataset_t *ds, int64_t delta) 77 { 78 uint64_t old_bytes, new_bytes; 79 80 if (ds->ds_reserved == 0) 81 return (delta); 82 83 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 84 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); 85 86 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); 87 return (new_bytes - old_bytes); 88 } 89 90 void 91 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) 92 { 93 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 94 int compressed = BP_GET_PSIZE(bp); 95 int uncompressed = BP_GET_UCSIZE(bp); 96 int64_t delta; 97 98 dprintf_bp(bp, "ds=%p", ds); 99 100 ASSERT(dmu_tx_is_syncing(tx)); 101 /* It could have been compressed away to nothing */ 102 if (BP_IS_HOLE(bp)) 103 return; 104 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); 105 ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); 106 if (ds == NULL) { 107 dsl_pool_mos_diduse_space(tx->tx_pool, 108 used, compressed, uncompressed); 109 return; 110 } 111 dmu_buf_will_dirty(ds->ds_dbuf, tx); 112 113 mutex_enter(&ds->ds_dir->dd_lock); 114 mutex_enter(&ds->ds_lock); 115 delta = parent_delta(ds, used); 116 ds->ds_phys->ds_referenced_bytes += used; 117 ds->ds_phys->ds_compressed_bytes += compressed; 118 ds->ds_phys->ds_uncompressed_bytes += uncompressed; 119 ds->ds_phys->ds_unique_bytes += used; 120 mutex_exit(&ds->ds_lock); 121 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, 122 compressed, uncompressed, tx); 123 dsl_dir_transfer_space(ds->ds_dir, used - delta, 124 DD_USED_REFRSRV, DD_USED_HEAD, tx); 125 mutex_exit(&ds->ds_dir->dd_lock); 126 } 127 128 int 129 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, 130 boolean_t async) 131 { 132 if (BP_IS_HOLE(bp)) 133 return (0); 134 135 ASSERT(dmu_tx_is_syncing(tx)); 136 ASSERT(bp->blk_birth <= tx->tx_txg); 137 138 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 139 int compressed = BP_GET_PSIZE(bp); 140 int uncompressed = BP_GET_UCSIZE(bp); 141 142 ASSERT(used > 0); 143 if (ds == NULL) { 144 dsl_free(tx->tx_pool, tx->tx_txg, bp); 145 dsl_pool_mos_diduse_space(tx->tx_pool, 146 -used, -compressed, -uncompressed); 147 return (used); 148 } 149 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); 150 151 ASSERT(!dsl_dataset_is_snapshot(ds)); 152 dmu_buf_will_dirty(ds->ds_dbuf, tx); 153 154 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { 155 int64_t delta; 156 157 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); 158 dsl_free(tx->tx_pool, tx->tx_txg, bp); 159 160 mutex_enter(&ds->ds_dir->dd_lock); 161 mutex_enter(&ds->ds_lock); 162 ASSERT(ds->ds_phys->ds_unique_bytes >= used || 163 !DS_UNIQUE_IS_ACCURATE(ds)); 164 delta = parent_delta(ds, -used); 165 ds->ds_phys->ds_unique_bytes -= used; 166 mutex_exit(&ds->ds_lock); 167 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 168 delta, -compressed, -uncompressed, tx); 169 dsl_dir_transfer_space(ds->ds_dir, -used - delta, 170 DD_USED_REFRSRV, DD_USED_HEAD, tx); 171 mutex_exit(&ds->ds_dir->dd_lock); 172 } else { 173 dprintf_bp(bp, "putting on dead list: %s", ""); 174 if (async) { 175 /* 176 * We are here as part of zio's write done callback, 177 * which means we're a zio interrupt thread. We can't 178 * call dsl_deadlist_insert() now because it may block 179 * waiting for I/O. Instead, put bp on the deferred 180 * queue and let dsl_pool_sync() finish the job. 181 */ 182 bplist_append(&ds->ds_pending_deadlist, bp); 183 } else { 184 dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); 185 } 186 ASSERT3U(ds->ds_prev->ds_object, ==, 187 ds->ds_phys->ds_prev_snap_obj); 188 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); 189 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ 190 if (ds->ds_prev->ds_phys->ds_next_snap_obj == 191 ds->ds_object && bp->blk_birth > 192 ds->ds_prev->ds_phys->ds_prev_snap_txg) { 193 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 194 mutex_enter(&ds->ds_prev->ds_lock); 195 ds->ds_prev->ds_phys->ds_unique_bytes += used; 196 mutex_exit(&ds->ds_prev->ds_lock); 197 } 198 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { 199 dsl_dir_transfer_space(ds->ds_dir, used, 200 DD_USED_HEAD, DD_USED_SNAP, tx); 201 } 202 } 203 mutex_enter(&ds->ds_lock); 204 ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used); 205 ds->ds_phys->ds_referenced_bytes -= used; 206 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); 207 ds->ds_phys->ds_compressed_bytes -= compressed; 208 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); 209 ds->ds_phys->ds_uncompressed_bytes -= uncompressed; 210 mutex_exit(&ds->ds_lock); 211 212 return (used); 213 } 214 215 uint64_t 216 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) 217 { 218 uint64_t trysnap = 0; 219 220 if (ds == NULL) 221 return (0); 222 /* 223 * The snapshot creation could fail, but that would cause an 224 * incorrect FALSE return, which would only result in an 225 * overestimation of the amount of space that an operation would 226 * consume, which is OK. 227 * 228 * There's also a small window where we could miss a pending 229 * snapshot, because we could set the sync task in the quiescing 230 * phase. So this should only be used as a guess. 231 */ 232 if (ds->ds_trysnap_txg > 233 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) 234 trysnap = ds->ds_trysnap_txg; 235 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); 236 } 237 238 boolean_t 239 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, 240 uint64_t blk_birth) 241 { 242 if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) 243 return (B_FALSE); 244 245 ddt_prefetch(dsl_dataset_get_spa(ds), bp); 246 247 return (B_TRUE); 248 } 249 250 /* ARGSUSED */ 251 static void 252 dsl_dataset_evict(dmu_buf_t *db, void *dsv) 253 { 254 dsl_dataset_t *ds = dsv; 255 256 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); 257 258 unique_remove(ds->ds_fsid_guid); 259 260 if (ds->ds_objset != NULL) 261 dmu_objset_evict(ds->ds_objset); 262 263 if (ds->ds_prev) { 264 dsl_dataset_drop_ref(ds->ds_prev, ds); 265 ds->ds_prev = NULL; 266 } 267 268 bplist_destroy(&ds->ds_pending_deadlist); 269 if (db != NULL) { 270 dsl_deadlist_close(&ds->ds_deadlist); 271 } else { 272 ASSERT(ds->ds_deadlist.dl_dbuf == NULL); 273 ASSERT(!ds->ds_deadlist.dl_oldfmt); 274 } 275 if (ds->ds_dir) 276 dsl_dir_close(ds->ds_dir, ds); 277 278 ASSERT(!list_link_active(&ds->ds_synced_link)); 279 280 mutex_destroy(&ds->ds_lock); 281 mutex_destroy(&ds->ds_recvlock); 282 mutex_destroy(&ds->ds_opening_lock); 283 rw_destroy(&ds->ds_rwlock); 284 cv_destroy(&ds->ds_exclusive_cv); 285 286 kmem_free(ds, sizeof (dsl_dataset_t)); 287 } 288 289 static int 290 dsl_dataset_get_snapname(dsl_dataset_t *ds) 291 { 292 dsl_dataset_phys_t *headphys; 293 int err; 294 dmu_buf_t *headdbuf; 295 dsl_pool_t *dp = ds->ds_dir->dd_pool; 296 objset_t *mos = dp->dp_meta_objset; 297 298 if (ds->ds_snapname[0]) 299 return (0); 300 if (ds->ds_phys->ds_next_snap_obj == 0) 301 return (0); 302 303 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, 304 FTAG, &headdbuf); 305 if (err) 306 return (err); 307 headphys = headdbuf->db_data; 308 err = zap_value_search(dp->dp_meta_objset, 309 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); 310 dmu_buf_rele(headdbuf, FTAG); 311 return (err); 312 } 313 314 static int 315 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) 316 { 317 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 318 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 319 matchtype_t mt; 320 int err; 321 322 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 323 mt = MT_FIRST; 324 else 325 mt = MT_EXACT; 326 327 err = zap_lookup_norm(mos, snapobj, name, 8, 1, 328 value, mt, NULL, 0, NULL); 329 if (err == ENOTSUP && mt == MT_FIRST) 330 err = zap_lookup(mos, snapobj, name, 8, 1, value); 331 return (err); 332 } 333 334 static int 335 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) 336 { 337 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 338 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 339 matchtype_t mt; 340 int err; 341 342 dsl_dir_snap_cmtime_update(ds->ds_dir); 343 344 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 345 mt = MT_FIRST; 346 else 347 mt = MT_EXACT; 348 349 err = zap_remove_norm(mos, snapobj, name, mt, tx); 350 if (err == ENOTSUP && mt == MT_FIRST) 351 err = zap_remove(mos, snapobj, name, tx); 352 353 if (err == 0) 354 dsl_snapcount_adjust(ds->ds_dir, tx, -1, B_TRUE); 355 356 return (err); 357 } 358 359 static int 360 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, 361 dsl_dataset_t **dsp) 362 { 363 objset_t *mos = dp->dp_meta_objset; 364 dmu_buf_t *dbuf; 365 dsl_dataset_t *ds; 366 int err; 367 dmu_object_info_t doi; 368 369 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 370 dsl_pool_sync_context(dp)); 371 372 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); 373 if (err) 374 return (err); 375 376 /* Make sure dsobj has the correct object type. */ 377 dmu_object_info_from_db(dbuf, &doi); 378 if (doi.doi_type != DMU_OT_DSL_DATASET) 379 return (EINVAL); 380 381 ds = dmu_buf_get_user(dbuf); 382 if (ds == NULL) { 383 dsl_dataset_t *winner; 384 385 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); 386 ds->ds_dbuf = dbuf; 387 ds->ds_object = dsobj; 388 ds->ds_phys = dbuf->db_data; 389 390 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); 391 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); 392 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); 393 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); 394 395 rw_init(&ds->ds_rwlock, 0, 0, 0); 396 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); 397 398 bplist_create(&ds->ds_pending_deadlist); 399 dsl_deadlist_open(&ds->ds_deadlist, 400 mos, ds->ds_phys->ds_deadlist_obj); 401 402 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), 403 offsetof(dmu_sendarg_t, dsa_link)); 404 405 if (err == 0) { 406 err = dsl_dir_open_obj(dp, 407 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); 408 } 409 if (err) { 410 mutex_destroy(&ds->ds_lock); 411 mutex_destroy(&ds->ds_recvlock); 412 mutex_destroy(&ds->ds_opening_lock); 413 rw_destroy(&ds->ds_rwlock); 414 cv_destroy(&ds->ds_exclusive_cv); 415 bplist_destroy(&ds->ds_pending_deadlist); 416 dsl_deadlist_close(&ds->ds_deadlist); 417 kmem_free(ds, sizeof (dsl_dataset_t)); 418 dmu_buf_rele(dbuf, tag); 419 return (err); 420 } 421 422 if (!dsl_dataset_is_snapshot(ds)) { 423 ds->ds_snapname[0] = '\0'; 424 if (ds->ds_phys->ds_prev_snap_obj) { 425 err = dsl_dataset_get_ref(dp, 426 ds->ds_phys->ds_prev_snap_obj, 427 ds, &ds->ds_prev); 428 } 429 } else { 430 if (zfs_flags & ZFS_DEBUG_SNAPNAMES) 431 err = dsl_dataset_get_snapname(ds); 432 if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { 433 err = zap_count( 434 ds->ds_dir->dd_pool->dp_meta_objset, 435 ds->ds_phys->ds_userrefs_obj, 436 &ds->ds_userrefs); 437 } 438 } 439 440 if (err == 0 && !dsl_dataset_is_snapshot(ds)) { 441 /* 442 * In sync context, we're called with either no lock 443 * or with the write lock. If we're not syncing, 444 * we're always called with the read lock held. 445 */ 446 boolean_t need_lock = 447 !RW_WRITE_HELD(&dp->dp_config_rwlock) && 448 dsl_pool_sync_context(dp); 449 450 if (need_lock) 451 rw_enter(&dp->dp_config_rwlock, RW_READER); 452 453 err = dsl_prop_get_ds(ds, 454 "refreservation", sizeof (uint64_t), 1, 455 &ds->ds_reserved, NULL); 456 if (err == 0) { 457 err = dsl_prop_get_ds(ds, 458 "refquota", sizeof (uint64_t), 1, 459 &ds->ds_quota, NULL); 460 } 461 462 if (need_lock) 463 rw_exit(&dp->dp_config_rwlock); 464 } else { 465 ds->ds_reserved = ds->ds_quota = 0; 466 } 467 468 if (err == 0) { 469 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, 470 dsl_dataset_evict); 471 } 472 if (err || winner) { 473 bplist_destroy(&ds->ds_pending_deadlist); 474 dsl_deadlist_close(&ds->ds_deadlist); 475 if (ds->ds_prev) 476 dsl_dataset_drop_ref(ds->ds_prev, ds); 477 dsl_dir_close(ds->ds_dir, ds); 478 mutex_destroy(&ds->ds_lock); 479 mutex_destroy(&ds->ds_recvlock); 480 mutex_destroy(&ds->ds_opening_lock); 481 rw_destroy(&ds->ds_rwlock); 482 cv_destroy(&ds->ds_exclusive_cv); 483 kmem_free(ds, sizeof (dsl_dataset_t)); 484 if (err) { 485 dmu_buf_rele(dbuf, tag); 486 return (err); 487 } 488 ds = winner; 489 } else { 490 ds->ds_fsid_guid = 491 unique_insert(ds->ds_phys->ds_fsid_guid); 492 } 493 } 494 ASSERT3P(ds->ds_dbuf, ==, dbuf); 495 ASSERT3P(ds->ds_phys, ==, dbuf->db_data); 496 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || 497 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || 498 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); 499 mutex_enter(&ds->ds_lock); 500 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { 501 mutex_exit(&ds->ds_lock); 502 dmu_buf_rele(ds->ds_dbuf, tag); 503 return (ENOENT); 504 } 505 mutex_exit(&ds->ds_lock); 506 *dsp = ds; 507 return (0); 508 } 509 510 static int 511 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) 512 { 513 dsl_pool_t *dp = ds->ds_dir->dd_pool; 514 515 /* 516 * In syncing context we don't want the rwlock lock: there 517 * may be an existing writer waiting for sync phase to 518 * finish. We don't need to worry about such writers, since 519 * sync phase is single-threaded, so the writer can't be 520 * doing anything while we are active. 521 */ 522 if (dsl_pool_sync_context(dp)) { 523 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 524 return (0); 525 } 526 527 /* 528 * Normal users will hold the ds_rwlock as a READER until they 529 * are finished (i.e., call dsl_dataset_rele()). "Owners" will 530 * drop their READER lock after they set the ds_owner field. 531 * 532 * If the dataset is being destroyed, the destroy thread will 533 * obtain a WRITER lock for exclusive access after it's done its 534 * open-context work and then change the ds_owner to 535 * dsl_reaper once destruction is assured. So threads 536 * may block here temporarily, until the "destructability" of 537 * the dataset is determined. 538 */ 539 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); 540 mutex_enter(&ds->ds_lock); 541 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { 542 rw_exit(&dp->dp_config_rwlock); 543 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); 544 if (DSL_DATASET_IS_DESTROYED(ds)) { 545 mutex_exit(&ds->ds_lock); 546 dsl_dataset_drop_ref(ds, tag); 547 rw_enter(&dp->dp_config_rwlock, RW_READER); 548 return (ENOENT); 549 } 550 /* 551 * The dp_config_rwlock lives above the ds_lock. And 552 * we need to check DSL_DATASET_IS_DESTROYED() while 553 * holding the ds_lock, so we have to drop and reacquire 554 * the ds_lock here. 555 */ 556 mutex_exit(&ds->ds_lock); 557 rw_enter(&dp->dp_config_rwlock, RW_READER); 558 mutex_enter(&ds->ds_lock); 559 } 560 mutex_exit(&ds->ds_lock); 561 return (0); 562 } 563 564 int 565 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, 566 dsl_dataset_t **dsp) 567 { 568 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); 569 570 if (err) 571 return (err); 572 return (dsl_dataset_hold_ref(*dsp, tag)); 573 } 574 575 int 576 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, 577 void *tag, dsl_dataset_t **dsp) 578 { 579 int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); 580 if (err) 581 return (err); 582 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 583 dsl_dataset_rele(*dsp, tag); 584 *dsp = NULL; 585 return (EBUSY); 586 } 587 return (0); 588 } 589 590 int 591 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) 592 { 593 dsl_dir_t *dd; 594 dsl_pool_t *dp; 595 const char *snapname; 596 uint64_t obj; 597 int err = 0; 598 599 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); 600 if (err) 601 return (err); 602 603 dp = dd->dd_pool; 604 obj = dd->dd_phys->dd_head_dataset_obj; 605 rw_enter(&dp->dp_config_rwlock, RW_READER); 606 if (obj) 607 err = dsl_dataset_get_ref(dp, obj, tag, dsp); 608 else 609 err = ENOENT; 610 if (err) 611 goto out; 612 613 err = dsl_dataset_hold_ref(*dsp, tag); 614 615 /* we may be looking for a snapshot */ 616 if (err == 0 && snapname != NULL) { 617 dsl_dataset_t *ds = NULL; 618 619 if (*snapname++ != '@') { 620 dsl_dataset_rele(*dsp, tag); 621 err = ENOENT; 622 goto out; 623 } 624 625 dprintf("looking for snapshot '%s'\n", snapname); 626 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); 627 if (err == 0) 628 err = dsl_dataset_get_ref(dp, obj, tag, &ds); 629 dsl_dataset_rele(*dsp, tag); 630 631 ASSERT3U((err == 0), ==, (ds != NULL)); 632 633 if (ds) { 634 mutex_enter(&ds->ds_lock); 635 if (ds->ds_snapname[0] == 0) 636 (void) strlcpy(ds->ds_snapname, snapname, 637 sizeof (ds->ds_snapname)); 638 mutex_exit(&ds->ds_lock); 639 err = dsl_dataset_hold_ref(ds, tag); 640 *dsp = err ? NULL : ds; 641 } 642 } 643 out: 644 rw_exit(&dp->dp_config_rwlock); 645 dsl_dir_close(dd, FTAG); 646 return (err); 647 } 648 649 int 650 dsl_dataset_own(const char *name, boolean_t inconsistentok, 651 void *tag, dsl_dataset_t **dsp) 652 { 653 int err = dsl_dataset_hold(name, tag, dsp); 654 if (err) 655 return (err); 656 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 657 dsl_dataset_rele(*dsp, tag); 658 return (EBUSY); 659 } 660 return (0); 661 } 662 663 void 664 dsl_dataset_name(dsl_dataset_t *ds, char *name) 665 { 666 if (ds == NULL) { 667 (void) strcpy(name, "mos"); 668 } else { 669 dsl_dir_name(ds->ds_dir, name); 670 VERIFY(0 == dsl_dataset_get_snapname(ds)); 671 if (ds->ds_snapname[0]) { 672 (void) strcat(name, "@"); 673 /* 674 * We use a "recursive" mutex so that we 675 * can call dprintf_ds() with ds_lock held. 676 */ 677 if (!MUTEX_HELD(&ds->ds_lock)) { 678 mutex_enter(&ds->ds_lock); 679 (void) strcat(name, ds->ds_snapname); 680 mutex_exit(&ds->ds_lock); 681 } else { 682 (void) strcat(name, ds->ds_snapname); 683 } 684 } 685 } 686 } 687 688 static int 689 dsl_dataset_namelen(dsl_dataset_t *ds) 690 { 691 int result; 692 693 if (ds == NULL) { 694 result = 3; /* "mos" */ 695 } else { 696 result = dsl_dir_namelen(ds->ds_dir); 697 VERIFY(0 == dsl_dataset_get_snapname(ds)); 698 if (ds->ds_snapname[0]) { 699 ++result; /* adding one for the @-sign */ 700 if (!MUTEX_HELD(&ds->ds_lock)) { 701 mutex_enter(&ds->ds_lock); 702 result += strlen(ds->ds_snapname); 703 mutex_exit(&ds->ds_lock); 704 } else { 705 result += strlen(ds->ds_snapname); 706 } 707 } 708 } 709 710 return (result); 711 } 712 713 void 714 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) 715 { 716 dmu_buf_rele(ds->ds_dbuf, tag); 717 } 718 719 void 720 dsl_dataset_rele(dsl_dataset_t *ds, void *tag) 721 { 722 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { 723 rw_exit(&ds->ds_rwlock); 724 } 725 dsl_dataset_drop_ref(ds, tag); 726 } 727 728 void 729 dsl_dataset_disown(dsl_dataset_t *ds, void *tag) 730 { 731 ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || 732 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); 733 734 mutex_enter(&ds->ds_lock); 735 ds->ds_owner = NULL; 736 if (RW_WRITE_HELD(&ds->ds_rwlock)) { 737 rw_exit(&ds->ds_rwlock); 738 cv_broadcast(&ds->ds_exclusive_cv); 739 } 740 mutex_exit(&ds->ds_lock); 741 if (ds->ds_dbuf) 742 dsl_dataset_drop_ref(ds, tag); 743 else 744 dsl_dataset_evict(NULL, ds); 745 } 746 747 boolean_t 748 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) 749 { 750 boolean_t gotit = FALSE; 751 752 mutex_enter(&ds->ds_lock); 753 if (ds->ds_owner == NULL && 754 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { 755 ds->ds_owner = tag; 756 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) 757 rw_exit(&ds->ds_rwlock); 758 gotit = TRUE; 759 } 760 mutex_exit(&ds->ds_lock); 761 return (gotit); 762 } 763 764 void 765 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) 766 { 767 ASSERT3P(owner, ==, ds->ds_owner); 768 if (!RW_WRITE_HELD(&ds->ds_rwlock)) 769 rw_enter(&ds->ds_rwlock, RW_WRITER); 770 } 771 772 uint64_t 773 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, 774 uint64_t flags, dmu_tx_t *tx) 775 { 776 dsl_pool_t *dp = dd->dd_pool; 777 dmu_buf_t *dbuf; 778 dsl_dataset_phys_t *dsphys; 779 uint64_t dsobj; 780 objset_t *mos = dp->dp_meta_objset; 781 782 if (origin == NULL) 783 origin = dp->dp_origin_snap; 784 785 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); 786 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); 787 ASSERT(dmu_tx_is_syncing(tx)); 788 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 789 790 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 791 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 792 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 793 dmu_buf_will_dirty(dbuf, tx); 794 dsphys = dbuf->db_data; 795 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 796 dsphys->ds_dir_obj = dd->dd_object; 797 dsphys->ds_flags = flags; 798 dsphys->ds_fsid_guid = unique_create(); 799 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 800 sizeof (dsphys->ds_guid)); 801 dsphys->ds_snapnames_zapobj = 802 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, 803 DMU_OT_NONE, 0, tx); 804 dsphys->ds_creation_time = gethrestime_sec(); 805 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; 806 807 if (origin == NULL) { 808 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); 809 } else { 810 dsl_dataset_t *ohds; 811 812 dsphys->ds_prev_snap_obj = origin->ds_object; 813 dsphys->ds_prev_snap_txg = 814 origin->ds_phys->ds_creation_txg; 815 dsphys->ds_referenced_bytes = 816 origin->ds_phys->ds_referenced_bytes; 817 dsphys->ds_compressed_bytes = 818 origin->ds_phys->ds_compressed_bytes; 819 dsphys->ds_uncompressed_bytes = 820 origin->ds_phys->ds_uncompressed_bytes; 821 dsphys->ds_bp = origin->ds_phys->ds_bp; 822 dsphys->ds_flags |= origin->ds_phys->ds_flags; 823 824 dmu_buf_will_dirty(origin->ds_dbuf, tx); 825 origin->ds_phys->ds_num_children++; 826 827 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 828 origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); 829 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, 830 dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); 831 dsl_dataset_rele(ohds, FTAG); 832 833 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { 834 if (origin->ds_phys->ds_next_clones_obj == 0) { 835 origin->ds_phys->ds_next_clones_obj = 836 zap_create(mos, 837 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 838 } 839 VERIFY(0 == zap_add_int(mos, 840 origin->ds_phys->ds_next_clones_obj, 841 dsobj, tx)); 842 } 843 844 dmu_buf_will_dirty(dd->dd_dbuf, tx); 845 dd->dd_phys->dd_origin_obj = origin->ds_object; 846 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 847 if (origin->ds_dir->dd_phys->dd_clones == 0) { 848 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 849 origin->ds_dir->dd_phys->dd_clones = 850 zap_create(mos, 851 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 852 } 853 VERIFY3U(0, ==, zap_add_int(mos, 854 origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); 855 } 856 } 857 858 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 859 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 860 861 dmu_buf_rele(dbuf, FTAG); 862 863 dmu_buf_will_dirty(dd->dd_dbuf, tx); 864 dd->dd_phys->dd_head_dataset_obj = dsobj; 865 866 return (dsobj); 867 } 868 869 uint64_t 870 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, 871 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) 872 { 873 dsl_pool_t *dp = pdd->dd_pool; 874 uint64_t dsobj, ddobj; 875 dsl_dir_t *dd; 876 877 ASSERT(lastname[0] != '@'); 878 879 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); 880 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); 881 882 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); 883 884 dsl_deleg_set_create_perms(dd, tx, cr); 885 886 dsl_dir_close(dd, FTAG); 887 888 /* 889 * If we are creating a clone, make sure we zero out any stale 890 * data from the origin snapshots zil header. 891 */ 892 if (origin != NULL) { 893 dsl_dataset_t *ds; 894 objset_t *os; 895 896 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 897 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); 898 bzero(&os->os_zil_header, sizeof (os->os_zil_header)); 899 dsl_dataset_dirty(ds, tx); 900 dsl_dataset_rele(ds, FTAG); 901 } 902 903 return (dsobj); 904 } 905 906 /* 907 * The snapshots must all be in the same pool. 908 */ 909 int 910 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, 911 nvlist_t *errlist) 912 { 913 int err; 914 dsl_sync_task_t *dst; 915 spa_t *spa; 916 nvpair_t *pair; 917 dsl_sync_task_group_t *dstg; 918 919 pair = nvlist_next_nvpair(snaps, NULL); 920 if (pair == NULL) 921 return (0); 922 923 err = spa_open(nvpair_name(pair), &spa, FTAG); 924 if (err) 925 return (err); 926 dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 927 928 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; 929 pair = nvlist_next_nvpair(snaps, pair)) { 930 dsl_dataset_t *ds; 931 932 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); 933 if (err == 0) { 934 struct dsl_ds_destroyarg *dsda; 935 936 dsl_dataset_make_exclusive(ds, dstg); 937 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), 938 KM_SLEEP); 939 dsda->ds = ds; 940 dsda->defer = defer; 941 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 942 dsl_dataset_destroy_sync, dsda, dstg, 0); 943 } else if (err == ENOENT) { 944 err = 0; 945 } else { 946 fnvlist_add_int32(errlist, nvpair_name(pair), err); 947 break; 948 } 949 } 950 951 if (err == 0) 952 err = dsl_sync_task_group_wait(dstg); 953 954 for (dst = list_head(&dstg->dstg_tasks); dst; 955 dst = list_next(&dstg->dstg_tasks, dst)) { 956 struct dsl_ds_destroyarg *dsda = dst->dst_arg1; 957 dsl_dataset_t *ds = dsda->ds; 958 959 /* 960 * Return the snapshots that triggered the error. 961 */ 962 if (dst->dst_err != 0) { 963 char name[ZFS_MAXNAMELEN]; 964 dsl_dataset_name(ds, name); 965 fnvlist_add_int32(errlist, name, dst->dst_err); 966 } 967 ASSERT3P(dsda->rm_origin, ==, NULL); 968 dsl_dataset_disown(ds, dstg); 969 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); 970 } 971 972 dsl_sync_task_group_destroy(dstg); 973 spa_close(spa, FTAG); 974 return (err); 975 976 } 977 978 static boolean_t 979 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) 980 { 981 boolean_t might_destroy = B_FALSE; 982 983 mutex_enter(&ds->ds_lock); 984 if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && 985 DS_IS_DEFER_DESTROY(ds)) 986 might_destroy = B_TRUE; 987 mutex_exit(&ds->ds_lock); 988 989 return (might_destroy); 990 } 991 992 /* 993 * If we're removing a clone, and these three conditions are true: 994 * 1) the clone's origin has no other children 995 * 2) the clone's origin has no user references 996 * 3) the clone's origin has been marked for deferred destruction 997 * Then, prepare to remove the origin as part of this sync task group. 998 */ 999 static int 1000 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) 1001 { 1002 dsl_dataset_t *ds = dsda->ds; 1003 dsl_dataset_t *origin = ds->ds_prev; 1004 1005 if (dsl_dataset_might_destroy_origin(origin)) { 1006 char *name; 1007 int namelen; 1008 int error; 1009 1010 namelen = dsl_dataset_namelen(origin) + 1; 1011 name = kmem_alloc(namelen, KM_SLEEP); 1012 dsl_dataset_name(origin, name); 1013 #ifdef _KERNEL 1014 error = zfs_unmount_snap(name, NULL); 1015 if (error) { 1016 kmem_free(name, namelen); 1017 return (error); 1018 } 1019 #endif 1020 error = dsl_dataset_own(name, B_TRUE, tag, &origin); 1021 kmem_free(name, namelen); 1022 if (error) 1023 return (error); 1024 dsda->rm_origin = origin; 1025 dsl_dataset_make_exclusive(origin, tag); 1026 } 1027 1028 return (0); 1029 } 1030 1031 /* 1032 * ds must be opened as OWNER. On return (whether successful or not), 1033 * ds will be closed and caller can no longer dereference it. 1034 */ 1035 int 1036 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) 1037 { 1038 int err; 1039 dsl_sync_task_group_t *dstg; 1040 objset_t *os; 1041 dsl_dir_t *dd; 1042 uint64_t obj; 1043 struct dsl_ds_destroyarg dsda = { 0 }; 1044 1045 dsda.ds = ds; 1046 1047 if (dsl_dataset_is_snapshot(ds)) { 1048 /* Destroying a snapshot is simpler */ 1049 dsl_dataset_make_exclusive(ds, tag); 1050 1051 dsda.defer = defer; 1052 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1053 dsl_dataset_destroy_check, dsl_dataset_destroy_sync, 1054 &dsda, tag, 0); 1055 ASSERT3P(dsda.rm_origin, ==, NULL); 1056 goto out; 1057 } else if (defer) { 1058 err = EINVAL; 1059 goto out; 1060 } 1061 1062 dd = ds->ds_dir; 1063 1064 if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), 1065 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { 1066 /* 1067 * Check for errors and mark this ds as inconsistent, in 1068 * case we crash while freeing the objects. 1069 */ 1070 err = dsl_sync_task_do(dd->dd_pool, 1071 dsl_dataset_destroy_begin_check, 1072 dsl_dataset_destroy_begin_sync, ds, NULL, 0); 1073 if (err) 1074 goto out; 1075 1076 err = dmu_objset_from_ds(ds, &os); 1077 if (err) 1078 goto out; 1079 1080 /* 1081 * Remove all objects while in the open context so that 1082 * there is less work to do in the syncing context. 1083 */ 1084 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 1085 ds->ds_phys->ds_prev_snap_txg)) { 1086 /* 1087 * Ignore errors, if there is not enough disk space 1088 * we will deal with it in dsl_dataset_destroy_sync(). 1089 */ 1090 (void) dmu_free_object(os, obj); 1091 } 1092 if (err != ESRCH) 1093 goto out; 1094 1095 /* 1096 * Sync out all in-flight IO. 1097 */ 1098 txg_wait_synced(dd->dd_pool, 0); 1099 1100 /* 1101 * If we managed to free all the objects in open 1102 * context, the user space accounting should be zero. 1103 */ 1104 if (ds->ds_phys->ds_bp.blk_fill == 0 && 1105 dmu_objset_userused_enabled(os)) { 1106 uint64_t count; 1107 1108 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, 1109 &count) != 0 || count == 0); 1110 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, 1111 &count) != 0 || count == 0); 1112 } 1113 } 1114 1115 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 1116 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); 1117 rw_exit(&dd->dd_pool->dp_config_rwlock); 1118 1119 if (err) 1120 goto out; 1121 1122 /* 1123 * Blow away the dsl_dir + head dataset. 1124 */ 1125 dsl_dataset_make_exclusive(ds, tag); 1126 /* 1127 * If we're removing a clone, we might also need to remove its 1128 * origin. 1129 */ 1130 do { 1131 dsda.need_prep = B_FALSE; 1132 if (dsl_dir_is_clone(dd)) { 1133 err = dsl_dataset_origin_rm_prep(&dsda, tag); 1134 if (err) { 1135 dsl_dir_close(dd, FTAG); 1136 goto out; 1137 } 1138 } 1139 1140 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); 1141 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 1142 dsl_dataset_destroy_sync, &dsda, tag, 0); 1143 dsl_sync_task_create(dstg, dsl_dir_destroy_check, 1144 dsl_dir_destroy_sync, dd, tag, 0); 1145 err = dsl_sync_task_group_wait(dstg); 1146 dsl_sync_task_group_destroy(dstg); 1147 1148 /* 1149 * We could be racing against 'zfs release' or 'zfs destroy -d' 1150 * on the origin snap, in which case we can get EBUSY if we 1151 * needed to destroy the origin snap but were not ready to 1152 * do so. 1153 */ 1154 if (dsda.need_prep) { 1155 ASSERT(err == EBUSY); 1156 ASSERT(dsl_dir_is_clone(dd)); 1157 ASSERT(dsda.rm_origin == NULL); 1158 } 1159 } while (dsda.need_prep); 1160 1161 if (dsda.rm_origin != NULL) 1162 dsl_dataset_disown(dsda.rm_origin, tag); 1163 1164 /* if it is successful, dsl_dir_destroy_sync will close the dd */ 1165 if (err) 1166 dsl_dir_close(dd, FTAG); 1167 out: 1168 dsl_dataset_disown(ds, tag); 1169 return (err); 1170 } 1171 1172 blkptr_t * 1173 dsl_dataset_get_blkptr(dsl_dataset_t *ds) 1174 { 1175 return (&ds->ds_phys->ds_bp); 1176 } 1177 1178 void 1179 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 1180 { 1181 ASSERT(dmu_tx_is_syncing(tx)); 1182 /* If it's the meta-objset, set dp_meta_rootbp */ 1183 if (ds == NULL) { 1184 tx->tx_pool->dp_meta_rootbp = *bp; 1185 } else { 1186 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1187 ds->ds_phys->ds_bp = *bp; 1188 } 1189 } 1190 1191 spa_t * 1192 dsl_dataset_get_spa(dsl_dataset_t *ds) 1193 { 1194 return (ds->ds_dir->dd_pool->dp_spa); 1195 } 1196 1197 void 1198 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) 1199 { 1200 dsl_pool_t *dp; 1201 1202 if (ds == NULL) /* this is the meta-objset */ 1203 return; 1204 1205 ASSERT(ds->ds_objset != NULL); 1206 1207 if (ds->ds_phys->ds_next_snap_obj != 0) 1208 panic("dirtying snapshot!"); 1209 1210 dp = ds->ds_dir->dd_pool; 1211 1212 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { 1213 /* up the hold count until we can be written out */ 1214 dmu_buf_add_ref(ds->ds_dbuf, ds); 1215 } 1216 } 1217 1218 boolean_t 1219 dsl_dataset_is_dirty(dsl_dataset_t *ds) 1220 { 1221 for (int t = 0; t < TXG_SIZE; t++) { 1222 if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, 1223 ds, t)) 1224 return (B_TRUE); 1225 } 1226 return (B_FALSE); 1227 } 1228 1229 /* 1230 * The unique space in the head dataset can be calculated by subtracting 1231 * the space used in the most recent snapshot, that is still being used 1232 * in this file system, from the space currently in use. To figure out 1233 * the space in the most recent snapshot still in use, we need to take 1234 * the total space used in the snapshot and subtract out the space that 1235 * has been freed up since the snapshot was taken. 1236 */ 1237 static void 1238 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) 1239 { 1240 uint64_t mrs_used; 1241 uint64_t dlused, dlcomp, dluncomp; 1242 1243 ASSERT(!dsl_dataset_is_snapshot(ds)); 1244 1245 if (ds->ds_phys->ds_prev_snap_obj != 0) 1246 mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; 1247 else 1248 mrs_used = 0; 1249 1250 dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); 1251 1252 ASSERT3U(dlused, <=, mrs_used); 1253 ds->ds_phys->ds_unique_bytes = 1254 ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); 1255 1256 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= 1257 SPA_VERSION_UNIQUE_ACCURATE) 1258 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1259 } 1260 1261 struct killarg { 1262 dsl_dataset_t *ds; 1263 dmu_tx_t *tx; 1264 }; 1265 1266 /* ARGSUSED */ 1267 static int 1268 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 1269 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1270 { 1271 struct killarg *ka = arg; 1272 dmu_tx_t *tx = ka->tx; 1273 1274 if (bp == NULL) 1275 return (0); 1276 1277 if (zb->zb_level == ZB_ZIL_LEVEL) { 1278 ASSERT(zilog != NULL); 1279 /* 1280 * It's a block in the intent log. It has no 1281 * accounting, so just free it. 1282 */ 1283 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); 1284 } else { 1285 ASSERT(zilog == NULL); 1286 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); 1287 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); 1288 } 1289 1290 return (0); 1291 } 1292 1293 /* ARGSUSED */ 1294 static int 1295 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) 1296 { 1297 dsl_dataset_t *ds = arg1; 1298 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1299 uint64_t count; 1300 int err; 1301 1302 /* 1303 * Can't delete a head dataset if there are snapshots of it. 1304 * (Except if the only snapshots are from the branch we cloned 1305 * from.) 1306 */ 1307 if (ds->ds_prev != NULL && 1308 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1309 return (EBUSY); 1310 1311 /* 1312 * This is really a dsl_dir thing, but check it here so that 1313 * we'll be less likely to leave this dataset inconsistent & 1314 * nearly destroyed. 1315 */ 1316 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); 1317 if (err) 1318 return (err); 1319 if (count != 0) 1320 return (EEXIST); 1321 1322 return (0); 1323 } 1324 1325 /* ARGSUSED */ 1326 static void 1327 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1328 { 1329 dsl_dataset_t *ds = arg1; 1330 1331 /* Mark it as inconsistent on-disk, in case we crash */ 1332 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1333 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1334 1335 spa_history_log_internal_ds(ds, "destroy begin", tx, ""); 1336 } 1337 1338 static int 1339 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, 1340 dmu_tx_t *tx) 1341 { 1342 dsl_dataset_t *ds = dsda->ds; 1343 dsl_dataset_t *ds_prev = ds->ds_prev; 1344 1345 if (dsl_dataset_might_destroy_origin(ds_prev)) { 1346 struct dsl_ds_destroyarg ndsda = {0}; 1347 1348 /* 1349 * If we're not prepared to remove the origin, don't remove 1350 * the clone either. 1351 */ 1352 if (dsda->rm_origin == NULL) { 1353 dsda->need_prep = B_TRUE; 1354 return (EBUSY); 1355 } 1356 1357 ndsda.ds = ds_prev; 1358 ndsda.is_origin_rm = B_TRUE; 1359 return (dsl_dataset_destroy_check(&ndsda, tag, tx)); 1360 } 1361 1362 /* 1363 * If we're not going to remove the origin after all, 1364 * undo the open context setup. 1365 */ 1366 if (dsda->rm_origin != NULL) { 1367 dsl_dataset_disown(dsda->rm_origin, tag); 1368 dsda->rm_origin = NULL; 1369 } 1370 1371 return (0); 1372 } 1373 1374 /* 1375 * If you add new checks here, you may need to add 1376 * additional checks to the "temporary" case in 1377 * snapshot_check() in dmu_objset.c. 1378 */ 1379 /* ARGSUSED */ 1380 int 1381 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) 1382 { 1383 struct dsl_ds_destroyarg *dsda = arg1; 1384 dsl_dataset_t *ds = dsda->ds; 1385 1386 /* we have an owner hold, so noone else can destroy us */ 1387 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 1388 1389 /* 1390 * Only allow deferred destroy on pools that support it. 1391 * NOTE: deferred destroy is only supported on snapshots. 1392 */ 1393 if (dsda->defer) { 1394 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 1395 SPA_VERSION_USERREFS) 1396 return (ENOTSUP); 1397 ASSERT(dsl_dataset_is_snapshot(ds)); 1398 return (0); 1399 } 1400 1401 /* 1402 * Can't delete a head dataset if there are snapshots of it. 1403 * (Except if the only snapshots are from the branch we cloned 1404 * from.) 1405 */ 1406 if (ds->ds_prev != NULL && 1407 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1408 return (EBUSY); 1409 1410 /* 1411 * If we made changes this txg, traverse_dsl_dataset won't find 1412 * them. Try again. 1413 */ 1414 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1415 return (EAGAIN); 1416 1417 if (dsl_dataset_is_snapshot(ds)) { 1418 /* 1419 * If this snapshot has an elevated user reference count, 1420 * we can't destroy it yet. 1421 */ 1422 if (ds->ds_userrefs > 0 && !dsda->releasing) 1423 return (EBUSY); 1424 1425 mutex_enter(&ds->ds_lock); 1426 /* 1427 * Can't delete a branch point. However, if we're destroying 1428 * a clone and removing its origin due to it having a user 1429 * hold count of 0 and having been marked for deferred destroy, 1430 * it's OK for the origin to have a single clone. 1431 */ 1432 if (ds->ds_phys->ds_num_children > 1433 (dsda->is_origin_rm ? 2 : 1)) { 1434 mutex_exit(&ds->ds_lock); 1435 return (EEXIST); 1436 } 1437 mutex_exit(&ds->ds_lock); 1438 } else if (dsl_dir_is_clone(ds->ds_dir)) { 1439 return (dsl_dataset_origin_check(dsda, arg2, tx)); 1440 } 1441 1442 /* XXX we should do some i/o error checking... */ 1443 return (0); 1444 } 1445 1446 struct refsarg { 1447 kmutex_t lock; 1448 boolean_t gone; 1449 kcondvar_t cv; 1450 }; 1451 1452 /* ARGSUSED */ 1453 static void 1454 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) 1455 { 1456 struct refsarg *arg = argv; 1457 1458 mutex_enter(&arg->lock); 1459 arg->gone = TRUE; 1460 cv_signal(&arg->cv); 1461 mutex_exit(&arg->lock); 1462 } 1463 1464 static void 1465 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) 1466 { 1467 struct refsarg arg; 1468 1469 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); 1470 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); 1471 arg.gone = FALSE; 1472 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, 1473 dsl_dataset_refs_gone); 1474 dmu_buf_rele(ds->ds_dbuf, tag); 1475 mutex_enter(&arg.lock); 1476 while (!arg.gone) 1477 cv_wait(&arg.cv, &arg.lock); 1478 ASSERT(arg.gone); 1479 mutex_exit(&arg.lock); 1480 ds->ds_dbuf = NULL; 1481 ds->ds_phys = NULL; 1482 mutex_destroy(&arg.lock); 1483 cv_destroy(&arg.cv); 1484 } 1485 1486 static void 1487 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) 1488 { 1489 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1490 uint64_t count; 1491 int err; 1492 1493 ASSERT(ds->ds_phys->ds_num_children >= 2); 1494 err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); 1495 /* 1496 * The err should not be ENOENT, but a bug in a previous version 1497 * of the code could cause upgrade_clones_cb() to not set 1498 * ds_next_snap_obj when it should, leading to a missing entry. 1499 * If we knew that the pool was created after 1500 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't 1501 * ENOENT. However, at least we can check that we don't have 1502 * too many entries in the next_clones_obj even after failing to 1503 * remove this one. 1504 */ 1505 if (err != ENOENT) { 1506 VERIFY0(err); 1507 } 1508 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 1509 &count)); 1510 ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); 1511 } 1512 1513 static void 1514 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) 1515 { 1516 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1517 zap_cursor_t zc; 1518 zap_attribute_t za; 1519 1520 /* 1521 * If it is the old version, dd_clones doesn't exist so we can't 1522 * find the clones, but deadlist_remove_key() is a no-op so it 1523 * doesn't matter. 1524 */ 1525 if (ds->ds_dir->dd_phys->dd_clones == 0) 1526 return; 1527 1528 for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); 1529 zap_cursor_retrieve(&zc, &za) == 0; 1530 zap_cursor_advance(&zc)) { 1531 dsl_dataset_t *clone; 1532 1533 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 1534 za.za_first_integer, FTAG, &clone)); 1535 if (clone->ds_dir->dd_origin_txg > mintxg) { 1536 dsl_deadlist_remove_key(&clone->ds_deadlist, 1537 mintxg, tx); 1538 dsl_dataset_remove_clones_key(clone, mintxg, tx); 1539 } 1540 dsl_dataset_rele(clone, FTAG); 1541 } 1542 zap_cursor_fini(&zc); 1543 } 1544 1545 struct process_old_arg { 1546 dsl_dataset_t *ds; 1547 dsl_dataset_t *ds_prev; 1548 boolean_t after_branch_point; 1549 zio_t *pio; 1550 uint64_t used, comp, uncomp; 1551 }; 1552 1553 static int 1554 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 1555 { 1556 struct process_old_arg *poa = arg; 1557 dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; 1558 1559 if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { 1560 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); 1561 if (poa->ds_prev && !poa->after_branch_point && 1562 bp->blk_birth > 1563 poa->ds_prev->ds_phys->ds_prev_snap_txg) { 1564 poa->ds_prev->ds_phys->ds_unique_bytes += 1565 bp_get_dsize_sync(dp->dp_spa, bp); 1566 } 1567 } else { 1568 poa->used += bp_get_dsize_sync(dp->dp_spa, bp); 1569 poa->comp += BP_GET_PSIZE(bp); 1570 poa->uncomp += BP_GET_UCSIZE(bp); 1571 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); 1572 } 1573 return (0); 1574 } 1575 1576 static void 1577 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, 1578 dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) 1579 { 1580 struct process_old_arg poa = { 0 }; 1581 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1582 objset_t *mos = dp->dp_meta_objset; 1583 1584 ASSERT(ds->ds_deadlist.dl_oldfmt); 1585 ASSERT(ds_next->ds_deadlist.dl_oldfmt); 1586 1587 poa.ds = ds; 1588 poa.ds_prev = ds_prev; 1589 poa.after_branch_point = after_branch_point; 1590 poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 1591 VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, 1592 process_old_cb, &poa, tx)); 1593 VERIFY0(zio_wait(poa.pio)); 1594 ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); 1595 1596 /* change snapused */ 1597 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1598 -poa.used, -poa.comp, -poa.uncomp, tx); 1599 1600 /* swap next's deadlist to our deadlist */ 1601 dsl_deadlist_close(&ds->ds_deadlist); 1602 dsl_deadlist_close(&ds_next->ds_deadlist); 1603 SWITCH64(ds_next->ds_phys->ds_deadlist_obj, 1604 ds->ds_phys->ds_deadlist_obj); 1605 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 1606 dsl_deadlist_open(&ds_next->ds_deadlist, mos, 1607 ds_next->ds_phys->ds_deadlist_obj); 1608 } 1609 1610 static int 1611 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) 1612 { 1613 int err; 1614 struct killarg ka; 1615 1616 /* 1617 * Free everything that we point to (that's born after 1618 * the previous snapshot, if we are a clone) 1619 * 1620 * NB: this should be very quick, because we already 1621 * freed all the objects in open context. 1622 */ 1623 ka.ds = ds; 1624 ka.tx = tx; 1625 err = traverse_dataset(ds, 1626 ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, 1627 kill_blkptr, &ka); 1628 ASSERT0(err); 1629 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); 1630 1631 return (err); 1632 } 1633 1634 void 1635 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) 1636 { 1637 struct dsl_ds_destroyarg *dsda = arg1; 1638 dsl_dataset_t *ds = dsda->ds; 1639 int err; 1640 int after_branch_point = FALSE; 1641 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1642 objset_t *mos = dp->dp_meta_objset; 1643 dsl_dataset_t *ds_prev = NULL; 1644 boolean_t wont_destroy; 1645 uint64_t obj; 1646 1647 wont_destroy = (dsda->defer && 1648 (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); 1649 1650 ASSERT(ds->ds_owner || wont_destroy); 1651 ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); 1652 ASSERT(ds->ds_prev == NULL || 1653 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); 1654 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); 1655 1656 if (wont_destroy) { 1657 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1658 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1659 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; 1660 spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); 1661 return; 1662 } 1663 1664 /* We need to log before removing it from the namespace. */ 1665 spa_history_log_internal_ds(ds, "destroy", tx, ""); 1666 1667 /* signal any waiters that this dataset is going away */ 1668 mutex_enter(&ds->ds_lock); 1669 ds->ds_owner = dsl_reaper; 1670 cv_broadcast(&ds->ds_exclusive_cv); 1671 mutex_exit(&ds->ds_lock); 1672 1673 /* Remove our reservation */ 1674 if (ds->ds_reserved != 0) { 1675 dsl_prop_setarg_t psa; 1676 uint64_t value = 0; 1677 1678 dsl_prop_setarg_init_uint64(&psa, "refreservation", 1679 (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), 1680 &value); 1681 psa.psa_effective_value = 0; /* predict default value */ 1682 1683 dsl_dataset_set_reservation_sync(ds, &psa, tx); 1684 ASSERT0(ds->ds_reserved); 1685 } 1686 1687 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1688 1689 dsl_scan_ds_destroyed(ds, tx); 1690 1691 obj = ds->ds_object; 1692 1693 if (ds->ds_phys->ds_prev_snap_obj != 0) { 1694 if (ds->ds_prev) { 1695 ds_prev = ds->ds_prev; 1696 } else { 1697 VERIFY(0 == dsl_dataset_hold_obj(dp, 1698 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); 1699 } 1700 after_branch_point = 1701 (ds_prev->ds_phys->ds_next_snap_obj != obj); 1702 1703 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); 1704 if (after_branch_point && 1705 ds_prev->ds_phys->ds_next_clones_obj != 0) { 1706 remove_from_next_clones(ds_prev, obj, tx); 1707 if (ds->ds_phys->ds_next_snap_obj != 0) { 1708 VERIFY(0 == zap_add_int(mos, 1709 ds_prev->ds_phys->ds_next_clones_obj, 1710 ds->ds_phys->ds_next_snap_obj, tx)); 1711 } 1712 } 1713 if (after_branch_point && 1714 ds->ds_phys->ds_next_snap_obj == 0) { 1715 /* This clone is toast. */ 1716 ASSERT(ds_prev->ds_phys->ds_num_children > 1); 1717 ds_prev->ds_phys->ds_num_children--; 1718 1719 /* 1720 * If the clone's origin has no other clones, no 1721 * user holds, and has been marked for deferred 1722 * deletion, then we should have done the necessary 1723 * destroy setup for it. 1724 */ 1725 if (ds_prev->ds_phys->ds_num_children == 1 && 1726 ds_prev->ds_userrefs == 0 && 1727 DS_IS_DEFER_DESTROY(ds_prev)) { 1728 ASSERT3P(dsda->rm_origin, !=, NULL); 1729 } else { 1730 ASSERT3P(dsda->rm_origin, ==, NULL); 1731 } 1732 } else if (!after_branch_point) { 1733 ds_prev->ds_phys->ds_next_snap_obj = 1734 ds->ds_phys->ds_next_snap_obj; 1735 } 1736 } 1737 1738 if (dsl_dataset_is_snapshot(ds)) { 1739 dsl_dataset_t *ds_next; 1740 uint64_t old_unique; 1741 uint64_t used = 0, comp = 0, uncomp = 0; 1742 1743 VERIFY(0 == dsl_dataset_hold_obj(dp, 1744 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); 1745 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); 1746 1747 old_unique = ds_next->ds_phys->ds_unique_bytes; 1748 1749 dmu_buf_will_dirty(ds_next->ds_dbuf, tx); 1750 ds_next->ds_phys->ds_prev_snap_obj = 1751 ds->ds_phys->ds_prev_snap_obj; 1752 ds_next->ds_phys->ds_prev_snap_txg = 1753 ds->ds_phys->ds_prev_snap_txg; 1754 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1755 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); 1756 1757 1758 if (ds_next->ds_deadlist.dl_oldfmt) { 1759 process_old_deadlist(ds, ds_prev, ds_next, 1760 after_branch_point, tx); 1761 } else { 1762 /* Adjust prev's unique space. */ 1763 if (ds_prev && !after_branch_point) { 1764 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1765 ds_prev->ds_phys->ds_prev_snap_txg, 1766 ds->ds_phys->ds_prev_snap_txg, 1767 &used, &comp, &uncomp); 1768 ds_prev->ds_phys->ds_unique_bytes += used; 1769 } 1770 1771 /* Adjust snapused. */ 1772 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1773 ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 1774 &used, &comp, &uncomp); 1775 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1776 -used, -comp, -uncomp, tx); 1777 1778 /* Move blocks to be freed to pool's free list. */ 1779 dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, 1780 &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, 1781 tx); 1782 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, 1783 DD_USED_HEAD, used, comp, uncomp, tx); 1784 1785 /* Merge our deadlist into next's and free it. */ 1786 dsl_deadlist_merge(&ds_next->ds_deadlist, 1787 ds->ds_phys->ds_deadlist_obj, tx); 1788 } 1789 dsl_deadlist_close(&ds->ds_deadlist); 1790 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1791 1792 /* Collapse range in clone heads */ 1793 dsl_dataset_remove_clones_key(ds, 1794 ds->ds_phys->ds_creation_txg, tx); 1795 1796 if (dsl_dataset_is_snapshot(ds_next)) { 1797 dsl_dataset_t *ds_nextnext; 1798 1799 /* 1800 * Update next's unique to include blocks which 1801 * were previously shared by only this snapshot 1802 * and it. Those blocks will be born after the 1803 * prev snap and before this snap, and will have 1804 * died after the next snap and before the one 1805 * after that (ie. be on the snap after next's 1806 * deadlist). 1807 */ 1808 VERIFY(0 == dsl_dataset_hold_obj(dp, 1809 ds_next->ds_phys->ds_next_snap_obj, 1810 FTAG, &ds_nextnext)); 1811 dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, 1812 ds->ds_phys->ds_prev_snap_txg, 1813 ds->ds_phys->ds_creation_txg, 1814 &used, &comp, &uncomp); 1815 ds_next->ds_phys->ds_unique_bytes += used; 1816 dsl_dataset_rele(ds_nextnext, FTAG); 1817 ASSERT3P(ds_next->ds_prev, ==, NULL); 1818 1819 /* Collapse range in this head. */ 1820 dsl_dataset_t *hds; 1821 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 1822 ds->ds_dir->dd_phys->dd_head_dataset_obj, 1823 FTAG, &hds)); 1824 dsl_deadlist_remove_key(&hds->ds_deadlist, 1825 ds->ds_phys->ds_creation_txg, tx); 1826 dsl_dataset_rele(hds, FTAG); 1827 1828 } else { 1829 ASSERT3P(ds_next->ds_prev, ==, ds); 1830 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); 1831 ds_next->ds_prev = NULL; 1832 if (ds_prev) { 1833 VERIFY(0 == dsl_dataset_get_ref(dp, 1834 ds->ds_phys->ds_prev_snap_obj, 1835 ds_next, &ds_next->ds_prev)); 1836 } 1837 1838 dsl_dataset_recalc_head_uniq(ds_next); 1839 1840 /* 1841 * Reduce the amount of our unconsmed refreservation 1842 * being charged to our parent by the amount of 1843 * new unique data we have gained. 1844 */ 1845 if (old_unique < ds_next->ds_reserved) { 1846 int64_t mrsdelta; 1847 uint64_t new_unique = 1848 ds_next->ds_phys->ds_unique_bytes; 1849 1850 ASSERT(old_unique <= new_unique); 1851 mrsdelta = MIN(new_unique - old_unique, 1852 ds_next->ds_reserved - old_unique); 1853 dsl_dir_diduse_space(ds->ds_dir, 1854 DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); 1855 } 1856 } 1857 dsl_dataset_rele(ds_next, FTAG); 1858 } else { 1859 zfeature_info_t *async_destroy = 1860 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; 1861 objset_t *os; 1862 1863 /* 1864 * There's no next snapshot, so this is a head dataset. 1865 * Destroy the deadlist. Unless it's a clone, the 1866 * deadlist should be empty. (If it's a clone, it's 1867 * safe to ignore the deadlist contents.) 1868 */ 1869 dsl_deadlist_close(&ds->ds_deadlist); 1870 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1871 ds->ds_phys->ds_deadlist_obj = 0; 1872 1873 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); 1874 1875 if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { 1876 err = old_synchronous_dataset_destroy(ds, tx); 1877 } else { 1878 /* 1879 * Move the bptree into the pool's list of trees to 1880 * clean up and update space accounting information. 1881 */ 1882 uint64_t used, comp, uncomp; 1883 1884 zil_destroy_sync(dmu_objset_zil(os), tx); 1885 1886 if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { 1887 spa_feature_incr(dp->dp_spa, async_destroy, tx); 1888 dp->dp_bptree_obj = bptree_alloc(mos, tx); 1889 VERIFY(zap_add(mos, 1890 DMU_POOL_DIRECTORY_OBJECT, 1891 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 1892 &dp->dp_bptree_obj, tx) == 0); 1893 } 1894 1895 used = ds->ds_dir->dd_phys->dd_used_bytes; 1896 comp = ds->ds_dir->dd_phys->dd_compressed_bytes; 1897 uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; 1898 1899 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || 1900 ds->ds_phys->ds_unique_bytes == used); 1901 1902 bptree_add(mos, dp->dp_bptree_obj, 1903 &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, 1904 used, comp, uncomp, tx); 1905 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 1906 -used, -comp, -uncomp, tx); 1907 dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, 1908 used, comp, uncomp, tx); 1909 } 1910 1911 if (ds->ds_prev != NULL) { 1912 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 1913 VERIFY3U(0, ==, zap_remove_int(mos, 1914 ds->ds_prev->ds_dir->dd_phys->dd_clones, 1915 ds->ds_object, tx)); 1916 } 1917 dsl_dataset_rele(ds->ds_prev, ds); 1918 ds->ds_prev = ds_prev = NULL; 1919 } 1920 } 1921 1922 /* 1923 * This must be done after the dsl_traverse(), because it will 1924 * re-open the objset. 1925 */ 1926 if (ds->ds_objset) { 1927 dmu_objset_evict(ds->ds_objset); 1928 ds->ds_objset = NULL; 1929 } 1930 1931 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { 1932 /* Erase the link in the dir */ 1933 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1934 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; 1935 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); 1936 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); 1937 ASSERT(err == 0); 1938 } else { 1939 /* remove from snapshot namespace */ 1940 dsl_dataset_t *ds_head; 1941 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); 1942 VERIFY(0 == dsl_dataset_hold_obj(dp, 1943 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); 1944 VERIFY(0 == dsl_dataset_get_snapname(ds)); 1945 #ifdef ZFS_DEBUG 1946 { 1947 uint64_t val; 1948 1949 err = dsl_dataset_snap_lookup(ds_head, 1950 ds->ds_snapname, &val); 1951 ASSERT0(err); 1952 ASSERT3U(val, ==, obj); 1953 } 1954 #endif 1955 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); 1956 ASSERT(err == 0); 1957 dsl_dataset_rele(ds_head, FTAG); 1958 } 1959 1960 if (ds_prev && ds->ds_prev != ds_prev) 1961 dsl_dataset_rele(ds_prev, FTAG); 1962 1963 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); 1964 1965 if (ds->ds_phys->ds_next_clones_obj != 0) { 1966 uint64_t count; 1967 ASSERT(0 == zap_count(mos, 1968 ds->ds_phys->ds_next_clones_obj, &count) && count == 0); 1969 VERIFY(0 == dmu_object_free(mos, 1970 ds->ds_phys->ds_next_clones_obj, tx)); 1971 } 1972 if (ds->ds_phys->ds_props_obj != 0) 1973 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); 1974 if (ds->ds_phys->ds_userrefs_obj != 0) 1975 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); 1976 dsl_dir_close(ds->ds_dir, ds); 1977 ds->ds_dir = NULL; 1978 dsl_dataset_drain_refs(ds, tag); 1979 VERIFY(0 == dmu_object_free(mos, obj, tx)); 1980 1981 if (dsda->rm_origin) { 1982 /* 1983 * Remove the origin of the clone we just destroyed. 1984 */ 1985 struct dsl_ds_destroyarg ndsda = {0}; 1986 1987 ndsda.ds = dsda->rm_origin; 1988 dsl_dataset_destroy_sync(&ndsda, tag, tx); 1989 } 1990 } 1991 1992 static int 1993 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) 1994 { 1995 uint64_t asize; 1996 1997 if (!dmu_tx_is_syncing(tx)) 1998 return (0); 1999 2000 /* 2001 * If there's an fs-only reservation, any blocks that might become 2002 * owned by the snapshot dataset must be accommodated by space 2003 * outside of the reservation. 2004 */ 2005 ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); 2006 asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 2007 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 2008 return (ENOSPC); 2009 2010 /* 2011 * Propagate any reserved space for this snapshot to other 2012 * snapshot checks in this sync group. 2013 */ 2014 if (asize > 0) 2015 dsl_dir_willuse_space(ds->ds_dir, asize, tx); 2016 2017 return (0); 2018 } 2019 2020 /* 2021 * Check if adding additional snapshot(s) would exceed any snapshot limits. 2022 * Note that all snapshot limits up to the root dataset (i.e. the pool itself) 2023 * or the given ancestor must be satisfied. Note that it is valid for the 2024 * count to exceed the limit. This can happen if a snapshot is taken by an 2025 * administrative user in the global zone (e.g. a recursive snapshot by root). 2026 */ 2027 int 2028 dsl_snapcount_check(dsl_dir_t *dd, uint64_t cnt, dsl_dir_t *ancestor) 2029 { 2030 uint64_t limit; 2031 int err = 0; 2032 2033 /* 2034 * The limit is never enforced for the admin user in global zone. 2035 * If we're not in the global zone then we need to run this check in 2036 * open context, since thats when we know what zone we're in and 2037 * syncing is only performed in the global zone. 2038 */ 2039 if (INGLOBALZONE(curproc)) 2040 return (0); 2041 2042 /* 2043 * If renaming a dataset with no snapshots, count adjustment is 0. 2044 */ 2045 if (cnt == 0) 2046 return (0); 2047 2048 /* 2049 * If an ancestor has been provided, stop checking the limit once we 2050 * hit that dir. We need this during rename so that we don't overcount 2051 * the check once we recurse up to the common ancestor. 2052 */ 2053 if (ancestor == dd) 2054 return (0); 2055 2056 /* 2057 * If we hit an uninitialized node while recursing up the tree, we can 2058 * stop since we know the counts are not valid on this node and we 2059 * know we won't touch this node's counts. 2060 */ 2061 if (dd->dd_phys->dd_filesystem_count == 0) 2062 return (0); 2063 2064 /* 2065 * If there's no value for this property, there's no need to enforce a 2066 * snapshot limit. 2067 */ 2068 err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_SNAPSHOT_LIMIT), 2069 8, 1, &limit, NULL, B_FALSE); 2070 if (err == ENOENT) 2071 return (0); 2072 else if (err != 0) 2073 return (err); 2074 2075 #ifdef _KERNEL 2076 extern void __dtrace_probe_zfs__ss__limit(uint64_t, uint64_t, char *); 2077 __dtrace_probe_zfs__ss__limit( 2078 (uint64_t)dd->dd_phys->dd_snapshot_count, (uint64_t)limit, 2079 dd->dd_myname); 2080 #endif 2081 2082 if (limit != MAXLIMIT && 2083 (dd->dd_phys->dd_snapshot_count + cnt) > limit) 2084 return (EDQUOT); 2085 2086 if (dd->dd_parent != NULL) 2087 err = dsl_snapcount_check(dd->dd_parent, cnt, ancestor); 2088 2089 return (err); 2090 } 2091 2092 /* 2093 * Adjust the snapshot count for the specified dsl_dir_t and all parents. 2094 * When a new snapshot is created, increment the count on all parents, and when 2095 * a snapshot is destroyed, decrement the count. 2096 */ 2097 void 2098 dsl_snapcount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta, 2099 boolean_t first) 2100 { 2101 /* 2102 * If we hit an uninitialized node while recursing up the tree, we can 2103 * stop since we know the counts are not valid on this node and we 2104 * know we shouldn't touch this node's counts. An uninitialized count 2105 * on the node indicates that either the feature has not yet been 2106 * activated or there are no limits on this part of the tree. 2107 */ 2108 if (dd->dd_phys->dd_filesystem_count == 0) 2109 return; 2110 2111 /* 2112 * The feature might have previously been active, so there could be 2113 * non-0 counts on the nodes, but it might now be inactive. 2114 * 2115 * On initial entry we need to check if this feature is active, but 2116 * we don't want to re-check this on each recursive call. Note: the 2117 * feature cannot be active if its not enabled. If the feature is not 2118 * active, don't touch the on-disk count fields. 2119 */ 2120 if (first) { 2121 dsl_dataset_t *ds = NULL; 2122 spa_t *spa; 2123 zfeature_info_t *quota_feat = 2124 &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT]; 2125 2126 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2127 dd->dd_phys->dd_head_dataset_obj, FTAG, &ds)); 2128 spa = dsl_dataset_get_spa(ds); 2129 dsl_dataset_rele(ds, FTAG); 2130 if (!spa_feature_is_active(spa, quota_feat)) 2131 return; 2132 } 2133 2134 /* 2135 * As with dsl_dataset_set_reservation_check(), wdon't want to run 2136 * this check in open context. 2137 */ 2138 if (!dmu_tx_is_syncing(tx)) 2139 return; 2140 2141 /* if renaming a dataset with no snapshots, count adjustment is 0 */ 2142 if (delta == 0) 2143 return; 2144 2145 /* 2146 * If we hit an uninitialized node while recursing up the tree, we can 2147 * stop since we know the counts are not valid on this node and we 2148 * know we shouldn't touch this node's counts. 2149 */ 2150 if (dd->dd_phys->dd_filesystem_count == 0) 2151 return; 2152 2153 /* Increment count for parent */ 2154 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2155 2156 mutex_enter(&dd->dd_lock); 2157 2158 dd->dd_phys->dd_snapshot_count += delta; 2159 2160 /* Roll up this additional count into our ancestors */ 2161 if (dd->dd_parent != NULL) 2162 dsl_snapcount_adjust(dd->dd_parent, tx, delta, B_FALSE); 2163 2164 mutex_exit(&dd->dd_lock); 2165 } 2166 2167 int 2168 dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname, 2169 uint64_t cnt, dmu_tx_t *tx) 2170 { 2171 int err; 2172 uint64_t value; 2173 2174 /* 2175 * We don't allow multiple snapshots of the same txg. If there 2176 * is already one, try again. 2177 */ 2178 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) 2179 return (EAGAIN); 2180 2181 /* 2182 * Check for conflicting snapshot name. 2183 */ 2184 err = dsl_dataset_snap_lookup(ds, snapname, &value); 2185 if (err == 0) 2186 return (EEXIST); 2187 if (err != ENOENT) 2188 return (err); 2189 2190 /* 2191 * Check that the dataset's name is not too long. Name consists 2192 * of the dataset's length + 1 for the @-sign + snapshot name's length 2193 */ 2194 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) 2195 return (ENAMETOOLONG); 2196 2197 err = dsl_snapcount_check(ds->ds_dir, cnt, NULL); 2198 if (err) 2199 return (err); 2200 2201 err = dsl_dataset_snapshot_reserve_space(ds, tx); 2202 if (err) 2203 return (err); 2204 2205 ds->ds_trysnap_txg = tx->tx_txg; 2206 return (0); 2207 } 2208 2209 void 2210 dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, 2211 dmu_tx_t *tx) 2212 { 2213 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2214 dmu_buf_t *dbuf; 2215 dsl_dataset_phys_t *dsphys; 2216 uint64_t dsobj, crtxg; 2217 objset_t *mos = dp->dp_meta_objset; 2218 int err; 2219 2220 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 2221 2222 dsl_snapcount_adjust(ds->ds_dir, tx, 1, B_TRUE); 2223 2224 /* 2225 * The origin's ds_creation_txg has to be < TXG_INITIAL 2226 */ 2227 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) 2228 crtxg = 1; 2229 else 2230 crtxg = tx->tx_txg; 2231 2232 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 2233 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 2234 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 2235 dmu_buf_will_dirty(dbuf, tx); 2236 dsphys = dbuf->db_data; 2237 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 2238 dsphys->ds_dir_obj = ds->ds_dir->dd_object; 2239 dsphys->ds_fsid_guid = unique_create(); 2240 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 2241 sizeof (dsphys->ds_guid)); 2242 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; 2243 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; 2244 dsphys->ds_next_snap_obj = ds->ds_object; 2245 dsphys->ds_num_children = 1; 2246 dsphys->ds_creation_time = gethrestime_sec(); 2247 dsphys->ds_creation_txg = crtxg; 2248 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; 2249 dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes; 2250 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; 2251 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; 2252 dsphys->ds_flags = ds->ds_phys->ds_flags; 2253 dsphys->ds_bp = ds->ds_phys->ds_bp; 2254 dmu_buf_rele(dbuf, FTAG); 2255 2256 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); 2257 if (ds->ds_prev) { 2258 uint64_t next_clones_obj = 2259 ds->ds_prev->ds_phys->ds_next_clones_obj; 2260 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == 2261 ds->ds_object || 2262 ds->ds_prev->ds_phys->ds_num_children > 1); 2263 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 2264 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2265 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 2266 ds->ds_prev->ds_phys->ds_creation_txg); 2267 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; 2268 } else if (next_clones_obj != 0) { 2269 remove_from_next_clones(ds->ds_prev, 2270 dsphys->ds_next_snap_obj, tx); 2271 VERIFY3U(0, ==, zap_add_int(mos, 2272 next_clones_obj, dsobj, tx)); 2273 } 2274 } 2275 2276 /* 2277 * If we have a reference-reservation on this dataset, we will 2278 * need to increase the amount of refreservation being charged 2279 * since our unique space is going to zero. 2280 */ 2281 if (ds->ds_reserved) { 2282 int64_t delta; 2283 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 2284 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 2285 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, 2286 delta, 0, 0, tx); 2287 } 2288 2289 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2290 zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", 2291 ds->ds_dir->dd_myname, snapname, dsobj, 2292 ds->ds_phys->ds_prev_snap_txg); 2293 ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, 2294 UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); 2295 dsl_deadlist_close(&ds->ds_deadlist); 2296 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 2297 dsl_deadlist_add_key(&ds->ds_deadlist, 2298 ds->ds_phys->ds_prev_snap_txg, tx); 2299 2300 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); 2301 ds->ds_phys->ds_prev_snap_obj = dsobj; 2302 ds->ds_phys->ds_prev_snap_txg = crtxg; 2303 ds->ds_phys->ds_unique_bytes = 0; 2304 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 2305 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 2306 2307 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, 2308 snapname, 8, 1, &dsobj, tx); 2309 ASSERT(err == 0); 2310 2311 if (ds->ds_prev) 2312 dsl_dataset_drop_ref(ds->ds_prev, ds); 2313 VERIFY(0 == dsl_dataset_get_ref(dp, 2314 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 2315 2316 dsl_scan_ds_snapshotted(ds, tx); 2317 2318 dsl_dir_snap_cmtime_update(ds->ds_dir); 2319 2320 spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); 2321 } 2322 2323 void 2324 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) 2325 { 2326 ASSERT(dmu_tx_is_syncing(tx)); 2327 ASSERT(ds->ds_objset != NULL); 2328 ASSERT(ds->ds_phys->ds_next_snap_obj == 0); 2329 2330 /* 2331 * in case we had to change ds_fsid_guid when we opened it, 2332 * sync it out now. 2333 */ 2334 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2335 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; 2336 2337 dmu_objset_sync(ds->ds_objset, zio, tx); 2338 } 2339 2340 static void 2341 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) 2342 { 2343 uint64_t count = 0; 2344 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 2345 zap_cursor_t zc; 2346 zap_attribute_t za; 2347 nvlist_t *propval; 2348 nvlist_t *val; 2349 2350 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2351 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2352 VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2353 2354 /* 2355 * There may me missing entries in ds_next_clones_obj 2356 * due to a bug in a previous version of the code. 2357 * Only trust it if it has the right number of entries. 2358 */ 2359 if (ds->ds_phys->ds_next_clones_obj != 0) { 2360 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 2361 &count)); 2362 } 2363 if (count != ds->ds_phys->ds_num_children - 1) { 2364 goto fail; 2365 } 2366 for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); 2367 zap_cursor_retrieve(&zc, &za) == 0; 2368 zap_cursor_advance(&zc)) { 2369 dsl_dataset_t *clone; 2370 char buf[ZFS_MAXNAMELEN]; 2371 /* 2372 * Even though we hold the dp_config_rwlock, the dataset 2373 * may fail to open, returning ENOENT. If there is a 2374 * thread concurrently attempting to destroy this 2375 * dataset, it will have the ds_rwlock held for 2376 * RW_WRITER. Our call to dsl_dataset_hold_obj() -> 2377 * dsl_dataset_hold_ref() will fail its 2378 * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the 2379 * dp_config_rwlock, and wait for the destroy progress 2380 * and signal ds_exclusive_cv. If the destroy was 2381 * successful, we will see that 2382 * DSL_DATASET_IS_DESTROYED(), and return ENOENT. 2383 */ 2384 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 2385 za.za_first_integer, FTAG, &clone) != 0) 2386 continue; 2387 dsl_dir_name(clone->ds_dir, buf); 2388 VERIFY(nvlist_add_boolean(val, buf) == 0); 2389 dsl_dataset_rele(clone, FTAG); 2390 } 2391 zap_cursor_fini(&zc); 2392 VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); 2393 VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), 2394 propval) == 0); 2395 fail: 2396 nvlist_free(val); 2397 nvlist_free(propval); 2398 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2399 } 2400 2401 void 2402 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) 2403 { 2404 uint64_t refd, avail, uobjs, aobjs, ratio; 2405 2406 ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : 2407 (ds->ds_phys->ds_uncompressed_bytes * 100 / 2408 ds->ds_phys->ds_compressed_bytes); 2409 2410 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); 2411 2412 if (dsl_dataset_is_snapshot(ds)) { 2413 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); 2414 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 2415 ds->ds_phys->ds_unique_bytes); 2416 get_clones_stat(ds, nv); 2417 } else { 2418 dsl_dir_stats(ds->ds_dir, nv); 2419 } 2420 2421 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); 2422 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); 2423 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); 2424 2425 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, 2426 ds->ds_phys->ds_creation_time); 2427 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, 2428 ds->ds_phys->ds_creation_txg); 2429 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, 2430 ds->ds_quota); 2431 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, 2432 ds->ds_reserved); 2433 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, 2434 ds->ds_phys->ds_guid); 2435 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, 2436 ds->ds_phys->ds_unique_bytes); 2437 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, 2438 ds->ds_object); 2439 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, 2440 ds->ds_userrefs); 2441 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, 2442 DS_IS_DEFER_DESTROY(ds) ? 1 : 0); 2443 2444 if (ds->ds_phys->ds_prev_snap_obj != 0) { 2445 uint64_t written, comp, uncomp; 2446 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2447 dsl_dataset_t *prev; 2448 2449 rw_enter(&dp->dp_config_rwlock, RW_READER); 2450 int err = dsl_dataset_hold_obj(dp, 2451 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); 2452 rw_exit(&dp->dp_config_rwlock); 2453 if (err == 0) { 2454 err = dsl_dataset_space_written(prev, ds, &written, 2455 &comp, &uncomp); 2456 dsl_dataset_rele(prev, FTAG); 2457 if (err == 0) { 2458 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, 2459 written); 2460 } 2461 } 2462 } 2463 } 2464 2465 void 2466 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) 2467 { 2468 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; 2469 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; 2470 stat->dds_guid = ds->ds_phys->ds_guid; 2471 stat->dds_origin[0] = '\0'; 2472 if (dsl_dataset_is_snapshot(ds)) { 2473 stat->dds_is_snapshot = B_TRUE; 2474 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; 2475 } else { 2476 stat->dds_is_snapshot = B_FALSE; 2477 stat->dds_num_clones = 0; 2478 2479 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2480 if (dsl_dir_is_clone(ds->ds_dir)) { 2481 dsl_dataset_t *ods; 2482 2483 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, 2484 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); 2485 dsl_dataset_name(ods, stat->dds_origin); 2486 dsl_dataset_drop_ref(ods, FTAG); 2487 } 2488 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2489 } 2490 } 2491 2492 uint64_t 2493 dsl_dataset_fsid_guid(dsl_dataset_t *ds) 2494 { 2495 return (ds->ds_fsid_guid); 2496 } 2497 2498 void 2499 dsl_dataset_space(dsl_dataset_t *ds, 2500 uint64_t *refdbytesp, uint64_t *availbytesp, 2501 uint64_t *usedobjsp, uint64_t *availobjsp) 2502 { 2503 *refdbytesp = ds->ds_phys->ds_referenced_bytes; 2504 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); 2505 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) 2506 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; 2507 if (ds->ds_quota != 0) { 2508 /* 2509 * Adjust available bytes according to refquota 2510 */ 2511 if (*refdbytesp < ds->ds_quota) 2512 *availbytesp = MIN(*availbytesp, 2513 ds->ds_quota - *refdbytesp); 2514 else 2515 *availbytesp = 0; 2516 } 2517 *usedobjsp = ds->ds_phys->ds_bp.blk_fill; 2518 *availobjsp = DN_MAX_OBJECT - *usedobjsp; 2519 } 2520 2521 boolean_t 2522 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) 2523 { 2524 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2525 2526 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 2527 dsl_pool_sync_context(dp)); 2528 if (ds->ds_prev == NULL) 2529 return (B_FALSE); 2530 if (ds->ds_phys->ds_bp.blk_birth > 2531 ds->ds_prev->ds_phys->ds_creation_txg) { 2532 objset_t *os, *os_prev; 2533 /* 2534 * It may be that only the ZIL differs, because it was 2535 * reset in the head. Don't count that as being 2536 * modified. 2537 */ 2538 if (dmu_objset_from_ds(ds, &os) != 0) 2539 return (B_TRUE); 2540 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) 2541 return (B_TRUE); 2542 return (bcmp(&os->os_phys->os_meta_dnode, 2543 &os_prev->os_phys->os_meta_dnode, 2544 sizeof (os->os_phys->os_meta_dnode)) != 0); 2545 } 2546 return (B_FALSE); 2547 } 2548 2549 /* ARGSUSED */ 2550 static int 2551 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) 2552 { 2553 dsl_dataset_t *ds = arg1; 2554 char *newsnapname = arg2; 2555 dsl_dir_t *dd = ds->ds_dir; 2556 dsl_dataset_t *hds; 2557 uint64_t val; 2558 int err; 2559 2560 err = dsl_dataset_hold_obj(dd->dd_pool, 2561 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); 2562 if (err) 2563 return (err); 2564 2565 /* new name better not be in use */ 2566 err = dsl_dataset_snap_lookup(hds, newsnapname, &val); 2567 dsl_dataset_rele(hds, FTAG); 2568 2569 if (err == 0) 2570 err = EEXIST; 2571 else if (err == ENOENT) 2572 err = 0; 2573 2574 /* dataset name + 1 for the "@" + the new snapshot name must fit */ 2575 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) 2576 err = ENAMETOOLONG; 2577 2578 return (err); 2579 } 2580 2581 static void 2582 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2583 { 2584 dsl_dataset_t *ds = arg1; 2585 const char *newsnapname = arg2; 2586 dsl_dir_t *dd = ds->ds_dir; 2587 objset_t *mos = dd->dd_pool->dp_meta_objset; 2588 dsl_dataset_t *hds; 2589 int err; 2590 2591 ASSERT(ds->ds_phys->ds_next_snap_obj != 0); 2592 2593 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2594 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); 2595 2596 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2597 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); 2598 ASSERT0(err); 2599 mutex_enter(&ds->ds_lock); 2600 (void) strcpy(ds->ds_snapname, newsnapname); 2601 mutex_exit(&ds->ds_lock); 2602 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, 2603 ds->ds_snapname, 8, 1, &ds->ds_object, tx); 2604 ASSERT0(err); 2605 2606 spa_history_log_internal_ds(ds, "rename", tx, 2607 "-> @%s", newsnapname); 2608 dsl_dataset_rele(hds, FTAG); 2609 } 2610 2611 struct renamesnaparg { 2612 dsl_sync_task_group_t *dstg; 2613 char failed[MAXPATHLEN]; 2614 char *oldsnap; 2615 char *newsnap; 2616 }; 2617 2618 static int 2619 dsl_snapshot_rename_one(const char *name, void *arg) 2620 { 2621 struct renamesnaparg *ra = arg; 2622 dsl_dataset_t *ds = NULL; 2623 char *snapname; 2624 int err; 2625 2626 snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); 2627 (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); 2628 2629 /* 2630 * For recursive snapshot renames the parent won't be changing 2631 * so we just pass name for both the to/from argument. 2632 */ 2633 err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); 2634 if (err != 0) { 2635 strfree(snapname); 2636 return (err == ENOENT ? 0 : err); 2637 } 2638 2639 #ifdef _KERNEL 2640 /* 2641 * For all filesystems undergoing rename, we'll need to unmount it. 2642 */ 2643 (void) zfs_unmount_snap(snapname, NULL); 2644 #endif 2645 err = dsl_dataset_hold(snapname, ra->dstg, &ds); 2646 strfree(snapname); 2647 if (err != 0) 2648 return (err == ENOENT ? 0 : err); 2649 2650 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, 2651 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); 2652 2653 return (0); 2654 } 2655 2656 static int 2657 dsl_recursive_rename(char *oldname, const char *newname) 2658 { 2659 int err; 2660 struct renamesnaparg *ra; 2661 dsl_sync_task_t *dst; 2662 spa_t *spa; 2663 char *cp, *fsname = spa_strdup(oldname); 2664 int len = strlen(oldname) + 1; 2665 2666 /* truncate the snapshot name to get the fsname */ 2667 cp = strchr(fsname, '@'); 2668 *cp = '\0'; 2669 2670 err = spa_open(fsname, &spa, FTAG); 2671 if (err) { 2672 kmem_free(fsname, len); 2673 return (err); 2674 } 2675 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); 2676 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 2677 2678 ra->oldsnap = strchr(oldname, '@') + 1; 2679 ra->newsnap = strchr(newname, '@') + 1; 2680 *ra->failed = '\0'; 2681 2682 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, 2683 DS_FIND_CHILDREN); 2684 kmem_free(fsname, len); 2685 2686 if (err == 0) { 2687 err = dsl_sync_task_group_wait(ra->dstg); 2688 } 2689 2690 for (dst = list_head(&ra->dstg->dstg_tasks); dst; 2691 dst = list_next(&ra->dstg->dstg_tasks, dst)) { 2692 dsl_dataset_t *ds = dst->dst_arg1; 2693 if (dst->dst_err) { 2694 dsl_dir_name(ds->ds_dir, ra->failed); 2695 (void) strlcat(ra->failed, "@", sizeof (ra->failed)); 2696 (void) strlcat(ra->failed, ra->newsnap, 2697 sizeof (ra->failed)); 2698 } 2699 dsl_dataset_rele(ds, ra->dstg); 2700 } 2701 2702 if (err) 2703 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); 2704 2705 dsl_sync_task_group_destroy(ra->dstg); 2706 kmem_free(ra, sizeof (struct renamesnaparg)); 2707 spa_close(spa, FTAG); 2708 return (err); 2709 } 2710 2711 static int 2712 dsl_valid_rename(const char *oldname, void *arg) 2713 { 2714 int delta = *(int *)arg; 2715 2716 if (strlen(oldname) + delta >= MAXNAMELEN) 2717 return (ENAMETOOLONG); 2718 2719 return (0); 2720 } 2721 2722 #pragma weak dmu_objset_rename = dsl_dataset_rename 2723 int 2724 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) 2725 { 2726 dsl_dir_t *dd; 2727 dsl_dataset_t *ds; 2728 const char *tail; 2729 int err; 2730 2731 err = dsl_dir_open(oldname, FTAG, &dd, &tail); 2732 if (err) 2733 return (err); 2734 2735 if (tail == NULL) { 2736 int delta = strlen(newname) - strlen(oldname); 2737 2738 /* if we're growing, validate child name lengths */ 2739 if (delta > 0) 2740 err = dmu_objset_find(oldname, dsl_valid_rename, 2741 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2742 2743 if (err == 0) 2744 err = dsl_dir_rename(dd, newname); 2745 dsl_dir_close(dd, FTAG); 2746 return (err); 2747 } 2748 2749 if (tail[0] != '@') { 2750 /* the name ended in a nonexistent component */ 2751 dsl_dir_close(dd, FTAG); 2752 return (ENOENT); 2753 } 2754 2755 dsl_dir_close(dd, FTAG); 2756 2757 /* new name must be snapshot in same filesystem */ 2758 tail = strchr(newname, '@'); 2759 if (tail == NULL) 2760 return (EINVAL); 2761 tail++; 2762 if (strncmp(oldname, newname, tail - newname) != 0) 2763 return (EXDEV); 2764 2765 if (recursive) { 2766 err = dsl_recursive_rename(oldname, newname); 2767 } else { 2768 err = dsl_dataset_hold(oldname, FTAG, &ds); 2769 if (err) 2770 return (err); 2771 2772 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2773 dsl_dataset_snapshot_rename_check, 2774 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); 2775 2776 dsl_dataset_rele(ds, FTAG); 2777 } 2778 2779 return (err); 2780 } 2781 2782 struct promotenode { 2783 list_node_t link; 2784 dsl_dataset_t *ds; 2785 }; 2786 2787 struct promotearg { 2788 list_t shared_snaps, origin_snaps, clone_snaps; 2789 dsl_dataset_t *origin_origin; 2790 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; 2791 char *err_ds; 2792 }; 2793 2794 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); 2795 static boolean_t snaplist_unstable(list_t *l); 2796 2797 static int 2798 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) 2799 { 2800 dsl_dataset_t *hds = arg1; 2801 struct promotearg *pa = arg2; 2802 struct promotenode *snap = list_head(&pa->shared_snaps); 2803 dsl_dataset_t *origin_ds = snap->ds; 2804 int err; 2805 uint64_t unused; 2806 2807 /* Check that it is a real clone */ 2808 if (!dsl_dir_is_clone(hds->ds_dir)) 2809 return (EINVAL); 2810 2811 /* Since this is so expensive, don't do the preliminary check */ 2812 if (!dmu_tx_is_syncing(tx)) 2813 return (0); 2814 2815 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) 2816 return (EXDEV); 2817 2818 /* compute origin's new unique space */ 2819 snap = list_tail(&pa->clone_snaps); 2820 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2821 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2822 origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 2823 &pa->unique, &unused, &unused); 2824 2825 /* 2826 * Walk the snapshots that we are moving 2827 * 2828 * Compute space to transfer. Consider the incremental changes 2829 * to used for each snapshot: 2830 * (my used) = (prev's used) + (blocks born) - (blocks killed) 2831 * So each snapshot gave birth to: 2832 * (blocks born) = (my used) - (prev's used) + (blocks killed) 2833 * So a sequence would look like: 2834 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) 2835 * Which simplifies to: 2836 * uN + kN + kN-1 + ... + k1 + k0 2837 * Note however, if we stop before we reach the ORIGIN we get: 2838 * uN + kN + kN-1 + ... + kM - uM-1 2839 */ 2840 pa->used = origin_ds->ds_phys->ds_referenced_bytes; 2841 pa->comp = origin_ds->ds_phys->ds_compressed_bytes; 2842 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; 2843 for (snap = list_head(&pa->shared_snaps); snap; 2844 snap = list_next(&pa->shared_snaps, snap)) { 2845 uint64_t val, dlused, dlcomp, dluncomp; 2846 dsl_dataset_t *ds = snap->ds; 2847 2848 /* Check that the snapshot name does not conflict */ 2849 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2850 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); 2851 if (err == 0) { 2852 err = EEXIST; 2853 goto out; 2854 } 2855 if (err != ENOENT) 2856 goto out; 2857 2858 /* The very first snapshot does not have a deadlist */ 2859 if (ds->ds_phys->ds_prev_snap_obj == 0) 2860 continue; 2861 2862 dsl_deadlist_space(&ds->ds_deadlist, 2863 &dlused, &dlcomp, &dluncomp); 2864 pa->used += dlused; 2865 pa->comp += dlcomp; 2866 pa->uncomp += dluncomp; 2867 } 2868 2869 /* 2870 * If we are a clone of a clone then we never reached ORIGIN, 2871 * so we need to subtract out the clone origin's used space. 2872 */ 2873 if (pa->origin_origin) { 2874 pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; 2875 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; 2876 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; 2877 } 2878 2879 /* Check that there is enough space and limit headroom here */ 2880 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 2881 origin_ds->ds_dir, pa->used, tx); 2882 if (err) 2883 return (err); 2884 2885 /* 2886 * Compute the amounts of space that will be used by snapshots 2887 * after the promotion (for both origin and clone). For each, 2888 * it is the amount of space that will be on all of their 2889 * deadlists (that was not born before their new origin). 2890 */ 2891 if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2892 uint64_t space; 2893 2894 /* 2895 * Note, typically this will not be a clone of a clone, 2896 * so dd_origin_txg will be < TXG_INITIAL, so 2897 * these snaplist_space() -> dsl_deadlist_space_range() 2898 * calls will be fast because they do not have to 2899 * iterate over all bps. 2900 */ 2901 snap = list_head(&pa->origin_snaps); 2902 err = snaplist_space(&pa->shared_snaps, 2903 snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); 2904 if (err) 2905 return (err); 2906 2907 err = snaplist_space(&pa->clone_snaps, 2908 snap->ds->ds_dir->dd_origin_txg, &space); 2909 if (err) 2910 return (err); 2911 pa->cloneusedsnap += space; 2912 } 2913 if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2914 err = snaplist_space(&pa->origin_snaps, 2915 origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); 2916 if (err) 2917 return (err); 2918 } 2919 2920 return (0); 2921 out: 2922 pa->err_ds = snap->ds->ds_snapname; 2923 return (err); 2924 } 2925 2926 static void 2927 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2928 { 2929 dsl_dataset_t *hds = arg1; 2930 struct promotearg *pa = arg2; 2931 struct promotenode *snap = list_head(&pa->shared_snaps); 2932 dsl_dataset_t *origin_ds = snap->ds; 2933 dsl_dataset_t *origin_head; 2934 dsl_dir_t *dd = hds->ds_dir; 2935 dsl_pool_t *dp = hds->ds_dir->dd_pool; 2936 dsl_dir_t *odd = NULL; 2937 uint64_t oldnext_obj; 2938 int64_t delta; 2939 2940 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); 2941 2942 snap = list_head(&pa->origin_snaps); 2943 origin_head = snap->ds; 2944 2945 /* 2946 * We need to explicitly open odd, since origin_ds's dd will be 2947 * changing. 2948 */ 2949 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, 2950 NULL, FTAG, &odd)); 2951 2952 /* change origin's next snap */ 2953 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); 2954 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; 2955 snap = list_tail(&pa->clone_snaps); 2956 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2957 origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; 2958 2959 /* change the origin's next clone */ 2960 if (origin_ds->ds_phys->ds_next_clones_obj) { 2961 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); 2962 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2963 origin_ds->ds_phys->ds_next_clones_obj, 2964 oldnext_obj, tx)); 2965 } 2966 2967 /* change origin */ 2968 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2969 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); 2970 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; 2971 dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; 2972 dmu_buf_will_dirty(odd->dd_dbuf, tx); 2973 odd->dd_phys->dd_origin_obj = origin_ds->ds_object; 2974 origin_head->ds_dir->dd_origin_txg = 2975 origin_ds->ds_phys->ds_creation_txg; 2976 2977 /* change dd_clone entries */ 2978 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2979 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2980 odd->dd_phys->dd_clones, hds->ds_object, tx)); 2981 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2982 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2983 hds->ds_object, tx)); 2984 2985 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2986 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2987 origin_head->ds_object, tx)); 2988 if (dd->dd_phys->dd_clones == 0) { 2989 dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, 2990 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 2991 } 2992 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2993 dd->dd_phys->dd_clones, origin_head->ds_object, tx)); 2994 2995 } 2996 2997 /* move snapshots to this dir */ 2998 for (snap = list_head(&pa->shared_snaps); snap; 2999 snap = list_next(&pa->shared_snaps, snap)) { 3000 dsl_dataset_t *ds = snap->ds; 3001 3002 /* unregister props as dsl_dir is changing */ 3003 if (ds->ds_objset) { 3004 dmu_objset_evict(ds->ds_objset); 3005 ds->ds_objset = NULL; 3006 } 3007 /* move snap name entry */ 3008 VERIFY(0 == dsl_dataset_get_snapname(ds)); 3009 VERIFY(0 == dsl_dataset_snap_remove(origin_head, 3010 ds->ds_snapname, tx)); 3011 VERIFY(0 == zap_add(dp->dp_meta_objset, 3012 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 3013 8, 1, &ds->ds_object, tx)); 3014 dsl_snapcount_adjust(hds->ds_dir, tx, 1, B_TRUE); 3015 3016 /* change containing dsl_dir */ 3017 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3018 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); 3019 ds->ds_phys->ds_dir_obj = dd->dd_object; 3020 ASSERT3P(ds->ds_dir, ==, odd); 3021 dsl_dir_close(ds->ds_dir, ds); 3022 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, 3023 NULL, ds, &ds->ds_dir)); 3024 3025 /* move any clone references */ 3026 if (ds->ds_phys->ds_next_clones_obj && 3027 spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 3028 zap_cursor_t zc; 3029 zap_attribute_t za; 3030 3031 for (zap_cursor_init(&zc, dp->dp_meta_objset, 3032 ds->ds_phys->ds_next_clones_obj); 3033 zap_cursor_retrieve(&zc, &za) == 0; 3034 zap_cursor_advance(&zc)) { 3035 dsl_dataset_t *cnds; 3036 uint64_t o; 3037 3038 if (za.za_first_integer == oldnext_obj) { 3039 /* 3040 * We've already moved the 3041 * origin's reference. 3042 */ 3043 continue; 3044 } 3045 3046 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 3047 za.za_first_integer, FTAG, &cnds)); 3048 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; 3049 3050 VERIFY3U(zap_remove_int(dp->dp_meta_objset, 3051 odd->dd_phys->dd_clones, o, tx), ==, 0); 3052 VERIFY3U(zap_add_int(dp->dp_meta_objset, 3053 dd->dd_phys->dd_clones, o, tx), ==, 0); 3054 dsl_dataset_rele(cnds, FTAG); 3055 } 3056 zap_cursor_fini(&zc); 3057 } 3058 3059 ASSERT0(dsl_prop_numcb(ds)); 3060 } 3061 3062 /* 3063 * Change space accounting. 3064 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either 3065 * both be valid, or both be 0 (resulting in delta == 0). This 3066 * is true for each of {clone,origin} independently. 3067 */ 3068 3069 delta = pa->cloneusedsnap - 3070 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 3071 ASSERT3S(delta, >=, 0); 3072 ASSERT3U(pa->used, >=, delta); 3073 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); 3074 dsl_dir_diduse_space(dd, DD_USED_HEAD, 3075 pa->used - delta, pa->comp, pa->uncomp, tx); 3076 3077 delta = pa->originusedsnap - 3078 odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 3079 ASSERT3S(delta, <=, 0); 3080 ASSERT3U(pa->used, >=, -delta); 3081 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); 3082 dsl_dir_diduse_space(odd, DD_USED_HEAD, 3083 -pa->used - delta, -pa->comp, -pa->uncomp, tx); 3084 3085 origin_ds->ds_phys->ds_unique_bytes = pa->unique; 3086 3087 /* log history record */ 3088 spa_history_log_internal_ds(hds, "promote", tx, ""); 3089 3090 dsl_dir_close(odd, FTAG); 3091 } 3092 3093 static char *snaplist_tag = "snaplist"; 3094 /* 3095 * Make a list of dsl_dataset_t's for the snapshots between first_obj 3096 * (exclusive) and last_obj (inclusive). The list will be in reverse 3097 * order (last_obj will be the list_head()). If first_obj == 0, do all 3098 * snapshots back to this dataset's origin. 3099 */ 3100 static int 3101 snaplist_make(dsl_pool_t *dp, boolean_t own, 3102 uint64_t first_obj, uint64_t last_obj, list_t *l) 3103 { 3104 uint64_t obj = last_obj; 3105 3106 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); 3107 3108 list_create(l, sizeof (struct promotenode), 3109 offsetof(struct promotenode, link)); 3110 3111 while (obj != first_obj) { 3112 dsl_dataset_t *ds; 3113 struct promotenode *snap; 3114 int err; 3115 3116 if (own) { 3117 err = dsl_dataset_own_obj(dp, obj, 3118 0, snaplist_tag, &ds); 3119 if (err == 0) 3120 dsl_dataset_make_exclusive(ds, snaplist_tag); 3121 } else { 3122 err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); 3123 } 3124 if (err == ENOENT) { 3125 /* lost race with snapshot destroy */ 3126 struct promotenode *last = list_tail(l); 3127 ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); 3128 obj = last->ds->ds_phys->ds_prev_snap_obj; 3129 continue; 3130 } else if (err) { 3131 return (err); 3132 } 3133 3134 if (first_obj == 0) 3135 first_obj = ds->ds_dir->dd_phys->dd_origin_obj; 3136 3137 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); 3138 snap->ds = ds; 3139 list_insert_tail(l, snap); 3140 obj = ds->ds_phys->ds_prev_snap_obj; 3141 } 3142 3143 return (0); 3144 } 3145 3146 static int 3147 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) 3148 { 3149 struct promotenode *snap; 3150 3151 *spacep = 0; 3152 for (snap = list_head(l); snap; snap = list_next(l, snap)) { 3153 uint64_t used, comp, uncomp; 3154 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 3155 mintxg, UINT64_MAX, &used, &comp, &uncomp); 3156 *spacep += used; 3157 } 3158 return (0); 3159 } 3160 3161 static void 3162 snaplist_destroy(list_t *l, boolean_t own) 3163 { 3164 struct promotenode *snap; 3165 3166 if (!l || !list_link_active(&l->list_head)) 3167 return; 3168 3169 while ((snap = list_tail(l)) != NULL) { 3170 list_remove(l, snap); 3171 if (own) 3172 dsl_dataset_disown(snap->ds, snaplist_tag); 3173 else 3174 dsl_dataset_rele(snap->ds, snaplist_tag); 3175 kmem_free(snap, sizeof (struct promotenode)); 3176 } 3177 list_destroy(l); 3178 } 3179 3180 /* 3181 * Promote a clone. Nomenclature note: 3182 * "clone" or "cds": the original clone which is being promoted 3183 * "origin" or "ods": the snapshot which is originally clone's origin 3184 * "origin head" or "ohds": the dataset which is the head 3185 * (filesystem/volume) for the origin 3186 * "origin origin": the origin of the origin's filesystem (typically 3187 * NULL, indicating that the clone is not a clone of a clone). 3188 */ 3189 int 3190 dsl_dataset_promote(const char *name, char *conflsnap) 3191 { 3192 dsl_dataset_t *ds; 3193 dsl_dir_t *dd; 3194 dsl_pool_t *dp; 3195 dmu_object_info_t doi; 3196 struct promotearg pa = { 0 }; 3197 struct promotenode *snap; 3198 int err; 3199 3200 err = dsl_dataset_hold(name, FTAG, &ds); 3201 if (err) 3202 return (err); 3203 dd = ds->ds_dir; 3204 dp = dd->dd_pool; 3205 3206 err = dmu_object_info(dp->dp_meta_objset, 3207 ds->ds_phys->ds_snapnames_zapobj, &doi); 3208 if (err) { 3209 dsl_dataset_rele(ds, FTAG); 3210 return (err); 3211 } 3212 3213 if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { 3214 dsl_dataset_rele(ds, FTAG); 3215 return (EINVAL); 3216 } 3217 3218 /* 3219 * We are going to inherit all the snapshots taken before our 3220 * origin (i.e., our new origin will be our parent's origin). 3221 * Take ownership of them so that we can rename them into our 3222 * namespace. 3223 */ 3224 rw_enter(&dp->dp_config_rwlock, RW_READER); 3225 3226 err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, 3227 &pa.shared_snaps); 3228 if (err != 0) 3229 goto out; 3230 3231 err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); 3232 if (err != 0) 3233 goto out; 3234 3235 snap = list_head(&pa.shared_snaps); 3236 ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); 3237 err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, 3238 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); 3239 if (err != 0) 3240 goto out; 3241 3242 if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { 3243 err = dsl_dataset_hold_obj(dp, 3244 snap->ds->ds_dir->dd_phys->dd_origin_obj, 3245 FTAG, &pa.origin_origin); 3246 if (err != 0) 3247 goto out; 3248 } 3249 3250 out: 3251 rw_exit(&dp->dp_config_rwlock); 3252 3253 /* 3254 * Add in 128x the snapnames zapobj size, since we will be moving 3255 * a bunch of snapnames to the promoted ds, and dirtying their 3256 * bonus buffers. 3257 */ 3258 if (err == 0) { 3259 err = dsl_sync_task_do(dp, dsl_dataset_promote_check, 3260 dsl_dataset_promote_sync, ds, &pa, 3261 2 + 2 * doi.doi_physical_blocks_512); 3262 if (err && pa.err_ds && conflsnap) 3263 (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); 3264 } 3265 3266 snaplist_destroy(&pa.shared_snaps, B_TRUE); 3267 snaplist_destroy(&pa.clone_snaps, B_FALSE); 3268 snaplist_destroy(&pa.origin_snaps, B_FALSE); 3269 if (pa.origin_origin) 3270 dsl_dataset_rele(pa.origin_origin, FTAG); 3271 dsl_dataset_rele(ds, FTAG); 3272 return (err); 3273 } 3274 3275 struct cloneswaparg { 3276 dsl_dataset_t *cds; /* clone dataset */ 3277 dsl_dataset_t *ohds; /* origin's head dataset */ 3278 boolean_t force; 3279 int64_t unused_refres_delta; /* change in unconsumed refreservation */ 3280 }; 3281 3282 /* ARGSUSED */ 3283 static int 3284 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) 3285 { 3286 struct cloneswaparg *csa = arg1; 3287 3288 /* they should both be heads */ 3289 if (dsl_dataset_is_snapshot(csa->cds) || 3290 dsl_dataset_is_snapshot(csa->ohds)) 3291 return (EINVAL); 3292 3293 /* the branch point should be just before them */ 3294 if (csa->cds->ds_prev != csa->ohds->ds_prev) 3295 return (EINVAL); 3296 3297 /* cds should be the clone (unless they are unrelated) */ 3298 if (csa->cds->ds_prev != NULL && 3299 csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && 3300 csa->ohds->ds_object != 3301 csa->cds->ds_prev->ds_phys->ds_next_snap_obj) 3302 return (EINVAL); 3303 3304 /* the clone should be a child of the origin */ 3305 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) 3306 return (EINVAL); 3307 3308 /* ohds shouldn't be modified unless 'force' */ 3309 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) 3310 return (ETXTBSY); 3311 3312 /* adjust amount of any unconsumed refreservation */ 3313 csa->unused_refres_delta = 3314 (int64_t)MIN(csa->ohds->ds_reserved, 3315 csa->ohds->ds_phys->ds_unique_bytes) - 3316 (int64_t)MIN(csa->ohds->ds_reserved, 3317 csa->cds->ds_phys->ds_unique_bytes); 3318 3319 if (csa->unused_refres_delta > 0 && 3320 csa->unused_refres_delta > 3321 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) 3322 return (ENOSPC); 3323 3324 if (csa->ohds->ds_quota != 0 && 3325 csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) 3326 return (EDQUOT); 3327 3328 return (0); 3329 } 3330 3331 /* ARGSUSED */ 3332 static void 3333 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3334 { 3335 struct cloneswaparg *csa = arg1; 3336 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; 3337 3338 ASSERT(csa->cds->ds_reserved == 0); 3339 ASSERT(csa->ohds->ds_quota == 0 || 3340 csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); 3341 3342 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); 3343 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); 3344 3345 if (csa->cds->ds_objset != NULL) { 3346 dmu_objset_evict(csa->cds->ds_objset); 3347 csa->cds->ds_objset = NULL; 3348 } 3349 3350 if (csa->ohds->ds_objset != NULL) { 3351 dmu_objset_evict(csa->ohds->ds_objset); 3352 csa->ohds->ds_objset = NULL; 3353 } 3354 3355 /* 3356 * Reset origin's unique bytes, if it exists. 3357 */ 3358 if (csa->cds->ds_prev) { 3359 dsl_dataset_t *origin = csa->cds->ds_prev; 3360 uint64_t comp, uncomp; 3361 3362 dmu_buf_will_dirty(origin->ds_dbuf, tx); 3363 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3364 origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, 3365 &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); 3366 } 3367 3368 /* swap blkptrs */ 3369 { 3370 blkptr_t tmp; 3371 tmp = csa->ohds->ds_phys->ds_bp; 3372 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; 3373 csa->cds->ds_phys->ds_bp = tmp; 3374 } 3375 3376 /* set dd_*_bytes */ 3377 { 3378 int64_t dused, dcomp, duncomp; 3379 uint64_t cdl_used, cdl_comp, cdl_uncomp; 3380 uint64_t odl_used, odl_comp, odl_uncomp; 3381 3382 ASSERT3U(csa->cds->ds_dir->dd_phys-> 3383 dd_used_breakdown[DD_USED_SNAP], ==, 0); 3384 3385 dsl_deadlist_space(&csa->cds->ds_deadlist, 3386 &cdl_used, &cdl_comp, &cdl_uncomp); 3387 dsl_deadlist_space(&csa->ohds->ds_deadlist, 3388 &odl_used, &odl_comp, &odl_uncomp); 3389 3390 dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - 3391 (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); 3392 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - 3393 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); 3394 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + 3395 cdl_uncomp - 3396 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); 3397 3398 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, 3399 dused, dcomp, duncomp, tx); 3400 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, 3401 -dused, -dcomp, -duncomp, tx); 3402 3403 /* 3404 * The difference in the space used by snapshots is the 3405 * difference in snapshot space due to the head's 3406 * deadlist (since that's the only thing that's 3407 * changing that affects the snapused). 3408 */ 3409 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3410 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3411 &cdl_used, &cdl_comp, &cdl_uncomp); 3412 dsl_deadlist_space_range(&csa->ohds->ds_deadlist, 3413 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3414 &odl_used, &odl_comp, &odl_uncomp); 3415 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, 3416 DD_USED_HEAD, DD_USED_SNAP, tx); 3417 } 3418 3419 /* swap ds_*_bytes */ 3420 SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, 3421 csa->cds->ds_phys->ds_referenced_bytes); 3422 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, 3423 csa->cds->ds_phys->ds_compressed_bytes); 3424 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, 3425 csa->cds->ds_phys->ds_uncompressed_bytes); 3426 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, 3427 csa->cds->ds_phys->ds_unique_bytes); 3428 3429 /* apply any parent delta for change in unconsumed refreservation */ 3430 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, 3431 csa->unused_refres_delta, 0, 0, tx); 3432 3433 /* 3434 * Swap deadlists. 3435 */ 3436 dsl_deadlist_close(&csa->cds->ds_deadlist); 3437 dsl_deadlist_close(&csa->ohds->ds_deadlist); 3438 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, 3439 csa->cds->ds_phys->ds_deadlist_obj); 3440 dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, 3441 csa->cds->ds_phys->ds_deadlist_obj); 3442 dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, 3443 csa->ohds->ds_phys->ds_deadlist_obj); 3444 3445 dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); 3446 3447 spa_history_log_internal_ds(csa->cds, "clone swap", tx, 3448 "parent=%s", csa->ohds->ds_dir->dd_myname); 3449 } 3450 3451 /* 3452 * Swap 'clone' with its origin head datasets. Used at the end of "zfs 3453 * recv" into an existing fs to swizzle the file system to the new 3454 * version, and by "zfs rollback". Can also be used to swap two 3455 * independent head datasets if neither has any snapshots. 3456 */ 3457 int 3458 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, 3459 boolean_t force) 3460 { 3461 struct cloneswaparg csa; 3462 int error; 3463 3464 ASSERT(clone->ds_owner); 3465 ASSERT(origin_head->ds_owner); 3466 retry: 3467 /* 3468 * Need exclusive access for the swap. If we're swapping these 3469 * datasets back after an error, we already hold the locks. 3470 */ 3471 if (!RW_WRITE_HELD(&clone->ds_rwlock)) 3472 rw_enter(&clone->ds_rwlock, RW_WRITER); 3473 if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && 3474 !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { 3475 rw_exit(&clone->ds_rwlock); 3476 rw_enter(&origin_head->ds_rwlock, RW_WRITER); 3477 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { 3478 rw_exit(&origin_head->ds_rwlock); 3479 goto retry; 3480 } 3481 } 3482 csa.cds = clone; 3483 csa.ohds = origin_head; 3484 csa.force = force; 3485 error = dsl_sync_task_do(clone->ds_dir->dd_pool, 3486 dsl_dataset_clone_swap_check, 3487 dsl_dataset_clone_swap_sync, &csa, NULL, 9); 3488 return (error); 3489 } 3490 3491 /* 3492 * Given a pool name and a dataset object number in that pool, 3493 * return the name of that dataset. 3494 */ 3495 int 3496 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) 3497 { 3498 spa_t *spa; 3499 dsl_pool_t *dp; 3500 dsl_dataset_t *ds; 3501 int error; 3502 3503 if ((error = spa_open(pname, &spa, FTAG)) != 0) 3504 return (error); 3505 dp = spa_get_dsl(spa); 3506 rw_enter(&dp->dp_config_rwlock, RW_READER); 3507 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { 3508 dsl_dataset_name(ds, buf); 3509 dsl_dataset_rele(ds, FTAG); 3510 } 3511 rw_exit(&dp->dp_config_rwlock); 3512 spa_close(spa, FTAG); 3513 3514 return (error); 3515 } 3516 3517 int 3518 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, 3519 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) 3520 { 3521 int error = 0; 3522 3523 ASSERT3S(asize, >, 0); 3524 3525 /* 3526 * *ref_rsrv is the portion of asize that will come from any 3527 * unconsumed refreservation space. 3528 */ 3529 *ref_rsrv = 0; 3530 3531 mutex_enter(&ds->ds_lock); 3532 /* 3533 * Make a space adjustment for reserved bytes. 3534 */ 3535 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { 3536 ASSERT3U(*used, >=, 3537 ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3538 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3539 *ref_rsrv = 3540 asize - MIN(asize, parent_delta(ds, asize + inflight)); 3541 } 3542 3543 if (!check_quota || ds->ds_quota == 0) { 3544 mutex_exit(&ds->ds_lock); 3545 return (0); 3546 } 3547 /* 3548 * If they are requesting more space, and our current estimate 3549 * is over quota, they get to try again unless the actual 3550 * on-disk is over quota and there are no pending changes (which 3551 * may free up space for us). 3552 */ 3553 if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { 3554 if (inflight > 0 || 3555 ds->ds_phys->ds_referenced_bytes < ds->ds_quota) 3556 error = ERESTART; 3557 else 3558 error = EDQUOT; 3559 } 3560 mutex_exit(&ds->ds_lock); 3561 3562 return (error); 3563 } 3564 3565 /* ARGSUSED */ 3566 static int 3567 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) 3568 { 3569 dsl_dataset_t *ds = arg1; 3570 dsl_prop_setarg_t *psa = arg2; 3571 int err; 3572 3573 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) 3574 return (ENOTSUP); 3575 3576 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3577 return (err); 3578 3579 if (psa->psa_effective_value == 0) 3580 return (0); 3581 3582 if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || 3583 psa->psa_effective_value < ds->ds_reserved) 3584 return (ENOSPC); 3585 3586 return (0); 3587 } 3588 3589 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); 3590 3591 void 3592 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3593 { 3594 dsl_dataset_t *ds = arg1; 3595 dsl_prop_setarg_t *psa = arg2; 3596 uint64_t effective_value = psa->psa_effective_value; 3597 3598 dsl_prop_set_sync(ds, psa, tx); 3599 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3600 3601 if (ds->ds_quota != effective_value) { 3602 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3603 ds->ds_quota = effective_value; 3604 } 3605 } 3606 3607 int 3608 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) 3609 { 3610 dsl_dataset_t *ds; 3611 dsl_prop_setarg_t psa; 3612 int err; 3613 3614 dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); 3615 3616 err = dsl_dataset_hold(dsname, FTAG, &ds); 3617 if (err) 3618 return (err); 3619 3620 /* 3621 * If someone removes a file, then tries to set the quota, we 3622 * want to make sure the file freeing takes effect. 3623 */ 3624 txg_wait_open(ds->ds_dir->dd_pool, 0); 3625 3626 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3627 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, 3628 ds, &psa, 0); 3629 3630 dsl_dataset_rele(ds, FTAG); 3631 return (err); 3632 } 3633 3634 static int 3635 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) 3636 { 3637 dsl_dataset_t *ds = arg1; 3638 dsl_prop_setarg_t *psa = arg2; 3639 uint64_t effective_value; 3640 uint64_t unique; 3641 int err; 3642 3643 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 3644 SPA_VERSION_REFRESERVATION) 3645 return (ENOTSUP); 3646 3647 if (dsl_dataset_is_snapshot(ds)) 3648 return (EINVAL); 3649 3650 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3651 return (err); 3652 3653 effective_value = psa->psa_effective_value; 3654 3655 /* 3656 * If we are doing the preliminary check in open context, the 3657 * space estimates may be inaccurate. 3658 */ 3659 if (!dmu_tx_is_syncing(tx)) 3660 return (0); 3661 3662 mutex_enter(&ds->ds_lock); 3663 if (!DS_UNIQUE_IS_ACCURATE(ds)) 3664 dsl_dataset_recalc_head_uniq(ds); 3665 unique = ds->ds_phys->ds_unique_bytes; 3666 mutex_exit(&ds->ds_lock); 3667 3668 if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { 3669 uint64_t delta = MAX(unique, effective_value) - 3670 MAX(unique, ds->ds_reserved); 3671 3672 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 3673 return (ENOSPC); 3674 if (ds->ds_quota > 0 && 3675 effective_value > ds->ds_quota) 3676 return (ENOSPC); 3677 } 3678 3679 return (0); 3680 } 3681 3682 static void 3683 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3684 { 3685 dsl_dataset_t *ds = arg1; 3686 dsl_prop_setarg_t *psa = arg2; 3687 uint64_t effective_value = psa->psa_effective_value; 3688 uint64_t unique; 3689 int64_t delta; 3690 3691 dsl_prop_set_sync(ds, psa, tx); 3692 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3693 3694 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3695 3696 mutex_enter(&ds->ds_dir->dd_lock); 3697 mutex_enter(&ds->ds_lock); 3698 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 3699 unique = ds->ds_phys->ds_unique_bytes; 3700 delta = MAX(0, (int64_t)(effective_value - unique)) - 3701 MAX(0, (int64_t)(ds->ds_reserved - unique)); 3702 ds->ds_reserved = effective_value; 3703 mutex_exit(&ds->ds_lock); 3704 3705 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); 3706 mutex_exit(&ds->ds_dir->dd_lock); 3707 } 3708 3709 int 3710 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, 3711 uint64_t reservation) 3712 { 3713 dsl_dataset_t *ds; 3714 dsl_prop_setarg_t psa; 3715 int err; 3716 3717 dsl_prop_setarg_init_uint64(&psa, "refreservation", source, 3718 &reservation); 3719 3720 err = dsl_dataset_hold(dsname, FTAG, &ds); 3721 if (err) 3722 return (err); 3723 3724 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3725 dsl_dataset_set_reservation_check, 3726 dsl_dataset_set_reservation_sync, ds, &psa, 0); 3727 3728 dsl_dataset_rele(ds, FTAG); 3729 return (err); 3730 } 3731 3732 typedef struct zfs_hold_cleanup_arg { 3733 dsl_pool_t *dp; 3734 uint64_t dsobj; 3735 char htag[MAXNAMELEN]; 3736 } zfs_hold_cleanup_arg_t; 3737 3738 static void 3739 dsl_dataset_user_release_onexit(void *arg) 3740 { 3741 zfs_hold_cleanup_arg_t *ca = arg; 3742 3743 (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, 3744 B_TRUE); 3745 kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); 3746 } 3747 3748 void 3749 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, 3750 minor_t minor) 3751 { 3752 zfs_hold_cleanup_arg_t *ca; 3753 3754 ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); 3755 ca->dp = ds->ds_dir->dd_pool; 3756 ca->dsobj = ds->ds_object; 3757 (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); 3758 VERIFY3U(0, ==, zfs_onexit_add_cb(minor, 3759 dsl_dataset_user_release_onexit, ca, NULL)); 3760 } 3761 3762 /* 3763 * If you add new checks here, you may need to add 3764 * additional checks to the "temporary" case in 3765 * snapshot_check() in dmu_objset.c. 3766 */ 3767 static int 3768 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) 3769 { 3770 dsl_dataset_t *ds = arg1; 3771 struct dsl_ds_holdarg *ha = arg2; 3772 const char *htag = ha->htag; 3773 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3774 int error = 0; 3775 3776 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3777 return (ENOTSUP); 3778 3779 if (!dsl_dataset_is_snapshot(ds)) 3780 return (EINVAL); 3781 3782 /* tags must be unique */ 3783 mutex_enter(&ds->ds_lock); 3784 if (ds->ds_phys->ds_userrefs_obj) { 3785 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 3786 8, 1, tx); 3787 if (error == 0) 3788 error = EEXIST; 3789 else if (error == ENOENT) 3790 error = 0; 3791 } 3792 mutex_exit(&ds->ds_lock); 3793 3794 if (error == 0 && ha->temphold && 3795 strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 3796 error = E2BIG; 3797 3798 return (error); 3799 } 3800 3801 void 3802 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3803 { 3804 dsl_dataset_t *ds = arg1; 3805 struct dsl_ds_holdarg *ha = arg2; 3806 const char *htag = ha->htag; 3807 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3808 objset_t *mos = dp->dp_meta_objset; 3809 uint64_t now = gethrestime_sec(); 3810 uint64_t zapobj; 3811 3812 mutex_enter(&ds->ds_lock); 3813 if (ds->ds_phys->ds_userrefs_obj == 0) { 3814 /* 3815 * This is the first user hold for this dataset. Create 3816 * the userrefs zap object. 3817 */ 3818 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3819 zapobj = ds->ds_phys->ds_userrefs_obj = 3820 zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); 3821 } else { 3822 zapobj = ds->ds_phys->ds_userrefs_obj; 3823 } 3824 ds->ds_userrefs++; 3825 mutex_exit(&ds->ds_lock); 3826 3827 VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); 3828 3829 if (ha->temphold) { 3830 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, 3831 htag, &now, tx)); 3832 } 3833 3834 spa_history_log_internal_ds(ds, "hold", tx, 3835 "tag = %s temp = %d holds now = %llu", 3836 htag, (int)ha->temphold, ds->ds_userrefs); 3837 } 3838 3839 static int 3840 dsl_dataset_user_hold_one(const char *dsname, void *arg) 3841 { 3842 struct dsl_ds_holdarg *ha = arg; 3843 dsl_dataset_t *ds; 3844 int error; 3845 char *name; 3846 3847 /* alloc a buffer to hold dsname@snapname plus terminating NULL */ 3848 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3849 error = dsl_dataset_hold(name, ha->dstg, &ds); 3850 strfree(name); 3851 if (error == 0) { 3852 ha->gotone = B_TRUE; 3853 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, 3854 dsl_dataset_user_hold_sync, ds, ha, 0); 3855 } else if (error == ENOENT && ha->recursive) { 3856 error = 0; 3857 } else { 3858 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3859 } 3860 return (error); 3861 } 3862 3863 int 3864 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, 3865 boolean_t temphold) 3866 { 3867 struct dsl_ds_holdarg *ha; 3868 int error; 3869 3870 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3871 ha->htag = htag; 3872 ha->temphold = temphold; 3873 error = dsl_sync_task_do(ds->ds_dir->dd_pool, 3874 dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, 3875 ds, ha, 0); 3876 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3877 3878 return (error); 3879 } 3880 3881 int 3882 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, 3883 boolean_t recursive, boolean_t temphold, int cleanup_fd) 3884 { 3885 struct dsl_ds_holdarg *ha; 3886 dsl_sync_task_t *dst; 3887 spa_t *spa; 3888 int error; 3889 minor_t minor = 0; 3890 3891 if (cleanup_fd != -1) { 3892 /* Currently we only support cleanup-on-exit of tempholds. */ 3893 if (!temphold) 3894 return (EINVAL); 3895 error = zfs_onexit_fd_hold(cleanup_fd, &minor); 3896 if (error) 3897 return (error); 3898 } 3899 3900 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3901 3902 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3903 3904 error = spa_open(dsname, &spa, FTAG); 3905 if (error) { 3906 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3907 if (cleanup_fd != -1) 3908 zfs_onexit_fd_rele(cleanup_fd); 3909 return (error); 3910 } 3911 3912 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3913 ha->htag = htag; 3914 ha->snapname = snapname; 3915 ha->recursive = recursive; 3916 ha->temphold = temphold; 3917 3918 if (recursive) { 3919 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, 3920 ha, DS_FIND_CHILDREN); 3921 } else { 3922 error = dsl_dataset_user_hold_one(dsname, ha); 3923 } 3924 if (error == 0) 3925 error = dsl_sync_task_group_wait(ha->dstg); 3926 3927 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3928 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3929 dsl_dataset_t *ds = dst->dst_arg1; 3930 3931 if (dst->dst_err) { 3932 dsl_dataset_name(ds, ha->failed); 3933 *strchr(ha->failed, '@') = '\0'; 3934 } else if (error == 0 && minor != 0 && temphold) { 3935 /* 3936 * If this hold is to be released upon process exit, 3937 * register that action now. 3938 */ 3939 dsl_register_onexit_hold_cleanup(ds, htag, minor); 3940 } 3941 dsl_dataset_rele(ds, ha->dstg); 3942 } 3943 3944 if (error == 0 && recursive && !ha->gotone) 3945 error = ENOENT; 3946 3947 if (error) 3948 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3949 3950 dsl_sync_task_group_destroy(ha->dstg); 3951 3952 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3953 spa_close(spa, FTAG); 3954 if (cleanup_fd != -1) 3955 zfs_onexit_fd_rele(cleanup_fd); 3956 return (error); 3957 } 3958 3959 struct dsl_ds_releasearg { 3960 dsl_dataset_t *ds; 3961 const char *htag; 3962 boolean_t own; /* do we own or just hold ds? */ 3963 }; 3964 3965 static int 3966 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, 3967 boolean_t *might_destroy) 3968 { 3969 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3970 uint64_t zapobj; 3971 uint64_t tmp; 3972 int error; 3973 3974 *might_destroy = B_FALSE; 3975 3976 mutex_enter(&ds->ds_lock); 3977 zapobj = ds->ds_phys->ds_userrefs_obj; 3978 if (zapobj == 0) { 3979 /* The tag can't possibly exist */ 3980 mutex_exit(&ds->ds_lock); 3981 return (ESRCH); 3982 } 3983 3984 /* Make sure the tag exists */ 3985 error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); 3986 if (error) { 3987 mutex_exit(&ds->ds_lock); 3988 if (error == ENOENT) 3989 error = ESRCH; 3990 return (error); 3991 } 3992 3993 if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && 3994 DS_IS_DEFER_DESTROY(ds)) 3995 *might_destroy = B_TRUE; 3996 3997 mutex_exit(&ds->ds_lock); 3998 return (0); 3999 } 4000 4001 static int 4002 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) 4003 { 4004 struct dsl_ds_releasearg *ra = arg1; 4005 dsl_dataset_t *ds = ra->ds; 4006 boolean_t might_destroy; 4007 int error; 4008 4009 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 4010 return (ENOTSUP); 4011 4012 error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); 4013 if (error) 4014 return (error); 4015 4016 if (might_destroy) { 4017 struct dsl_ds_destroyarg dsda = {0}; 4018 4019 if (dmu_tx_is_syncing(tx)) { 4020 /* 4021 * If we're not prepared to remove the snapshot, 4022 * we can't allow the release to happen right now. 4023 */ 4024 if (!ra->own) 4025 return (EBUSY); 4026 } 4027 dsda.ds = ds; 4028 dsda.releasing = B_TRUE; 4029 return (dsl_dataset_destroy_check(&dsda, tag, tx)); 4030 } 4031 4032 return (0); 4033 } 4034 4035 static void 4036 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) 4037 { 4038 struct dsl_ds_releasearg *ra = arg1; 4039 dsl_dataset_t *ds = ra->ds; 4040 dsl_pool_t *dp = ds->ds_dir->dd_pool; 4041 objset_t *mos = dp->dp_meta_objset; 4042 uint64_t zapobj; 4043 uint64_t refs; 4044 int error; 4045 4046 mutex_enter(&ds->ds_lock); 4047 ds->ds_userrefs--; 4048 refs = ds->ds_userrefs; 4049 mutex_exit(&ds->ds_lock); 4050 error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); 4051 VERIFY(error == 0 || error == ENOENT); 4052 zapobj = ds->ds_phys->ds_userrefs_obj; 4053 VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); 4054 4055 spa_history_log_internal_ds(ds, "release", tx, 4056 "tag = %s refs now = %lld", ra->htag, (longlong_t)refs); 4057 4058 if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && 4059 DS_IS_DEFER_DESTROY(ds)) { 4060 struct dsl_ds_destroyarg dsda = {0}; 4061 4062 ASSERT(ra->own); 4063 dsda.ds = ds; 4064 dsda.releasing = B_TRUE; 4065 /* We already did the destroy_check */ 4066 dsl_dataset_destroy_sync(&dsda, tag, tx); 4067 } 4068 } 4069 4070 static int 4071 dsl_dataset_user_release_one(const char *dsname, void *arg) 4072 { 4073 struct dsl_ds_holdarg *ha = arg; 4074 struct dsl_ds_releasearg *ra; 4075 dsl_dataset_t *ds; 4076 int error; 4077 void *dtag = ha->dstg; 4078 char *name; 4079 boolean_t own = B_FALSE; 4080 boolean_t might_destroy; 4081 4082 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ 4083 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 4084 error = dsl_dataset_hold(name, dtag, &ds); 4085 strfree(name); 4086 if (error == ENOENT && ha->recursive) 4087 return (0); 4088 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 4089 if (error) 4090 return (error); 4091 4092 ha->gotone = B_TRUE; 4093 4094 ASSERT(dsl_dataset_is_snapshot(ds)); 4095 4096 error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); 4097 if (error) { 4098 dsl_dataset_rele(ds, dtag); 4099 return (error); 4100 } 4101 4102 if (might_destroy) { 4103 #ifdef _KERNEL 4104 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 4105 error = zfs_unmount_snap(name, NULL); 4106 strfree(name); 4107 if (error) { 4108 dsl_dataset_rele(ds, dtag); 4109 return (error); 4110 } 4111 #endif 4112 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { 4113 dsl_dataset_rele(ds, dtag); 4114 return (EBUSY); 4115 } else { 4116 own = B_TRUE; 4117 dsl_dataset_make_exclusive(ds, dtag); 4118 } 4119 } 4120 4121 ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); 4122 ra->ds = ds; 4123 ra->htag = ha->htag; 4124 ra->own = own; 4125 dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, 4126 dsl_dataset_user_release_sync, ra, dtag, 0); 4127 4128 return (0); 4129 } 4130 4131 int 4132 dsl_dataset_user_release(char *dsname, char *snapname, char *htag, 4133 boolean_t recursive) 4134 { 4135 struct dsl_ds_holdarg *ha; 4136 dsl_sync_task_t *dst; 4137 spa_t *spa; 4138 int error; 4139 4140 top: 4141 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 4142 4143 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 4144 4145 error = spa_open(dsname, &spa, FTAG); 4146 if (error) { 4147 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 4148 return (error); 4149 } 4150 4151 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 4152 ha->htag = htag; 4153 ha->snapname = snapname; 4154 ha->recursive = recursive; 4155 if (recursive) { 4156 error = dmu_objset_find(dsname, dsl_dataset_user_release_one, 4157 ha, DS_FIND_CHILDREN); 4158 } else { 4159 error = dsl_dataset_user_release_one(dsname, ha); 4160 } 4161 if (error == 0) 4162 error = dsl_sync_task_group_wait(ha->dstg); 4163 4164 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 4165 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 4166 struct dsl_ds_releasearg *ra = dst->dst_arg1; 4167 dsl_dataset_t *ds = ra->ds; 4168 4169 if (dst->dst_err) 4170 dsl_dataset_name(ds, ha->failed); 4171 4172 if (ra->own) 4173 dsl_dataset_disown(ds, ha->dstg); 4174 else 4175 dsl_dataset_rele(ds, ha->dstg); 4176 4177 kmem_free(ra, sizeof (struct dsl_ds_releasearg)); 4178 } 4179 4180 if (error == 0 && recursive && !ha->gotone) 4181 error = ENOENT; 4182 4183 if (error && error != EBUSY) 4184 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 4185 4186 dsl_sync_task_group_destroy(ha->dstg); 4187 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 4188 spa_close(spa, FTAG); 4189 4190 /* 4191 * We can get EBUSY if we were racing with deferred destroy and 4192 * dsl_dataset_user_release_check() hadn't done the necessary 4193 * open context setup. We can also get EBUSY if we're racing 4194 * with destroy and that thread is the ds_owner. Either way 4195 * the busy condition should be transient, and we should retry 4196 * the release operation. 4197 */ 4198 if (error == EBUSY) 4199 goto top; 4200 4201 return (error); 4202 } 4203 4204 /* 4205 * Called at spa_load time (with retry == B_FALSE) to release a stale 4206 * temporary user hold. Also called by the onexit code (with retry == B_TRUE). 4207 */ 4208 int 4209 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, 4210 boolean_t retry) 4211 { 4212 dsl_dataset_t *ds; 4213 char *snap; 4214 char *name; 4215 int namelen; 4216 int error; 4217 4218 do { 4219 rw_enter(&dp->dp_config_rwlock, RW_READER); 4220 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 4221 rw_exit(&dp->dp_config_rwlock); 4222 if (error) 4223 return (error); 4224 namelen = dsl_dataset_namelen(ds)+1; 4225 name = kmem_alloc(namelen, KM_SLEEP); 4226 dsl_dataset_name(ds, name); 4227 dsl_dataset_rele(ds, FTAG); 4228 4229 snap = strchr(name, '@'); 4230 *snap = '\0'; 4231 ++snap; 4232 error = dsl_dataset_user_release(name, snap, htag, B_FALSE); 4233 kmem_free(name, namelen); 4234 4235 /* 4236 * The object can't have been destroyed because we have a hold, 4237 * but it might have been renamed, resulting in ENOENT. Retry 4238 * if we've been requested to do so. 4239 * 4240 * It would be nice if we could use the dsobj all the way 4241 * through and avoid ENOENT entirely. But we might need to 4242 * unmount the snapshot, and there's currently no way to lookup 4243 * a vfsp using a ZFS object id. 4244 */ 4245 } while ((error == ENOENT) && retry); 4246 4247 return (error); 4248 } 4249 4250 int 4251 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) 4252 { 4253 dsl_dataset_t *ds; 4254 int err; 4255 4256 err = dsl_dataset_hold(dsname, FTAG, &ds); 4257 if (err) 4258 return (err); 4259 4260 VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); 4261 if (ds->ds_phys->ds_userrefs_obj != 0) { 4262 zap_attribute_t *za; 4263 zap_cursor_t zc; 4264 4265 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 4266 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, 4267 ds->ds_phys->ds_userrefs_obj); 4268 zap_cursor_retrieve(&zc, za) == 0; 4269 zap_cursor_advance(&zc)) { 4270 VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, 4271 za->za_first_integer)); 4272 } 4273 zap_cursor_fini(&zc); 4274 kmem_free(za, sizeof (zap_attribute_t)); 4275 } 4276 dsl_dataset_rele(ds, FTAG); 4277 return (0); 4278 } 4279 4280 /* 4281 * Note, this function is used as the callback for dmu_objset_find(). We 4282 * always return 0 so that we will continue to find and process 4283 * inconsistent datasets, even if we encounter an error trying to 4284 * process one of them. 4285 */ 4286 /* ARGSUSED */ 4287 int 4288 dsl_destroy_inconsistent(const char *dsname, void *arg) 4289 { 4290 dsl_dataset_t *ds; 4291 4292 if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { 4293 if (DS_IS_INCONSISTENT(ds)) 4294 (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); 4295 else 4296 dsl_dataset_disown(ds, FTAG); 4297 } 4298 return (0); 4299 } 4300 4301 /* 4302 * Return (in *usedp) the amount of space written in new that is not 4303 * present in oldsnap. New may be a snapshot or the head. Old must be 4304 * a snapshot before new, in new's filesystem (or its origin). If not then 4305 * fail and return EINVAL. 4306 * 4307 * The written space is calculated by considering two components: First, we 4308 * ignore any freed space, and calculate the written as new's used space 4309 * minus old's used space. Next, we add in the amount of space that was freed 4310 * between the two snapshots, thus reducing new's used space relative to old's. 4311 * Specifically, this is the space that was born before old->ds_creation_txg, 4312 * and freed before new (ie. on new's deadlist or a previous deadlist). 4313 * 4314 * space freed [---------------------] 4315 * snapshots ---O-------O--------O-------O------ 4316 * oldsnap new 4317 */ 4318 int 4319 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, 4320 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 4321 { 4322 int err = 0; 4323 uint64_t snapobj; 4324 dsl_pool_t *dp = new->ds_dir->dd_pool; 4325 4326 *usedp = 0; 4327 *usedp += new->ds_phys->ds_referenced_bytes; 4328 *usedp -= oldsnap->ds_phys->ds_referenced_bytes; 4329 4330 *compp = 0; 4331 *compp += new->ds_phys->ds_compressed_bytes; 4332 *compp -= oldsnap->ds_phys->ds_compressed_bytes; 4333 4334 *uncompp = 0; 4335 *uncompp += new->ds_phys->ds_uncompressed_bytes; 4336 *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; 4337 4338 rw_enter(&dp->dp_config_rwlock, RW_READER); 4339 snapobj = new->ds_object; 4340 while (snapobj != oldsnap->ds_object) { 4341 dsl_dataset_t *snap; 4342 uint64_t used, comp, uncomp; 4343 4344 if (snapobj == new->ds_object) { 4345 snap = new; 4346 } else { 4347 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); 4348 if (err != 0) 4349 break; 4350 } 4351 4352 if (snap->ds_phys->ds_prev_snap_txg == 4353 oldsnap->ds_phys->ds_creation_txg) { 4354 /* 4355 * The blocks in the deadlist can not be born after 4356 * ds_prev_snap_txg, so get the whole deadlist space, 4357 * which is more efficient (especially for old-format 4358 * deadlists). Unfortunately the deadlist code 4359 * doesn't have enough information to make this 4360 * optimization itself. 4361 */ 4362 dsl_deadlist_space(&snap->ds_deadlist, 4363 &used, &comp, &uncomp); 4364 } else { 4365 dsl_deadlist_space_range(&snap->ds_deadlist, 4366 0, oldsnap->ds_phys->ds_creation_txg, 4367 &used, &comp, &uncomp); 4368 } 4369 *usedp += used; 4370 *compp += comp; 4371 *uncompp += uncomp; 4372 4373 /* 4374 * If we get to the beginning of the chain of snapshots 4375 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap 4376 * was not a snapshot of/before new. 4377 */ 4378 snapobj = snap->ds_phys->ds_prev_snap_obj; 4379 if (snap != new) 4380 dsl_dataset_rele(snap, FTAG); 4381 if (snapobj == 0) { 4382 err = EINVAL; 4383 break; 4384 } 4385 4386 } 4387 rw_exit(&dp->dp_config_rwlock); 4388 return (err); 4389 } 4390 4391 /* 4392 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, 4393 * lastsnap, and all snapshots in between are deleted. 4394 * 4395 * blocks that would be freed [---------------------------] 4396 * snapshots ---O-------O--------O-------O--------O 4397 * firstsnap lastsnap 4398 * 4399 * This is the set of blocks that were born after the snap before firstsnap, 4400 * (birth > firstsnap->prev_snap_txg) and died before the snap after the 4401 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). 4402 * We calculate this by iterating over the relevant deadlists (from the snap 4403 * after lastsnap, backward to the snap after firstsnap), summing up the 4404 * space on the deadlist that was born after the snap before firstsnap. 4405 */ 4406 int 4407 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, 4408 dsl_dataset_t *lastsnap, 4409 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 4410 { 4411 int err = 0; 4412 uint64_t snapobj; 4413 dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; 4414 4415 ASSERT(dsl_dataset_is_snapshot(firstsnap)); 4416 ASSERT(dsl_dataset_is_snapshot(lastsnap)); 4417 4418 /* 4419 * Check that the snapshots are in the same dsl_dir, and firstsnap 4420 * is before lastsnap. 4421 */ 4422 if (firstsnap->ds_dir != lastsnap->ds_dir || 4423 firstsnap->ds_phys->ds_creation_txg > 4424 lastsnap->ds_phys->ds_creation_txg) 4425 return (EINVAL); 4426 4427 *usedp = *compp = *uncompp = 0; 4428 4429 rw_enter(&dp->dp_config_rwlock, RW_READER); 4430 snapobj = lastsnap->ds_phys->ds_next_snap_obj; 4431 while (snapobj != firstsnap->ds_object) { 4432 dsl_dataset_t *ds; 4433 uint64_t used, comp, uncomp; 4434 4435 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); 4436 if (err != 0) 4437 break; 4438 4439 dsl_deadlist_space_range(&ds->ds_deadlist, 4440 firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, 4441 &used, &comp, &uncomp); 4442 *usedp += used; 4443 *compp += comp; 4444 *uncompp += uncomp; 4445 4446 snapobj = ds->ds_phys->ds_prev_snap_obj; 4447 ASSERT3U(snapobj, !=, 0); 4448 dsl_dataset_rele(ds, FTAG); 4449 } 4450 rw_exit(&dp->dp_config_rwlock); 4451 return (err); 4452 }