1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 25 */ 26 27 #include <sys/dmu_objset.h> 28 #include <sys/dsl_dataset.h> 29 #include <sys/dsl_dir.h> 30 #include <sys/dsl_prop.h> 31 #include <sys/dsl_synctask.h> 32 #include <sys/dmu_traverse.h> 33 #include <sys/dmu_impl.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/arc.h> 36 #include <sys/zio.h> 37 #include <sys/zap.h> 38 #include <sys/zfeature.h> 39 #include <sys/unique.h> 40 #include <sys/zfs_context.h> 41 #include <sys/zfs_ioctl.h> 42 #include <sys/spa.h> 43 #include <sys/zfs_znode.h> 44 #include <sys/zfs_onexit.h> 45 #include <sys/zvol.h> 46 #include <sys/dsl_scan.h> 47 #include <sys/dsl_deadlist.h> 48 49 static char *dsl_reaper = "the grim reaper"; 50 51 static dsl_checkfunc_t dsl_dataset_destroy_begin_check; 52 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; 53 static dsl_syncfunc_t dsl_dataset_set_reservation_sync; 54 55 #define SWITCH64(x, y) \ 56 { \ 57 uint64_t __tmp = (x); \ 58 (x) = (y); \ 59 (y) = __tmp; \ 60 } 61 62 #define DS_REF_MAX (1ULL << 62) 63 64 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE 65 66 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) 67 68 69 /* 70 * Figure out how much of this delta should be propogated to the dsl_dir 71 * layer. If there's a refreservation, that space has already been 72 * partially accounted for in our ancestors. 73 */ 74 static int64_t 75 parent_delta(dsl_dataset_t *ds, int64_t delta) 76 { 77 uint64_t old_bytes, new_bytes; 78 79 if (ds->ds_reserved == 0) 80 return (delta); 81 82 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 83 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); 84 85 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); 86 return (new_bytes - old_bytes); 87 } 88 89 void 90 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) 91 { 92 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 93 int compressed = BP_GET_PSIZE(bp); 94 int uncompressed = BP_GET_UCSIZE(bp); 95 int64_t delta; 96 97 dprintf_bp(bp, "ds=%p", ds); 98 99 ASSERT(dmu_tx_is_syncing(tx)); 100 /* It could have been compressed away to nothing */ 101 if (BP_IS_HOLE(bp)) 102 return; 103 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); 104 ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); 105 if (ds == NULL) { 106 dsl_pool_mos_diduse_space(tx->tx_pool, 107 used, compressed, uncompressed); 108 return; 109 } 110 dmu_buf_will_dirty(ds->ds_dbuf, tx); 111 112 mutex_enter(&ds->ds_dir->dd_lock); 113 mutex_enter(&ds->ds_lock); 114 delta = parent_delta(ds, used); 115 ds->ds_phys->ds_referenced_bytes += used; 116 ds->ds_phys->ds_compressed_bytes += compressed; 117 ds->ds_phys->ds_uncompressed_bytes += uncompressed; 118 ds->ds_phys->ds_unique_bytes += used; 119 mutex_exit(&ds->ds_lock); 120 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, 121 compressed, uncompressed, tx); 122 dsl_dir_transfer_space(ds->ds_dir, used - delta, 123 DD_USED_REFRSRV, DD_USED_HEAD, tx); 124 mutex_exit(&ds->ds_dir->dd_lock); 125 } 126 127 int 128 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, 129 boolean_t async) 130 { 131 if (BP_IS_HOLE(bp)) 132 return (0); 133 134 ASSERT(dmu_tx_is_syncing(tx)); 135 ASSERT(bp->blk_birth <= tx->tx_txg); 136 137 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 138 int compressed = BP_GET_PSIZE(bp); 139 int uncompressed = BP_GET_UCSIZE(bp); 140 141 ASSERT(used > 0); 142 if (ds == NULL) { 143 dsl_free(tx->tx_pool, tx->tx_txg, bp); 144 dsl_pool_mos_diduse_space(tx->tx_pool, 145 -used, -compressed, -uncompressed); 146 return (used); 147 } 148 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); 149 150 ASSERT(!dsl_dataset_is_snapshot(ds)); 151 dmu_buf_will_dirty(ds->ds_dbuf, tx); 152 153 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { 154 int64_t delta; 155 156 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); 157 dsl_free(tx->tx_pool, tx->tx_txg, bp); 158 159 mutex_enter(&ds->ds_dir->dd_lock); 160 mutex_enter(&ds->ds_lock); 161 ASSERT(ds->ds_phys->ds_unique_bytes >= used || 162 !DS_UNIQUE_IS_ACCURATE(ds)); 163 delta = parent_delta(ds, -used); 164 ds->ds_phys->ds_unique_bytes -= used; 165 mutex_exit(&ds->ds_lock); 166 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 167 delta, -compressed, -uncompressed, tx); 168 dsl_dir_transfer_space(ds->ds_dir, -used - delta, 169 DD_USED_REFRSRV, DD_USED_HEAD, tx); 170 mutex_exit(&ds->ds_dir->dd_lock); 171 } else { 172 dprintf_bp(bp, "putting on dead list: %s", ""); 173 if (async) { 174 /* 175 * We are here as part of zio's write done callback, 176 * which means we're a zio interrupt thread. We can't 177 * call dsl_deadlist_insert() now because it may block 178 * waiting for I/O. Instead, put bp on the deferred 179 * queue and let dsl_pool_sync() finish the job. 180 */ 181 bplist_append(&ds->ds_pending_deadlist, bp); 182 } else { 183 dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); 184 } 185 ASSERT3U(ds->ds_prev->ds_object, ==, 186 ds->ds_phys->ds_prev_snap_obj); 187 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); 188 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ 189 if (ds->ds_prev->ds_phys->ds_next_snap_obj == 190 ds->ds_object && bp->blk_birth > 191 ds->ds_prev->ds_phys->ds_prev_snap_txg) { 192 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 193 mutex_enter(&ds->ds_prev->ds_lock); 194 ds->ds_prev->ds_phys->ds_unique_bytes += used; 195 mutex_exit(&ds->ds_prev->ds_lock); 196 } 197 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { 198 dsl_dir_transfer_space(ds->ds_dir, used, 199 DD_USED_HEAD, DD_USED_SNAP, tx); 200 } 201 } 202 mutex_enter(&ds->ds_lock); 203 ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used); 204 ds->ds_phys->ds_referenced_bytes -= used; 205 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); 206 ds->ds_phys->ds_compressed_bytes -= compressed; 207 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); 208 ds->ds_phys->ds_uncompressed_bytes -= uncompressed; 209 mutex_exit(&ds->ds_lock); 210 211 return (used); 212 } 213 214 uint64_t 215 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) 216 { 217 uint64_t trysnap = 0; 218 219 if (ds == NULL) 220 return (0); 221 /* 222 * The snapshot creation could fail, but that would cause an 223 * incorrect FALSE return, which would only result in an 224 * overestimation of the amount of space that an operation would 225 * consume, which is OK. 226 * 227 * There's also a small window where we could miss a pending 228 * snapshot, because we could set the sync task in the quiescing 229 * phase. So this should only be used as a guess. 230 */ 231 if (ds->ds_trysnap_txg > 232 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) 233 trysnap = ds->ds_trysnap_txg; 234 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); 235 } 236 237 boolean_t 238 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, 239 uint64_t blk_birth) 240 { 241 if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) 242 return (B_FALSE); 243 244 ddt_prefetch(dsl_dataset_get_spa(ds), bp); 245 246 return (B_TRUE); 247 } 248 249 /* ARGSUSED */ 250 static void 251 dsl_dataset_evict(dmu_buf_t *db, void *dsv) 252 { 253 dsl_dataset_t *ds = dsv; 254 255 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); 256 257 unique_remove(ds->ds_fsid_guid); 258 259 if (ds->ds_objset != NULL) 260 dmu_objset_evict(ds->ds_objset); 261 262 if (ds->ds_prev) { 263 dsl_dataset_drop_ref(ds->ds_prev, ds); 264 ds->ds_prev = NULL; 265 } 266 267 bplist_destroy(&ds->ds_pending_deadlist); 268 if (db != NULL) { 269 dsl_deadlist_close(&ds->ds_deadlist); 270 } else { 271 ASSERT(ds->ds_deadlist.dl_dbuf == NULL); 272 ASSERT(!ds->ds_deadlist.dl_oldfmt); 273 } 274 if (ds->ds_dir) 275 dsl_dir_close(ds->ds_dir, ds); 276 277 ASSERT(!list_link_active(&ds->ds_synced_link)); 278 279 mutex_destroy(&ds->ds_lock); 280 mutex_destroy(&ds->ds_recvlock); 281 mutex_destroy(&ds->ds_opening_lock); 282 rw_destroy(&ds->ds_rwlock); 283 cv_destroy(&ds->ds_exclusive_cv); 284 285 kmem_free(ds, sizeof (dsl_dataset_t)); 286 } 287 288 static int 289 dsl_dataset_get_snapname(dsl_dataset_t *ds) 290 { 291 dsl_dataset_phys_t *headphys; 292 int err; 293 dmu_buf_t *headdbuf; 294 dsl_pool_t *dp = ds->ds_dir->dd_pool; 295 objset_t *mos = dp->dp_meta_objset; 296 297 if (ds->ds_snapname[0]) 298 return (0); 299 if (ds->ds_phys->ds_next_snap_obj == 0) 300 return (0); 301 302 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, 303 FTAG, &headdbuf); 304 if (err) 305 return (err); 306 headphys = headdbuf->db_data; 307 err = zap_value_search(dp->dp_meta_objset, 308 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); 309 dmu_buf_rele(headdbuf, FTAG); 310 return (err); 311 } 312 313 static int 314 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) 315 { 316 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 317 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 318 matchtype_t mt; 319 int err; 320 321 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 322 mt = MT_FIRST; 323 else 324 mt = MT_EXACT; 325 326 err = zap_lookup_norm(mos, snapobj, name, 8, 1, 327 value, mt, NULL, 0, NULL); 328 if (err == ENOTSUP && mt == MT_FIRST) 329 err = zap_lookup(mos, snapobj, name, 8, 1, value); 330 return (err); 331 } 332 333 static int 334 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) 335 { 336 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 337 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 338 matchtype_t mt; 339 int err; 340 341 dsl_dir_snap_cmtime_update(ds->ds_dir); 342 343 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 344 mt = MT_FIRST; 345 else 346 mt = MT_EXACT; 347 348 err = zap_remove_norm(mos, snapobj, name, mt, tx); 349 if (err == ENOTSUP && mt == MT_FIRST) 350 err = zap_remove(mos, snapobj, name, tx); 351 352 if (err == 0) 353 dsl_snapcount_adjust(ds->ds_dir, tx, -1, B_TRUE); 354 355 return (err); 356 } 357 358 static int 359 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, 360 dsl_dataset_t **dsp) 361 { 362 objset_t *mos = dp->dp_meta_objset; 363 dmu_buf_t *dbuf; 364 dsl_dataset_t *ds; 365 int err; 366 dmu_object_info_t doi; 367 368 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 369 dsl_pool_sync_context(dp)); 370 371 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); 372 if (err) 373 return (err); 374 375 /* Make sure dsobj has the correct object type. */ 376 dmu_object_info_from_db(dbuf, &doi); 377 if (doi.doi_type != DMU_OT_DSL_DATASET) 378 return (EINVAL); 379 380 ds = dmu_buf_get_user(dbuf); 381 if (ds == NULL) { 382 dsl_dataset_t *winner; 383 384 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); 385 ds->ds_dbuf = dbuf; 386 ds->ds_object = dsobj; 387 ds->ds_phys = dbuf->db_data; 388 389 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); 390 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); 391 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); 392 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); 393 394 rw_init(&ds->ds_rwlock, 0, 0, 0); 395 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); 396 397 bplist_create(&ds->ds_pending_deadlist); 398 dsl_deadlist_open(&ds->ds_deadlist, 399 mos, ds->ds_phys->ds_deadlist_obj); 400 401 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), 402 offsetof(dmu_sendarg_t, dsa_link)); 403 404 if (err == 0) { 405 err = dsl_dir_open_obj(dp, 406 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); 407 } 408 if (err) { 409 mutex_destroy(&ds->ds_lock); 410 mutex_destroy(&ds->ds_recvlock); 411 mutex_destroy(&ds->ds_opening_lock); 412 rw_destroy(&ds->ds_rwlock); 413 cv_destroy(&ds->ds_exclusive_cv); 414 bplist_destroy(&ds->ds_pending_deadlist); 415 dsl_deadlist_close(&ds->ds_deadlist); 416 kmem_free(ds, sizeof (dsl_dataset_t)); 417 dmu_buf_rele(dbuf, tag); 418 return (err); 419 } 420 421 if (!dsl_dataset_is_snapshot(ds)) { 422 ds->ds_snapname[0] = '\0'; 423 if (ds->ds_phys->ds_prev_snap_obj) { 424 err = dsl_dataset_get_ref(dp, 425 ds->ds_phys->ds_prev_snap_obj, 426 ds, &ds->ds_prev); 427 } 428 } else { 429 if (zfs_flags & ZFS_DEBUG_SNAPNAMES) 430 err = dsl_dataset_get_snapname(ds); 431 if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { 432 err = zap_count( 433 ds->ds_dir->dd_pool->dp_meta_objset, 434 ds->ds_phys->ds_userrefs_obj, 435 &ds->ds_userrefs); 436 } 437 } 438 439 if (err == 0 && !dsl_dataset_is_snapshot(ds)) { 440 /* 441 * In sync context, we're called with either no lock 442 * or with the write lock. If we're not syncing, 443 * we're always called with the read lock held. 444 */ 445 boolean_t need_lock = 446 !RW_WRITE_HELD(&dp->dp_config_rwlock) && 447 dsl_pool_sync_context(dp); 448 449 if (need_lock) 450 rw_enter(&dp->dp_config_rwlock, RW_READER); 451 452 err = dsl_prop_get_ds(ds, 453 "refreservation", sizeof (uint64_t), 1, 454 &ds->ds_reserved, NULL); 455 if (err == 0) { 456 err = dsl_prop_get_ds(ds, 457 "refquota", sizeof (uint64_t), 1, 458 &ds->ds_quota, NULL); 459 } 460 461 if (need_lock) 462 rw_exit(&dp->dp_config_rwlock); 463 } else { 464 ds->ds_reserved = ds->ds_quota = 0; 465 } 466 467 if (err == 0) { 468 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, 469 dsl_dataset_evict); 470 } 471 if (err || winner) { 472 bplist_destroy(&ds->ds_pending_deadlist); 473 dsl_deadlist_close(&ds->ds_deadlist); 474 if (ds->ds_prev) 475 dsl_dataset_drop_ref(ds->ds_prev, ds); 476 dsl_dir_close(ds->ds_dir, ds); 477 mutex_destroy(&ds->ds_lock); 478 mutex_destroy(&ds->ds_recvlock); 479 mutex_destroy(&ds->ds_opening_lock); 480 rw_destroy(&ds->ds_rwlock); 481 cv_destroy(&ds->ds_exclusive_cv); 482 kmem_free(ds, sizeof (dsl_dataset_t)); 483 if (err) { 484 dmu_buf_rele(dbuf, tag); 485 return (err); 486 } 487 ds = winner; 488 } else { 489 ds->ds_fsid_guid = 490 unique_insert(ds->ds_phys->ds_fsid_guid); 491 } 492 } 493 ASSERT3P(ds->ds_dbuf, ==, dbuf); 494 ASSERT3P(ds->ds_phys, ==, dbuf->db_data); 495 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || 496 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || 497 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); 498 mutex_enter(&ds->ds_lock); 499 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { 500 mutex_exit(&ds->ds_lock); 501 dmu_buf_rele(ds->ds_dbuf, tag); 502 return (ENOENT); 503 } 504 mutex_exit(&ds->ds_lock); 505 *dsp = ds; 506 return (0); 507 } 508 509 static int 510 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) 511 { 512 dsl_pool_t *dp = ds->ds_dir->dd_pool; 513 514 /* 515 * In syncing context we don't want the rwlock lock: there 516 * may be an existing writer waiting for sync phase to 517 * finish. We don't need to worry about such writers, since 518 * sync phase is single-threaded, so the writer can't be 519 * doing anything while we are active. 520 */ 521 if (dsl_pool_sync_context(dp)) { 522 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 523 return (0); 524 } 525 526 /* 527 * Normal users will hold the ds_rwlock as a READER until they 528 * are finished (i.e., call dsl_dataset_rele()). "Owners" will 529 * drop their READER lock after they set the ds_owner field. 530 * 531 * If the dataset is being destroyed, the destroy thread will 532 * obtain a WRITER lock for exclusive access after it's done its 533 * open-context work and then change the ds_owner to 534 * dsl_reaper once destruction is assured. So threads 535 * may block here temporarily, until the "destructability" of 536 * the dataset is determined. 537 */ 538 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); 539 mutex_enter(&ds->ds_lock); 540 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { 541 rw_exit(&dp->dp_config_rwlock); 542 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); 543 if (DSL_DATASET_IS_DESTROYED(ds)) { 544 mutex_exit(&ds->ds_lock); 545 dsl_dataset_drop_ref(ds, tag); 546 rw_enter(&dp->dp_config_rwlock, RW_READER); 547 return (ENOENT); 548 } 549 /* 550 * The dp_config_rwlock lives above the ds_lock. And 551 * we need to check DSL_DATASET_IS_DESTROYED() while 552 * holding the ds_lock, so we have to drop and reacquire 553 * the ds_lock here. 554 */ 555 mutex_exit(&ds->ds_lock); 556 rw_enter(&dp->dp_config_rwlock, RW_READER); 557 mutex_enter(&ds->ds_lock); 558 } 559 mutex_exit(&ds->ds_lock); 560 return (0); 561 } 562 563 int 564 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, 565 dsl_dataset_t **dsp) 566 { 567 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); 568 569 if (err) 570 return (err); 571 return (dsl_dataset_hold_ref(*dsp, tag)); 572 } 573 574 int 575 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, 576 void *tag, dsl_dataset_t **dsp) 577 { 578 int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); 579 if (err) 580 return (err); 581 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 582 dsl_dataset_rele(*dsp, tag); 583 *dsp = NULL; 584 return (EBUSY); 585 } 586 return (0); 587 } 588 589 int 590 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) 591 { 592 dsl_dir_t *dd; 593 dsl_pool_t *dp; 594 const char *snapname; 595 uint64_t obj; 596 int err = 0; 597 598 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); 599 if (err) 600 return (err); 601 602 dp = dd->dd_pool; 603 obj = dd->dd_phys->dd_head_dataset_obj; 604 rw_enter(&dp->dp_config_rwlock, RW_READER); 605 if (obj) 606 err = dsl_dataset_get_ref(dp, obj, tag, dsp); 607 else 608 err = ENOENT; 609 if (err) 610 goto out; 611 612 err = dsl_dataset_hold_ref(*dsp, tag); 613 614 /* we may be looking for a snapshot */ 615 if (err == 0 && snapname != NULL) { 616 dsl_dataset_t *ds = NULL; 617 618 if (*snapname++ != '@') { 619 dsl_dataset_rele(*dsp, tag); 620 err = ENOENT; 621 goto out; 622 } 623 624 dprintf("looking for snapshot '%s'\n", snapname); 625 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); 626 if (err == 0) 627 err = dsl_dataset_get_ref(dp, obj, tag, &ds); 628 dsl_dataset_rele(*dsp, tag); 629 630 ASSERT3U((err == 0), ==, (ds != NULL)); 631 632 if (ds) { 633 mutex_enter(&ds->ds_lock); 634 if (ds->ds_snapname[0] == 0) 635 (void) strlcpy(ds->ds_snapname, snapname, 636 sizeof (ds->ds_snapname)); 637 mutex_exit(&ds->ds_lock); 638 err = dsl_dataset_hold_ref(ds, tag); 639 *dsp = err ? NULL : ds; 640 } 641 } 642 out: 643 rw_exit(&dp->dp_config_rwlock); 644 dsl_dir_close(dd, FTAG); 645 return (err); 646 } 647 648 int 649 dsl_dataset_own(const char *name, boolean_t inconsistentok, 650 void *tag, dsl_dataset_t **dsp) 651 { 652 int err = dsl_dataset_hold(name, tag, dsp); 653 if (err) 654 return (err); 655 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 656 dsl_dataset_rele(*dsp, tag); 657 return (EBUSY); 658 } 659 return (0); 660 } 661 662 void 663 dsl_dataset_name(dsl_dataset_t *ds, char *name) 664 { 665 if (ds == NULL) { 666 (void) strcpy(name, "mos"); 667 } else { 668 dsl_dir_name(ds->ds_dir, name); 669 VERIFY(0 == dsl_dataset_get_snapname(ds)); 670 if (ds->ds_snapname[0]) { 671 (void) strcat(name, "@"); 672 /* 673 * We use a "recursive" mutex so that we 674 * can call dprintf_ds() with ds_lock held. 675 */ 676 if (!MUTEX_HELD(&ds->ds_lock)) { 677 mutex_enter(&ds->ds_lock); 678 (void) strcat(name, ds->ds_snapname); 679 mutex_exit(&ds->ds_lock); 680 } else { 681 (void) strcat(name, ds->ds_snapname); 682 } 683 } 684 } 685 } 686 687 static int 688 dsl_dataset_namelen(dsl_dataset_t *ds) 689 { 690 int result; 691 692 if (ds == NULL) { 693 result = 3; /* "mos" */ 694 } else { 695 result = dsl_dir_namelen(ds->ds_dir); 696 VERIFY(0 == dsl_dataset_get_snapname(ds)); 697 if (ds->ds_snapname[0]) { 698 ++result; /* adding one for the @-sign */ 699 if (!MUTEX_HELD(&ds->ds_lock)) { 700 mutex_enter(&ds->ds_lock); 701 result += strlen(ds->ds_snapname); 702 mutex_exit(&ds->ds_lock); 703 } else { 704 result += strlen(ds->ds_snapname); 705 } 706 } 707 } 708 709 return (result); 710 } 711 712 void 713 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) 714 { 715 dmu_buf_rele(ds->ds_dbuf, tag); 716 } 717 718 void 719 dsl_dataset_rele(dsl_dataset_t *ds, void *tag) 720 { 721 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { 722 rw_exit(&ds->ds_rwlock); 723 } 724 dsl_dataset_drop_ref(ds, tag); 725 } 726 727 void 728 dsl_dataset_disown(dsl_dataset_t *ds, void *tag) 729 { 730 ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || 731 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); 732 733 mutex_enter(&ds->ds_lock); 734 ds->ds_owner = NULL; 735 if (RW_WRITE_HELD(&ds->ds_rwlock)) { 736 rw_exit(&ds->ds_rwlock); 737 cv_broadcast(&ds->ds_exclusive_cv); 738 } 739 mutex_exit(&ds->ds_lock); 740 if (ds->ds_dbuf) 741 dsl_dataset_drop_ref(ds, tag); 742 else 743 dsl_dataset_evict(NULL, ds); 744 } 745 746 boolean_t 747 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) 748 { 749 boolean_t gotit = FALSE; 750 751 mutex_enter(&ds->ds_lock); 752 if (ds->ds_owner == NULL && 753 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { 754 ds->ds_owner = tag; 755 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) 756 rw_exit(&ds->ds_rwlock); 757 gotit = TRUE; 758 } 759 mutex_exit(&ds->ds_lock); 760 return (gotit); 761 } 762 763 void 764 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) 765 { 766 ASSERT3P(owner, ==, ds->ds_owner); 767 if (!RW_WRITE_HELD(&ds->ds_rwlock)) 768 rw_enter(&ds->ds_rwlock, RW_WRITER); 769 } 770 771 uint64_t 772 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, 773 uint64_t flags, dmu_tx_t *tx) 774 { 775 dsl_pool_t *dp = dd->dd_pool; 776 dmu_buf_t *dbuf; 777 dsl_dataset_phys_t *dsphys; 778 uint64_t dsobj; 779 objset_t *mos = dp->dp_meta_objset; 780 781 if (origin == NULL) 782 origin = dp->dp_origin_snap; 783 784 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); 785 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); 786 ASSERT(dmu_tx_is_syncing(tx)); 787 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 788 789 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 790 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 791 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 792 dmu_buf_will_dirty(dbuf, tx); 793 dsphys = dbuf->db_data; 794 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 795 dsphys->ds_dir_obj = dd->dd_object; 796 dsphys->ds_flags = flags; 797 dsphys->ds_fsid_guid = unique_create(); 798 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 799 sizeof (dsphys->ds_guid)); 800 dsphys->ds_snapnames_zapobj = 801 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, 802 DMU_OT_NONE, 0, tx); 803 dsphys->ds_creation_time = gethrestime_sec(); 804 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; 805 806 if (origin == NULL) { 807 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); 808 } else { 809 dsl_dataset_t *ohds; 810 811 dsphys->ds_prev_snap_obj = origin->ds_object; 812 dsphys->ds_prev_snap_txg = 813 origin->ds_phys->ds_creation_txg; 814 dsphys->ds_referenced_bytes = 815 origin->ds_phys->ds_referenced_bytes; 816 dsphys->ds_compressed_bytes = 817 origin->ds_phys->ds_compressed_bytes; 818 dsphys->ds_uncompressed_bytes = 819 origin->ds_phys->ds_uncompressed_bytes; 820 dsphys->ds_bp = origin->ds_phys->ds_bp; 821 dsphys->ds_flags |= origin->ds_phys->ds_flags; 822 823 dmu_buf_will_dirty(origin->ds_dbuf, tx); 824 origin->ds_phys->ds_num_children++; 825 826 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 827 origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); 828 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, 829 dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); 830 dsl_dataset_rele(ohds, FTAG); 831 832 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { 833 if (origin->ds_phys->ds_next_clones_obj == 0) { 834 origin->ds_phys->ds_next_clones_obj = 835 zap_create(mos, 836 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 837 } 838 VERIFY(0 == zap_add_int(mos, 839 origin->ds_phys->ds_next_clones_obj, 840 dsobj, tx)); 841 } 842 843 dmu_buf_will_dirty(dd->dd_dbuf, tx); 844 dd->dd_phys->dd_origin_obj = origin->ds_object; 845 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 846 if (origin->ds_dir->dd_phys->dd_clones == 0) { 847 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 848 origin->ds_dir->dd_phys->dd_clones = 849 zap_create(mos, 850 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 851 } 852 VERIFY3U(0, ==, zap_add_int(mos, 853 origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); 854 } 855 } 856 857 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 858 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 859 860 dmu_buf_rele(dbuf, FTAG); 861 862 dmu_buf_will_dirty(dd->dd_dbuf, tx); 863 dd->dd_phys->dd_head_dataset_obj = dsobj; 864 865 return (dsobj); 866 } 867 868 uint64_t 869 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, 870 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) 871 { 872 dsl_pool_t *dp = pdd->dd_pool; 873 uint64_t dsobj, ddobj; 874 dsl_dir_t *dd; 875 876 ASSERT(lastname[0] != '@'); 877 878 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); 879 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); 880 881 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); 882 883 dsl_deleg_set_create_perms(dd, tx, cr); 884 885 dsl_dir_close(dd, FTAG); 886 887 /* 888 * If we are creating a clone, make sure we zero out any stale 889 * data from the origin snapshots zil header. 890 */ 891 if (origin != NULL) { 892 dsl_dataset_t *ds; 893 objset_t *os; 894 895 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 896 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); 897 bzero(&os->os_zil_header, sizeof (os->os_zil_header)); 898 dsl_dataset_dirty(ds, tx); 899 dsl_dataset_rele(ds, FTAG); 900 } 901 902 return (dsobj); 903 } 904 905 /* 906 * The snapshots must all be in the same pool. 907 */ 908 int 909 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, 910 nvlist_t *errlist) 911 { 912 int err; 913 dsl_sync_task_t *dst; 914 spa_t *spa; 915 nvpair_t *pair; 916 dsl_sync_task_group_t *dstg; 917 918 pair = nvlist_next_nvpair(snaps, NULL); 919 if (pair == NULL) 920 return (0); 921 922 err = spa_open(nvpair_name(pair), &spa, FTAG); 923 if (err) 924 return (err); 925 dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 926 927 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; 928 pair = nvlist_next_nvpair(snaps, pair)) { 929 dsl_dataset_t *ds; 930 931 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); 932 if (err == 0) { 933 struct dsl_ds_destroyarg *dsda; 934 935 dsl_dataset_make_exclusive(ds, dstg); 936 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), 937 KM_SLEEP); 938 dsda->ds = ds; 939 dsda->defer = defer; 940 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 941 dsl_dataset_destroy_sync, dsda, dstg, 0); 942 } else if (err == ENOENT) { 943 err = 0; 944 } else { 945 fnvlist_add_int32(errlist, nvpair_name(pair), err); 946 break; 947 } 948 } 949 950 if (err == 0) 951 err = dsl_sync_task_group_wait(dstg); 952 953 for (dst = list_head(&dstg->dstg_tasks); dst; 954 dst = list_next(&dstg->dstg_tasks, dst)) { 955 struct dsl_ds_destroyarg *dsda = dst->dst_arg1; 956 dsl_dataset_t *ds = dsda->ds; 957 958 /* 959 * Return the snapshots that triggered the error. 960 */ 961 if (dst->dst_err != 0) { 962 char name[ZFS_MAXNAMELEN]; 963 dsl_dataset_name(ds, name); 964 fnvlist_add_int32(errlist, name, dst->dst_err); 965 } 966 ASSERT3P(dsda->rm_origin, ==, NULL); 967 dsl_dataset_disown(ds, dstg); 968 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); 969 } 970 971 dsl_sync_task_group_destroy(dstg); 972 spa_close(spa, FTAG); 973 return (err); 974 975 } 976 977 static boolean_t 978 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) 979 { 980 boolean_t might_destroy = B_FALSE; 981 982 mutex_enter(&ds->ds_lock); 983 if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && 984 DS_IS_DEFER_DESTROY(ds)) 985 might_destroy = B_TRUE; 986 mutex_exit(&ds->ds_lock); 987 988 return (might_destroy); 989 } 990 991 /* 992 * If we're removing a clone, and these three conditions are true: 993 * 1) the clone's origin has no other children 994 * 2) the clone's origin has no user references 995 * 3) the clone's origin has been marked for deferred destruction 996 * Then, prepare to remove the origin as part of this sync task group. 997 */ 998 static int 999 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) 1000 { 1001 dsl_dataset_t *ds = dsda->ds; 1002 dsl_dataset_t *origin = ds->ds_prev; 1003 1004 if (dsl_dataset_might_destroy_origin(origin)) { 1005 char *name; 1006 int namelen; 1007 int error; 1008 1009 namelen = dsl_dataset_namelen(origin) + 1; 1010 name = kmem_alloc(namelen, KM_SLEEP); 1011 dsl_dataset_name(origin, name); 1012 #ifdef _KERNEL 1013 error = zfs_unmount_snap(name, NULL); 1014 if (error) { 1015 kmem_free(name, namelen); 1016 return (error); 1017 } 1018 #endif 1019 error = dsl_dataset_own(name, B_TRUE, tag, &origin); 1020 kmem_free(name, namelen); 1021 if (error) 1022 return (error); 1023 dsda->rm_origin = origin; 1024 dsl_dataset_make_exclusive(origin, tag); 1025 } 1026 1027 return (0); 1028 } 1029 1030 /* 1031 * ds must be opened as OWNER. On return (whether successful or not), 1032 * ds will be closed and caller can no longer dereference it. 1033 */ 1034 int 1035 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) 1036 { 1037 int err; 1038 dsl_sync_task_group_t *dstg; 1039 objset_t *os; 1040 dsl_dir_t *dd; 1041 uint64_t obj; 1042 struct dsl_ds_destroyarg dsda = { 0 }; 1043 1044 dsda.ds = ds; 1045 1046 if (dsl_dataset_is_snapshot(ds)) { 1047 /* Destroying a snapshot is simpler */ 1048 dsl_dataset_make_exclusive(ds, tag); 1049 1050 dsda.defer = defer; 1051 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1052 dsl_dataset_destroy_check, dsl_dataset_destroy_sync, 1053 &dsda, tag, 0); 1054 ASSERT3P(dsda.rm_origin, ==, NULL); 1055 goto out; 1056 } else if (defer) { 1057 err = EINVAL; 1058 goto out; 1059 } 1060 1061 dd = ds->ds_dir; 1062 1063 if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), 1064 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { 1065 /* 1066 * Check for errors and mark this ds as inconsistent, in 1067 * case we crash while freeing the objects. 1068 */ 1069 err = dsl_sync_task_do(dd->dd_pool, 1070 dsl_dataset_destroy_begin_check, 1071 dsl_dataset_destroy_begin_sync, ds, NULL, 0); 1072 if (err) 1073 goto out; 1074 1075 err = dmu_objset_from_ds(ds, &os); 1076 if (err) 1077 goto out; 1078 1079 /* 1080 * Remove all objects while in the open context so that 1081 * there is less work to do in the syncing context. 1082 */ 1083 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 1084 ds->ds_phys->ds_prev_snap_txg)) { 1085 /* 1086 * Ignore errors, if there is not enough disk space 1087 * we will deal with it in dsl_dataset_destroy_sync(). 1088 */ 1089 (void) dmu_free_object(os, obj); 1090 } 1091 if (err != ESRCH) 1092 goto out; 1093 1094 /* 1095 * Sync out all in-flight IO. 1096 */ 1097 txg_wait_synced(dd->dd_pool, 0); 1098 1099 /* 1100 * If we managed to free all the objects in open 1101 * context, the user space accounting should be zero. 1102 */ 1103 if (ds->ds_phys->ds_bp.blk_fill == 0 && 1104 dmu_objset_userused_enabled(os)) { 1105 uint64_t count; 1106 1107 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, 1108 &count) != 0 || count == 0); 1109 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, 1110 &count) != 0 || count == 0); 1111 } 1112 } 1113 1114 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 1115 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); 1116 rw_exit(&dd->dd_pool->dp_config_rwlock); 1117 1118 if (err) 1119 goto out; 1120 1121 /* 1122 * Blow away the dsl_dir + head dataset. 1123 */ 1124 dsl_dataset_make_exclusive(ds, tag); 1125 /* 1126 * If we're removing a clone, we might also need to remove its 1127 * origin. 1128 */ 1129 do { 1130 dsda.need_prep = B_FALSE; 1131 if (dsl_dir_is_clone(dd)) { 1132 err = dsl_dataset_origin_rm_prep(&dsda, tag); 1133 if (err) { 1134 dsl_dir_close(dd, FTAG); 1135 goto out; 1136 } 1137 } 1138 1139 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); 1140 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 1141 dsl_dataset_destroy_sync, &dsda, tag, 0); 1142 dsl_sync_task_create(dstg, dsl_dir_destroy_check, 1143 dsl_dir_destroy_sync, dd, tag, 0); 1144 err = dsl_sync_task_group_wait(dstg); 1145 dsl_sync_task_group_destroy(dstg); 1146 1147 /* 1148 * We could be racing against 'zfs release' or 'zfs destroy -d' 1149 * on the origin snap, in which case we can get EBUSY if we 1150 * needed to destroy the origin snap but were not ready to 1151 * do so. 1152 */ 1153 if (dsda.need_prep) { 1154 ASSERT(err == EBUSY); 1155 ASSERT(dsl_dir_is_clone(dd)); 1156 ASSERT(dsda.rm_origin == NULL); 1157 } 1158 } while (dsda.need_prep); 1159 1160 if (dsda.rm_origin != NULL) 1161 dsl_dataset_disown(dsda.rm_origin, tag); 1162 1163 /* if it is successful, dsl_dir_destroy_sync will close the dd */ 1164 if (err) 1165 dsl_dir_close(dd, FTAG); 1166 out: 1167 dsl_dataset_disown(ds, tag); 1168 return (err); 1169 } 1170 1171 blkptr_t * 1172 dsl_dataset_get_blkptr(dsl_dataset_t *ds) 1173 { 1174 return (&ds->ds_phys->ds_bp); 1175 } 1176 1177 void 1178 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 1179 { 1180 ASSERT(dmu_tx_is_syncing(tx)); 1181 /* If it's the meta-objset, set dp_meta_rootbp */ 1182 if (ds == NULL) { 1183 tx->tx_pool->dp_meta_rootbp = *bp; 1184 } else { 1185 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1186 ds->ds_phys->ds_bp = *bp; 1187 } 1188 } 1189 1190 spa_t * 1191 dsl_dataset_get_spa(dsl_dataset_t *ds) 1192 { 1193 return (ds->ds_dir->dd_pool->dp_spa); 1194 } 1195 1196 void 1197 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) 1198 { 1199 dsl_pool_t *dp; 1200 1201 if (ds == NULL) /* this is the meta-objset */ 1202 return; 1203 1204 ASSERT(ds->ds_objset != NULL); 1205 1206 if (ds->ds_phys->ds_next_snap_obj != 0) 1207 panic("dirtying snapshot!"); 1208 1209 dp = ds->ds_dir->dd_pool; 1210 1211 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { 1212 /* up the hold count until we can be written out */ 1213 dmu_buf_add_ref(ds->ds_dbuf, ds); 1214 } 1215 } 1216 1217 boolean_t 1218 dsl_dataset_is_dirty(dsl_dataset_t *ds) 1219 { 1220 for (int t = 0; t < TXG_SIZE; t++) { 1221 if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, 1222 ds, t)) 1223 return (B_TRUE); 1224 } 1225 return (B_FALSE); 1226 } 1227 1228 /* 1229 * The unique space in the head dataset can be calculated by subtracting 1230 * the space used in the most recent snapshot, that is still being used 1231 * in this file system, from the space currently in use. To figure out 1232 * the space in the most recent snapshot still in use, we need to take 1233 * the total space used in the snapshot and subtract out the space that 1234 * has been freed up since the snapshot was taken. 1235 */ 1236 static void 1237 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) 1238 { 1239 uint64_t mrs_used; 1240 uint64_t dlused, dlcomp, dluncomp; 1241 1242 ASSERT(!dsl_dataset_is_snapshot(ds)); 1243 1244 if (ds->ds_phys->ds_prev_snap_obj != 0) 1245 mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; 1246 else 1247 mrs_used = 0; 1248 1249 dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); 1250 1251 ASSERT3U(dlused, <=, mrs_used); 1252 ds->ds_phys->ds_unique_bytes = 1253 ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); 1254 1255 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= 1256 SPA_VERSION_UNIQUE_ACCURATE) 1257 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1258 } 1259 1260 struct killarg { 1261 dsl_dataset_t *ds; 1262 dmu_tx_t *tx; 1263 }; 1264 1265 /* ARGSUSED */ 1266 static int 1267 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 1268 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1269 { 1270 struct killarg *ka = arg; 1271 dmu_tx_t *tx = ka->tx; 1272 1273 if (bp == NULL) 1274 return (0); 1275 1276 if (zb->zb_level == ZB_ZIL_LEVEL) { 1277 ASSERT(zilog != NULL); 1278 /* 1279 * It's a block in the intent log. It has no 1280 * accounting, so just free it. 1281 */ 1282 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); 1283 } else { 1284 ASSERT(zilog == NULL); 1285 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); 1286 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); 1287 } 1288 1289 return (0); 1290 } 1291 1292 /* ARGSUSED */ 1293 static int 1294 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) 1295 { 1296 dsl_dataset_t *ds = arg1; 1297 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1298 uint64_t count; 1299 int err; 1300 1301 /* 1302 * Can't delete a head dataset if there are snapshots of it. 1303 * (Except if the only snapshots are from the branch we cloned 1304 * from.) 1305 */ 1306 if (ds->ds_prev != NULL && 1307 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1308 return (EBUSY); 1309 1310 /* 1311 * This is really a dsl_dir thing, but check it here so that 1312 * we'll be less likely to leave this dataset inconsistent & 1313 * nearly destroyed. 1314 */ 1315 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); 1316 if (err) 1317 return (err); 1318 if (count != 0) 1319 return (EEXIST); 1320 1321 return (0); 1322 } 1323 1324 /* ARGSUSED */ 1325 static void 1326 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1327 { 1328 dsl_dataset_t *ds = arg1; 1329 1330 /* Mark it as inconsistent on-disk, in case we crash */ 1331 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1332 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1333 1334 spa_history_log_internal_ds(ds, "destroy begin", tx, ""); 1335 } 1336 1337 static int 1338 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, 1339 dmu_tx_t *tx) 1340 { 1341 dsl_dataset_t *ds = dsda->ds; 1342 dsl_dataset_t *ds_prev = ds->ds_prev; 1343 1344 if (dsl_dataset_might_destroy_origin(ds_prev)) { 1345 struct dsl_ds_destroyarg ndsda = {0}; 1346 1347 /* 1348 * If we're not prepared to remove the origin, don't remove 1349 * the clone either. 1350 */ 1351 if (dsda->rm_origin == NULL) { 1352 dsda->need_prep = B_TRUE; 1353 return (EBUSY); 1354 } 1355 1356 ndsda.ds = ds_prev; 1357 ndsda.is_origin_rm = B_TRUE; 1358 return (dsl_dataset_destroy_check(&ndsda, tag, tx)); 1359 } 1360 1361 /* 1362 * If we're not going to remove the origin after all, 1363 * undo the open context setup. 1364 */ 1365 if (dsda->rm_origin != NULL) { 1366 dsl_dataset_disown(dsda->rm_origin, tag); 1367 dsda->rm_origin = NULL; 1368 } 1369 1370 return (0); 1371 } 1372 1373 /* 1374 * If you add new checks here, you may need to add 1375 * additional checks to the "temporary" case in 1376 * snapshot_check() in dmu_objset.c. 1377 */ 1378 /* ARGSUSED */ 1379 int 1380 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) 1381 { 1382 struct dsl_ds_destroyarg *dsda = arg1; 1383 dsl_dataset_t *ds = dsda->ds; 1384 1385 /* we have an owner hold, so noone else can destroy us */ 1386 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 1387 1388 /* 1389 * Only allow deferred destroy on pools that support it. 1390 * NOTE: deferred destroy is only supported on snapshots. 1391 */ 1392 if (dsda->defer) { 1393 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 1394 SPA_VERSION_USERREFS) 1395 return (ENOTSUP); 1396 ASSERT(dsl_dataset_is_snapshot(ds)); 1397 return (0); 1398 } 1399 1400 /* 1401 * Can't delete a head dataset if there are snapshots of it. 1402 * (Except if the only snapshots are from the branch we cloned 1403 * from.) 1404 */ 1405 if (ds->ds_prev != NULL && 1406 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1407 return (EBUSY); 1408 1409 /* 1410 * If we made changes this txg, traverse_dsl_dataset won't find 1411 * them. Try again. 1412 */ 1413 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1414 return (EAGAIN); 1415 1416 if (dsl_dataset_is_snapshot(ds)) { 1417 /* 1418 * If this snapshot has an elevated user reference count, 1419 * we can't destroy it yet. 1420 */ 1421 if (ds->ds_userrefs > 0 && !dsda->releasing) 1422 return (EBUSY); 1423 1424 mutex_enter(&ds->ds_lock); 1425 /* 1426 * Can't delete a branch point. However, if we're destroying 1427 * a clone and removing its origin due to it having a user 1428 * hold count of 0 and having been marked for deferred destroy, 1429 * it's OK for the origin to have a single clone. 1430 */ 1431 if (ds->ds_phys->ds_num_children > 1432 (dsda->is_origin_rm ? 2 : 1)) { 1433 mutex_exit(&ds->ds_lock); 1434 return (EEXIST); 1435 } 1436 mutex_exit(&ds->ds_lock); 1437 } else if (dsl_dir_is_clone(ds->ds_dir)) { 1438 return (dsl_dataset_origin_check(dsda, arg2, tx)); 1439 } 1440 1441 /* XXX we should do some i/o error checking... */ 1442 return (0); 1443 } 1444 1445 struct refsarg { 1446 kmutex_t lock; 1447 boolean_t gone; 1448 kcondvar_t cv; 1449 }; 1450 1451 /* ARGSUSED */ 1452 static void 1453 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) 1454 { 1455 struct refsarg *arg = argv; 1456 1457 mutex_enter(&arg->lock); 1458 arg->gone = TRUE; 1459 cv_signal(&arg->cv); 1460 mutex_exit(&arg->lock); 1461 } 1462 1463 static void 1464 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) 1465 { 1466 struct refsarg arg; 1467 1468 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); 1469 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); 1470 arg.gone = FALSE; 1471 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, 1472 dsl_dataset_refs_gone); 1473 dmu_buf_rele(ds->ds_dbuf, tag); 1474 mutex_enter(&arg.lock); 1475 while (!arg.gone) 1476 cv_wait(&arg.cv, &arg.lock); 1477 ASSERT(arg.gone); 1478 mutex_exit(&arg.lock); 1479 ds->ds_dbuf = NULL; 1480 ds->ds_phys = NULL; 1481 mutex_destroy(&arg.lock); 1482 cv_destroy(&arg.cv); 1483 } 1484 1485 static void 1486 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) 1487 { 1488 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1489 uint64_t count; 1490 int err; 1491 1492 ASSERT(ds->ds_phys->ds_num_children >= 2); 1493 err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); 1494 /* 1495 * The err should not be ENOENT, but a bug in a previous version 1496 * of the code could cause upgrade_clones_cb() to not set 1497 * ds_next_snap_obj when it should, leading to a missing entry. 1498 * If we knew that the pool was created after 1499 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't 1500 * ENOENT. However, at least we can check that we don't have 1501 * too many entries in the next_clones_obj even after failing to 1502 * remove this one. 1503 */ 1504 if (err != ENOENT) { 1505 VERIFY0(err); 1506 } 1507 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 1508 &count)); 1509 ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); 1510 } 1511 1512 static void 1513 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) 1514 { 1515 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1516 zap_cursor_t zc; 1517 zap_attribute_t za; 1518 1519 /* 1520 * If it is the old version, dd_clones doesn't exist so we can't 1521 * find the clones, but deadlist_remove_key() is a no-op so it 1522 * doesn't matter. 1523 */ 1524 if (ds->ds_dir->dd_phys->dd_clones == 0) 1525 return; 1526 1527 for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); 1528 zap_cursor_retrieve(&zc, &za) == 0; 1529 zap_cursor_advance(&zc)) { 1530 dsl_dataset_t *clone; 1531 1532 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 1533 za.za_first_integer, FTAG, &clone)); 1534 if (clone->ds_dir->dd_origin_txg > mintxg) { 1535 dsl_deadlist_remove_key(&clone->ds_deadlist, 1536 mintxg, tx); 1537 dsl_dataset_remove_clones_key(clone, mintxg, tx); 1538 } 1539 dsl_dataset_rele(clone, FTAG); 1540 } 1541 zap_cursor_fini(&zc); 1542 } 1543 1544 struct process_old_arg { 1545 dsl_dataset_t *ds; 1546 dsl_dataset_t *ds_prev; 1547 boolean_t after_branch_point; 1548 zio_t *pio; 1549 uint64_t used, comp, uncomp; 1550 }; 1551 1552 static int 1553 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 1554 { 1555 struct process_old_arg *poa = arg; 1556 dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; 1557 1558 if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { 1559 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); 1560 if (poa->ds_prev && !poa->after_branch_point && 1561 bp->blk_birth > 1562 poa->ds_prev->ds_phys->ds_prev_snap_txg) { 1563 poa->ds_prev->ds_phys->ds_unique_bytes += 1564 bp_get_dsize_sync(dp->dp_spa, bp); 1565 } 1566 } else { 1567 poa->used += bp_get_dsize_sync(dp->dp_spa, bp); 1568 poa->comp += BP_GET_PSIZE(bp); 1569 poa->uncomp += BP_GET_UCSIZE(bp); 1570 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); 1571 } 1572 return (0); 1573 } 1574 1575 static void 1576 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, 1577 dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) 1578 { 1579 struct process_old_arg poa = { 0 }; 1580 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1581 objset_t *mos = dp->dp_meta_objset; 1582 1583 ASSERT(ds->ds_deadlist.dl_oldfmt); 1584 ASSERT(ds_next->ds_deadlist.dl_oldfmt); 1585 1586 poa.ds = ds; 1587 poa.ds_prev = ds_prev; 1588 poa.after_branch_point = after_branch_point; 1589 poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 1590 VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, 1591 process_old_cb, &poa, tx)); 1592 VERIFY0(zio_wait(poa.pio)); 1593 ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); 1594 1595 /* change snapused */ 1596 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1597 -poa.used, -poa.comp, -poa.uncomp, tx); 1598 1599 /* swap next's deadlist to our deadlist */ 1600 dsl_deadlist_close(&ds->ds_deadlist); 1601 dsl_deadlist_close(&ds_next->ds_deadlist); 1602 SWITCH64(ds_next->ds_phys->ds_deadlist_obj, 1603 ds->ds_phys->ds_deadlist_obj); 1604 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 1605 dsl_deadlist_open(&ds_next->ds_deadlist, mos, 1606 ds_next->ds_phys->ds_deadlist_obj); 1607 } 1608 1609 static int 1610 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) 1611 { 1612 int err; 1613 struct killarg ka; 1614 1615 /* 1616 * Free everything that we point to (that's born after 1617 * the previous snapshot, if we are a clone) 1618 * 1619 * NB: this should be very quick, because we already 1620 * freed all the objects in open context. 1621 */ 1622 ka.ds = ds; 1623 ka.tx = tx; 1624 err = traverse_dataset(ds, 1625 ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, 1626 kill_blkptr, &ka); 1627 ASSERT0(err); 1628 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); 1629 1630 return (err); 1631 } 1632 1633 void 1634 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) 1635 { 1636 struct dsl_ds_destroyarg *dsda = arg1; 1637 dsl_dataset_t *ds = dsda->ds; 1638 int err; 1639 int after_branch_point = FALSE; 1640 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1641 objset_t *mos = dp->dp_meta_objset; 1642 dsl_dataset_t *ds_prev = NULL; 1643 boolean_t wont_destroy; 1644 uint64_t obj; 1645 1646 wont_destroy = (dsda->defer && 1647 (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); 1648 1649 ASSERT(ds->ds_owner || wont_destroy); 1650 ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); 1651 ASSERT(ds->ds_prev == NULL || 1652 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); 1653 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); 1654 1655 if (wont_destroy) { 1656 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1657 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1658 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; 1659 spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); 1660 return; 1661 } 1662 1663 /* We need to log before removing it from the namespace. */ 1664 spa_history_log_internal_ds(ds, "destroy", tx, ""); 1665 1666 /* signal any waiters that this dataset is going away */ 1667 mutex_enter(&ds->ds_lock); 1668 ds->ds_owner = dsl_reaper; 1669 cv_broadcast(&ds->ds_exclusive_cv); 1670 mutex_exit(&ds->ds_lock); 1671 1672 /* Remove our reservation */ 1673 if (ds->ds_reserved != 0) { 1674 dsl_prop_setarg_t psa; 1675 uint64_t value = 0; 1676 1677 dsl_prop_setarg_init_uint64(&psa, "refreservation", 1678 (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), 1679 &value); 1680 psa.psa_effective_value = 0; /* predict default value */ 1681 1682 dsl_dataset_set_reservation_sync(ds, &psa, tx); 1683 ASSERT0(ds->ds_reserved); 1684 } 1685 1686 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1687 1688 dsl_scan_ds_destroyed(ds, tx); 1689 1690 obj = ds->ds_object; 1691 1692 if (ds->ds_phys->ds_prev_snap_obj != 0) { 1693 if (ds->ds_prev) { 1694 ds_prev = ds->ds_prev; 1695 } else { 1696 VERIFY(0 == dsl_dataset_hold_obj(dp, 1697 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); 1698 } 1699 after_branch_point = 1700 (ds_prev->ds_phys->ds_next_snap_obj != obj); 1701 1702 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); 1703 if (after_branch_point && 1704 ds_prev->ds_phys->ds_next_clones_obj != 0) { 1705 remove_from_next_clones(ds_prev, obj, tx); 1706 if (ds->ds_phys->ds_next_snap_obj != 0) { 1707 VERIFY(0 == zap_add_int(mos, 1708 ds_prev->ds_phys->ds_next_clones_obj, 1709 ds->ds_phys->ds_next_snap_obj, tx)); 1710 } 1711 } 1712 if (after_branch_point && 1713 ds->ds_phys->ds_next_snap_obj == 0) { 1714 /* This clone is toast. */ 1715 ASSERT(ds_prev->ds_phys->ds_num_children > 1); 1716 ds_prev->ds_phys->ds_num_children--; 1717 1718 /* 1719 * If the clone's origin has no other clones, no 1720 * user holds, and has been marked for deferred 1721 * deletion, then we should have done the necessary 1722 * destroy setup for it. 1723 */ 1724 if (ds_prev->ds_phys->ds_num_children == 1 && 1725 ds_prev->ds_userrefs == 0 && 1726 DS_IS_DEFER_DESTROY(ds_prev)) { 1727 ASSERT3P(dsda->rm_origin, !=, NULL); 1728 } else { 1729 ASSERT3P(dsda->rm_origin, ==, NULL); 1730 } 1731 } else if (!after_branch_point) { 1732 ds_prev->ds_phys->ds_next_snap_obj = 1733 ds->ds_phys->ds_next_snap_obj; 1734 } 1735 } 1736 1737 if (dsl_dataset_is_snapshot(ds)) { 1738 dsl_dataset_t *ds_next; 1739 uint64_t old_unique; 1740 uint64_t used = 0, comp = 0, uncomp = 0; 1741 1742 VERIFY(0 == dsl_dataset_hold_obj(dp, 1743 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); 1744 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); 1745 1746 old_unique = ds_next->ds_phys->ds_unique_bytes; 1747 1748 dmu_buf_will_dirty(ds_next->ds_dbuf, tx); 1749 ds_next->ds_phys->ds_prev_snap_obj = 1750 ds->ds_phys->ds_prev_snap_obj; 1751 ds_next->ds_phys->ds_prev_snap_txg = 1752 ds->ds_phys->ds_prev_snap_txg; 1753 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1754 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); 1755 1756 1757 if (ds_next->ds_deadlist.dl_oldfmt) { 1758 process_old_deadlist(ds, ds_prev, ds_next, 1759 after_branch_point, tx); 1760 } else { 1761 /* Adjust prev's unique space. */ 1762 if (ds_prev && !after_branch_point) { 1763 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1764 ds_prev->ds_phys->ds_prev_snap_txg, 1765 ds->ds_phys->ds_prev_snap_txg, 1766 &used, &comp, &uncomp); 1767 ds_prev->ds_phys->ds_unique_bytes += used; 1768 } 1769 1770 /* Adjust snapused. */ 1771 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1772 ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 1773 &used, &comp, &uncomp); 1774 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1775 -used, -comp, -uncomp, tx); 1776 1777 /* Move blocks to be freed to pool's free list. */ 1778 dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, 1779 &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, 1780 tx); 1781 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, 1782 DD_USED_HEAD, used, comp, uncomp, tx); 1783 1784 /* Merge our deadlist into next's and free it. */ 1785 dsl_deadlist_merge(&ds_next->ds_deadlist, 1786 ds->ds_phys->ds_deadlist_obj, tx); 1787 } 1788 dsl_deadlist_close(&ds->ds_deadlist); 1789 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1790 1791 /* Collapse range in clone heads */ 1792 dsl_dataset_remove_clones_key(ds, 1793 ds->ds_phys->ds_creation_txg, tx); 1794 1795 if (dsl_dataset_is_snapshot(ds_next)) { 1796 dsl_dataset_t *ds_nextnext; 1797 1798 /* 1799 * Update next's unique to include blocks which 1800 * were previously shared by only this snapshot 1801 * and it. Those blocks will be born after the 1802 * prev snap and before this snap, and will have 1803 * died after the next snap and before the one 1804 * after that (ie. be on the snap after next's 1805 * deadlist). 1806 */ 1807 VERIFY(0 == dsl_dataset_hold_obj(dp, 1808 ds_next->ds_phys->ds_next_snap_obj, 1809 FTAG, &ds_nextnext)); 1810 dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, 1811 ds->ds_phys->ds_prev_snap_txg, 1812 ds->ds_phys->ds_creation_txg, 1813 &used, &comp, &uncomp); 1814 ds_next->ds_phys->ds_unique_bytes += used; 1815 dsl_dataset_rele(ds_nextnext, FTAG); 1816 ASSERT3P(ds_next->ds_prev, ==, NULL); 1817 1818 /* Collapse range in this head. */ 1819 dsl_dataset_t *hds; 1820 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 1821 ds->ds_dir->dd_phys->dd_head_dataset_obj, 1822 FTAG, &hds)); 1823 dsl_deadlist_remove_key(&hds->ds_deadlist, 1824 ds->ds_phys->ds_creation_txg, tx); 1825 dsl_dataset_rele(hds, FTAG); 1826 1827 } else { 1828 ASSERT3P(ds_next->ds_prev, ==, ds); 1829 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); 1830 ds_next->ds_prev = NULL; 1831 if (ds_prev) { 1832 VERIFY(0 == dsl_dataset_get_ref(dp, 1833 ds->ds_phys->ds_prev_snap_obj, 1834 ds_next, &ds_next->ds_prev)); 1835 } 1836 1837 dsl_dataset_recalc_head_uniq(ds_next); 1838 1839 /* 1840 * Reduce the amount of our unconsmed refreservation 1841 * being charged to our parent by the amount of 1842 * new unique data we have gained. 1843 */ 1844 if (old_unique < ds_next->ds_reserved) { 1845 int64_t mrsdelta; 1846 uint64_t new_unique = 1847 ds_next->ds_phys->ds_unique_bytes; 1848 1849 ASSERT(old_unique <= new_unique); 1850 mrsdelta = MIN(new_unique - old_unique, 1851 ds_next->ds_reserved - old_unique); 1852 dsl_dir_diduse_space(ds->ds_dir, 1853 DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); 1854 } 1855 } 1856 dsl_dataset_rele(ds_next, FTAG); 1857 } else { 1858 zfeature_info_t *async_destroy = 1859 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; 1860 objset_t *os; 1861 1862 /* 1863 * There's no next snapshot, so this is a head dataset. 1864 * Destroy the deadlist. Unless it's a clone, the 1865 * deadlist should be empty. (If it's a clone, it's 1866 * safe to ignore the deadlist contents.) 1867 */ 1868 dsl_deadlist_close(&ds->ds_deadlist); 1869 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1870 ds->ds_phys->ds_deadlist_obj = 0; 1871 1872 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); 1873 1874 if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { 1875 err = old_synchronous_dataset_destroy(ds, tx); 1876 } else { 1877 /* 1878 * Move the bptree into the pool's list of trees to 1879 * clean up and update space accounting information. 1880 */ 1881 uint64_t used, comp, uncomp; 1882 1883 zil_destroy_sync(dmu_objset_zil(os), tx); 1884 1885 if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { 1886 spa_feature_incr(dp->dp_spa, async_destroy, tx); 1887 dp->dp_bptree_obj = bptree_alloc(mos, tx); 1888 VERIFY(zap_add(mos, 1889 DMU_POOL_DIRECTORY_OBJECT, 1890 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 1891 &dp->dp_bptree_obj, tx) == 0); 1892 } 1893 1894 used = ds->ds_dir->dd_phys->dd_used_bytes; 1895 comp = ds->ds_dir->dd_phys->dd_compressed_bytes; 1896 uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; 1897 1898 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || 1899 ds->ds_phys->ds_unique_bytes == used); 1900 1901 bptree_add(mos, dp->dp_bptree_obj, 1902 &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, 1903 used, comp, uncomp, tx); 1904 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 1905 -used, -comp, -uncomp, tx); 1906 dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, 1907 used, comp, uncomp, tx); 1908 } 1909 1910 if (ds->ds_prev != NULL) { 1911 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 1912 VERIFY3U(0, ==, zap_remove_int(mos, 1913 ds->ds_prev->ds_dir->dd_phys->dd_clones, 1914 ds->ds_object, tx)); 1915 } 1916 dsl_dataset_rele(ds->ds_prev, ds); 1917 ds->ds_prev = ds_prev = NULL; 1918 } 1919 } 1920 1921 /* 1922 * This must be done after the dsl_traverse(), because it will 1923 * re-open the objset. 1924 */ 1925 if (ds->ds_objset) { 1926 dmu_objset_evict(ds->ds_objset); 1927 ds->ds_objset = NULL; 1928 } 1929 1930 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { 1931 /* Erase the link in the dir */ 1932 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1933 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; 1934 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); 1935 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); 1936 ASSERT(err == 0); 1937 } else { 1938 /* remove from snapshot namespace */ 1939 dsl_dataset_t *ds_head; 1940 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); 1941 VERIFY(0 == dsl_dataset_hold_obj(dp, 1942 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); 1943 VERIFY(0 == dsl_dataset_get_snapname(ds)); 1944 #ifdef ZFS_DEBUG 1945 { 1946 uint64_t val; 1947 1948 err = dsl_dataset_snap_lookup(ds_head, 1949 ds->ds_snapname, &val); 1950 ASSERT0(err); 1951 ASSERT3U(val, ==, obj); 1952 } 1953 #endif 1954 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); 1955 ASSERT(err == 0); 1956 dsl_dataset_rele(ds_head, FTAG); 1957 } 1958 1959 if (ds_prev && ds->ds_prev != ds_prev) 1960 dsl_dataset_rele(ds_prev, FTAG); 1961 1962 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); 1963 1964 if (ds->ds_phys->ds_next_clones_obj != 0) { 1965 uint64_t count; 1966 ASSERT(0 == zap_count(mos, 1967 ds->ds_phys->ds_next_clones_obj, &count) && count == 0); 1968 VERIFY(0 == dmu_object_free(mos, 1969 ds->ds_phys->ds_next_clones_obj, tx)); 1970 } 1971 if (ds->ds_phys->ds_props_obj != 0) 1972 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); 1973 if (ds->ds_phys->ds_userrefs_obj != 0) 1974 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); 1975 dsl_dir_close(ds->ds_dir, ds); 1976 ds->ds_dir = NULL; 1977 dsl_dataset_drain_refs(ds, tag); 1978 VERIFY(0 == dmu_object_free(mos, obj, tx)); 1979 1980 if (dsda->rm_origin) { 1981 /* 1982 * Remove the origin of the clone we just destroyed. 1983 */ 1984 struct dsl_ds_destroyarg ndsda = {0}; 1985 1986 ndsda.ds = dsda->rm_origin; 1987 dsl_dataset_destroy_sync(&ndsda, tag, tx); 1988 } 1989 } 1990 1991 static int 1992 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) 1993 { 1994 uint64_t asize; 1995 1996 if (!dmu_tx_is_syncing(tx)) 1997 return (0); 1998 1999 /* 2000 * If there's an fs-only reservation, any blocks that might become 2001 * owned by the snapshot dataset must be accommodated by space 2002 * outside of the reservation. 2003 */ 2004 ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); 2005 asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 2006 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 2007 return (ENOSPC); 2008 2009 /* 2010 * Propagate any reserved space for this snapshot to other 2011 * snapshot checks in this sync group. 2012 */ 2013 if (asize > 0) 2014 dsl_dir_willuse_space(ds->ds_dir, asize, tx); 2015 2016 return (0); 2017 } 2018 2019 /* 2020 * Check if adding additional snapshot(s) would exceed any snapshot quotas. 2021 * Note that all snapshot quotas up to the root dataset (i.e. the pool itself) 2022 * or the given ancestor must be satisfied. Note that it is valid for the 2023 * count to exceed the quota. This can happen if a recursive snapshot is taken 2024 * from a dataset above this one. 2025 */ 2026 int 2027 dsl_snapcount_check(dsl_dir_t *dd, dmu_tx_t *tx, uint64_t cnt, 2028 dsl_dir_t *ancestor) 2029 { 2030 uint64_t quota; 2031 int err = 0; 2032 2033 /* 2034 * As with dsl_dataset_set_reservation_check(), don't run this check in 2035 * open context. 2036 */ 2037 if (!dmu_tx_is_syncing(tx)) 2038 return (0); 2039 2040 /* 2041 * If renaming a dataset with no snapshots, count adjustment is 0. 2042 * Likewise when taking a recursive snapshot below the top-level (see 2043 * the comment in snapshot_check() for more details). 2044 */ 2045 if (cnt == 0) 2046 return (0); 2047 2048 /* 2049 * If an ancestor has been provided, stop checking the quota once we 2050 * hit that dir. We need this during rename so that we don't overcount 2051 * the check once we recurse up to the common ancestor. 2052 */ 2053 if (ancestor == dd) 2054 return (0); 2055 2056 /* 2057 * If there's no value for this property, there's no need to enforce a 2058 * snapshot quota. 2059 */ 2060 err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_SNAPSHOT_QUOTA), 2061 8, 1, "a, NULL, B_FALSE); 2062 if (err == ENOENT) 2063 return (0); 2064 else if (err != 0) 2065 return (err); 2066 2067 #ifdef _KERNEL 2068 extern void __dtrace_probe_zfs__ss__quota(uint64_t, uint64_t, char *); 2069 __dtrace_probe_zfs__ss__quota( 2070 (uint64_t)dd->dd_phys->dd_snapshot_count, (uint64_t)quota, 2071 dd->dd_myname); 2072 #endif 2073 2074 if (quota > 0 && (dd->dd_phys->dd_snapshot_count + cnt) > quota) 2075 return (EDQUOT); 2076 2077 if (dd->dd_parent != NULL) 2078 err = dsl_snapcount_check(dd->dd_parent, tx, cnt, ancestor); 2079 2080 return (err); 2081 } 2082 2083 /* 2084 * Adjust the snapshot count for the specified dsl_dir_t and all parents. 2085 * When a new snapshot is created, increment the count on all parents, and when 2086 * a snapshot is destroyed, decrement the count. 2087 */ 2088 void 2089 dsl_snapcount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta, 2090 boolean_t first) 2091 { 2092 /* 2093 * On initial entry we need to check if this feature is active, but 2094 * we don't want to re-check this on each recursive call. Note: the 2095 * feature cannot be active if its not enabled. If the feature is not 2096 * active, don't touch the on-disk count fields. 2097 */ 2098 if (first) { 2099 dsl_dataset_t *ds = NULL; 2100 spa_t *spa; 2101 zfeature_info_t *quota_feat = 2102 &spa_feature_table[SPA_FEATURE_DS_SS_QUOTA]; 2103 2104 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2105 dd->dd_phys->dd_head_dataset_obj, FTAG, &ds)); 2106 spa = dsl_dataset_get_spa(ds); 2107 dsl_dataset_rele(ds, FTAG); 2108 if (!spa_feature_is_active(spa, quota_feat)) 2109 return; 2110 } 2111 2112 /* 2113 * As with dsl_dataset_set_reservation_check(), wdon't want to run 2114 * this check in open context. 2115 */ 2116 if (!dmu_tx_is_syncing(tx)) 2117 return; 2118 2119 /* if renaming a dataset with no snapshots, count adjustment is 0 */ 2120 if (delta == 0) 2121 return; 2122 2123 /* Increment count for parent */ 2124 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2125 2126 mutex_enter(&dd->dd_lock); 2127 2128 /* 2129 * Counts may be incorrect if dealing with an existing pool and 2130 * there has never been a quota set in the dataset hierarchy. 2131 * This is not an error. 2132 */ 2133 if (delta < 0 && dd->dd_phys->dd_snapshot_count < (delta * -1)) { 2134 #ifdef _KERNEL 2135 extern void __dtrace_probe_zfs__sscnt__adj__neg(char *); 2136 __dtrace_probe_zfs__sscnt__adj__neg(dd->dd_myname); 2137 #endif 2138 mutex_exit(&dd->dd_lock); 2139 return; 2140 } 2141 2142 dd->dd_phys->dd_snapshot_count += delta; 2143 2144 /* Roll up this additional count into our ancestors */ 2145 2146 if (dd->dd_parent != NULL) 2147 dsl_snapcount_adjust(dd->dd_parent, tx, delta, B_FALSE); 2148 2149 mutex_exit(&dd->dd_lock); 2150 } 2151 2152 int 2153 dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname, 2154 uint64_t cnt, dmu_tx_t *tx) 2155 { 2156 int err; 2157 uint64_t value; 2158 2159 /* 2160 * We don't allow multiple snapshots of the same txg. If there 2161 * is already one, try again. 2162 */ 2163 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) 2164 return (EAGAIN); 2165 2166 /* 2167 * Check for conflicting snapshot name. 2168 */ 2169 err = dsl_dataset_snap_lookup(ds, snapname, &value); 2170 if (err == 0) 2171 return (EEXIST); 2172 if (err != ENOENT) 2173 return (err); 2174 2175 /* 2176 * Check that the dataset's name is not too long. Name consists 2177 * of the dataset's length + 1 for the @-sign + snapshot name's length 2178 */ 2179 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) 2180 return (ENAMETOOLONG); 2181 2182 err = dsl_snapcount_check(ds->ds_dir, tx, cnt, NULL); 2183 if (err) 2184 return (err); 2185 2186 err = dsl_dataset_snapshot_reserve_space(ds, tx); 2187 if (err) 2188 return (err); 2189 2190 ds->ds_trysnap_txg = tx->tx_txg; 2191 return (0); 2192 } 2193 2194 void 2195 dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname, 2196 dmu_tx_t *tx) 2197 { 2198 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2199 dmu_buf_t *dbuf; 2200 dsl_dataset_phys_t *dsphys; 2201 uint64_t dsobj, crtxg; 2202 objset_t *mos = dp->dp_meta_objset; 2203 int err; 2204 2205 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 2206 2207 dsl_snapcount_adjust(ds->ds_dir, tx, 1, B_TRUE); 2208 2209 /* 2210 * The origin's ds_creation_txg has to be < TXG_INITIAL 2211 */ 2212 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) 2213 crtxg = 1; 2214 else 2215 crtxg = tx->tx_txg; 2216 2217 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 2218 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 2219 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 2220 dmu_buf_will_dirty(dbuf, tx); 2221 dsphys = dbuf->db_data; 2222 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 2223 dsphys->ds_dir_obj = ds->ds_dir->dd_object; 2224 dsphys->ds_fsid_guid = unique_create(); 2225 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 2226 sizeof (dsphys->ds_guid)); 2227 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; 2228 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; 2229 dsphys->ds_next_snap_obj = ds->ds_object; 2230 dsphys->ds_num_children = 1; 2231 dsphys->ds_creation_time = gethrestime_sec(); 2232 dsphys->ds_creation_txg = crtxg; 2233 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; 2234 dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes; 2235 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; 2236 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; 2237 dsphys->ds_flags = ds->ds_phys->ds_flags; 2238 dsphys->ds_bp = ds->ds_phys->ds_bp; 2239 dmu_buf_rele(dbuf, FTAG); 2240 2241 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); 2242 if (ds->ds_prev) { 2243 uint64_t next_clones_obj = 2244 ds->ds_prev->ds_phys->ds_next_clones_obj; 2245 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == 2246 ds->ds_object || 2247 ds->ds_prev->ds_phys->ds_num_children > 1); 2248 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 2249 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2250 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 2251 ds->ds_prev->ds_phys->ds_creation_txg); 2252 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; 2253 } else if (next_clones_obj != 0) { 2254 remove_from_next_clones(ds->ds_prev, 2255 dsphys->ds_next_snap_obj, tx); 2256 VERIFY3U(0, ==, zap_add_int(mos, 2257 next_clones_obj, dsobj, tx)); 2258 } 2259 } 2260 2261 /* 2262 * If we have a reference-reservation on this dataset, we will 2263 * need to increase the amount of refreservation being charged 2264 * since our unique space is going to zero. 2265 */ 2266 if (ds->ds_reserved) { 2267 int64_t delta; 2268 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 2269 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 2270 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, 2271 delta, 0, 0, tx); 2272 } 2273 2274 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2275 zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", 2276 ds->ds_dir->dd_myname, snapname, dsobj, 2277 ds->ds_phys->ds_prev_snap_txg); 2278 ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, 2279 UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); 2280 dsl_deadlist_close(&ds->ds_deadlist); 2281 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 2282 dsl_deadlist_add_key(&ds->ds_deadlist, 2283 ds->ds_phys->ds_prev_snap_txg, tx); 2284 2285 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); 2286 ds->ds_phys->ds_prev_snap_obj = dsobj; 2287 ds->ds_phys->ds_prev_snap_txg = crtxg; 2288 ds->ds_phys->ds_unique_bytes = 0; 2289 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 2290 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 2291 2292 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, 2293 snapname, 8, 1, &dsobj, tx); 2294 ASSERT(err == 0); 2295 2296 if (ds->ds_prev) 2297 dsl_dataset_drop_ref(ds->ds_prev, ds); 2298 VERIFY(0 == dsl_dataset_get_ref(dp, 2299 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 2300 2301 dsl_scan_ds_snapshotted(ds, tx); 2302 2303 dsl_dir_snap_cmtime_update(ds->ds_dir); 2304 2305 spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); 2306 } 2307 2308 void 2309 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) 2310 { 2311 ASSERT(dmu_tx_is_syncing(tx)); 2312 ASSERT(ds->ds_objset != NULL); 2313 ASSERT(ds->ds_phys->ds_next_snap_obj == 0); 2314 2315 /* 2316 * in case we had to change ds_fsid_guid when we opened it, 2317 * sync it out now. 2318 */ 2319 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2320 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; 2321 2322 dmu_objset_sync(ds->ds_objset, zio, tx); 2323 } 2324 2325 static void 2326 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) 2327 { 2328 uint64_t count = 0; 2329 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 2330 zap_cursor_t zc; 2331 zap_attribute_t za; 2332 nvlist_t *propval; 2333 nvlist_t *val; 2334 2335 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2336 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2337 VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2338 2339 /* 2340 * There may me missing entries in ds_next_clones_obj 2341 * due to a bug in a previous version of the code. 2342 * Only trust it if it has the right number of entries. 2343 */ 2344 if (ds->ds_phys->ds_next_clones_obj != 0) { 2345 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 2346 &count)); 2347 } 2348 if (count != ds->ds_phys->ds_num_children - 1) { 2349 goto fail; 2350 } 2351 for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); 2352 zap_cursor_retrieve(&zc, &za) == 0; 2353 zap_cursor_advance(&zc)) { 2354 dsl_dataset_t *clone; 2355 char buf[ZFS_MAXNAMELEN]; 2356 /* 2357 * Even though we hold the dp_config_rwlock, the dataset 2358 * may fail to open, returning ENOENT. If there is a 2359 * thread concurrently attempting to destroy this 2360 * dataset, it will have the ds_rwlock held for 2361 * RW_WRITER. Our call to dsl_dataset_hold_obj() -> 2362 * dsl_dataset_hold_ref() will fail its 2363 * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the 2364 * dp_config_rwlock, and wait for the destroy progress 2365 * and signal ds_exclusive_cv. If the destroy was 2366 * successful, we will see that 2367 * DSL_DATASET_IS_DESTROYED(), and return ENOENT. 2368 */ 2369 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 2370 za.za_first_integer, FTAG, &clone) != 0) 2371 continue; 2372 dsl_dir_name(clone->ds_dir, buf); 2373 VERIFY(nvlist_add_boolean(val, buf) == 0); 2374 dsl_dataset_rele(clone, FTAG); 2375 } 2376 zap_cursor_fini(&zc); 2377 VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); 2378 VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), 2379 propval) == 0); 2380 fail: 2381 nvlist_free(val); 2382 nvlist_free(propval); 2383 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2384 } 2385 2386 void 2387 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) 2388 { 2389 uint64_t refd, avail, uobjs, aobjs, ratio; 2390 2391 ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : 2392 (ds->ds_phys->ds_uncompressed_bytes * 100 / 2393 ds->ds_phys->ds_compressed_bytes); 2394 2395 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); 2396 2397 if (dsl_dataset_is_snapshot(ds)) { 2398 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); 2399 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 2400 ds->ds_phys->ds_unique_bytes); 2401 get_clones_stat(ds, nv); 2402 } else { 2403 dsl_dir_stats(ds->ds_dir, nv); 2404 } 2405 2406 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); 2407 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); 2408 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); 2409 2410 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, 2411 ds->ds_phys->ds_creation_time); 2412 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, 2413 ds->ds_phys->ds_creation_txg); 2414 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, 2415 ds->ds_quota); 2416 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, 2417 ds->ds_reserved); 2418 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, 2419 ds->ds_phys->ds_guid); 2420 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, 2421 ds->ds_phys->ds_unique_bytes); 2422 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, 2423 ds->ds_object); 2424 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, 2425 ds->ds_userrefs); 2426 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, 2427 DS_IS_DEFER_DESTROY(ds) ? 1 : 0); 2428 2429 if (ds->ds_phys->ds_prev_snap_obj != 0) { 2430 uint64_t written, comp, uncomp; 2431 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2432 dsl_dataset_t *prev; 2433 2434 rw_enter(&dp->dp_config_rwlock, RW_READER); 2435 int err = dsl_dataset_hold_obj(dp, 2436 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); 2437 rw_exit(&dp->dp_config_rwlock); 2438 if (err == 0) { 2439 err = dsl_dataset_space_written(prev, ds, &written, 2440 &comp, &uncomp); 2441 dsl_dataset_rele(prev, FTAG); 2442 if (err == 0) { 2443 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, 2444 written); 2445 } 2446 } 2447 } 2448 } 2449 2450 void 2451 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) 2452 { 2453 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; 2454 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; 2455 stat->dds_guid = ds->ds_phys->ds_guid; 2456 stat->dds_origin[0] = '\0'; 2457 if (dsl_dataset_is_snapshot(ds)) { 2458 stat->dds_is_snapshot = B_TRUE; 2459 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; 2460 } else { 2461 stat->dds_is_snapshot = B_FALSE; 2462 stat->dds_num_clones = 0; 2463 2464 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2465 if (dsl_dir_is_clone(ds->ds_dir)) { 2466 dsl_dataset_t *ods; 2467 2468 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, 2469 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); 2470 dsl_dataset_name(ods, stat->dds_origin); 2471 dsl_dataset_drop_ref(ods, FTAG); 2472 } 2473 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2474 } 2475 } 2476 2477 uint64_t 2478 dsl_dataset_fsid_guid(dsl_dataset_t *ds) 2479 { 2480 return (ds->ds_fsid_guid); 2481 } 2482 2483 void 2484 dsl_dataset_space(dsl_dataset_t *ds, 2485 uint64_t *refdbytesp, uint64_t *availbytesp, 2486 uint64_t *usedobjsp, uint64_t *availobjsp) 2487 { 2488 *refdbytesp = ds->ds_phys->ds_referenced_bytes; 2489 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); 2490 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) 2491 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; 2492 if (ds->ds_quota != 0) { 2493 /* 2494 * Adjust available bytes according to refquota 2495 */ 2496 if (*refdbytesp < ds->ds_quota) 2497 *availbytesp = MIN(*availbytesp, 2498 ds->ds_quota - *refdbytesp); 2499 else 2500 *availbytesp = 0; 2501 } 2502 *usedobjsp = ds->ds_phys->ds_bp.blk_fill; 2503 *availobjsp = DN_MAX_OBJECT - *usedobjsp; 2504 } 2505 2506 boolean_t 2507 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) 2508 { 2509 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2510 2511 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 2512 dsl_pool_sync_context(dp)); 2513 if (ds->ds_prev == NULL) 2514 return (B_FALSE); 2515 if (ds->ds_phys->ds_bp.blk_birth > 2516 ds->ds_prev->ds_phys->ds_creation_txg) { 2517 objset_t *os, *os_prev; 2518 /* 2519 * It may be that only the ZIL differs, because it was 2520 * reset in the head. Don't count that as being 2521 * modified. 2522 */ 2523 if (dmu_objset_from_ds(ds, &os) != 0) 2524 return (B_TRUE); 2525 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) 2526 return (B_TRUE); 2527 return (bcmp(&os->os_phys->os_meta_dnode, 2528 &os_prev->os_phys->os_meta_dnode, 2529 sizeof (os->os_phys->os_meta_dnode)) != 0); 2530 } 2531 return (B_FALSE); 2532 } 2533 2534 /* ARGSUSED */ 2535 static int 2536 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) 2537 { 2538 dsl_dataset_t *ds = arg1; 2539 char *newsnapname = arg2; 2540 dsl_dir_t *dd = ds->ds_dir; 2541 dsl_dataset_t *hds; 2542 uint64_t val; 2543 int err; 2544 2545 err = dsl_dataset_hold_obj(dd->dd_pool, 2546 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); 2547 if (err) 2548 return (err); 2549 2550 /* new name better not be in use */ 2551 err = dsl_dataset_snap_lookup(hds, newsnapname, &val); 2552 dsl_dataset_rele(hds, FTAG); 2553 2554 if (err == 0) 2555 err = EEXIST; 2556 else if (err == ENOENT) 2557 err = 0; 2558 2559 /* dataset name + 1 for the "@" + the new snapshot name must fit */ 2560 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) 2561 err = ENAMETOOLONG; 2562 2563 return (err); 2564 } 2565 2566 static void 2567 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2568 { 2569 dsl_dataset_t *ds = arg1; 2570 const char *newsnapname = arg2; 2571 dsl_dir_t *dd = ds->ds_dir; 2572 objset_t *mos = dd->dd_pool->dp_meta_objset; 2573 dsl_dataset_t *hds; 2574 int err; 2575 2576 ASSERT(ds->ds_phys->ds_next_snap_obj != 0); 2577 2578 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2579 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); 2580 2581 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2582 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); 2583 ASSERT0(err); 2584 mutex_enter(&ds->ds_lock); 2585 (void) strcpy(ds->ds_snapname, newsnapname); 2586 mutex_exit(&ds->ds_lock); 2587 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, 2588 ds->ds_snapname, 8, 1, &ds->ds_object, tx); 2589 ASSERT0(err); 2590 2591 spa_history_log_internal_ds(ds, "rename", tx, 2592 "-> @%s", newsnapname); 2593 dsl_dataset_rele(hds, FTAG); 2594 } 2595 2596 struct renamesnaparg { 2597 dsl_sync_task_group_t *dstg; 2598 char failed[MAXPATHLEN]; 2599 char *oldsnap; 2600 char *newsnap; 2601 }; 2602 2603 static int 2604 dsl_snapshot_rename_one(const char *name, void *arg) 2605 { 2606 struct renamesnaparg *ra = arg; 2607 dsl_dataset_t *ds = NULL; 2608 char *snapname; 2609 int err; 2610 2611 snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); 2612 (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); 2613 2614 /* 2615 * For recursive snapshot renames the parent won't be changing 2616 * so we just pass name for both the to/from argument. 2617 */ 2618 err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); 2619 if (err != 0) { 2620 strfree(snapname); 2621 return (err == ENOENT ? 0 : err); 2622 } 2623 2624 #ifdef _KERNEL 2625 /* 2626 * For all filesystems undergoing rename, we'll need to unmount it. 2627 */ 2628 (void) zfs_unmount_snap(snapname, NULL); 2629 #endif 2630 err = dsl_dataset_hold(snapname, ra->dstg, &ds); 2631 strfree(snapname); 2632 if (err != 0) 2633 return (err == ENOENT ? 0 : err); 2634 2635 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, 2636 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); 2637 2638 return (0); 2639 } 2640 2641 static int 2642 dsl_recursive_rename(char *oldname, const char *newname) 2643 { 2644 int err; 2645 struct renamesnaparg *ra; 2646 dsl_sync_task_t *dst; 2647 spa_t *spa; 2648 char *cp, *fsname = spa_strdup(oldname); 2649 int len = strlen(oldname) + 1; 2650 2651 /* truncate the snapshot name to get the fsname */ 2652 cp = strchr(fsname, '@'); 2653 *cp = '\0'; 2654 2655 err = spa_open(fsname, &spa, FTAG); 2656 if (err) { 2657 kmem_free(fsname, len); 2658 return (err); 2659 } 2660 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); 2661 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 2662 2663 ra->oldsnap = strchr(oldname, '@') + 1; 2664 ra->newsnap = strchr(newname, '@') + 1; 2665 *ra->failed = '\0'; 2666 2667 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, 2668 DS_FIND_CHILDREN); 2669 kmem_free(fsname, len); 2670 2671 if (err == 0) { 2672 err = dsl_sync_task_group_wait(ra->dstg); 2673 } 2674 2675 for (dst = list_head(&ra->dstg->dstg_tasks); dst; 2676 dst = list_next(&ra->dstg->dstg_tasks, dst)) { 2677 dsl_dataset_t *ds = dst->dst_arg1; 2678 if (dst->dst_err) { 2679 dsl_dir_name(ds->ds_dir, ra->failed); 2680 (void) strlcat(ra->failed, "@", sizeof (ra->failed)); 2681 (void) strlcat(ra->failed, ra->newsnap, 2682 sizeof (ra->failed)); 2683 } 2684 dsl_dataset_rele(ds, ra->dstg); 2685 } 2686 2687 if (err) 2688 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); 2689 2690 dsl_sync_task_group_destroy(ra->dstg); 2691 kmem_free(ra, sizeof (struct renamesnaparg)); 2692 spa_close(spa, FTAG); 2693 return (err); 2694 } 2695 2696 static int 2697 dsl_valid_rename(const char *oldname, void *arg) 2698 { 2699 int delta = *(int *)arg; 2700 2701 if (strlen(oldname) + delta >= MAXNAMELEN) 2702 return (ENAMETOOLONG); 2703 2704 return (0); 2705 } 2706 2707 #pragma weak dmu_objset_rename = dsl_dataset_rename 2708 int 2709 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) 2710 { 2711 dsl_dir_t *dd; 2712 dsl_dataset_t *ds; 2713 const char *tail; 2714 int err; 2715 2716 err = dsl_dir_open(oldname, FTAG, &dd, &tail); 2717 if (err) 2718 return (err); 2719 2720 if (tail == NULL) { 2721 int delta = strlen(newname) - strlen(oldname); 2722 2723 /* if we're growing, validate child name lengths */ 2724 if (delta > 0) 2725 err = dmu_objset_find(oldname, dsl_valid_rename, 2726 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2727 2728 if (err == 0) 2729 err = dsl_dir_rename(dd, newname); 2730 dsl_dir_close(dd, FTAG); 2731 return (err); 2732 } 2733 2734 if (tail[0] != '@') { 2735 /* the name ended in a nonexistent component */ 2736 dsl_dir_close(dd, FTAG); 2737 return (ENOENT); 2738 } 2739 2740 dsl_dir_close(dd, FTAG); 2741 2742 /* new name must be snapshot in same filesystem */ 2743 tail = strchr(newname, '@'); 2744 if (tail == NULL) 2745 return (EINVAL); 2746 tail++; 2747 if (strncmp(oldname, newname, tail - newname) != 0) 2748 return (EXDEV); 2749 2750 if (recursive) { 2751 err = dsl_recursive_rename(oldname, newname); 2752 } else { 2753 err = dsl_dataset_hold(oldname, FTAG, &ds); 2754 if (err) 2755 return (err); 2756 2757 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2758 dsl_dataset_snapshot_rename_check, 2759 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); 2760 2761 dsl_dataset_rele(ds, FTAG); 2762 } 2763 2764 return (err); 2765 } 2766 2767 struct promotenode { 2768 list_node_t link; 2769 dsl_dataset_t *ds; 2770 }; 2771 2772 struct promotearg { 2773 list_t shared_snaps, origin_snaps, clone_snaps; 2774 dsl_dataset_t *origin_origin; 2775 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; 2776 char *err_ds; 2777 }; 2778 2779 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); 2780 static boolean_t snaplist_unstable(list_t *l); 2781 2782 static int 2783 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) 2784 { 2785 dsl_dataset_t *hds = arg1; 2786 struct promotearg *pa = arg2; 2787 struct promotenode *snap = list_head(&pa->shared_snaps); 2788 dsl_dataset_t *origin_ds = snap->ds; 2789 int err; 2790 uint64_t unused; 2791 2792 /* Check that it is a real clone */ 2793 if (!dsl_dir_is_clone(hds->ds_dir)) 2794 return (EINVAL); 2795 2796 /* Since this is so expensive, don't do the preliminary check */ 2797 if (!dmu_tx_is_syncing(tx)) 2798 return (0); 2799 2800 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) 2801 return (EXDEV); 2802 2803 /* compute origin's new unique space */ 2804 snap = list_tail(&pa->clone_snaps); 2805 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2806 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2807 origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 2808 &pa->unique, &unused, &unused); 2809 2810 /* 2811 * Walk the snapshots that we are moving 2812 * 2813 * Compute space to transfer. Consider the incremental changes 2814 * to used for each snapshot: 2815 * (my used) = (prev's used) + (blocks born) - (blocks killed) 2816 * So each snapshot gave birth to: 2817 * (blocks born) = (my used) - (prev's used) + (blocks killed) 2818 * So a sequence would look like: 2819 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) 2820 * Which simplifies to: 2821 * uN + kN + kN-1 + ... + k1 + k0 2822 * Note however, if we stop before we reach the ORIGIN we get: 2823 * uN + kN + kN-1 + ... + kM - uM-1 2824 */ 2825 pa->used = origin_ds->ds_phys->ds_referenced_bytes; 2826 pa->comp = origin_ds->ds_phys->ds_compressed_bytes; 2827 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; 2828 for (snap = list_head(&pa->shared_snaps); snap; 2829 snap = list_next(&pa->shared_snaps, snap)) { 2830 uint64_t val, dlused, dlcomp, dluncomp; 2831 dsl_dataset_t *ds = snap->ds; 2832 2833 /* Check that the snapshot name does not conflict */ 2834 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2835 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); 2836 if (err == 0) { 2837 err = EEXIST; 2838 goto out; 2839 } 2840 if (err != ENOENT) 2841 goto out; 2842 2843 /* The very first snapshot does not have a deadlist */ 2844 if (ds->ds_phys->ds_prev_snap_obj == 0) 2845 continue; 2846 2847 dsl_deadlist_space(&ds->ds_deadlist, 2848 &dlused, &dlcomp, &dluncomp); 2849 pa->used += dlused; 2850 pa->comp += dlcomp; 2851 pa->uncomp += dluncomp; 2852 } 2853 2854 /* 2855 * If we are a clone of a clone then we never reached ORIGIN, 2856 * so we need to subtract out the clone origin's used space. 2857 */ 2858 if (pa->origin_origin) { 2859 pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; 2860 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; 2861 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; 2862 } 2863 2864 /* Check that there is enough space and quota headroom here */ 2865 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 2866 origin_ds->ds_dir, pa->used, tx); 2867 if (err) 2868 return (err); 2869 2870 /* 2871 * Compute the amounts of space that will be used by snapshots 2872 * after the promotion (for both origin and clone). For each, 2873 * it is the amount of space that will be on all of their 2874 * deadlists (that was not born before their new origin). 2875 */ 2876 if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2877 uint64_t space; 2878 2879 /* 2880 * Note, typically this will not be a clone of a clone, 2881 * so dd_origin_txg will be < TXG_INITIAL, so 2882 * these snaplist_space() -> dsl_deadlist_space_range() 2883 * calls will be fast because they do not have to 2884 * iterate over all bps. 2885 */ 2886 snap = list_head(&pa->origin_snaps); 2887 err = snaplist_space(&pa->shared_snaps, 2888 snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); 2889 if (err) 2890 return (err); 2891 2892 err = snaplist_space(&pa->clone_snaps, 2893 snap->ds->ds_dir->dd_origin_txg, &space); 2894 if (err) 2895 return (err); 2896 pa->cloneusedsnap += space; 2897 } 2898 if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2899 err = snaplist_space(&pa->origin_snaps, 2900 origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); 2901 if (err) 2902 return (err); 2903 } 2904 2905 return (0); 2906 out: 2907 pa->err_ds = snap->ds->ds_snapname; 2908 return (err); 2909 } 2910 2911 static void 2912 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2913 { 2914 dsl_dataset_t *hds = arg1; 2915 struct promotearg *pa = arg2; 2916 struct promotenode *snap = list_head(&pa->shared_snaps); 2917 dsl_dataset_t *origin_ds = snap->ds; 2918 dsl_dataset_t *origin_head; 2919 dsl_dir_t *dd = hds->ds_dir; 2920 dsl_pool_t *dp = hds->ds_dir->dd_pool; 2921 dsl_dir_t *odd = NULL; 2922 uint64_t oldnext_obj; 2923 int64_t delta; 2924 2925 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); 2926 2927 snap = list_head(&pa->origin_snaps); 2928 origin_head = snap->ds; 2929 2930 /* 2931 * We need to explicitly open odd, since origin_ds's dd will be 2932 * changing. 2933 */ 2934 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, 2935 NULL, FTAG, &odd)); 2936 2937 /* change origin's next snap */ 2938 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); 2939 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; 2940 snap = list_tail(&pa->clone_snaps); 2941 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2942 origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; 2943 2944 /* change the origin's next clone */ 2945 if (origin_ds->ds_phys->ds_next_clones_obj) { 2946 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); 2947 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2948 origin_ds->ds_phys->ds_next_clones_obj, 2949 oldnext_obj, tx)); 2950 } 2951 2952 /* change origin */ 2953 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2954 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); 2955 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; 2956 dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; 2957 dmu_buf_will_dirty(odd->dd_dbuf, tx); 2958 odd->dd_phys->dd_origin_obj = origin_ds->ds_object; 2959 origin_head->ds_dir->dd_origin_txg = 2960 origin_ds->ds_phys->ds_creation_txg; 2961 2962 /* change dd_clone entries */ 2963 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2964 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2965 odd->dd_phys->dd_clones, hds->ds_object, tx)); 2966 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2967 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2968 hds->ds_object, tx)); 2969 2970 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2971 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2972 origin_head->ds_object, tx)); 2973 if (dd->dd_phys->dd_clones == 0) { 2974 dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, 2975 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 2976 } 2977 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2978 dd->dd_phys->dd_clones, origin_head->ds_object, tx)); 2979 2980 } 2981 2982 /* move snapshots to this dir */ 2983 for (snap = list_head(&pa->shared_snaps); snap; 2984 snap = list_next(&pa->shared_snaps, snap)) { 2985 dsl_dataset_t *ds = snap->ds; 2986 2987 /* unregister props as dsl_dir is changing */ 2988 if (ds->ds_objset) { 2989 dmu_objset_evict(ds->ds_objset); 2990 ds->ds_objset = NULL; 2991 } 2992 /* move snap name entry */ 2993 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2994 VERIFY(0 == dsl_dataset_snap_remove(origin_head, 2995 ds->ds_snapname, tx)); 2996 VERIFY(0 == zap_add(dp->dp_meta_objset, 2997 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 2998 8, 1, &ds->ds_object, tx)); 2999 dsl_snapcount_adjust(hds->ds_dir, tx, 1, B_TRUE); 3000 3001 /* change containing dsl_dir */ 3002 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3003 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); 3004 ds->ds_phys->ds_dir_obj = dd->dd_object; 3005 ASSERT3P(ds->ds_dir, ==, odd); 3006 dsl_dir_close(ds->ds_dir, ds); 3007 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, 3008 NULL, ds, &ds->ds_dir)); 3009 3010 /* move any clone references */ 3011 if (ds->ds_phys->ds_next_clones_obj && 3012 spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 3013 zap_cursor_t zc; 3014 zap_attribute_t za; 3015 3016 for (zap_cursor_init(&zc, dp->dp_meta_objset, 3017 ds->ds_phys->ds_next_clones_obj); 3018 zap_cursor_retrieve(&zc, &za) == 0; 3019 zap_cursor_advance(&zc)) { 3020 dsl_dataset_t *cnds; 3021 uint64_t o; 3022 3023 if (za.za_first_integer == oldnext_obj) { 3024 /* 3025 * We've already moved the 3026 * origin's reference. 3027 */ 3028 continue; 3029 } 3030 3031 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 3032 za.za_first_integer, FTAG, &cnds)); 3033 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; 3034 3035 VERIFY3U(zap_remove_int(dp->dp_meta_objset, 3036 odd->dd_phys->dd_clones, o, tx), ==, 0); 3037 VERIFY3U(zap_add_int(dp->dp_meta_objset, 3038 dd->dd_phys->dd_clones, o, tx), ==, 0); 3039 dsl_dataset_rele(cnds, FTAG); 3040 } 3041 zap_cursor_fini(&zc); 3042 } 3043 3044 ASSERT0(dsl_prop_numcb(ds)); 3045 } 3046 3047 /* 3048 * Change space accounting. 3049 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either 3050 * both be valid, or both be 0 (resulting in delta == 0). This 3051 * is true for each of {clone,origin} independently. 3052 */ 3053 3054 delta = pa->cloneusedsnap - 3055 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 3056 ASSERT3S(delta, >=, 0); 3057 ASSERT3U(pa->used, >=, delta); 3058 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); 3059 dsl_dir_diduse_space(dd, DD_USED_HEAD, 3060 pa->used - delta, pa->comp, pa->uncomp, tx); 3061 3062 delta = pa->originusedsnap - 3063 odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 3064 ASSERT3S(delta, <=, 0); 3065 ASSERT3U(pa->used, >=, -delta); 3066 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); 3067 dsl_dir_diduse_space(odd, DD_USED_HEAD, 3068 -pa->used - delta, -pa->comp, -pa->uncomp, tx); 3069 3070 origin_ds->ds_phys->ds_unique_bytes = pa->unique; 3071 3072 /* log history record */ 3073 spa_history_log_internal_ds(hds, "promote", tx, ""); 3074 3075 dsl_dir_close(odd, FTAG); 3076 } 3077 3078 static char *snaplist_tag = "snaplist"; 3079 /* 3080 * Make a list of dsl_dataset_t's for the snapshots between first_obj 3081 * (exclusive) and last_obj (inclusive). The list will be in reverse 3082 * order (last_obj will be the list_head()). If first_obj == 0, do all 3083 * snapshots back to this dataset's origin. 3084 */ 3085 static int 3086 snaplist_make(dsl_pool_t *dp, boolean_t own, 3087 uint64_t first_obj, uint64_t last_obj, list_t *l) 3088 { 3089 uint64_t obj = last_obj; 3090 3091 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); 3092 3093 list_create(l, sizeof (struct promotenode), 3094 offsetof(struct promotenode, link)); 3095 3096 while (obj != first_obj) { 3097 dsl_dataset_t *ds; 3098 struct promotenode *snap; 3099 int err; 3100 3101 if (own) { 3102 err = dsl_dataset_own_obj(dp, obj, 3103 0, snaplist_tag, &ds); 3104 if (err == 0) 3105 dsl_dataset_make_exclusive(ds, snaplist_tag); 3106 } else { 3107 err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); 3108 } 3109 if (err == ENOENT) { 3110 /* lost race with snapshot destroy */ 3111 struct promotenode *last = list_tail(l); 3112 ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); 3113 obj = last->ds->ds_phys->ds_prev_snap_obj; 3114 continue; 3115 } else if (err) { 3116 return (err); 3117 } 3118 3119 if (first_obj == 0) 3120 first_obj = ds->ds_dir->dd_phys->dd_origin_obj; 3121 3122 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); 3123 snap->ds = ds; 3124 list_insert_tail(l, snap); 3125 obj = ds->ds_phys->ds_prev_snap_obj; 3126 } 3127 3128 return (0); 3129 } 3130 3131 static int 3132 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) 3133 { 3134 struct promotenode *snap; 3135 3136 *spacep = 0; 3137 for (snap = list_head(l); snap; snap = list_next(l, snap)) { 3138 uint64_t used, comp, uncomp; 3139 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 3140 mintxg, UINT64_MAX, &used, &comp, &uncomp); 3141 *spacep += used; 3142 } 3143 return (0); 3144 } 3145 3146 static void 3147 snaplist_destroy(list_t *l, boolean_t own) 3148 { 3149 struct promotenode *snap; 3150 3151 if (!l || !list_link_active(&l->list_head)) 3152 return; 3153 3154 while ((snap = list_tail(l)) != NULL) { 3155 list_remove(l, snap); 3156 if (own) 3157 dsl_dataset_disown(snap->ds, snaplist_tag); 3158 else 3159 dsl_dataset_rele(snap->ds, snaplist_tag); 3160 kmem_free(snap, sizeof (struct promotenode)); 3161 } 3162 list_destroy(l); 3163 } 3164 3165 /* 3166 * Promote a clone. Nomenclature note: 3167 * "clone" or "cds": the original clone which is being promoted 3168 * "origin" or "ods": the snapshot which is originally clone's origin 3169 * "origin head" or "ohds": the dataset which is the head 3170 * (filesystem/volume) for the origin 3171 * "origin origin": the origin of the origin's filesystem (typically 3172 * NULL, indicating that the clone is not a clone of a clone). 3173 */ 3174 int 3175 dsl_dataset_promote(const char *name, char *conflsnap) 3176 { 3177 dsl_dataset_t *ds; 3178 dsl_dir_t *dd; 3179 dsl_pool_t *dp; 3180 dmu_object_info_t doi; 3181 struct promotearg pa = { 0 }; 3182 struct promotenode *snap; 3183 int err; 3184 3185 err = dsl_dataset_hold(name, FTAG, &ds); 3186 if (err) 3187 return (err); 3188 dd = ds->ds_dir; 3189 dp = dd->dd_pool; 3190 3191 err = dmu_object_info(dp->dp_meta_objset, 3192 ds->ds_phys->ds_snapnames_zapobj, &doi); 3193 if (err) { 3194 dsl_dataset_rele(ds, FTAG); 3195 return (err); 3196 } 3197 3198 if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { 3199 dsl_dataset_rele(ds, FTAG); 3200 return (EINVAL); 3201 } 3202 3203 /* 3204 * We are going to inherit all the snapshots taken before our 3205 * origin (i.e., our new origin will be our parent's origin). 3206 * Take ownership of them so that we can rename them into our 3207 * namespace. 3208 */ 3209 rw_enter(&dp->dp_config_rwlock, RW_READER); 3210 3211 err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, 3212 &pa.shared_snaps); 3213 if (err != 0) 3214 goto out; 3215 3216 err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); 3217 if (err != 0) 3218 goto out; 3219 3220 snap = list_head(&pa.shared_snaps); 3221 ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); 3222 err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, 3223 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); 3224 if (err != 0) 3225 goto out; 3226 3227 if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { 3228 err = dsl_dataset_hold_obj(dp, 3229 snap->ds->ds_dir->dd_phys->dd_origin_obj, 3230 FTAG, &pa.origin_origin); 3231 if (err != 0) 3232 goto out; 3233 } 3234 3235 out: 3236 rw_exit(&dp->dp_config_rwlock); 3237 3238 /* 3239 * Add in 128x the snapnames zapobj size, since we will be moving 3240 * a bunch of snapnames to the promoted ds, and dirtying their 3241 * bonus buffers. 3242 */ 3243 if (err == 0) { 3244 err = dsl_sync_task_do(dp, dsl_dataset_promote_check, 3245 dsl_dataset_promote_sync, ds, &pa, 3246 2 + 2 * doi.doi_physical_blocks_512); 3247 if (err && pa.err_ds && conflsnap) 3248 (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); 3249 } 3250 3251 snaplist_destroy(&pa.shared_snaps, B_TRUE); 3252 snaplist_destroy(&pa.clone_snaps, B_FALSE); 3253 snaplist_destroy(&pa.origin_snaps, B_FALSE); 3254 if (pa.origin_origin) 3255 dsl_dataset_rele(pa.origin_origin, FTAG); 3256 dsl_dataset_rele(ds, FTAG); 3257 return (err); 3258 } 3259 3260 struct cloneswaparg { 3261 dsl_dataset_t *cds; /* clone dataset */ 3262 dsl_dataset_t *ohds; /* origin's head dataset */ 3263 boolean_t force; 3264 int64_t unused_refres_delta; /* change in unconsumed refreservation */ 3265 }; 3266 3267 /* ARGSUSED */ 3268 static int 3269 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) 3270 { 3271 struct cloneswaparg *csa = arg1; 3272 3273 /* they should both be heads */ 3274 if (dsl_dataset_is_snapshot(csa->cds) || 3275 dsl_dataset_is_snapshot(csa->ohds)) 3276 return (EINVAL); 3277 3278 /* the branch point should be just before them */ 3279 if (csa->cds->ds_prev != csa->ohds->ds_prev) 3280 return (EINVAL); 3281 3282 /* cds should be the clone (unless they are unrelated) */ 3283 if (csa->cds->ds_prev != NULL && 3284 csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && 3285 csa->ohds->ds_object != 3286 csa->cds->ds_prev->ds_phys->ds_next_snap_obj) 3287 return (EINVAL); 3288 3289 /* the clone should be a child of the origin */ 3290 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) 3291 return (EINVAL); 3292 3293 /* ohds shouldn't be modified unless 'force' */ 3294 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) 3295 return (ETXTBSY); 3296 3297 /* adjust amount of any unconsumed refreservation */ 3298 csa->unused_refres_delta = 3299 (int64_t)MIN(csa->ohds->ds_reserved, 3300 csa->ohds->ds_phys->ds_unique_bytes) - 3301 (int64_t)MIN(csa->ohds->ds_reserved, 3302 csa->cds->ds_phys->ds_unique_bytes); 3303 3304 if (csa->unused_refres_delta > 0 && 3305 csa->unused_refres_delta > 3306 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) 3307 return (ENOSPC); 3308 3309 if (csa->ohds->ds_quota != 0 && 3310 csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) 3311 return (EDQUOT); 3312 3313 return (0); 3314 } 3315 3316 /* ARGSUSED */ 3317 static void 3318 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3319 { 3320 struct cloneswaparg *csa = arg1; 3321 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; 3322 3323 ASSERT(csa->cds->ds_reserved == 0); 3324 ASSERT(csa->ohds->ds_quota == 0 || 3325 csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); 3326 3327 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); 3328 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); 3329 3330 if (csa->cds->ds_objset != NULL) { 3331 dmu_objset_evict(csa->cds->ds_objset); 3332 csa->cds->ds_objset = NULL; 3333 } 3334 3335 if (csa->ohds->ds_objset != NULL) { 3336 dmu_objset_evict(csa->ohds->ds_objset); 3337 csa->ohds->ds_objset = NULL; 3338 } 3339 3340 /* 3341 * Reset origin's unique bytes, if it exists. 3342 */ 3343 if (csa->cds->ds_prev) { 3344 dsl_dataset_t *origin = csa->cds->ds_prev; 3345 uint64_t comp, uncomp; 3346 3347 dmu_buf_will_dirty(origin->ds_dbuf, tx); 3348 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3349 origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, 3350 &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); 3351 } 3352 3353 /* swap blkptrs */ 3354 { 3355 blkptr_t tmp; 3356 tmp = csa->ohds->ds_phys->ds_bp; 3357 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; 3358 csa->cds->ds_phys->ds_bp = tmp; 3359 } 3360 3361 /* set dd_*_bytes */ 3362 { 3363 int64_t dused, dcomp, duncomp; 3364 uint64_t cdl_used, cdl_comp, cdl_uncomp; 3365 uint64_t odl_used, odl_comp, odl_uncomp; 3366 3367 ASSERT3U(csa->cds->ds_dir->dd_phys-> 3368 dd_used_breakdown[DD_USED_SNAP], ==, 0); 3369 3370 dsl_deadlist_space(&csa->cds->ds_deadlist, 3371 &cdl_used, &cdl_comp, &cdl_uncomp); 3372 dsl_deadlist_space(&csa->ohds->ds_deadlist, 3373 &odl_used, &odl_comp, &odl_uncomp); 3374 3375 dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - 3376 (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); 3377 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - 3378 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); 3379 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + 3380 cdl_uncomp - 3381 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); 3382 3383 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, 3384 dused, dcomp, duncomp, tx); 3385 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, 3386 -dused, -dcomp, -duncomp, tx); 3387 3388 /* 3389 * The difference in the space used by snapshots is the 3390 * difference in snapshot space due to the head's 3391 * deadlist (since that's the only thing that's 3392 * changing that affects the snapused). 3393 */ 3394 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3395 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3396 &cdl_used, &cdl_comp, &cdl_uncomp); 3397 dsl_deadlist_space_range(&csa->ohds->ds_deadlist, 3398 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3399 &odl_used, &odl_comp, &odl_uncomp); 3400 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, 3401 DD_USED_HEAD, DD_USED_SNAP, tx); 3402 } 3403 3404 /* swap ds_*_bytes */ 3405 SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, 3406 csa->cds->ds_phys->ds_referenced_bytes); 3407 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, 3408 csa->cds->ds_phys->ds_compressed_bytes); 3409 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, 3410 csa->cds->ds_phys->ds_uncompressed_bytes); 3411 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, 3412 csa->cds->ds_phys->ds_unique_bytes); 3413 3414 /* apply any parent delta for change in unconsumed refreservation */ 3415 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, 3416 csa->unused_refres_delta, 0, 0, tx); 3417 3418 /* 3419 * Swap deadlists. 3420 */ 3421 dsl_deadlist_close(&csa->cds->ds_deadlist); 3422 dsl_deadlist_close(&csa->ohds->ds_deadlist); 3423 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, 3424 csa->cds->ds_phys->ds_deadlist_obj); 3425 dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, 3426 csa->cds->ds_phys->ds_deadlist_obj); 3427 dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, 3428 csa->ohds->ds_phys->ds_deadlist_obj); 3429 3430 dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); 3431 3432 spa_history_log_internal_ds(csa->cds, "clone swap", tx, 3433 "parent=%s", csa->ohds->ds_dir->dd_myname); 3434 } 3435 3436 /* 3437 * Swap 'clone' with its origin head datasets. Used at the end of "zfs 3438 * recv" into an existing fs to swizzle the file system to the new 3439 * version, and by "zfs rollback". Can also be used to swap two 3440 * independent head datasets if neither has any snapshots. 3441 */ 3442 int 3443 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, 3444 boolean_t force) 3445 { 3446 struct cloneswaparg csa; 3447 int error; 3448 3449 ASSERT(clone->ds_owner); 3450 ASSERT(origin_head->ds_owner); 3451 retry: 3452 /* 3453 * Need exclusive access for the swap. If we're swapping these 3454 * datasets back after an error, we already hold the locks. 3455 */ 3456 if (!RW_WRITE_HELD(&clone->ds_rwlock)) 3457 rw_enter(&clone->ds_rwlock, RW_WRITER); 3458 if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && 3459 !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { 3460 rw_exit(&clone->ds_rwlock); 3461 rw_enter(&origin_head->ds_rwlock, RW_WRITER); 3462 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { 3463 rw_exit(&origin_head->ds_rwlock); 3464 goto retry; 3465 } 3466 } 3467 csa.cds = clone; 3468 csa.ohds = origin_head; 3469 csa.force = force; 3470 error = dsl_sync_task_do(clone->ds_dir->dd_pool, 3471 dsl_dataset_clone_swap_check, 3472 dsl_dataset_clone_swap_sync, &csa, NULL, 9); 3473 return (error); 3474 } 3475 3476 /* 3477 * Given a pool name and a dataset object number in that pool, 3478 * return the name of that dataset. 3479 */ 3480 int 3481 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) 3482 { 3483 spa_t *spa; 3484 dsl_pool_t *dp; 3485 dsl_dataset_t *ds; 3486 int error; 3487 3488 if ((error = spa_open(pname, &spa, FTAG)) != 0) 3489 return (error); 3490 dp = spa_get_dsl(spa); 3491 rw_enter(&dp->dp_config_rwlock, RW_READER); 3492 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { 3493 dsl_dataset_name(ds, buf); 3494 dsl_dataset_rele(ds, FTAG); 3495 } 3496 rw_exit(&dp->dp_config_rwlock); 3497 spa_close(spa, FTAG); 3498 3499 return (error); 3500 } 3501 3502 int 3503 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, 3504 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) 3505 { 3506 int error = 0; 3507 3508 ASSERT3S(asize, >, 0); 3509 3510 /* 3511 * *ref_rsrv is the portion of asize that will come from any 3512 * unconsumed refreservation space. 3513 */ 3514 *ref_rsrv = 0; 3515 3516 mutex_enter(&ds->ds_lock); 3517 /* 3518 * Make a space adjustment for reserved bytes. 3519 */ 3520 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { 3521 ASSERT3U(*used, >=, 3522 ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3523 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3524 *ref_rsrv = 3525 asize - MIN(asize, parent_delta(ds, asize + inflight)); 3526 } 3527 3528 if (!check_quota || ds->ds_quota == 0) { 3529 mutex_exit(&ds->ds_lock); 3530 return (0); 3531 } 3532 /* 3533 * If they are requesting more space, and our current estimate 3534 * is over quota, they get to try again unless the actual 3535 * on-disk is over quota and there are no pending changes (which 3536 * may free up space for us). 3537 */ 3538 if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { 3539 if (inflight > 0 || 3540 ds->ds_phys->ds_referenced_bytes < ds->ds_quota) 3541 error = ERESTART; 3542 else 3543 error = EDQUOT; 3544 } 3545 mutex_exit(&ds->ds_lock); 3546 3547 return (error); 3548 } 3549 3550 /* ARGSUSED */ 3551 static int 3552 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) 3553 { 3554 dsl_dataset_t *ds = arg1; 3555 dsl_prop_setarg_t *psa = arg2; 3556 int err; 3557 3558 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) 3559 return (ENOTSUP); 3560 3561 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3562 return (err); 3563 3564 if (psa->psa_effective_value == 0) 3565 return (0); 3566 3567 if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || 3568 psa->psa_effective_value < ds->ds_reserved) 3569 return (ENOSPC); 3570 3571 return (0); 3572 } 3573 3574 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); 3575 3576 void 3577 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3578 { 3579 dsl_dataset_t *ds = arg1; 3580 dsl_prop_setarg_t *psa = arg2; 3581 uint64_t effective_value = psa->psa_effective_value; 3582 3583 dsl_prop_set_sync(ds, psa, tx); 3584 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3585 3586 if (ds->ds_quota != effective_value) { 3587 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3588 ds->ds_quota = effective_value; 3589 } 3590 } 3591 3592 int 3593 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) 3594 { 3595 dsl_dataset_t *ds; 3596 dsl_prop_setarg_t psa; 3597 int err; 3598 3599 dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); 3600 3601 err = dsl_dataset_hold(dsname, FTAG, &ds); 3602 if (err) 3603 return (err); 3604 3605 /* 3606 * If someone removes a file, then tries to set the quota, we 3607 * want to make sure the file freeing takes effect. 3608 */ 3609 txg_wait_open(ds->ds_dir->dd_pool, 0); 3610 3611 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3612 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, 3613 ds, &psa, 0); 3614 3615 dsl_dataset_rele(ds, FTAG); 3616 return (err); 3617 } 3618 3619 static int 3620 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) 3621 { 3622 dsl_dataset_t *ds = arg1; 3623 dsl_prop_setarg_t *psa = arg2; 3624 uint64_t effective_value; 3625 uint64_t unique; 3626 int err; 3627 3628 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 3629 SPA_VERSION_REFRESERVATION) 3630 return (ENOTSUP); 3631 3632 if (dsl_dataset_is_snapshot(ds)) 3633 return (EINVAL); 3634 3635 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3636 return (err); 3637 3638 effective_value = psa->psa_effective_value; 3639 3640 /* 3641 * If we are doing the preliminary check in open context, the 3642 * space estimates may be inaccurate. 3643 */ 3644 if (!dmu_tx_is_syncing(tx)) 3645 return (0); 3646 3647 mutex_enter(&ds->ds_lock); 3648 if (!DS_UNIQUE_IS_ACCURATE(ds)) 3649 dsl_dataset_recalc_head_uniq(ds); 3650 unique = ds->ds_phys->ds_unique_bytes; 3651 mutex_exit(&ds->ds_lock); 3652 3653 if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { 3654 uint64_t delta = MAX(unique, effective_value) - 3655 MAX(unique, ds->ds_reserved); 3656 3657 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 3658 return (ENOSPC); 3659 if (ds->ds_quota > 0 && 3660 effective_value > ds->ds_quota) 3661 return (ENOSPC); 3662 } 3663 3664 return (0); 3665 } 3666 3667 static void 3668 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3669 { 3670 dsl_dataset_t *ds = arg1; 3671 dsl_prop_setarg_t *psa = arg2; 3672 uint64_t effective_value = psa->psa_effective_value; 3673 uint64_t unique; 3674 int64_t delta; 3675 3676 dsl_prop_set_sync(ds, psa, tx); 3677 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3678 3679 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3680 3681 mutex_enter(&ds->ds_dir->dd_lock); 3682 mutex_enter(&ds->ds_lock); 3683 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 3684 unique = ds->ds_phys->ds_unique_bytes; 3685 delta = MAX(0, (int64_t)(effective_value - unique)) - 3686 MAX(0, (int64_t)(ds->ds_reserved - unique)); 3687 ds->ds_reserved = effective_value; 3688 mutex_exit(&ds->ds_lock); 3689 3690 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); 3691 mutex_exit(&ds->ds_dir->dd_lock); 3692 } 3693 3694 int 3695 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, 3696 uint64_t reservation) 3697 { 3698 dsl_dataset_t *ds; 3699 dsl_prop_setarg_t psa; 3700 int err; 3701 3702 dsl_prop_setarg_init_uint64(&psa, "refreservation", source, 3703 &reservation); 3704 3705 err = dsl_dataset_hold(dsname, FTAG, &ds); 3706 if (err) 3707 return (err); 3708 3709 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3710 dsl_dataset_set_reservation_check, 3711 dsl_dataset_set_reservation_sync, ds, &psa, 0); 3712 3713 dsl_dataset_rele(ds, FTAG); 3714 return (err); 3715 } 3716 3717 typedef struct zfs_hold_cleanup_arg { 3718 dsl_pool_t *dp; 3719 uint64_t dsobj; 3720 char htag[MAXNAMELEN]; 3721 } zfs_hold_cleanup_arg_t; 3722 3723 static void 3724 dsl_dataset_user_release_onexit(void *arg) 3725 { 3726 zfs_hold_cleanup_arg_t *ca = arg; 3727 3728 (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, 3729 B_TRUE); 3730 kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); 3731 } 3732 3733 void 3734 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, 3735 minor_t minor) 3736 { 3737 zfs_hold_cleanup_arg_t *ca; 3738 3739 ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); 3740 ca->dp = ds->ds_dir->dd_pool; 3741 ca->dsobj = ds->ds_object; 3742 (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); 3743 VERIFY3U(0, ==, zfs_onexit_add_cb(minor, 3744 dsl_dataset_user_release_onexit, ca, NULL)); 3745 } 3746 3747 /* 3748 * If you add new checks here, you may need to add 3749 * additional checks to the "temporary" case in 3750 * snapshot_check() in dmu_objset.c. 3751 */ 3752 static int 3753 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) 3754 { 3755 dsl_dataset_t *ds = arg1; 3756 struct dsl_ds_holdarg *ha = arg2; 3757 const char *htag = ha->htag; 3758 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3759 int error = 0; 3760 3761 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3762 return (ENOTSUP); 3763 3764 if (!dsl_dataset_is_snapshot(ds)) 3765 return (EINVAL); 3766 3767 /* tags must be unique */ 3768 mutex_enter(&ds->ds_lock); 3769 if (ds->ds_phys->ds_userrefs_obj) { 3770 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 3771 8, 1, tx); 3772 if (error == 0) 3773 error = EEXIST; 3774 else if (error == ENOENT) 3775 error = 0; 3776 } 3777 mutex_exit(&ds->ds_lock); 3778 3779 if (error == 0 && ha->temphold && 3780 strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 3781 error = E2BIG; 3782 3783 return (error); 3784 } 3785 3786 void 3787 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3788 { 3789 dsl_dataset_t *ds = arg1; 3790 struct dsl_ds_holdarg *ha = arg2; 3791 const char *htag = ha->htag; 3792 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3793 objset_t *mos = dp->dp_meta_objset; 3794 uint64_t now = gethrestime_sec(); 3795 uint64_t zapobj; 3796 3797 mutex_enter(&ds->ds_lock); 3798 if (ds->ds_phys->ds_userrefs_obj == 0) { 3799 /* 3800 * This is the first user hold for this dataset. Create 3801 * the userrefs zap object. 3802 */ 3803 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3804 zapobj = ds->ds_phys->ds_userrefs_obj = 3805 zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); 3806 } else { 3807 zapobj = ds->ds_phys->ds_userrefs_obj; 3808 } 3809 ds->ds_userrefs++; 3810 mutex_exit(&ds->ds_lock); 3811 3812 VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); 3813 3814 if (ha->temphold) { 3815 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, 3816 htag, &now, tx)); 3817 } 3818 3819 spa_history_log_internal_ds(ds, "hold", tx, 3820 "tag = %s temp = %d holds now = %llu", 3821 htag, (int)ha->temphold, ds->ds_userrefs); 3822 } 3823 3824 static int 3825 dsl_dataset_user_hold_one(const char *dsname, void *arg) 3826 { 3827 struct dsl_ds_holdarg *ha = arg; 3828 dsl_dataset_t *ds; 3829 int error; 3830 char *name; 3831 3832 /* alloc a buffer to hold dsname@snapname plus terminating NULL */ 3833 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3834 error = dsl_dataset_hold(name, ha->dstg, &ds); 3835 strfree(name); 3836 if (error == 0) { 3837 ha->gotone = B_TRUE; 3838 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, 3839 dsl_dataset_user_hold_sync, ds, ha, 0); 3840 } else if (error == ENOENT && ha->recursive) { 3841 error = 0; 3842 } else { 3843 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3844 } 3845 return (error); 3846 } 3847 3848 int 3849 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, 3850 boolean_t temphold) 3851 { 3852 struct dsl_ds_holdarg *ha; 3853 int error; 3854 3855 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3856 ha->htag = htag; 3857 ha->temphold = temphold; 3858 error = dsl_sync_task_do(ds->ds_dir->dd_pool, 3859 dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, 3860 ds, ha, 0); 3861 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3862 3863 return (error); 3864 } 3865 3866 int 3867 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, 3868 boolean_t recursive, boolean_t temphold, int cleanup_fd) 3869 { 3870 struct dsl_ds_holdarg *ha; 3871 dsl_sync_task_t *dst; 3872 spa_t *spa; 3873 int error; 3874 minor_t minor = 0; 3875 3876 if (cleanup_fd != -1) { 3877 /* Currently we only support cleanup-on-exit of tempholds. */ 3878 if (!temphold) 3879 return (EINVAL); 3880 error = zfs_onexit_fd_hold(cleanup_fd, &minor); 3881 if (error) 3882 return (error); 3883 } 3884 3885 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3886 3887 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3888 3889 error = spa_open(dsname, &spa, FTAG); 3890 if (error) { 3891 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3892 if (cleanup_fd != -1) 3893 zfs_onexit_fd_rele(cleanup_fd); 3894 return (error); 3895 } 3896 3897 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3898 ha->htag = htag; 3899 ha->snapname = snapname; 3900 ha->recursive = recursive; 3901 ha->temphold = temphold; 3902 3903 if (recursive) { 3904 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, 3905 ha, DS_FIND_CHILDREN); 3906 } else { 3907 error = dsl_dataset_user_hold_one(dsname, ha); 3908 } 3909 if (error == 0) 3910 error = dsl_sync_task_group_wait(ha->dstg); 3911 3912 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3913 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3914 dsl_dataset_t *ds = dst->dst_arg1; 3915 3916 if (dst->dst_err) { 3917 dsl_dataset_name(ds, ha->failed); 3918 *strchr(ha->failed, '@') = '\0'; 3919 } else if (error == 0 && minor != 0 && temphold) { 3920 /* 3921 * If this hold is to be released upon process exit, 3922 * register that action now. 3923 */ 3924 dsl_register_onexit_hold_cleanup(ds, htag, minor); 3925 } 3926 dsl_dataset_rele(ds, ha->dstg); 3927 } 3928 3929 if (error == 0 && recursive && !ha->gotone) 3930 error = ENOENT; 3931 3932 if (error) 3933 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3934 3935 dsl_sync_task_group_destroy(ha->dstg); 3936 3937 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3938 spa_close(spa, FTAG); 3939 if (cleanup_fd != -1) 3940 zfs_onexit_fd_rele(cleanup_fd); 3941 return (error); 3942 } 3943 3944 struct dsl_ds_releasearg { 3945 dsl_dataset_t *ds; 3946 const char *htag; 3947 boolean_t own; /* do we own or just hold ds? */ 3948 }; 3949 3950 static int 3951 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, 3952 boolean_t *might_destroy) 3953 { 3954 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3955 uint64_t zapobj; 3956 uint64_t tmp; 3957 int error; 3958 3959 *might_destroy = B_FALSE; 3960 3961 mutex_enter(&ds->ds_lock); 3962 zapobj = ds->ds_phys->ds_userrefs_obj; 3963 if (zapobj == 0) { 3964 /* The tag can't possibly exist */ 3965 mutex_exit(&ds->ds_lock); 3966 return (ESRCH); 3967 } 3968 3969 /* Make sure the tag exists */ 3970 error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); 3971 if (error) { 3972 mutex_exit(&ds->ds_lock); 3973 if (error == ENOENT) 3974 error = ESRCH; 3975 return (error); 3976 } 3977 3978 if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && 3979 DS_IS_DEFER_DESTROY(ds)) 3980 *might_destroy = B_TRUE; 3981 3982 mutex_exit(&ds->ds_lock); 3983 return (0); 3984 } 3985 3986 static int 3987 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) 3988 { 3989 struct dsl_ds_releasearg *ra = arg1; 3990 dsl_dataset_t *ds = ra->ds; 3991 boolean_t might_destroy; 3992 int error; 3993 3994 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3995 return (ENOTSUP); 3996 3997 error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); 3998 if (error) 3999 return (error); 4000 4001 if (might_destroy) { 4002 struct dsl_ds_destroyarg dsda = {0}; 4003 4004 if (dmu_tx_is_syncing(tx)) { 4005 /* 4006 * If we're not prepared to remove the snapshot, 4007 * we can't allow the release to happen right now. 4008 */ 4009 if (!ra->own) 4010 return (EBUSY); 4011 } 4012 dsda.ds = ds; 4013 dsda.releasing = B_TRUE; 4014 return (dsl_dataset_destroy_check(&dsda, tag, tx)); 4015 } 4016 4017 return (0); 4018 } 4019 4020 static void 4021 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) 4022 { 4023 struct dsl_ds_releasearg *ra = arg1; 4024 dsl_dataset_t *ds = ra->ds; 4025 dsl_pool_t *dp = ds->ds_dir->dd_pool; 4026 objset_t *mos = dp->dp_meta_objset; 4027 uint64_t zapobj; 4028 uint64_t refs; 4029 int error; 4030 4031 mutex_enter(&ds->ds_lock); 4032 ds->ds_userrefs--; 4033 refs = ds->ds_userrefs; 4034 mutex_exit(&ds->ds_lock); 4035 error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); 4036 VERIFY(error == 0 || error == ENOENT); 4037 zapobj = ds->ds_phys->ds_userrefs_obj; 4038 VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); 4039 4040 spa_history_log_internal_ds(ds, "release", tx, 4041 "tag = %s refs now = %lld", ra->htag, (longlong_t)refs); 4042 4043 if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && 4044 DS_IS_DEFER_DESTROY(ds)) { 4045 struct dsl_ds_destroyarg dsda = {0}; 4046 4047 ASSERT(ra->own); 4048 dsda.ds = ds; 4049 dsda.releasing = B_TRUE; 4050 /* We already did the destroy_check */ 4051 dsl_dataset_destroy_sync(&dsda, tag, tx); 4052 } 4053 } 4054 4055 static int 4056 dsl_dataset_user_release_one(const char *dsname, void *arg) 4057 { 4058 struct dsl_ds_holdarg *ha = arg; 4059 struct dsl_ds_releasearg *ra; 4060 dsl_dataset_t *ds; 4061 int error; 4062 void *dtag = ha->dstg; 4063 char *name; 4064 boolean_t own = B_FALSE; 4065 boolean_t might_destroy; 4066 4067 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ 4068 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 4069 error = dsl_dataset_hold(name, dtag, &ds); 4070 strfree(name); 4071 if (error == ENOENT && ha->recursive) 4072 return (0); 4073 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 4074 if (error) 4075 return (error); 4076 4077 ha->gotone = B_TRUE; 4078 4079 ASSERT(dsl_dataset_is_snapshot(ds)); 4080 4081 error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); 4082 if (error) { 4083 dsl_dataset_rele(ds, dtag); 4084 return (error); 4085 } 4086 4087 if (might_destroy) { 4088 #ifdef _KERNEL 4089 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 4090 error = zfs_unmount_snap(name, NULL); 4091 strfree(name); 4092 if (error) { 4093 dsl_dataset_rele(ds, dtag); 4094 return (error); 4095 } 4096 #endif 4097 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { 4098 dsl_dataset_rele(ds, dtag); 4099 return (EBUSY); 4100 } else { 4101 own = B_TRUE; 4102 dsl_dataset_make_exclusive(ds, dtag); 4103 } 4104 } 4105 4106 ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); 4107 ra->ds = ds; 4108 ra->htag = ha->htag; 4109 ra->own = own; 4110 dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, 4111 dsl_dataset_user_release_sync, ra, dtag, 0); 4112 4113 return (0); 4114 } 4115 4116 int 4117 dsl_dataset_user_release(char *dsname, char *snapname, char *htag, 4118 boolean_t recursive) 4119 { 4120 struct dsl_ds_holdarg *ha; 4121 dsl_sync_task_t *dst; 4122 spa_t *spa; 4123 int error; 4124 4125 top: 4126 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 4127 4128 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 4129 4130 error = spa_open(dsname, &spa, FTAG); 4131 if (error) { 4132 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 4133 return (error); 4134 } 4135 4136 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 4137 ha->htag = htag; 4138 ha->snapname = snapname; 4139 ha->recursive = recursive; 4140 if (recursive) { 4141 error = dmu_objset_find(dsname, dsl_dataset_user_release_one, 4142 ha, DS_FIND_CHILDREN); 4143 } else { 4144 error = dsl_dataset_user_release_one(dsname, ha); 4145 } 4146 if (error == 0) 4147 error = dsl_sync_task_group_wait(ha->dstg); 4148 4149 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 4150 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 4151 struct dsl_ds_releasearg *ra = dst->dst_arg1; 4152 dsl_dataset_t *ds = ra->ds; 4153 4154 if (dst->dst_err) 4155 dsl_dataset_name(ds, ha->failed); 4156 4157 if (ra->own) 4158 dsl_dataset_disown(ds, ha->dstg); 4159 else 4160 dsl_dataset_rele(ds, ha->dstg); 4161 4162 kmem_free(ra, sizeof (struct dsl_ds_releasearg)); 4163 } 4164 4165 if (error == 0 && recursive && !ha->gotone) 4166 error = ENOENT; 4167 4168 if (error && error != EBUSY) 4169 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 4170 4171 dsl_sync_task_group_destroy(ha->dstg); 4172 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 4173 spa_close(spa, FTAG); 4174 4175 /* 4176 * We can get EBUSY if we were racing with deferred destroy and 4177 * dsl_dataset_user_release_check() hadn't done the necessary 4178 * open context setup. We can also get EBUSY if we're racing 4179 * with destroy and that thread is the ds_owner. Either way 4180 * the busy condition should be transient, and we should retry 4181 * the release operation. 4182 */ 4183 if (error == EBUSY) 4184 goto top; 4185 4186 return (error); 4187 } 4188 4189 /* 4190 * Called at spa_load time (with retry == B_FALSE) to release a stale 4191 * temporary user hold. Also called by the onexit code (with retry == B_TRUE). 4192 */ 4193 int 4194 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, 4195 boolean_t retry) 4196 { 4197 dsl_dataset_t *ds; 4198 char *snap; 4199 char *name; 4200 int namelen; 4201 int error; 4202 4203 do { 4204 rw_enter(&dp->dp_config_rwlock, RW_READER); 4205 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 4206 rw_exit(&dp->dp_config_rwlock); 4207 if (error) 4208 return (error); 4209 namelen = dsl_dataset_namelen(ds)+1; 4210 name = kmem_alloc(namelen, KM_SLEEP); 4211 dsl_dataset_name(ds, name); 4212 dsl_dataset_rele(ds, FTAG); 4213 4214 snap = strchr(name, '@'); 4215 *snap = '\0'; 4216 ++snap; 4217 error = dsl_dataset_user_release(name, snap, htag, B_FALSE); 4218 kmem_free(name, namelen); 4219 4220 /* 4221 * The object can't have been destroyed because we have a hold, 4222 * but it might have been renamed, resulting in ENOENT. Retry 4223 * if we've been requested to do so. 4224 * 4225 * It would be nice if we could use the dsobj all the way 4226 * through and avoid ENOENT entirely. But we might need to 4227 * unmount the snapshot, and there's currently no way to lookup 4228 * a vfsp using a ZFS object id. 4229 */ 4230 } while ((error == ENOENT) && retry); 4231 4232 return (error); 4233 } 4234 4235 int 4236 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) 4237 { 4238 dsl_dataset_t *ds; 4239 int err; 4240 4241 err = dsl_dataset_hold(dsname, FTAG, &ds); 4242 if (err) 4243 return (err); 4244 4245 VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); 4246 if (ds->ds_phys->ds_userrefs_obj != 0) { 4247 zap_attribute_t *za; 4248 zap_cursor_t zc; 4249 4250 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 4251 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, 4252 ds->ds_phys->ds_userrefs_obj); 4253 zap_cursor_retrieve(&zc, za) == 0; 4254 zap_cursor_advance(&zc)) { 4255 VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, 4256 za->za_first_integer)); 4257 } 4258 zap_cursor_fini(&zc); 4259 kmem_free(za, sizeof (zap_attribute_t)); 4260 } 4261 dsl_dataset_rele(ds, FTAG); 4262 return (0); 4263 } 4264 4265 /* 4266 * Note, this function is used as the callback for dmu_objset_find(). We 4267 * always return 0 so that we will continue to find and process 4268 * inconsistent datasets, even if we encounter an error trying to 4269 * process one of them. 4270 */ 4271 /* ARGSUSED */ 4272 int 4273 dsl_destroy_inconsistent(const char *dsname, void *arg) 4274 { 4275 dsl_dataset_t *ds; 4276 4277 if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { 4278 if (DS_IS_INCONSISTENT(ds)) 4279 (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); 4280 else 4281 dsl_dataset_disown(ds, FTAG); 4282 } 4283 return (0); 4284 } 4285 4286 /* 4287 * Return (in *usedp) the amount of space written in new that is not 4288 * present in oldsnap. New may be a snapshot or the head. Old must be 4289 * a snapshot before new, in new's filesystem (or its origin). If not then 4290 * fail and return EINVAL. 4291 * 4292 * The written space is calculated by considering two components: First, we 4293 * ignore any freed space, and calculate the written as new's used space 4294 * minus old's used space. Next, we add in the amount of space that was freed 4295 * between the two snapshots, thus reducing new's used space relative to old's. 4296 * Specifically, this is the space that was born before old->ds_creation_txg, 4297 * and freed before new (ie. on new's deadlist or a previous deadlist). 4298 * 4299 * space freed [---------------------] 4300 * snapshots ---O-------O--------O-------O------ 4301 * oldsnap new 4302 */ 4303 int 4304 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, 4305 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 4306 { 4307 int err = 0; 4308 uint64_t snapobj; 4309 dsl_pool_t *dp = new->ds_dir->dd_pool; 4310 4311 *usedp = 0; 4312 *usedp += new->ds_phys->ds_referenced_bytes; 4313 *usedp -= oldsnap->ds_phys->ds_referenced_bytes; 4314 4315 *compp = 0; 4316 *compp += new->ds_phys->ds_compressed_bytes; 4317 *compp -= oldsnap->ds_phys->ds_compressed_bytes; 4318 4319 *uncompp = 0; 4320 *uncompp += new->ds_phys->ds_uncompressed_bytes; 4321 *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; 4322 4323 rw_enter(&dp->dp_config_rwlock, RW_READER); 4324 snapobj = new->ds_object; 4325 while (snapobj != oldsnap->ds_object) { 4326 dsl_dataset_t *snap; 4327 uint64_t used, comp, uncomp; 4328 4329 if (snapobj == new->ds_object) { 4330 snap = new; 4331 } else { 4332 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); 4333 if (err != 0) 4334 break; 4335 } 4336 4337 if (snap->ds_phys->ds_prev_snap_txg == 4338 oldsnap->ds_phys->ds_creation_txg) { 4339 /* 4340 * The blocks in the deadlist can not be born after 4341 * ds_prev_snap_txg, so get the whole deadlist space, 4342 * which is more efficient (especially for old-format 4343 * deadlists). Unfortunately the deadlist code 4344 * doesn't have enough information to make this 4345 * optimization itself. 4346 */ 4347 dsl_deadlist_space(&snap->ds_deadlist, 4348 &used, &comp, &uncomp); 4349 } else { 4350 dsl_deadlist_space_range(&snap->ds_deadlist, 4351 0, oldsnap->ds_phys->ds_creation_txg, 4352 &used, &comp, &uncomp); 4353 } 4354 *usedp += used; 4355 *compp += comp; 4356 *uncompp += uncomp; 4357 4358 /* 4359 * If we get to the beginning of the chain of snapshots 4360 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap 4361 * was not a snapshot of/before new. 4362 */ 4363 snapobj = snap->ds_phys->ds_prev_snap_obj; 4364 if (snap != new) 4365 dsl_dataset_rele(snap, FTAG); 4366 if (snapobj == 0) { 4367 err = EINVAL; 4368 break; 4369 } 4370 4371 } 4372 rw_exit(&dp->dp_config_rwlock); 4373 return (err); 4374 } 4375 4376 /* 4377 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, 4378 * lastsnap, and all snapshots in between are deleted. 4379 * 4380 * blocks that would be freed [---------------------------] 4381 * snapshots ---O-------O--------O-------O--------O 4382 * firstsnap lastsnap 4383 * 4384 * This is the set of blocks that were born after the snap before firstsnap, 4385 * (birth > firstsnap->prev_snap_txg) and died before the snap after the 4386 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). 4387 * We calculate this by iterating over the relevant deadlists (from the snap 4388 * after lastsnap, backward to the snap after firstsnap), summing up the 4389 * space on the deadlist that was born after the snap before firstsnap. 4390 */ 4391 int 4392 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, 4393 dsl_dataset_t *lastsnap, 4394 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 4395 { 4396 int err = 0; 4397 uint64_t snapobj; 4398 dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; 4399 4400 ASSERT(dsl_dataset_is_snapshot(firstsnap)); 4401 ASSERT(dsl_dataset_is_snapshot(lastsnap)); 4402 4403 /* 4404 * Check that the snapshots are in the same dsl_dir, and firstsnap 4405 * is before lastsnap. 4406 */ 4407 if (firstsnap->ds_dir != lastsnap->ds_dir || 4408 firstsnap->ds_phys->ds_creation_txg > 4409 lastsnap->ds_phys->ds_creation_txg) 4410 return (EINVAL); 4411 4412 *usedp = *compp = *uncompp = 0; 4413 4414 rw_enter(&dp->dp_config_rwlock, RW_READER); 4415 snapobj = lastsnap->ds_phys->ds_next_snap_obj; 4416 while (snapobj != firstsnap->ds_object) { 4417 dsl_dataset_t *ds; 4418 uint64_t used, comp, uncomp; 4419 4420 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); 4421 if (err != 0) 4422 break; 4423 4424 dsl_deadlist_space_range(&ds->ds_deadlist, 4425 firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, 4426 &used, &comp, &uncomp); 4427 *usedp += used; 4428 *compp += comp; 4429 *uncompp += uncomp; 4430 4431 snapobj = ds->ds_phys->ds_prev_snap_obj; 4432 ASSERT3U(snapobj, !=, 0); 4433 dsl_dataset_rele(ds, FTAG); 4434 } 4435 rw_exit(&dp->dp_config_rwlock); 4436 return (err); 4437 }