1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 25 */ 26 27 #include <sys/dmu_objset.h> 28 #include <sys/dsl_dataset.h> 29 #include <sys/dsl_dir.h> 30 #include <sys/dsl_prop.h> 31 #include <sys/dsl_synctask.h> 32 #include <sys/dmu_traverse.h> 33 #include <sys/dmu_impl.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/arc.h> 36 #include <sys/zio.h> 37 #include <sys/zap.h> 38 #include <sys/zfeature.h> 39 #include <sys/unique.h> 40 #include <sys/zfs_context.h> 41 #include <sys/zfs_ioctl.h> 42 #include <sys/spa.h> 43 #include <sys/zfs_znode.h> 44 #include <sys/zfs_onexit.h> 45 #include <sys/zvol.h> 46 #include <sys/dsl_scan.h> 47 #include <sys/dsl_deadlist.h> 48 49 static char *dsl_reaper = "the grim reaper"; 50 51 static dsl_checkfunc_t dsl_dataset_destroy_begin_check; 52 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; 53 static dsl_syncfunc_t dsl_dataset_set_reservation_sync; 54 55 #define SWITCH64(x, y) \ 56 { \ 57 uint64_t __tmp = (x); \ 58 (x) = (y); \ 59 (y) = __tmp; \ 60 } 61 62 #define DS_REF_MAX (1ULL << 62) 63 64 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE 65 66 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) 67 68 69 /* 70 * Figure out how much of this delta should be propogated to the dsl_dir 71 * layer. If there's a refreservation, that space has already been 72 * partially accounted for in our ancestors. 73 */ 74 static int64_t 75 parent_delta(dsl_dataset_t *ds, int64_t delta) 76 { 77 uint64_t old_bytes, new_bytes; 78 79 if (ds->ds_reserved == 0) 80 return (delta); 81 82 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 83 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); 84 85 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); 86 return (new_bytes - old_bytes); 87 } 88 89 void 90 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) 91 { 92 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 93 int compressed = BP_GET_PSIZE(bp); 94 int uncompressed = BP_GET_UCSIZE(bp); 95 int64_t delta; 96 97 dprintf_bp(bp, "ds=%p", ds); 98 99 ASSERT(dmu_tx_is_syncing(tx)); 100 /* It could have been compressed away to nothing */ 101 if (BP_IS_HOLE(bp)) 102 return; 103 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); 104 ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); 105 if (ds == NULL) { 106 /* 107 * Account for the meta-objset space in its placeholder 108 * dsl_dir. 109 */ 110 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ 111 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 112 used, compressed, uncompressed, tx); 113 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 114 return; 115 } 116 dmu_buf_will_dirty(ds->ds_dbuf, tx); 117 118 mutex_enter(&ds->ds_dir->dd_lock); 119 mutex_enter(&ds->ds_lock); 120 delta = parent_delta(ds, used); 121 ds->ds_phys->ds_referenced_bytes += used; 122 ds->ds_phys->ds_compressed_bytes += compressed; 123 ds->ds_phys->ds_uncompressed_bytes += uncompressed; 124 ds->ds_phys->ds_unique_bytes += used; 125 mutex_exit(&ds->ds_lock); 126 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, 127 compressed, uncompressed, tx); 128 dsl_dir_transfer_space(ds->ds_dir, used - delta, 129 DD_USED_REFRSRV, DD_USED_HEAD, tx); 130 mutex_exit(&ds->ds_dir->dd_lock); 131 } 132 133 int 134 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, 135 boolean_t async) 136 { 137 if (BP_IS_HOLE(bp)) 138 return (0); 139 140 ASSERT(dmu_tx_is_syncing(tx)); 141 ASSERT(bp->blk_birth <= tx->tx_txg); 142 143 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 144 int compressed = BP_GET_PSIZE(bp); 145 int uncompressed = BP_GET_UCSIZE(bp); 146 147 ASSERT(used > 0); 148 if (ds == NULL) { 149 /* 150 * Account for the meta-objset space in its placeholder 151 * dataset. 152 */ 153 dsl_free(tx->tx_pool, tx->tx_txg, bp); 154 155 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 156 -used, -compressed, -uncompressed, tx); 157 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 158 return (used); 159 } 160 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); 161 162 ASSERT(!dsl_dataset_is_snapshot(ds)); 163 dmu_buf_will_dirty(ds->ds_dbuf, tx); 164 165 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { 166 int64_t delta; 167 168 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); 169 dsl_free(tx->tx_pool, tx->tx_txg, bp); 170 171 mutex_enter(&ds->ds_dir->dd_lock); 172 mutex_enter(&ds->ds_lock); 173 ASSERT(ds->ds_phys->ds_unique_bytes >= used || 174 !DS_UNIQUE_IS_ACCURATE(ds)); 175 delta = parent_delta(ds, -used); 176 ds->ds_phys->ds_unique_bytes -= used; 177 mutex_exit(&ds->ds_lock); 178 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 179 delta, -compressed, -uncompressed, tx); 180 dsl_dir_transfer_space(ds->ds_dir, -used - delta, 181 DD_USED_REFRSRV, DD_USED_HEAD, tx); 182 mutex_exit(&ds->ds_dir->dd_lock); 183 } else { 184 dprintf_bp(bp, "putting on dead list: %s", ""); 185 if (async) { 186 /* 187 * We are here as part of zio's write done callback, 188 * which means we're a zio interrupt thread. We can't 189 * call dsl_deadlist_insert() now because it may block 190 * waiting for I/O. Instead, put bp on the deferred 191 * queue and let dsl_pool_sync() finish the job. 192 */ 193 bplist_append(&ds->ds_pending_deadlist, bp); 194 } else { 195 dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); 196 } 197 ASSERT3U(ds->ds_prev->ds_object, ==, 198 ds->ds_phys->ds_prev_snap_obj); 199 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); 200 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ 201 if (ds->ds_prev->ds_phys->ds_next_snap_obj == 202 ds->ds_object && bp->blk_birth > 203 ds->ds_prev->ds_phys->ds_prev_snap_txg) { 204 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 205 mutex_enter(&ds->ds_prev->ds_lock); 206 ds->ds_prev->ds_phys->ds_unique_bytes += used; 207 mutex_exit(&ds->ds_prev->ds_lock); 208 } 209 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { 210 dsl_dir_transfer_space(ds->ds_dir, used, 211 DD_USED_HEAD, DD_USED_SNAP, tx); 212 } 213 } 214 mutex_enter(&ds->ds_lock); 215 ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used); 216 ds->ds_phys->ds_referenced_bytes -= used; 217 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); 218 ds->ds_phys->ds_compressed_bytes -= compressed; 219 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); 220 ds->ds_phys->ds_uncompressed_bytes -= uncompressed; 221 mutex_exit(&ds->ds_lock); 222 223 return (used); 224 } 225 226 uint64_t 227 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) 228 { 229 uint64_t trysnap = 0; 230 231 if (ds == NULL) 232 return (0); 233 /* 234 * The snapshot creation could fail, but that would cause an 235 * incorrect FALSE return, which would only result in an 236 * overestimation of the amount of space that an operation would 237 * consume, which is OK. 238 * 239 * There's also a small window where we could miss a pending 240 * snapshot, because we could set the sync task in the quiescing 241 * phase. So this should only be used as a guess. 242 */ 243 if (ds->ds_trysnap_txg > 244 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) 245 trysnap = ds->ds_trysnap_txg; 246 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); 247 } 248 249 boolean_t 250 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, 251 uint64_t blk_birth) 252 { 253 if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) 254 return (B_FALSE); 255 256 ddt_prefetch(dsl_dataset_get_spa(ds), bp); 257 258 return (B_TRUE); 259 } 260 261 /* ARGSUSED */ 262 static void 263 dsl_dataset_evict(dmu_buf_t *db, void *dsv) 264 { 265 dsl_dataset_t *ds = dsv; 266 267 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); 268 269 unique_remove(ds->ds_fsid_guid); 270 271 if (ds->ds_objset != NULL) 272 dmu_objset_evict(ds->ds_objset); 273 274 if (ds->ds_prev) { 275 dsl_dataset_drop_ref(ds->ds_prev, ds); 276 ds->ds_prev = NULL; 277 } 278 279 bplist_destroy(&ds->ds_pending_deadlist); 280 if (db != NULL) { 281 dsl_deadlist_close(&ds->ds_deadlist); 282 } else { 283 ASSERT(ds->ds_deadlist.dl_dbuf == NULL); 284 ASSERT(!ds->ds_deadlist.dl_oldfmt); 285 } 286 if (ds->ds_dir) 287 dsl_dir_close(ds->ds_dir, ds); 288 289 ASSERT(!list_link_active(&ds->ds_synced_link)); 290 291 mutex_destroy(&ds->ds_lock); 292 mutex_destroy(&ds->ds_opening_lock); 293 rw_destroy(&ds->ds_rwlock); 294 cv_destroy(&ds->ds_exclusive_cv); 295 296 kmem_free(ds, sizeof (dsl_dataset_t)); 297 } 298 299 static int 300 dsl_dataset_get_snapname(dsl_dataset_t *ds) 301 { 302 dsl_dataset_phys_t *headphys; 303 int err; 304 dmu_buf_t *headdbuf; 305 dsl_pool_t *dp = ds->ds_dir->dd_pool; 306 objset_t *mos = dp->dp_meta_objset; 307 308 if (ds->ds_snapname[0]) 309 return (0); 310 if (ds->ds_phys->ds_next_snap_obj == 0) 311 return (0); 312 313 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, 314 FTAG, &headdbuf); 315 if (err) 316 return (err); 317 headphys = headdbuf->db_data; 318 err = zap_value_search(dp->dp_meta_objset, 319 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); 320 dmu_buf_rele(headdbuf, FTAG); 321 return (err); 322 } 323 324 static int 325 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) 326 { 327 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 328 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 329 matchtype_t mt; 330 int err; 331 332 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 333 mt = MT_FIRST; 334 else 335 mt = MT_EXACT; 336 337 err = zap_lookup_norm(mos, snapobj, name, 8, 1, 338 value, mt, NULL, 0, NULL); 339 if (err == ENOTSUP && mt == MT_FIRST) 340 err = zap_lookup(mos, snapobj, name, 8, 1, value); 341 return (err); 342 } 343 344 static int 345 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) 346 { 347 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 348 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 349 matchtype_t mt; 350 int err; 351 352 dsl_dir_snap_cmtime_update(ds->ds_dir); 353 354 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 355 mt = MT_FIRST; 356 else 357 mt = MT_EXACT; 358 359 err = zap_remove_norm(mos, snapobj, name, mt, tx); 360 if (err == ENOTSUP && mt == MT_FIRST) 361 err = zap_remove(mos, snapobj, name, tx); 362 return (err); 363 } 364 365 static int 366 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, 367 dsl_dataset_t **dsp) 368 { 369 objset_t *mos = dp->dp_meta_objset; 370 dmu_buf_t *dbuf; 371 dsl_dataset_t *ds; 372 int err; 373 dmu_object_info_t doi; 374 375 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 376 dsl_pool_sync_context(dp)); 377 378 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); 379 if (err) 380 return (err); 381 382 /* Make sure dsobj has the correct object type. */ 383 dmu_object_info_from_db(dbuf, &doi); 384 if (doi.doi_type != DMU_OT_DSL_DATASET) 385 return (EINVAL); 386 387 ds = dmu_buf_get_user(dbuf); 388 if (ds == NULL) { 389 dsl_dataset_t *winner; 390 391 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); 392 ds->ds_dbuf = dbuf; 393 ds->ds_object = dsobj; 394 ds->ds_phys = dbuf->db_data; 395 396 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); 397 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); 398 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); 399 400 rw_init(&ds->ds_rwlock, 0, 0, 0); 401 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); 402 403 bplist_create(&ds->ds_pending_deadlist); 404 dsl_deadlist_open(&ds->ds_deadlist, 405 mos, ds->ds_phys->ds_deadlist_obj); 406 407 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), 408 offsetof(dmu_sendarg_t, dsa_link)); 409 410 if (err == 0) { 411 err = dsl_dir_open_obj(dp, 412 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); 413 } 414 if (err) { 415 mutex_destroy(&ds->ds_lock); 416 mutex_destroy(&ds->ds_opening_lock); 417 rw_destroy(&ds->ds_rwlock); 418 cv_destroy(&ds->ds_exclusive_cv); 419 bplist_destroy(&ds->ds_pending_deadlist); 420 dsl_deadlist_close(&ds->ds_deadlist); 421 kmem_free(ds, sizeof (dsl_dataset_t)); 422 dmu_buf_rele(dbuf, tag); 423 return (err); 424 } 425 426 if (!dsl_dataset_is_snapshot(ds)) { 427 ds->ds_snapname[0] = '\0'; 428 if (ds->ds_phys->ds_prev_snap_obj) { 429 err = dsl_dataset_get_ref(dp, 430 ds->ds_phys->ds_prev_snap_obj, 431 ds, &ds->ds_prev); 432 } 433 } else { 434 if (zfs_flags & ZFS_DEBUG_SNAPNAMES) 435 err = dsl_dataset_get_snapname(ds); 436 if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { 437 err = zap_count( 438 ds->ds_dir->dd_pool->dp_meta_objset, 439 ds->ds_phys->ds_userrefs_obj, 440 &ds->ds_userrefs); 441 } 442 } 443 444 if (err == 0 && !dsl_dataset_is_snapshot(ds)) { 445 /* 446 * In sync context, we're called with either no lock 447 * or with the write lock. If we're not syncing, 448 * we're always called with the read lock held. 449 */ 450 boolean_t need_lock = 451 !RW_WRITE_HELD(&dp->dp_config_rwlock) && 452 dsl_pool_sync_context(dp); 453 454 if (need_lock) 455 rw_enter(&dp->dp_config_rwlock, RW_READER); 456 457 err = dsl_prop_get_ds(ds, 458 "refreservation", sizeof (uint64_t), 1, 459 &ds->ds_reserved, NULL); 460 if (err == 0) { 461 err = dsl_prop_get_ds(ds, 462 "refquota", sizeof (uint64_t), 1, 463 &ds->ds_quota, NULL); 464 } 465 466 if (need_lock) 467 rw_exit(&dp->dp_config_rwlock); 468 } else { 469 ds->ds_reserved = ds->ds_quota = 0; 470 } 471 472 if (err == 0) { 473 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, 474 dsl_dataset_evict); 475 } 476 if (err || winner) { 477 bplist_destroy(&ds->ds_pending_deadlist); 478 dsl_deadlist_close(&ds->ds_deadlist); 479 if (ds->ds_prev) 480 dsl_dataset_drop_ref(ds->ds_prev, ds); 481 dsl_dir_close(ds->ds_dir, ds); 482 mutex_destroy(&ds->ds_lock); 483 mutex_destroy(&ds->ds_opening_lock); 484 rw_destroy(&ds->ds_rwlock); 485 cv_destroy(&ds->ds_exclusive_cv); 486 kmem_free(ds, sizeof (dsl_dataset_t)); 487 if (err) { 488 dmu_buf_rele(dbuf, tag); 489 return (err); 490 } 491 ds = winner; 492 } else { 493 ds->ds_fsid_guid = 494 unique_insert(ds->ds_phys->ds_fsid_guid); 495 } 496 } 497 ASSERT3P(ds->ds_dbuf, ==, dbuf); 498 ASSERT3P(ds->ds_phys, ==, dbuf->db_data); 499 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || 500 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || 501 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); 502 mutex_enter(&ds->ds_lock); 503 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { 504 mutex_exit(&ds->ds_lock); 505 dmu_buf_rele(ds->ds_dbuf, tag); 506 return (ENOENT); 507 } 508 mutex_exit(&ds->ds_lock); 509 *dsp = ds; 510 return (0); 511 } 512 513 static int 514 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) 515 { 516 dsl_pool_t *dp = ds->ds_dir->dd_pool; 517 518 /* 519 * In syncing context we don't want the rwlock lock: there 520 * may be an existing writer waiting for sync phase to 521 * finish. We don't need to worry about such writers, since 522 * sync phase is single-threaded, so the writer can't be 523 * doing anything while we are active. 524 */ 525 if (dsl_pool_sync_context(dp)) { 526 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 527 return (0); 528 } 529 530 /* 531 * Normal users will hold the ds_rwlock as a READER until they 532 * are finished (i.e., call dsl_dataset_rele()). "Owners" will 533 * drop their READER lock after they set the ds_owner field. 534 * 535 * If the dataset is being destroyed, the destroy thread will 536 * obtain a WRITER lock for exclusive access after it's done its 537 * open-context work and then change the ds_owner to 538 * dsl_reaper once destruction is assured. So threads 539 * may block here temporarily, until the "destructability" of 540 * the dataset is determined. 541 */ 542 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); 543 mutex_enter(&ds->ds_lock); 544 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { 545 int rc; 546 547 rw_exit(&dp->dp_config_rwlock); 548 rc = cv_wait_sig(&ds->ds_exclusive_cv, &ds->ds_lock); 549 if (!rc || DSL_DATASET_IS_DESTROYED(ds)) { 550 mutex_exit(&ds->ds_lock); 551 dsl_dataset_drop_ref(ds, tag); 552 rw_enter(&dp->dp_config_rwlock, RW_READER); 553 return (rc ? ENOENT : EINTR); 554 } 555 /* 556 * The dp_config_rwlock lives above the ds_lock. And 557 * we need to check DSL_DATASET_IS_DESTROYED() while 558 * holding the ds_lock, so we have to drop and reacquire 559 * the ds_lock here. 560 */ 561 mutex_exit(&ds->ds_lock); 562 rw_enter(&dp->dp_config_rwlock, RW_READER); 563 mutex_enter(&ds->ds_lock); 564 } 565 mutex_exit(&ds->ds_lock); 566 return (0); 567 } 568 569 int 570 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, 571 dsl_dataset_t **dsp) 572 { 573 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); 574 575 if (err) 576 return (err); 577 return (dsl_dataset_hold_ref(*dsp, tag)); 578 } 579 580 int 581 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, 582 void *tag, dsl_dataset_t **dsp) 583 { 584 int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); 585 if (err) 586 return (err); 587 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 588 dsl_dataset_rele(*dsp, tag); 589 *dsp = NULL; 590 return (EBUSY); 591 } 592 return (0); 593 } 594 595 int 596 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) 597 { 598 dsl_dir_t *dd; 599 dsl_pool_t *dp; 600 const char *snapname; 601 uint64_t obj; 602 int err = 0; 603 604 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); 605 if (err) 606 return (err); 607 608 dp = dd->dd_pool; 609 obj = dd->dd_phys->dd_head_dataset_obj; 610 rw_enter(&dp->dp_config_rwlock, RW_READER); 611 if (obj) 612 err = dsl_dataset_get_ref(dp, obj, tag, dsp); 613 else 614 err = ENOENT; 615 if (err) 616 goto out; 617 618 err = dsl_dataset_hold_ref(*dsp, tag); 619 620 /* we may be looking for a snapshot */ 621 if (err == 0 && snapname != NULL) { 622 dsl_dataset_t *ds = NULL; 623 624 if (*snapname++ != '@') { 625 dsl_dataset_rele(*dsp, tag); 626 err = ENOENT; 627 goto out; 628 } 629 630 dprintf("looking for snapshot '%s'\n", snapname); 631 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); 632 if (err == 0) 633 err = dsl_dataset_get_ref(dp, obj, tag, &ds); 634 dsl_dataset_rele(*dsp, tag); 635 636 ASSERT3U((err == 0), ==, (ds != NULL)); 637 638 if (ds) { 639 mutex_enter(&ds->ds_lock); 640 if (ds->ds_snapname[0] == 0) 641 (void) strlcpy(ds->ds_snapname, snapname, 642 sizeof (ds->ds_snapname)); 643 mutex_exit(&ds->ds_lock); 644 err = dsl_dataset_hold_ref(ds, tag); 645 *dsp = err ? NULL : ds; 646 } 647 } 648 out: 649 rw_exit(&dp->dp_config_rwlock); 650 dsl_dir_close(dd, FTAG); 651 return (err); 652 } 653 654 int 655 dsl_dataset_own(const char *name, boolean_t inconsistentok, 656 void *tag, dsl_dataset_t **dsp) 657 { 658 int err = dsl_dataset_hold(name, tag, dsp); 659 if (err) 660 return (err); 661 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 662 dsl_dataset_rele(*dsp, tag); 663 return (EBUSY); 664 } 665 return (0); 666 } 667 668 void 669 dsl_dataset_name(dsl_dataset_t *ds, char *name) 670 { 671 if (ds == NULL) { 672 (void) strcpy(name, "mos"); 673 } else { 674 dsl_dir_name(ds->ds_dir, name); 675 VERIFY(0 == dsl_dataset_get_snapname(ds)); 676 if (ds->ds_snapname[0]) { 677 (void) strcat(name, "@"); 678 /* 679 * We use a "recursive" mutex so that we 680 * can call dprintf_ds() with ds_lock held. 681 */ 682 if (!MUTEX_HELD(&ds->ds_lock)) { 683 mutex_enter(&ds->ds_lock); 684 (void) strcat(name, ds->ds_snapname); 685 mutex_exit(&ds->ds_lock); 686 } else { 687 (void) strcat(name, ds->ds_snapname); 688 } 689 } 690 } 691 } 692 693 static int 694 dsl_dataset_namelen(dsl_dataset_t *ds) 695 { 696 int result; 697 698 if (ds == NULL) { 699 result = 3; /* "mos" */ 700 } else { 701 result = dsl_dir_namelen(ds->ds_dir); 702 VERIFY(0 == dsl_dataset_get_snapname(ds)); 703 if (ds->ds_snapname[0]) { 704 ++result; /* adding one for the @-sign */ 705 if (!MUTEX_HELD(&ds->ds_lock)) { 706 mutex_enter(&ds->ds_lock); 707 result += strlen(ds->ds_snapname); 708 mutex_exit(&ds->ds_lock); 709 } else { 710 result += strlen(ds->ds_snapname); 711 } 712 } 713 } 714 715 return (result); 716 } 717 718 void 719 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) 720 { 721 dmu_buf_rele(ds->ds_dbuf, tag); 722 } 723 724 void 725 dsl_dataset_rele(dsl_dataset_t *ds, void *tag) 726 { 727 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { 728 rw_exit(&ds->ds_rwlock); 729 } 730 dsl_dataset_drop_ref(ds, tag); 731 } 732 733 void 734 dsl_dataset_disown(dsl_dataset_t *ds, void *tag) 735 { 736 ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || 737 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); 738 739 mutex_enter(&ds->ds_lock); 740 ds->ds_owner = NULL; 741 if (RW_WRITE_HELD(&ds->ds_rwlock)) { 742 rw_exit(&ds->ds_rwlock); 743 cv_broadcast(&ds->ds_exclusive_cv); 744 } 745 mutex_exit(&ds->ds_lock); 746 if (ds->ds_dbuf) 747 dsl_dataset_drop_ref(ds, tag); 748 else 749 dsl_dataset_evict(NULL, ds); 750 } 751 752 boolean_t 753 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) 754 { 755 boolean_t gotit = FALSE; 756 757 mutex_enter(&ds->ds_lock); 758 if (ds->ds_owner == NULL && 759 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { 760 ds->ds_owner = tag; 761 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) 762 rw_exit(&ds->ds_rwlock); 763 gotit = TRUE; 764 } 765 mutex_exit(&ds->ds_lock); 766 return (gotit); 767 } 768 769 void 770 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) 771 { 772 ASSERT3P(owner, ==, ds->ds_owner); 773 if (!RW_WRITE_HELD(&ds->ds_rwlock)) 774 rw_enter(&ds->ds_rwlock, RW_WRITER); 775 } 776 777 uint64_t 778 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, 779 uint64_t flags, dmu_tx_t *tx) 780 { 781 dsl_pool_t *dp = dd->dd_pool; 782 dmu_buf_t *dbuf; 783 dsl_dataset_phys_t *dsphys; 784 uint64_t dsobj; 785 objset_t *mos = dp->dp_meta_objset; 786 787 if (origin == NULL) 788 origin = dp->dp_origin_snap; 789 790 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); 791 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); 792 ASSERT(dmu_tx_is_syncing(tx)); 793 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 794 795 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 796 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 797 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 798 dmu_buf_will_dirty(dbuf, tx); 799 dsphys = dbuf->db_data; 800 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 801 dsphys->ds_dir_obj = dd->dd_object; 802 dsphys->ds_flags = flags; 803 dsphys->ds_fsid_guid = unique_create(); 804 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 805 sizeof (dsphys->ds_guid)); 806 dsphys->ds_snapnames_zapobj = 807 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, 808 DMU_OT_NONE, 0, tx); 809 dsphys->ds_creation_time = gethrestime_sec(); 810 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; 811 812 if (origin == NULL) { 813 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); 814 } else { 815 dsl_dataset_t *ohds; 816 817 dsphys->ds_prev_snap_obj = origin->ds_object; 818 dsphys->ds_prev_snap_txg = 819 origin->ds_phys->ds_creation_txg; 820 dsphys->ds_referenced_bytes = 821 origin->ds_phys->ds_referenced_bytes; 822 dsphys->ds_compressed_bytes = 823 origin->ds_phys->ds_compressed_bytes; 824 dsphys->ds_uncompressed_bytes = 825 origin->ds_phys->ds_uncompressed_bytes; 826 dsphys->ds_bp = origin->ds_phys->ds_bp; 827 dsphys->ds_flags |= origin->ds_phys->ds_flags; 828 829 dmu_buf_will_dirty(origin->ds_dbuf, tx); 830 origin->ds_phys->ds_num_children++; 831 832 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 833 origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); 834 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, 835 dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); 836 dsl_dataset_rele(ohds, FTAG); 837 838 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { 839 if (origin->ds_phys->ds_next_clones_obj == 0) { 840 origin->ds_phys->ds_next_clones_obj = 841 zap_create(mos, 842 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 843 } 844 VERIFY(0 == zap_add_int(mos, 845 origin->ds_phys->ds_next_clones_obj, 846 dsobj, tx)); 847 } 848 849 dmu_buf_will_dirty(dd->dd_dbuf, tx); 850 dd->dd_phys->dd_origin_obj = origin->ds_object; 851 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 852 if (origin->ds_dir->dd_phys->dd_clones == 0) { 853 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 854 origin->ds_dir->dd_phys->dd_clones = 855 zap_create(mos, 856 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 857 } 858 VERIFY3U(0, ==, zap_add_int(mos, 859 origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); 860 } 861 } 862 863 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 864 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 865 866 dmu_buf_rele(dbuf, FTAG); 867 868 dmu_buf_will_dirty(dd->dd_dbuf, tx); 869 dd->dd_phys->dd_head_dataset_obj = dsobj; 870 871 return (dsobj); 872 } 873 874 uint64_t 875 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, 876 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) 877 { 878 dsl_pool_t *dp = pdd->dd_pool; 879 uint64_t dsobj, ddobj; 880 dsl_dir_t *dd; 881 882 ASSERT(lastname[0] != '@'); 883 884 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); 885 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); 886 887 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); 888 889 dsl_deleg_set_create_perms(dd, tx, cr); 890 891 dsl_dir_close(dd, FTAG); 892 893 /* 894 * If we are creating a clone, make sure we zero out any stale 895 * data from the origin snapshots zil header. 896 */ 897 if (origin != NULL) { 898 dsl_dataset_t *ds; 899 objset_t *os; 900 901 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 902 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); 903 bzero(&os->os_zil_header, sizeof (os->os_zil_header)); 904 dsl_dataset_dirty(ds, tx); 905 dsl_dataset_rele(ds, FTAG); 906 } 907 908 return (dsobj); 909 } 910 911 /* 912 * The snapshots must all be in the same pool. 913 */ 914 int 915 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) 916 { 917 int err; 918 dsl_sync_task_t *dst; 919 spa_t *spa; 920 nvpair_t *pair; 921 dsl_sync_task_group_t *dstg; 922 923 pair = nvlist_next_nvpair(snaps, NULL); 924 if (pair == NULL) 925 return (0); 926 927 err = spa_open(nvpair_name(pair), &spa, FTAG); 928 if (err) 929 return (err); 930 dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 931 932 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; 933 pair = nvlist_next_nvpair(snaps, pair)) { 934 dsl_dataset_t *ds; 935 936 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); 937 if (err == 0) { 938 struct dsl_ds_destroyarg *dsda; 939 940 dsl_dataset_make_exclusive(ds, dstg); 941 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), 942 KM_SLEEP); 943 dsda->ds = ds; 944 dsda->defer = defer; 945 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 946 dsl_dataset_destroy_sync, dsda, dstg, 0); 947 } else if (err == ENOENT) { 948 err = 0; 949 } else { 950 (void) strcpy(failed, nvpair_name(pair)); 951 break; 952 } 953 } 954 955 if (err == 0) 956 err = dsl_sync_task_group_wait(dstg); 957 958 for (dst = list_head(&dstg->dstg_tasks); dst; 959 dst = list_next(&dstg->dstg_tasks, dst)) { 960 struct dsl_ds_destroyarg *dsda = dst->dst_arg1; 961 dsl_dataset_t *ds = dsda->ds; 962 963 /* 964 * Return the file system name that triggered the error 965 */ 966 if (dst->dst_err) { 967 dsl_dataset_name(ds, failed); 968 } 969 ASSERT3P(dsda->rm_origin, ==, NULL); 970 dsl_dataset_disown(ds, dstg); 971 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); 972 } 973 974 dsl_sync_task_group_destroy(dstg); 975 spa_close(spa, FTAG); 976 return (err); 977 978 } 979 980 static boolean_t 981 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) 982 { 983 boolean_t might_destroy = B_FALSE; 984 985 mutex_enter(&ds->ds_lock); 986 if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && 987 DS_IS_DEFER_DESTROY(ds)) 988 might_destroy = B_TRUE; 989 mutex_exit(&ds->ds_lock); 990 991 return (might_destroy); 992 } 993 994 /* 995 * If we're removing a clone, and these three conditions are true: 996 * 1) the clone's origin has no other children 997 * 2) the clone's origin has no user references 998 * 3) the clone's origin has been marked for deferred destruction 999 * Then, prepare to remove the origin as part of this sync task group. 1000 */ 1001 static int 1002 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) 1003 { 1004 dsl_dataset_t *ds = dsda->ds; 1005 dsl_dataset_t *origin = ds->ds_prev; 1006 1007 if (dsl_dataset_might_destroy_origin(origin)) { 1008 char *name; 1009 int namelen; 1010 int error; 1011 1012 namelen = dsl_dataset_namelen(origin) + 1; 1013 name = kmem_alloc(namelen, KM_SLEEP); 1014 dsl_dataset_name(origin, name); 1015 #ifdef _KERNEL 1016 error = zfs_unmount_snap(name, NULL); 1017 if (error) { 1018 kmem_free(name, namelen); 1019 return (error); 1020 } 1021 #endif 1022 error = dsl_dataset_own(name, B_TRUE, tag, &origin); 1023 kmem_free(name, namelen); 1024 if (error) 1025 return (error); 1026 dsda->rm_origin = origin; 1027 dsl_dataset_make_exclusive(origin, tag); 1028 } 1029 1030 return (0); 1031 } 1032 1033 /* 1034 * ds must be opened as OWNER. On return (whether successful or not), 1035 * ds will be closed and caller can no longer dereference it. 1036 */ 1037 int 1038 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) 1039 { 1040 int err; 1041 dsl_sync_task_group_t *dstg; 1042 objset_t *os; 1043 dsl_dir_t *dd; 1044 uint64_t obj; 1045 struct dsl_ds_destroyarg dsda = { 0 }; 1046 dsl_dataset_t dummy_ds = { 0 }; 1047 1048 dsda.ds = ds; 1049 1050 if (dsl_dataset_is_snapshot(ds)) { 1051 /* Destroying a snapshot is simpler */ 1052 dsl_dataset_make_exclusive(ds, tag); 1053 1054 dsda.defer = defer; 1055 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1056 dsl_dataset_destroy_check, dsl_dataset_destroy_sync, 1057 &dsda, tag, 0); 1058 ASSERT3P(dsda.rm_origin, ==, NULL); 1059 goto out; 1060 } else if (defer) { 1061 err = EINVAL; 1062 goto out; 1063 } 1064 1065 dd = ds->ds_dir; 1066 dummy_ds.ds_dir = dd; 1067 dummy_ds.ds_object = ds->ds_object; 1068 1069 /* 1070 * Check for errors and mark this ds as inconsistent, in 1071 * case we crash while freeing the objects. 1072 */ 1073 err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, 1074 dsl_dataset_destroy_begin_sync, ds, NULL, 0); 1075 if (err) 1076 goto out; 1077 1078 err = dmu_objset_from_ds(ds, &os); 1079 if (err) 1080 goto out; 1081 1082 /* 1083 * If async destruction is not enabled try to remove all objects 1084 * while in the open context so that there is less work to do in 1085 * the syncing context. 1086 */ 1087 if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), 1088 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { 1089 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 1090 ds->ds_phys->ds_prev_snap_txg)) { 1091 /* 1092 * Ignore errors, if there is not enough disk space 1093 * we will deal with it in dsl_dataset_destroy_sync(). 1094 */ 1095 (void) dmu_free_object(os, obj); 1096 } 1097 if (err != ESRCH) 1098 goto out; 1099 } 1100 1101 /* 1102 * Only the ZIL knows how to free log blocks. 1103 */ 1104 zil_destroy(dmu_objset_zil(os), B_FALSE); 1105 1106 /* 1107 * Sync out all in-flight IO. 1108 */ 1109 txg_wait_synced(dd->dd_pool, 0); 1110 1111 /* 1112 * If we managed to free all the objects in open 1113 * context, the user space accounting should be zero. 1114 */ 1115 if (ds->ds_phys->ds_bp.blk_fill == 0 && 1116 dmu_objset_userused_enabled(os)) { 1117 uint64_t count; 1118 1119 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || 1120 count == 0); 1121 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || 1122 count == 0); 1123 } 1124 1125 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 1126 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); 1127 rw_exit(&dd->dd_pool->dp_config_rwlock); 1128 1129 if (err) 1130 goto out; 1131 1132 /* 1133 * Blow away the dsl_dir + head dataset. 1134 */ 1135 dsl_dataset_make_exclusive(ds, tag); 1136 /* 1137 * If we're removing a clone, we might also need to remove its 1138 * origin. 1139 */ 1140 do { 1141 dsda.need_prep = B_FALSE; 1142 if (dsl_dir_is_clone(dd)) { 1143 err = dsl_dataset_origin_rm_prep(&dsda, tag); 1144 if (err) { 1145 dsl_dir_close(dd, FTAG); 1146 goto out; 1147 } 1148 } 1149 1150 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); 1151 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 1152 dsl_dataset_destroy_sync, &dsda, tag, 0); 1153 dsl_sync_task_create(dstg, dsl_dir_destroy_check, 1154 dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); 1155 err = dsl_sync_task_group_wait(dstg); 1156 dsl_sync_task_group_destroy(dstg); 1157 1158 /* 1159 * We could be racing against 'zfs release' or 'zfs destroy -d' 1160 * on the origin snap, in which case we can get EBUSY if we 1161 * needed to destroy the origin snap but were not ready to 1162 * do so. 1163 */ 1164 if (dsda.need_prep) { 1165 ASSERT(err == EBUSY); 1166 ASSERT(dsl_dir_is_clone(dd)); 1167 ASSERT(dsda.rm_origin == NULL); 1168 } 1169 } while (dsda.need_prep); 1170 1171 if (dsda.rm_origin != NULL) 1172 dsl_dataset_disown(dsda.rm_origin, tag); 1173 1174 /* if it is successful, dsl_dir_destroy_sync will close the dd */ 1175 if (err) 1176 dsl_dir_close(dd, FTAG); 1177 out: 1178 dsl_dataset_disown(ds, tag); 1179 return (err); 1180 } 1181 1182 blkptr_t * 1183 dsl_dataset_get_blkptr(dsl_dataset_t *ds) 1184 { 1185 return (&ds->ds_phys->ds_bp); 1186 } 1187 1188 void 1189 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 1190 { 1191 ASSERT(dmu_tx_is_syncing(tx)); 1192 /* If it's the meta-objset, set dp_meta_rootbp */ 1193 if (ds == NULL) { 1194 tx->tx_pool->dp_meta_rootbp = *bp; 1195 } else { 1196 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1197 ds->ds_phys->ds_bp = *bp; 1198 } 1199 } 1200 1201 spa_t * 1202 dsl_dataset_get_spa(dsl_dataset_t *ds) 1203 { 1204 return (ds->ds_dir->dd_pool->dp_spa); 1205 } 1206 1207 void 1208 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) 1209 { 1210 dsl_pool_t *dp; 1211 1212 if (ds == NULL) /* this is the meta-objset */ 1213 return; 1214 1215 ASSERT(ds->ds_objset != NULL); 1216 1217 if (ds->ds_phys->ds_next_snap_obj != 0) 1218 panic("dirtying snapshot!"); 1219 1220 dp = ds->ds_dir->dd_pool; 1221 1222 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { 1223 /* up the hold count until we can be written out */ 1224 dmu_buf_add_ref(ds->ds_dbuf, ds); 1225 } 1226 } 1227 1228 /* 1229 * The unique space in the head dataset can be calculated by subtracting 1230 * the space used in the most recent snapshot, that is still being used 1231 * in this file system, from the space currently in use. To figure out 1232 * the space in the most recent snapshot still in use, we need to take 1233 * the total space used in the snapshot and subtract out the space that 1234 * has been freed up since the snapshot was taken. 1235 */ 1236 static void 1237 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) 1238 { 1239 uint64_t mrs_used; 1240 uint64_t dlused, dlcomp, dluncomp; 1241 1242 ASSERT(!dsl_dataset_is_snapshot(ds)); 1243 1244 if (ds->ds_phys->ds_prev_snap_obj != 0) 1245 mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; 1246 else 1247 mrs_used = 0; 1248 1249 dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); 1250 1251 ASSERT3U(dlused, <=, mrs_used); 1252 ds->ds_phys->ds_unique_bytes = 1253 ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); 1254 1255 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= 1256 SPA_VERSION_UNIQUE_ACCURATE) 1257 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1258 } 1259 1260 struct killarg { 1261 dsl_dataset_t *ds; 1262 dmu_tx_t *tx; 1263 }; 1264 1265 /* ARGSUSED */ 1266 static int 1267 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 1268 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1269 { 1270 struct killarg *ka = arg; 1271 dmu_tx_t *tx = ka->tx; 1272 1273 if (bp == NULL) 1274 return (0); 1275 1276 if (zb->zb_level == ZB_ZIL_LEVEL) { 1277 ASSERT(zilog != NULL); 1278 /* 1279 * It's a block in the intent log. It has no 1280 * accounting, so just free it. 1281 */ 1282 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); 1283 } else { 1284 ASSERT(zilog == NULL); 1285 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); 1286 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); 1287 } 1288 1289 return (0); 1290 } 1291 1292 /* ARGSUSED */ 1293 static int 1294 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) 1295 { 1296 dsl_dataset_t *ds = arg1; 1297 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1298 uint64_t count; 1299 int err; 1300 1301 /* 1302 * Can't delete a head dataset if there are snapshots of it. 1303 * (Except if the only snapshots are from the branch we cloned 1304 * from.) 1305 */ 1306 if (ds->ds_prev != NULL && 1307 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1308 return (EBUSY); 1309 1310 /* 1311 * This is really a dsl_dir thing, but check it here so that 1312 * we'll be less likely to leave this dataset inconsistent & 1313 * nearly destroyed. 1314 */ 1315 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); 1316 if (err) 1317 return (err); 1318 if (count != 0) 1319 return (EEXIST); 1320 1321 return (0); 1322 } 1323 1324 /* ARGSUSED */ 1325 static void 1326 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1327 { 1328 dsl_dataset_t *ds = arg1; 1329 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1330 1331 /* Mark it as inconsistent on-disk, in case we crash */ 1332 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1333 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1334 1335 spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, 1336 "dataset = %llu", ds->ds_object); 1337 } 1338 1339 static int 1340 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, 1341 dmu_tx_t *tx) 1342 { 1343 dsl_dataset_t *ds = dsda->ds; 1344 dsl_dataset_t *ds_prev = ds->ds_prev; 1345 1346 if (dsl_dataset_might_destroy_origin(ds_prev)) { 1347 struct dsl_ds_destroyarg ndsda = {0}; 1348 1349 /* 1350 * If we're not prepared to remove the origin, don't remove 1351 * the clone either. 1352 */ 1353 if (dsda->rm_origin == NULL) { 1354 dsda->need_prep = B_TRUE; 1355 return (EBUSY); 1356 } 1357 1358 ndsda.ds = ds_prev; 1359 ndsda.is_origin_rm = B_TRUE; 1360 return (dsl_dataset_destroy_check(&ndsda, tag, tx)); 1361 } 1362 1363 /* 1364 * If we're not going to remove the origin after all, 1365 * undo the open context setup. 1366 */ 1367 if (dsda->rm_origin != NULL) { 1368 dsl_dataset_disown(dsda->rm_origin, tag); 1369 dsda->rm_origin = NULL; 1370 } 1371 1372 return (0); 1373 } 1374 1375 /* 1376 * If you add new checks here, you may need to add 1377 * additional checks to the "temporary" case in 1378 * snapshot_check() in dmu_objset.c. 1379 */ 1380 /* ARGSUSED */ 1381 int 1382 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) 1383 { 1384 struct dsl_ds_destroyarg *dsda = arg1; 1385 dsl_dataset_t *ds = dsda->ds; 1386 1387 /* we have an owner hold, so noone else can destroy us */ 1388 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 1389 1390 /* 1391 * Only allow deferred destroy on pools that support it. 1392 * NOTE: deferred destroy is only supported on snapshots. 1393 */ 1394 if (dsda->defer) { 1395 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 1396 SPA_VERSION_USERREFS) 1397 return (ENOTSUP); 1398 ASSERT(dsl_dataset_is_snapshot(ds)); 1399 return (0); 1400 } 1401 1402 /* 1403 * Can't delete a head dataset if there are snapshots of it. 1404 * (Except if the only snapshots are from the branch we cloned 1405 * from.) 1406 */ 1407 if (ds->ds_prev != NULL && 1408 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1409 return (EBUSY); 1410 1411 /* 1412 * If we made changes this txg, traverse_dsl_dataset won't find 1413 * them. Try again. 1414 */ 1415 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1416 return (EAGAIN); 1417 1418 if (dsl_dataset_is_snapshot(ds)) { 1419 /* 1420 * If this snapshot has an elevated user reference count, 1421 * we can't destroy it yet. 1422 */ 1423 if (ds->ds_userrefs > 0 && !dsda->releasing) 1424 return (EBUSY); 1425 1426 mutex_enter(&ds->ds_lock); 1427 /* 1428 * Can't delete a branch point. However, if we're destroying 1429 * a clone and removing its origin due to it having a user 1430 * hold count of 0 and having been marked for deferred destroy, 1431 * it's OK for the origin to have a single clone. 1432 */ 1433 if (ds->ds_phys->ds_num_children > 1434 (dsda->is_origin_rm ? 2 : 1)) { 1435 mutex_exit(&ds->ds_lock); 1436 return (EEXIST); 1437 } 1438 mutex_exit(&ds->ds_lock); 1439 } else if (dsl_dir_is_clone(ds->ds_dir)) { 1440 return (dsl_dataset_origin_check(dsda, arg2, tx)); 1441 } 1442 1443 /* XXX we should do some i/o error checking... */ 1444 return (0); 1445 } 1446 1447 struct refsarg { 1448 kmutex_t lock; 1449 boolean_t gone; 1450 kcondvar_t cv; 1451 }; 1452 1453 /* ARGSUSED */ 1454 static void 1455 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) 1456 { 1457 struct refsarg *arg = argv; 1458 1459 mutex_enter(&arg->lock); 1460 arg->gone = TRUE; 1461 cv_signal(&arg->cv); 1462 mutex_exit(&arg->lock); 1463 } 1464 1465 static void 1466 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) 1467 { 1468 struct refsarg arg; 1469 1470 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); 1471 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); 1472 arg.gone = FALSE; 1473 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, 1474 dsl_dataset_refs_gone); 1475 dmu_buf_rele(ds->ds_dbuf, tag); 1476 mutex_enter(&arg.lock); 1477 while (!arg.gone) 1478 cv_wait(&arg.cv, &arg.lock); 1479 ASSERT(arg.gone); 1480 mutex_exit(&arg.lock); 1481 ds->ds_dbuf = NULL; 1482 ds->ds_phys = NULL; 1483 mutex_destroy(&arg.lock); 1484 cv_destroy(&arg.cv); 1485 } 1486 1487 static void 1488 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) 1489 { 1490 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1491 uint64_t count; 1492 int err; 1493 1494 ASSERT(ds->ds_phys->ds_num_children >= 2); 1495 err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); 1496 /* 1497 * The err should not be ENOENT, but a bug in a previous version 1498 * of the code could cause upgrade_clones_cb() to not set 1499 * ds_next_snap_obj when it should, leading to a missing entry. 1500 * If we knew that the pool was created after 1501 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't 1502 * ENOENT. However, at least we can check that we don't have 1503 * too many entries in the next_clones_obj even after failing to 1504 * remove this one. 1505 */ 1506 if (err != ENOENT) { 1507 VERIFY3U(err, ==, 0); 1508 } 1509 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 1510 &count)); 1511 ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); 1512 } 1513 1514 static void 1515 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) 1516 { 1517 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1518 zap_cursor_t zc; 1519 zap_attribute_t za; 1520 1521 /* 1522 * If it is the old version, dd_clones doesn't exist so we can't 1523 * find the clones, but deadlist_remove_key() is a no-op so it 1524 * doesn't matter. 1525 */ 1526 if (ds->ds_dir->dd_phys->dd_clones == 0) 1527 return; 1528 1529 for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); 1530 zap_cursor_retrieve(&zc, &za) == 0; 1531 zap_cursor_advance(&zc)) { 1532 dsl_dataset_t *clone; 1533 1534 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 1535 za.za_first_integer, FTAG, &clone)); 1536 if (clone->ds_dir->dd_origin_txg > mintxg) { 1537 dsl_deadlist_remove_key(&clone->ds_deadlist, 1538 mintxg, tx); 1539 dsl_dataset_remove_clones_key(clone, mintxg, tx); 1540 } 1541 dsl_dataset_rele(clone, FTAG); 1542 } 1543 zap_cursor_fini(&zc); 1544 } 1545 1546 struct process_old_arg { 1547 dsl_dataset_t *ds; 1548 dsl_dataset_t *ds_prev; 1549 boolean_t after_branch_point; 1550 zio_t *pio; 1551 uint64_t used, comp, uncomp; 1552 }; 1553 1554 static int 1555 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 1556 { 1557 struct process_old_arg *poa = arg; 1558 dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; 1559 1560 if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { 1561 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); 1562 if (poa->ds_prev && !poa->after_branch_point && 1563 bp->blk_birth > 1564 poa->ds_prev->ds_phys->ds_prev_snap_txg) { 1565 poa->ds_prev->ds_phys->ds_unique_bytes += 1566 bp_get_dsize_sync(dp->dp_spa, bp); 1567 } 1568 } else { 1569 poa->used += bp_get_dsize_sync(dp->dp_spa, bp); 1570 poa->comp += BP_GET_PSIZE(bp); 1571 poa->uncomp += BP_GET_UCSIZE(bp); 1572 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); 1573 } 1574 return (0); 1575 } 1576 1577 static void 1578 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, 1579 dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) 1580 { 1581 struct process_old_arg poa = { 0 }; 1582 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1583 objset_t *mos = dp->dp_meta_objset; 1584 1585 ASSERT(ds->ds_deadlist.dl_oldfmt); 1586 ASSERT(ds_next->ds_deadlist.dl_oldfmt); 1587 1588 poa.ds = ds; 1589 poa.ds_prev = ds_prev; 1590 poa.after_branch_point = after_branch_point; 1591 poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 1592 VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, 1593 process_old_cb, &poa, tx)); 1594 VERIFY3U(zio_wait(poa.pio), ==, 0); 1595 ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); 1596 1597 /* change snapused */ 1598 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1599 -poa.used, -poa.comp, -poa.uncomp, tx); 1600 1601 /* swap next's deadlist to our deadlist */ 1602 dsl_deadlist_close(&ds->ds_deadlist); 1603 dsl_deadlist_close(&ds_next->ds_deadlist); 1604 SWITCH64(ds_next->ds_phys->ds_deadlist_obj, 1605 ds->ds_phys->ds_deadlist_obj); 1606 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 1607 dsl_deadlist_open(&ds_next->ds_deadlist, mos, 1608 ds_next->ds_phys->ds_deadlist_obj); 1609 } 1610 1611 static int 1612 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) 1613 { 1614 int err; 1615 struct killarg ka; 1616 1617 /* 1618 * Free everything that we point to (that's born after 1619 * the previous snapshot, if we are a clone) 1620 * 1621 * NB: this should be very quick, because we already 1622 * freed all the objects in open context. 1623 */ 1624 ka.ds = ds; 1625 ka.tx = tx; 1626 err = traverse_dataset(ds, 1627 ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, 1628 kill_blkptr, &ka); 1629 ASSERT3U(err, ==, 0); 1630 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); 1631 1632 return (err); 1633 } 1634 1635 void 1636 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) 1637 { 1638 struct dsl_ds_destroyarg *dsda = arg1; 1639 dsl_dataset_t *ds = dsda->ds; 1640 int err; 1641 int after_branch_point = FALSE; 1642 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1643 objset_t *mos = dp->dp_meta_objset; 1644 dsl_dataset_t *ds_prev = NULL; 1645 boolean_t wont_destroy; 1646 uint64_t obj; 1647 1648 wont_destroy = (dsda->defer && 1649 (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); 1650 1651 ASSERT(ds->ds_owner || wont_destroy); 1652 ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); 1653 ASSERT(ds->ds_prev == NULL || 1654 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); 1655 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); 1656 1657 if (wont_destroy) { 1658 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1659 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1660 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; 1661 return; 1662 } 1663 1664 /* signal any waiters that this dataset is going away */ 1665 mutex_enter(&ds->ds_lock); 1666 ds->ds_owner = dsl_reaper; 1667 cv_broadcast(&ds->ds_exclusive_cv); 1668 mutex_exit(&ds->ds_lock); 1669 1670 /* Remove our reservation */ 1671 if (ds->ds_reserved != 0) { 1672 dsl_prop_setarg_t psa; 1673 uint64_t value = 0; 1674 1675 dsl_prop_setarg_init_uint64(&psa, "refreservation", 1676 (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), 1677 &value); 1678 psa.psa_effective_value = 0; /* predict default value */ 1679 1680 dsl_dataset_set_reservation_sync(ds, &psa, tx); 1681 ASSERT3U(ds->ds_reserved, ==, 0); 1682 } 1683 1684 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1685 1686 dsl_scan_ds_destroyed(ds, tx); 1687 1688 obj = ds->ds_object; 1689 1690 if (ds->ds_phys->ds_prev_snap_obj != 0) { 1691 if (ds->ds_prev) { 1692 ds_prev = ds->ds_prev; 1693 } else { 1694 VERIFY(0 == dsl_dataset_hold_obj(dp, 1695 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); 1696 } 1697 after_branch_point = 1698 (ds_prev->ds_phys->ds_next_snap_obj != obj); 1699 1700 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); 1701 if (after_branch_point && 1702 ds_prev->ds_phys->ds_next_clones_obj != 0) { 1703 remove_from_next_clones(ds_prev, obj, tx); 1704 if (ds->ds_phys->ds_next_snap_obj != 0) { 1705 VERIFY(0 == zap_add_int(mos, 1706 ds_prev->ds_phys->ds_next_clones_obj, 1707 ds->ds_phys->ds_next_snap_obj, tx)); 1708 } 1709 } 1710 if (after_branch_point && 1711 ds->ds_phys->ds_next_snap_obj == 0) { 1712 /* This clone is toast. */ 1713 ASSERT(ds_prev->ds_phys->ds_num_children > 1); 1714 ds_prev->ds_phys->ds_num_children--; 1715 1716 /* 1717 * If the clone's origin has no other clones, no 1718 * user holds, and has been marked for deferred 1719 * deletion, then we should have done the necessary 1720 * destroy setup for it. 1721 */ 1722 if (ds_prev->ds_phys->ds_num_children == 1 && 1723 ds_prev->ds_userrefs == 0 && 1724 DS_IS_DEFER_DESTROY(ds_prev)) { 1725 ASSERT3P(dsda->rm_origin, !=, NULL); 1726 } else { 1727 ASSERT3P(dsda->rm_origin, ==, NULL); 1728 } 1729 } else if (!after_branch_point) { 1730 ds_prev->ds_phys->ds_next_snap_obj = 1731 ds->ds_phys->ds_next_snap_obj; 1732 } 1733 } 1734 1735 if (dsl_dataset_is_snapshot(ds)) { 1736 dsl_dataset_t *ds_next; 1737 uint64_t old_unique; 1738 uint64_t used = 0, comp = 0, uncomp = 0; 1739 1740 VERIFY(0 == dsl_dataset_hold_obj(dp, 1741 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); 1742 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); 1743 1744 old_unique = ds_next->ds_phys->ds_unique_bytes; 1745 1746 dmu_buf_will_dirty(ds_next->ds_dbuf, tx); 1747 ds_next->ds_phys->ds_prev_snap_obj = 1748 ds->ds_phys->ds_prev_snap_obj; 1749 ds_next->ds_phys->ds_prev_snap_txg = 1750 ds->ds_phys->ds_prev_snap_txg; 1751 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1752 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); 1753 1754 1755 if (ds_next->ds_deadlist.dl_oldfmt) { 1756 process_old_deadlist(ds, ds_prev, ds_next, 1757 after_branch_point, tx); 1758 } else { 1759 /* Adjust prev's unique space. */ 1760 if (ds_prev && !after_branch_point) { 1761 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1762 ds_prev->ds_phys->ds_prev_snap_txg, 1763 ds->ds_phys->ds_prev_snap_txg, 1764 &used, &comp, &uncomp); 1765 ds_prev->ds_phys->ds_unique_bytes += used; 1766 } 1767 1768 /* Adjust snapused. */ 1769 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1770 ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 1771 &used, &comp, &uncomp); 1772 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1773 -used, -comp, -uncomp, tx); 1774 1775 /* Move blocks to be freed to pool's free list. */ 1776 dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, 1777 &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, 1778 tx); 1779 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, 1780 DD_USED_HEAD, used, comp, uncomp, tx); 1781 1782 /* Merge our deadlist into next's and free it. */ 1783 dsl_deadlist_merge(&ds_next->ds_deadlist, 1784 ds->ds_phys->ds_deadlist_obj, tx); 1785 } 1786 dsl_deadlist_close(&ds->ds_deadlist); 1787 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1788 1789 /* Collapse range in clone heads */ 1790 dsl_dataset_remove_clones_key(ds, 1791 ds->ds_phys->ds_creation_txg, tx); 1792 1793 if (dsl_dataset_is_snapshot(ds_next)) { 1794 dsl_dataset_t *ds_nextnext; 1795 1796 /* 1797 * Update next's unique to include blocks which 1798 * were previously shared by only this snapshot 1799 * and it. Those blocks will be born after the 1800 * prev snap and before this snap, and will have 1801 * died after the next snap and before the one 1802 * after that (ie. be on the snap after next's 1803 * deadlist). 1804 */ 1805 VERIFY(0 == dsl_dataset_hold_obj(dp, 1806 ds_next->ds_phys->ds_next_snap_obj, 1807 FTAG, &ds_nextnext)); 1808 dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, 1809 ds->ds_phys->ds_prev_snap_txg, 1810 ds->ds_phys->ds_creation_txg, 1811 &used, &comp, &uncomp); 1812 ds_next->ds_phys->ds_unique_bytes += used; 1813 dsl_dataset_rele(ds_nextnext, FTAG); 1814 ASSERT3P(ds_next->ds_prev, ==, NULL); 1815 1816 /* Collapse range in this head. */ 1817 dsl_dataset_t *hds; 1818 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 1819 ds->ds_dir->dd_phys->dd_head_dataset_obj, 1820 FTAG, &hds)); 1821 dsl_deadlist_remove_key(&hds->ds_deadlist, 1822 ds->ds_phys->ds_creation_txg, tx); 1823 dsl_dataset_rele(hds, FTAG); 1824 1825 } else { 1826 ASSERT3P(ds_next->ds_prev, ==, ds); 1827 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); 1828 ds_next->ds_prev = NULL; 1829 if (ds_prev) { 1830 VERIFY(0 == dsl_dataset_get_ref(dp, 1831 ds->ds_phys->ds_prev_snap_obj, 1832 ds_next, &ds_next->ds_prev)); 1833 } 1834 1835 dsl_dataset_recalc_head_uniq(ds_next); 1836 1837 /* 1838 * Reduce the amount of our unconsmed refreservation 1839 * being charged to our parent by the amount of 1840 * new unique data we have gained. 1841 */ 1842 if (old_unique < ds_next->ds_reserved) { 1843 int64_t mrsdelta; 1844 uint64_t new_unique = 1845 ds_next->ds_phys->ds_unique_bytes; 1846 1847 ASSERT(old_unique <= new_unique); 1848 mrsdelta = MIN(new_unique - old_unique, 1849 ds_next->ds_reserved - old_unique); 1850 dsl_dir_diduse_space(ds->ds_dir, 1851 DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); 1852 } 1853 } 1854 dsl_dataset_rele(ds_next, FTAG); 1855 } else { 1856 zfeature_info_t *async_destroy = 1857 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; 1858 1859 /* 1860 * There's no next snapshot, so this is a head dataset. 1861 * Destroy the deadlist. Unless it's a clone, the 1862 * deadlist should be empty. (If it's a clone, it's 1863 * safe to ignore the deadlist contents.) 1864 */ 1865 dsl_deadlist_close(&ds->ds_deadlist); 1866 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1867 ds->ds_phys->ds_deadlist_obj = 0; 1868 1869 if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { 1870 err = old_synchronous_dataset_destroy(ds, tx); 1871 } else { 1872 /* 1873 * Move the bptree into the pool's list of trees to 1874 * clean up and update space accounting information. 1875 */ 1876 uint64_t used, comp, uncomp; 1877 1878 ASSERT(err == 0 || err == EBUSY); 1879 if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { 1880 spa_feature_incr(dp->dp_spa, async_destroy, tx); 1881 dp->dp_bptree_obj = bptree_alloc( 1882 dp->dp_meta_objset, tx); 1883 VERIFY(zap_add(dp->dp_meta_objset, 1884 DMU_POOL_DIRECTORY_OBJECT, 1885 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 1886 &dp->dp_bptree_obj, tx) == 0); 1887 } 1888 1889 used = ds->ds_dir->dd_phys->dd_used_bytes; 1890 comp = ds->ds_dir->dd_phys->dd_compressed_bytes; 1891 uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; 1892 1893 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || 1894 ds->ds_phys->ds_unique_bytes == used); 1895 1896 bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj, 1897 &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, 1898 used, comp, uncomp, tx); 1899 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 1900 -used, -comp, -uncomp, tx); 1901 dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, 1902 used, comp, uncomp, tx); 1903 } 1904 1905 if (ds->ds_prev != NULL) { 1906 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 1907 VERIFY3U(0, ==, zap_remove_int(mos, 1908 ds->ds_prev->ds_dir->dd_phys->dd_clones, 1909 ds->ds_object, tx)); 1910 } 1911 dsl_dataset_rele(ds->ds_prev, ds); 1912 ds->ds_prev = ds_prev = NULL; 1913 } 1914 } 1915 1916 /* 1917 * This must be done after the dsl_traverse(), because it will 1918 * re-open the objset. 1919 */ 1920 if (ds->ds_objset) { 1921 dmu_objset_evict(ds->ds_objset); 1922 ds->ds_objset = NULL; 1923 } 1924 1925 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { 1926 /* Erase the link in the dir */ 1927 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1928 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; 1929 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); 1930 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); 1931 ASSERT(err == 0); 1932 } else { 1933 /* remove from snapshot namespace */ 1934 dsl_dataset_t *ds_head; 1935 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); 1936 VERIFY(0 == dsl_dataset_hold_obj(dp, 1937 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); 1938 VERIFY(0 == dsl_dataset_get_snapname(ds)); 1939 #ifdef ZFS_DEBUG 1940 { 1941 uint64_t val; 1942 1943 err = dsl_dataset_snap_lookup(ds_head, 1944 ds->ds_snapname, &val); 1945 ASSERT3U(err, ==, 0); 1946 ASSERT3U(val, ==, obj); 1947 } 1948 #endif 1949 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); 1950 ASSERT(err == 0); 1951 dsl_dataset_rele(ds_head, FTAG); 1952 } 1953 1954 if (ds_prev && ds->ds_prev != ds_prev) 1955 dsl_dataset_rele(ds_prev, FTAG); 1956 1957 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); 1958 spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, 1959 "dataset = %llu", ds->ds_object); 1960 1961 if (ds->ds_phys->ds_next_clones_obj != 0) { 1962 uint64_t count; 1963 ASSERT(0 == zap_count(mos, 1964 ds->ds_phys->ds_next_clones_obj, &count) && count == 0); 1965 VERIFY(0 == dmu_object_free(mos, 1966 ds->ds_phys->ds_next_clones_obj, tx)); 1967 } 1968 if (ds->ds_phys->ds_props_obj != 0) 1969 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); 1970 if (ds->ds_phys->ds_userrefs_obj != 0) 1971 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); 1972 dsl_dir_close(ds->ds_dir, ds); 1973 ds->ds_dir = NULL; 1974 dsl_dataset_drain_refs(ds, tag); 1975 VERIFY(0 == dmu_object_free(mos, obj, tx)); 1976 1977 if (dsda->rm_origin) { 1978 /* 1979 * Remove the origin of the clone we just destroyed. 1980 */ 1981 struct dsl_ds_destroyarg ndsda = {0}; 1982 1983 ndsda.ds = dsda->rm_origin; 1984 dsl_dataset_destroy_sync(&ndsda, tag, tx); 1985 } 1986 } 1987 1988 static int 1989 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) 1990 { 1991 uint64_t asize; 1992 1993 if (!dmu_tx_is_syncing(tx)) 1994 return (0); 1995 1996 /* 1997 * If there's an fs-only reservation, any blocks that might become 1998 * owned by the snapshot dataset must be accommodated by space 1999 * outside of the reservation. 2000 */ 2001 ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); 2002 asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 2003 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 2004 return (ENOSPC); 2005 2006 /* 2007 * Propogate any reserved space for this snapshot to other 2008 * snapshot checks in this sync group. 2009 */ 2010 if (asize > 0) 2011 dsl_dir_willuse_space(ds->ds_dir, asize, tx); 2012 2013 return (0); 2014 } 2015 2016 int 2017 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 2018 { 2019 dsl_dataset_t *ds = arg1; 2020 const char *snapname = arg2; 2021 int err; 2022 uint64_t value; 2023 2024 /* 2025 * We don't allow multiple snapshots of the same txg. If there 2026 * is already one, try again. 2027 */ 2028 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) 2029 return (EAGAIN); 2030 2031 /* 2032 * Check for conflicting name snapshot name. 2033 */ 2034 err = dsl_dataset_snap_lookup(ds, snapname, &value); 2035 if (err == 0) 2036 return (EEXIST); 2037 if (err != ENOENT) 2038 return (err); 2039 2040 /* 2041 * Check that the dataset's name is not too long. Name consists 2042 * of the dataset's length + 1 for the @-sign + snapshot name's length 2043 */ 2044 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) 2045 return (ENAMETOOLONG); 2046 2047 err = dsl_dataset_snapshot_reserve_space(ds, tx); 2048 if (err) 2049 return (err); 2050 2051 ds->ds_trysnap_txg = tx->tx_txg; 2052 return (0); 2053 } 2054 2055 void 2056 dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2057 { 2058 dsl_dataset_t *ds = arg1; 2059 const char *snapname = arg2; 2060 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2061 dmu_buf_t *dbuf; 2062 dsl_dataset_phys_t *dsphys; 2063 uint64_t dsobj, crtxg; 2064 objset_t *mos = dp->dp_meta_objset; 2065 int err; 2066 2067 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 2068 2069 /* 2070 * The origin's ds_creation_txg has to be < TXG_INITIAL 2071 */ 2072 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) 2073 crtxg = 1; 2074 else 2075 crtxg = tx->tx_txg; 2076 2077 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 2078 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 2079 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 2080 dmu_buf_will_dirty(dbuf, tx); 2081 dsphys = dbuf->db_data; 2082 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 2083 dsphys->ds_dir_obj = ds->ds_dir->dd_object; 2084 dsphys->ds_fsid_guid = unique_create(); 2085 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 2086 sizeof (dsphys->ds_guid)); 2087 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; 2088 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; 2089 dsphys->ds_next_snap_obj = ds->ds_object; 2090 dsphys->ds_num_children = 1; 2091 dsphys->ds_creation_time = gethrestime_sec(); 2092 dsphys->ds_creation_txg = crtxg; 2093 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; 2094 dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes; 2095 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; 2096 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; 2097 dsphys->ds_flags = ds->ds_phys->ds_flags; 2098 dsphys->ds_bp = ds->ds_phys->ds_bp; 2099 dmu_buf_rele(dbuf, FTAG); 2100 2101 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); 2102 if (ds->ds_prev) { 2103 uint64_t next_clones_obj = 2104 ds->ds_prev->ds_phys->ds_next_clones_obj; 2105 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == 2106 ds->ds_object || 2107 ds->ds_prev->ds_phys->ds_num_children > 1); 2108 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 2109 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2110 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 2111 ds->ds_prev->ds_phys->ds_creation_txg); 2112 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; 2113 } else if (next_clones_obj != 0) { 2114 remove_from_next_clones(ds->ds_prev, 2115 dsphys->ds_next_snap_obj, tx); 2116 VERIFY3U(0, ==, zap_add_int(mos, 2117 next_clones_obj, dsobj, tx)); 2118 } 2119 } 2120 2121 /* 2122 * If we have a reference-reservation on this dataset, we will 2123 * need to increase the amount of refreservation being charged 2124 * since our unique space is going to zero. 2125 */ 2126 if (ds->ds_reserved) { 2127 int64_t delta; 2128 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 2129 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 2130 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, 2131 delta, 0, 0, tx); 2132 } 2133 2134 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2135 zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", 2136 ds->ds_dir->dd_myname, snapname, dsobj, 2137 ds->ds_phys->ds_prev_snap_txg); 2138 ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, 2139 UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); 2140 dsl_deadlist_close(&ds->ds_deadlist); 2141 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 2142 dsl_deadlist_add_key(&ds->ds_deadlist, 2143 ds->ds_phys->ds_prev_snap_txg, tx); 2144 2145 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); 2146 ds->ds_phys->ds_prev_snap_obj = dsobj; 2147 ds->ds_phys->ds_prev_snap_txg = crtxg; 2148 ds->ds_phys->ds_unique_bytes = 0; 2149 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 2150 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 2151 2152 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, 2153 snapname, 8, 1, &dsobj, tx); 2154 ASSERT(err == 0); 2155 2156 if (ds->ds_prev) 2157 dsl_dataset_drop_ref(ds->ds_prev, ds); 2158 VERIFY(0 == dsl_dataset_get_ref(dp, 2159 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 2160 2161 dsl_scan_ds_snapshotted(ds, tx); 2162 2163 dsl_dir_snap_cmtime_update(ds->ds_dir); 2164 2165 spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, 2166 "dataset = %llu", dsobj); 2167 } 2168 2169 void 2170 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) 2171 { 2172 ASSERT(dmu_tx_is_syncing(tx)); 2173 ASSERT(ds->ds_objset != NULL); 2174 ASSERT(ds->ds_phys->ds_next_snap_obj == 0); 2175 2176 /* 2177 * in case we had to change ds_fsid_guid when we opened it, 2178 * sync it out now. 2179 */ 2180 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2181 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; 2182 2183 dsl_dir_dirty(ds->ds_dir, tx); 2184 dmu_objset_sync(ds->ds_objset, zio, tx); 2185 } 2186 2187 static void 2188 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) 2189 { 2190 uint64_t count = 0; 2191 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 2192 zap_cursor_t zc; 2193 zap_attribute_t za; 2194 nvlist_t *propval; 2195 nvlist_t *val; 2196 2197 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2198 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2199 VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2200 2201 /* 2202 * There may me missing entries in ds_next_clones_obj 2203 * due to a bug in a previous version of the code. 2204 * Only trust it if it has the right number of entries. 2205 */ 2206 if (ds->ds_phys->ds_next_clones_obj != 0) { 2207 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 2208 &count)); 2209 } 2210 if (count != ds->ds_phys->ds_num_children - 1) { 2211 goto fail; 2212 } 2213 for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); 2214 zap_cursor_retrieve(&zc, &za) == 0; 2215 zap_cursor_advance(&zc)) { 2216 dsl_dataset_t *clone; 2217 char buf[ZFS_MAXNAMELEN]; 2218 /* 2219 * Even though we hold the dp_config_rwlock, the dataset 2220 * may fail to open, returning ENOENT. If there is a 2221 * thread concurrently attempting to destroy this 2222 * dataset, it will have the ds_rwlock held for 2223 * RW_WRITER. Our call to dsl_dataset_hold_obj() -> 2224 * dsl_dataset_hold_ref() will fail its 2225 * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the 2226 * dp_config_rwlock, and wait for the destroy progress 2227 * and signal ds_exclusive_cv. If the destroy was 2228 * successful, we will see that 2229 * DSL_DATASET_IS_DESTROYED(), and return ENOENT. 2230 */ 2231 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 2232 za.za_first_integer, FTAG, &clone) != 0) 2233 continue; 2234 dsl_dir_name(clone->ds_dir, buf); 2235 VERIFY(nvlist_add_boolean(val, buf) == 0); 2236 dsl_dataset_rele(clone, FTAG); 2237 } 2238 zap_cursor_fini(&zc); 2239 VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); 2240 VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), 2241 propval) == 0); 2242 fail: 2243 nvlist_free(val); 2244 nvlist_free(propval); 2245 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2246 } 2247 2248 void 2249 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) 2250 { 2251 uint64_t refd, avail, uobjs, aobjs, ratio; 2252 2253 dsl_dir_stats(ds->ds_dir, nv); 2254 2255 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); 2256 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); 2257 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); 2258 2259 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, 2260 ds->ds_phys->ds_creation_time); 2261 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, 2262 ds->ds_phys->ds_creation_txg); 2263 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, 2264 ds->ds_quota); 2265 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, 2266 ds->ds_reserved); 2267 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, 2268 ds->ds_phys->ds_guid); 2269 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, 2270 ds->ds_phys->ds_unique_bytes); 2271 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, 2272 ds->ds_object); 2273 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, 2274 ds->ds_userrefs); 2275 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, 2276 DS_IS_DEFER_DESTROY(ds) ? 1 : 0); 2277 2278 if (ds->ds_phys->ds_prev_snap_obj != 0) { 2279 uint64_t written, comp, uncomp; 2280 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2281 dsl_dataset_t *prev; 2282 2283 rw_enter(&dp->dp_config_rwlock, RW_READER); 2284 int err = dsl_dataset_hold_obj(dp, 2285 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); 2286 rw_exit(&dp->dp_config_rwlock); 2287 if (err == 0) { 2288 err = dsl_dataset_space_written(prev, ds, &written, 2289 &comp, &uncomp); 2290 dsl_dataset_rele(prev, FTAG); 2291 if (err == 0) { 2292 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, 2293 written); 2294 } 2295 } 2296 } 2297 2298 ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : 2299 (ds->ds_phys->ds_uncompressed_bytes * 100 / 2300 ds->ds_phys->ds_compressed_bytes); 2301 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); 2302 2303 if (ds->ds_phys->ds_next_snap_obj) { 2304 /* 2305 * This is a snapshot; override the dd's space used with 2306 * our unique space and compression ratio. 2307 */ 2308 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 2309 ds->ds_phys->ds_unique_bytes); 2310 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); 2311 2312 get_clones_stat(ds, nv); 2313 } 2314 } 2315 2316 void 2317 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) 2318 { 2319 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; 2320 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; 2321 stat->dds_guid = ds->ds_phys->ds_guid; 2322 if (ds->ds_phys->ds_next_snap_obj) { 2323 stat->dds_is_snapshot = B_TRUE; 2324 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; 2325 } else { 2326 stat->dds_is_snapshot = B_FALSE; 2327 stat->dds_num_clones = 0; 2328 } 2329 2330 /* clone origin is really a dsl_dir thing... */ 2331 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2332 if (dsl_dir_is_clone(ds->ds_dir)) { 2333 dsl_dataset_t *ods; 2334 2335 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, 2336 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); 2337 dsl_dataset_name(ods, stat->dds_origin); 2338 dsl_dataset_drop_ref(ods, FTAG); 2339 } else { 2340 stat->dds_origin[0] = '\0'; 2341 } 2342 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2343 } 2344 2345 uint64_t 2346 dsl_dataset_fsid_guid(dsl_dataset_t *ds) 2347 { 2348 return (ds->ds_fsid_guid); 2349 } 2350 2351 void 2352 dsl_dataset_space(dsl_dataset_t *ds, 2353 uint64_t *refdbytesp, uint64_t *availbytesp, 2354 uint64_t *usedobjsp, uint64_t *availobjsp) 2355 { 2356 *refdbytesp = ds->ds_phys->ds_referenced_bytes; 2357 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); 2358 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) 2359 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; 2360 if (ds->ds_quota != 0) { 2361 /* 2362 * Adjust available bytes according to refquota 2363 */ 2364 if (*refdbytesp < ds->ds_quota) 2365 *availbytesp = MIN(*availbytesp, 2366 ds->ds_quota - *refdbytesp); 2367 else 2368 *availbytesp = 0; 2369 } 2370 *usedobjsp = ds->ds_phys->ds_bp.blk_fill; 2371 *availobjsp = DN_MAX_OBJECT - *usedobjsp; 2372 } 2373 2374 boolean_t 2375 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) 2376 { 2377 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2378 2379 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 2380 dsl_pool_sync_context(dp)); 2381 if (ds->ds_prev == NULL) 2382 return (B_FALSE); 2383 if (ds->ds_phys->ds_bp.blk_birth > 2384 ds->ds_prev->ds_phys->ds_creation_txg) { 2385 objset_t *os, *os_prev; 2386 /* 2387 * It may be that only the ZIL differs, because it was 2388 * reset in the head. Don't count that as being 2389 * modified. 2390 */ 2391 if (dmu_objset_from_ds(ds, &os) != 0) 2392 return (B_TRUE); 2393 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) 2394 return (B_TRUE); 2395 return (bcmp(&os->os_phys->os_meta_dnode, 2396 &os_prev->os_phys->os_meta_dnode, 2397 sizeof (os->os_phys->os_meta_dnode)) != 0); 2398 } 2399 return (B_FALSE); 2400 } 2401 2402 /* ARGSUSED */ 2403 static int 2404 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) 2405 { 2406 dsl_dataset_t *ds = arg1; 2407 char *newsnapname = arg2; 2408 dsl_dir_t *dd = ds->ds_dir; 2409 dsl_dataset_t *hds; 2410 uint64_t val; 2411 int err; 2412 2413 err = dsl_dataset_hold_obj(dd->dd_pool, 2414 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); 2415 if (err) 2416 return (err); 2417 2418 /* new name better not be in use */ 2419 err = dsl_dataset_snap_lookup(hds, newsnapname, &val); 2420 dsl_dataset_rele(hds, FTAG); 2421 2422 if (err == 0) 2423 err = EEXIST; 2424 else if (err == ENOENT) 2425 err = 0; 2426 2427 /* dataset name + 1 for the "@" + the new snapshot name must fit */ 2428 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) 2429 err = ENAMETOOLONG; 2430 2431 return (err); 2432 } 2433 2434 static void 2435 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2436 { 2437 dsl_dataset_t *ds = arg1; 2438 const char *newsnapname = arg2; 2439 dsl_dir_t *dd = ds->ds_dir; 2440 objset_t *mos = dd->dd_pool->dp_meta_objset; 2441 dsl_dataset_t *hds; 2442 int err; 2443 2444 ASSERT(ds->ds_phys->ds_next_snap_obj != 0); 2445 2446 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2447 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); 2448 2449 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2450 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); 2451 ASSERT3U(err, ==, 0); 2452 mutex_enter(&ds->ds_lock); 2453 (void) strcpy(ds->ds_snapname, newsnapname); 2454 mutex_exit(&ds->ds_lock); 2455 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, 2456 ds->ds_snapname, 8, 1, &ds->ds_object, tx); 2457 ASSERT3U(err, ==, 0); 2458 2459 spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, 2460 "dataset = %llu", ds->ds_object); 2461 dsl_dataset_rele(hds, FTAG); 2462 } 2463 2464 struct renamesnaparg { 2465 dsl_sync_task_group_t *dstg; 2466 char failed[MAXPATHLEN]; 2467 char *oldsnap; 2468 char *newsnap; 2469 }; 2470 2471 static int 2472 dsl_snapshot_rename_one(const char *name, void *arg) 2473 { 2474 struct renamesnaparg *ra = arg; 2475 dsl_dataset_t *ds = NULL; 2476 char *snapname; 2477 int err; 2478 2479 snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); 2480 (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); 2481 2482 /* 2483 * For recursive snapshot renames the parent won't be changing 2484 * so we just pass name for both the to/from argument. 2485 */ 2486 err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); 2487 if (err != 0) { 2488 strfree(snapname); 2489 return (err == ENOENT ? 0 : err); 2490 } 2491 2492 #ifdef _KERNEL 2493 /* 2494 * For all filesystems undergoing rename, we'll need to unmount it. 2495 */ 2496 (void) zfs_unmount_snap(snapname, NULL); 2497 #endif 2498 err = dsl_dataset_hold(snapname, ra->dstg, &ds); 2499 strfree(snapname); 2500 if (err != 0) 2501 return (err == ENOENT ? 0 : err); 2502 2503 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, 2504 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); 2505 2506 return (0); 2507 } 2508 2509 static int 2510 dsl_recursive_rename(char *oldname, const char *newname) 2511 { 2512 int err; 2513 struct renamesnaparg *ra; 2514 dsl_sync_task_t *dst; 2515 spa_t *spa; 2516 char *cp, *fsname = spa_strdup(oldname); 2517 int len = strlen(oldname) + 1; 2518 2519 /* truncate the snapshot name to get the fsname */ 2520 cp = strchr(fsname, '@'); 2521 *cp = '\0'; 2522 2523 err = spa_open(fsname, &spa, FTAG); 2524 if (err) { 2525 kmem_free(fsname, len); 2526 return (err); 2527 } 2528 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); 2529 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 2530 2531 ra->oldsnap = strchr(oldname, '@') + 1; 2532 ra->newsnap = strchr(newname, '@') + 1; 2533 *ra->failed = '\0'; 2534 2535 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, 2536 DS_FIND_CHILDREN); 2537 kmem_free(fsname, len); 2538 2539 if (err == 0) { 2540 err = dsl_sync_task_group_wait(ra->dstg); 2541 } 2542 2543 for (dst = list_head(&ra->dstg->dstg_tasks); dst; 2544 dst = list_next(&ra->dstg->dstg_tasks, dst)) { 2545 dsl_dataset_t *ds = dst->dst_arg1; 2546 if (dst->dst_err) { 2547 dsl_dir_name(ds->ds_dir, ra->failed); 2548 (void) strlcat(ra->failed, "@", sizeof (ra->failed)); 2549 (void) strlcat(ra->failed, ra->newsnap, 2550 sizeof (ra->failed)); 2551 } 2552 dsl_dataset_rele(ds, ra->dstg); 2553 } 2554 2555 if (err) 2556 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); 2557 2558 dsl_sync_task_group_destroy(ra->dstg); 2559 kmem_free(ra, sizeof (struct renamesnaparg)); 2560 spa_close(spa, FTAG); 2561 return (err); 2562 } 2563 2564 static int 2565 dsl_valid_rename(const char *oldname, void *arg) 2566 { 2567 int delta = *(int *)arg; 2568 2569 if (strlen(oldname) + delta >= MAXNAMELEN) 2570 return (ENAMETOOLONG); 2571 2572 return (0); 2573 } 2574 2575 #pragma weak dmu_objset_rename = dsl_dataset_rename 2576 int 2577 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) 2578 { 2579 dsl_dir_t *dd; 2580 dsl_dataset_t *ds; 2581 const char *tail; 2582 int err; 2583 2584 err = dsl_dir_open(oldname, FTAG, &dd, &tail); 2585 if (err) 2586 return (err); 2587 2588 if (tail == NULL) { 2589 int delta = strlen(newname) - strlen(oldname); 2590 2591 /* if we're growing, validate child name lengths */ 2592 if (delta > 0) 2593 err = dmu_objset_find(oldname, dsl_valid_rename, 2594 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2595 2596 if (err == 0) 2597 err = dsl_dir_rename(dd, newname); 2598 dsl_dir_close(dd, FTAG); 2599 return (err); 2600 } 2601 2602 if (tail[0] != '@') { 2603 /* the name ended in a nonexistent component */ 2604 dsl_dir_close(dd, FTAG); 2605 return (ENOENT); 2606 } 2607 2608 dsl_dir_close(dd, FTAG); 2609 2610 /* new name must be snapshot in same filesystem */ 2611 tail = strchr(newname, '@'); 2612 if (tail == NULL) 2613 return (EINVAL); 2614 tail++; 2615 if (strncmp(oldname, newname, tail - newname) != 0) 2616 return (EXDEV); 2617 2618 if (recursive) { 2619 err = dsl_recursive_rename(oldname, newname); 2620 } else { 2621 err = dsl_dataset_hold(oldname, FTAG, &ds); 2622 if (err) 2623 return (err); 2624 2625 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2626 dsl_dataset_snapshot_rename_check, 2627 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); 2628 2629 dsl_dataset_rele(ds, FTAG); 2630 } 2631 2632 return (err); 2633 } 2634 2635 struct promotenode { 2636 list_node_t link; 2637 dsl_dataset_t *ds; 2638 }; 2639 2640 struct promotearg { 2641 list_t shared_snaps, origin_snaps, clone_snaps; 2642 dsl_dataset_t *origin_origin; 2643 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; 2644 char *err_ds; 2645 }; 2646 2647 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); 2648 static boolean_t snaplist_unstable(list_t *l); 2649 2650 static int 2651 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) 2652 { 2653 dsl_dataset_t *hds = arg1; 2654 struct promotearg *pa = arg2; 2655 struct promotenode *snap = list_head(&pa->shared_snaps); 2656 dsl_dataset_t *origin_ds = snap->ds; 2657 int err; 2658 uint64_t unused; 2659 2660 /* Check that it is a real clone */ 2661 if (!dsl_dir_is_clone(hds->ds_dir)) 2662 return (EINVAL); 2663 2664 /* Since this is so expensive, don't do the preliminary check */ 2665 if (!dmu_tx_is_syncing(tx)) 2666 return (0); 2667 2668 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) 2669 return (EXDEV); 2670 2671 /* compute origin's new unique space */ 2672 snap = list_tail(&pa->clone_snaps); 2673 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2674 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2675 origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 2676 &pa->unique, &unused, &unused); 2677 2678 /* 2679 * Walk the snapshots that we are moving 2680 * 2681 * Compute space to transfer. Consider the incremental changes 2682 * to used for each snapshot: 2683 * (my used) = (prev's used) + (blocks born) - (blocks killed) 2684 * So each snapshot gave birth to: 2685 * (blocks born) = (my used) - (prev's used) + (blocks killed) 2686 * So a sequence would look like: 2687 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) 2688 * Which simplifies to: 2689 * uN + kN + kN-1 + ... + k1 + k0 2690 * Note however, if we stop before we reach the ORIGIN we get: 2691 * uN + kN + kN-1 + ... + kM - uM-1 2692 */ 2693 pa->used = origin_ds->ds_phys->ds_referenced_bytes; 2694 pa->comp = origin_ds->ds_phys->ds_compressed_bytes; 2695 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; 2696 for (snap = list_head(&pa->shared_snaps); snap; 2697 snap = list_next(&pa->shared_snaps, snap)) { 2698 uint64_t val, dlused, dlcomp, dluncomp; 2699 dsl_dataset_t *ds = snap->ds; 2700 2701 /* Check that the snapshot name does not conflict */ 2702 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2703 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); 2704 if (err == 0) { 2705 err = EEXIST; 2706 goto out; 2707 } 2708 if (err != ENOENT) 2709 goto out; 2710 2711 /* The very first snapshot does not have a deadlist */ 2712 if (ds->ds_phys->ds_prev_snap_obj == 0) 2713 continue; 2714 2715 dsl_deadlist_space(&ds->ds_deadlist, 2716 &dlused, &dlcomp, &dluncomp); 2717 pa->used += dlused; 2718 pa->comp += dlcomp; 2719 pa->uncomp += dluncomp; 2720 } 2721 2722 /* 2723 * If we are a clone of a clone then we never reached ORIGIN, 2724 * so we need to subtract out the clone origin's used space. 2725 */ 2726 if (pa->origin_origin) { 2727 pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; 2728 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; 2729 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; 2730 } 2731 2732 /* Check that there is enough space here */ 2733 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 2734 pa->used); 2735 if (err) 2736 return (err); 2737 2738 /* 2739 * Compute the amounts of space that will be used by snapshots 2740 * after the promotion (for both origin and clone). For each, 2741 * it is the amount of space that will be on all of their 2742 * deadlists (that was not born before their new origin). 2743 */ 2744 if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2745 uint64_t space; 2746 2747 /* 2748 * Note, typically this will not be a clone of a clone, 2749 * so dd_origin_txg will be < TXG_INITIAL, so 2750 * these snaplist_space() -> dsl_deadlist_space_range() 2751 * calls will be fast because they do not have to 2752 * iterate over all bps. 2753 */ 2754 snap = list_head(&pa->origin_snaps); 2755 err = snaplist_space(&pa->shared_snaps, 2756 snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); 2757 if (err) 2758 return (err); 2759 2760 err = snaplist_space(&pa->clone_snaps, 2761 snap->ds->ds_dir->dd_origin_txg, &space); 2762 if (err) 2763 return (err); 2764 pa->cloneusedsnap += space; 2765 } 2766 if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2767 err = snaplist_space(&pa->origin_snaps, 2768 origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); 2769 if (err) 2770 return (err); 2771 } 2772 2773 return (0); 2774 out: 2775 pa->err_ds = snap->ds->ds_snapname; 2776 return (err); 2777 } 2778 2779 static void 2780 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2781 { 2782 dsl_dataset_t *hds = arg1; 2783 struct promotearg *pa = arg2; 2784 struct promotenode *snap = list_head(&pa->shared_snaps); 2785 dsl_dataset_t *origin_ds = snap->ds; 2786 dsl_dataset_t *origin_head; 2787 dsl_dir_t *dd = hds->ds_dir; 2788 dsl_pool_t *dp = hds->ds_dir->dd_pool; 2789 dsl_dir_t *odd = NULL; 2790 uint64_t oldnext_obj; 2791 int64_t delta; 2792 2793 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); 2794 2795 snap = list_head(&pa->origin_snaps); 2796 origin_head = snap->ds; 2797 2798 /* 2799 * We need to explicitly open odd, since origin_ds's dd will be 2800 * changing. 2801 */ 2802 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, 2803 NULL, FTAG, &odd)); 2804 2805 /* change origin's next snap */ 2806 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); 2807 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; 2808 snap = list_tail(&pa->clone_snaps); 2809 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2810 origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; 2811 2812 /* change the origin's next clone */ 2813 if (origin_ds->ds_phys->ds_next_clones_obj) { 2814 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); 2815 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2816 origin_ds->ds_phys->ds_next_clones_obj, 2817 oldnext_obj, tx)); 2818 } 2819 2820 /* change origin */ 2821 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2822 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); 2823 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; 2824 dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; 2825 dmu_buf_will_dirty(odd->dd_dbuf, tx); 2826 odd->dd_phys->dd_origin_obj = origin_ds->ds_object; 2827 origin_head->ds_dir->dd_origin_txg = 2828 origin_ds->ds_phys->ds_creation_txg; 2829 2830 /* change dd_clone entries */ 2831 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2832 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2833 odd->dd_phys->dd_clones, hds->ds_object, tx)); 2834 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2835 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2836 hds->ds_object, tx)); 2837 2838 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2839 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2840 origin_head->ds_object, tx)); 2841 if (dd->dd_phys->dd_clones == 0) { 2842 dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, 2843 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 2844 } 2845 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2846 dd->dd_phys->dd_clones, origin_head->ds_object, tx)); 2847 2848 } 2849 2850 /* move snapshots to this dir */ 2851 for (snap = list_head(&pa->shared_snaps); snap; 2852 snap = list_next(&pa->shared_snaps, snap)) { 2853 dsl_dataset_t *ds = snap->ds; 2854 2855 /* unregister props as dsl_dir is changing */ 2856 if (ds->ds_objset) { 2857 dmu_objset_evict(ds->ds_objset); 2858 ds->ds_objset = NULL; 2859 } 2860 /* move snap name entry */ 2861 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2862 VERIFY(0 == dsl_dataset_snap_remove(origin_head, 2863 ds->ds_snapname, tx)); 2864 VERIFY(0 == zap_add(dp->dp_meta_objset, 2865 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 2866 8, 1, &ds->ds_object, tx)); 2867 2868 /* change containing dsl_dir */ 2869 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2870 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); 2871 ds->ds_phys->ds_dir_obj = dd->dd_object; 2872 ASSERT3P(ds->ds_dir, ==, odd); 2873 dsl_dir_close(ds->ds_dir, ds); 2874 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, 2875 NULL, ds, &ds->ds_dir)); 2876 2877 /* move any clone references */ 2878 if (ds->ds_phys->ds_next_clones_obj && 2879 spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2880 zap_cursor_t zc; 2881 zap_attribute_t za; 2882 2883 for (zap_cursor_init(&zc, dp->dp_meta_objset, 2884 ds->ds_phys->ds_next_clones_obj); 2885 zap_cursor_retrieve(&zc, &za) == 0; 2886 zap_cursor_advance(&zc)) { 2887 dsl_dataset_t *cnds; 2888 uint64_t o; 2889 2890 if (za.za_first_integer == oldnext_obj) { 2891 /* 2892 * We've already moved the 2893 * origin's reference. 2894 */ 2895 continue; 2896 } 2897 2898 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 2899 za.za_first_integer, FTAG, &cnds)); 2900 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; 2901 2902 VERIFY3U(zap_remove_int(dp->dp_meta_objset, 2903 odd->dd_phys->dd_clones, o, tx), ==, 0); 2904 VERIFY3U(zap_add_int(dp->dp_meta_objset, 2905 dd->dd_phys->dd_clones, o, tx), ==, 0); 2906 dsl_dataset_rele(cnds, FTAG); 2907 } 2908 zap_cursor_fini(&zc); 2909 } 2910 2911 ASSERT3U(dsl_prop_numcb(ds), ==, 0); 2912 } 2913 2914 /* 2915 * Change space accounting. 2916 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either 2917 * both be valid, or both be 0 (resulting in delta == 0). This 2918 * is true for each of {clone,origin} independently. 2919 */ 2920 2921 delta = pa->cloneusedsnap - 2922 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2923 ASSERT3S(delta, >=, 0); 2924 ASSERT3U(pa->used, >=, delta); 2925 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); 2926 dsl_dir_diduse_space(dd, DD_USED_HEAD, 2927 pa->used - delta, pa->comp, pa->uncomp, tx); 2928 2929 delta = pa->originusedsnap - 2930 odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2931 ASSERT3S(delta, <=, 0); 2932 ASSERT3U(pa->used, >=, -delta); 2933 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); 2934 dsl_dir_diduse_space(odd, DD_USED_HEAD, 2935 -pa->used - delta, -pa->comp, -pa->uncomp, tx); 2936 2937 origin_ds->ds_phys->ds_unique_bytes = pa->unique; 2938 2939 /* log history record */ 2940 spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, 2941 "dataset = %llu", hds->ds_object); 2942 2943 dsl_dir_close(odd, FTAG); 2944 } 2945 2946 static char *snaplist_tag = "snaplist"; 2947 /* 2948 * Make a list of dsl_dataset_t's for the snapshots between first_obj 2949 * (exclusive) and last_obj (inclusive). The list will be in reverse 2950 * order (last_obj will be the list_head()). If first_obj == 0, do all 2951 * snapshots back to this dataset's origin. 2952 */ 2953 static int 2954 snaplist_make(dsl_pool_t *dp, boolean_t own, 2955 uint64_t first_obj, uint64_t last_obj, list_t *l) 2956 { 2957 uint64_t obj = last_obj; 2958 2959 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); 2960 2961 list_create(l, sizeof (struct promotenode), 2962 offsetof(struct promotenode, link)); 2963 2964 while (obj != first_obj) { 2965 dsl_dataset_t *ds; 2966 struct promotenode *snap; 2967 int err; 2968 2969 if (own) { 2970 err = dsl_dataset_own_obj(dp, obj, 2971 0, snaplist_tag, &ds); 2972 if (err == 0) 2973 dsl_dataset_make_exclusive(ds, snaplist_tag); 2974 } else { 2975 err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); 2976 } 2977 if (err == ENOENT) { 2978 /* lost race with snapshot destroy */ 2979 struct promotenode *last = list_tail(l); 2980 ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); 2981 obj = last->ds->ds_phys->ds_prev_snap_obj; 2982 continue; 2983 } else if (err) { 2984 return (err); 2985 } 2986 2987 if (first_obj == 0) 2988 first_obj = ds->ds_dir->dd_phys->dd_origin_obj; 2989 2990 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); 2991 snap->ds = ds; 2992 list_insert_tail(l, snap); 2993 obj = ds->ds_phys->ds_prev_snap_obj; 2994 } 2995 2996 return (0); 2997 } 2998 2999 static int 3000 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) 3001 { 3002 struct promotenode *snap; 3003 3004 *spacep = 0; 3005 for (snap = list_head(l); snap; snap = list_next(l, snap)) { 3006 uint64_t used, comp, uncomp; 3007 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 3008 mintxg, UINT64_MAX, &used, &comp, &uncomp); 3009 *spacep += used; 3010 } 3011 return (0); 3012 } 3013 3014 static void 3015 snaplist_destroy(list_t *l, boolean_t own) 3016 { 3017 struct promotenode *snap; 3018 3019 if (!l || !list_link_active(&l->list_head)) 3020 return; 3021 3022 while ((snap = list_tail(l)) != NULL) { 3023 list_remove(l, snap); 3024 if (own) 3025 dsl_dataset_disown(snap->ds, snaplist_tag); 3026 else 3027 dsl_dataset_rele(snap->ds, snaplist_tag); 3028 kmem_free(snap, sizeof (struct promotenode)); 3029 } 3030 list_destroy(l); 3031 } 3032 3033 /* 3034 * Promote a clone. Nomenclature note: 3035 * "clone" or "cds": the original clone which is being promoted 3036 * "origin" or "ods": the snapshot which is originally clone's origin 3037 * "origin head" or "ohds": the dataset which is the head 3038 * (filesystem/volume) for the origin 3039 * "origin origin": the origin of the origin's filesystem (typically 3040 * NULL, indicating that the clone is not a clone of a clone). 3041 */ 3042 int 3043 dsl_dataset_promote(const char *name, char *conflsnap) 3044 { 3045 dsl_dataset_t *ds; 3046 dsl_dir_t *dd; 3047 dsl_pool_t *dp; 3048 dmu_object_info_t doi; 3049 struct promotearg pa = { 0 }; 3050 struct promotenode *snap; 3051 int err; 3052 3053 err = dsl_dataset_hold(name, FTAG, &ds); 3054 if (err) 3055 return (err); 3056 dd = ds->ds_dir; 3057 dp = dd->dd_pool; 3058 3059 err = dmu_object_info(dp->dp_meta_objset, 3060 ds->ds_phys->ds_snapnames_zapobj, &doi); 3061 if (err) { 3062 dsl_dataset_rele(ds, FTAG); 3063 return (err); 3064 } 3065 3066 if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { 3067 dsl_dataset_rele(ds, FTAG); 3068 return (EINVAL); 3069 } 3070 3071 /* 3072 * We are going to inherit all the snapshots taken before our 3073 * origin (i.e., our new origin will be our parent's origin). 3074 * Take ownership of them so that we can rename them into our 3075 * namespace. 3076 */ 3077 rw_enter(&dp->dp_config_rwlock, RW_READER); 3078 3079 err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, 3080 &pa.shared_snaps); 3081 if (err != 0) 3082 goto out; 3083 3084 err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); 3085 if (err != 0) 3086 goto out; 3087 3088 snap = list_head(&pa.shared_snaps); 3089 ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); 3090 err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, 3091 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); 3092 if (err != 0) 3093 goto out; 3094 3095 if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { 3096 err = dsl_dataset_hold_obj(dp, 3097 snap->ds->ds_dir->dd_phys->dd_origin_obj, 3098 FTAG, &pa.origin_origin); 3099 if (err != 0) 3100 goto out; 3101 } 3102 3103 out: 3104 rw_exit(&dp->dp_config_rwlock); 3105 3106 /* 3107 * Add in 128x the snapnames zapobj size, since we will be moving 3108 * a bunch of snapnames to the promoted ds, and dirtying their 3109 * bonus buffers. 3110 */ 3111 if (err == 0) { 3112 err = dsl_sync_task_do(dp, dsl_dataset_promote_check, 3113 dsl_dataset_promote_sync, ds, &pa, 3114 2 + 2 * doi.doi_physical_blocks_512); 3115 if (err && pa.err_ds && conflsnap) 3116 (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); 3117 } 3118 3119 snaplist_destroy(&pa.shared_snaps, B_TRUE); 3120 snaplist_destroy(&pa.clone_snaps, B_FALSE); 3121 snaplist_destroy(&pa.origin_snaps, B_FALSE); 3122 if (pa.origin_origin) 3123 dsl_dataset_rele(pa.origin_origin, FTAG); 3124 dsl_dataset_rele(ds, FTAG); 3125 return (err); 3126 } 3127 3128 struct cloneswaparg { 3129 dsl_dataset_t *cds; /* clone dataset */ 3130 dsl_dataset_t *ohds; /* origin's head dataset */ 3131 boolean_t force; 3132 int64_t unused_refres_delta; /* change in unconsumed refreservation */ 3133 }; 3134 3135 /* ARGSUSED */ 3136 static int 3137 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) 3138 { 3139 struct cloneswaparg *csa = arg1; 3140 3141 /* they should both be heads */ 3142 if (dsl_dataset_is_snapshot(csa->cds) || 3143 dsl_dataset_is_snapshot(csa->ohds)) 3144 return (EINVAL); 3145 3146 /* the branch point should be just before them */ 3147 if (csa->cds->ds_prev != csa->ohds->ds_prev) 3148 return (EINVAL); 3149 3150 /* cds should be the clone (unless they are unrelated) */ 3151 if (csa->cds->ds_prev != NULL && 3152 csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && 3153 csa->ohds->ds_object != 3154 csa->cds->ds_prev->ds_phys->ds_next_snap_obj) 3155 return (EINVAL); 3156 3157 /* the clone should be a child of the origin */ 3158 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) 3159 return (EINVAL); 3160 3161 /* ohds shouldn't be modified unless 'force' */ 3162 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) 3163 return (ETXTBSY); 3164 3165 /* adjust amount of any unconsumed refreservation */ 3166 csa->unused_refres_delta = 3167 (int64_t)MIN(csa->ohds->ds_reserved, 3168 csa->ohds->ds_phys->ds_unique_bytes) - 3169 (int64_t)MIN(csa->ohds->ds_reserved, 3170 csa->cds->ds_phys->ds_unique_bytes); 3171 3172 if (csa->unused_refres_delta > 0 && 3173 csa->unused_refres_delta > 3174 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) 3175 return (ENOSPC); 3176 3177 if (csa->ohds->ds_quota != 0 && 3178 csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) 3179 return (EDQUOT); 3180 3181 return (0); 3182 } 3183 3184 /* ARGSUSED */ 3185 static void 3186 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3187 { 3188 struct cloneswaparg *csa = arg1; 3189 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; 3190 3191 ASSERT(csa->cds->ds_reserved == 0); 3192 ASSERT(csa->ohds->ds_quota == 0 || 3193 csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); 3194 3195 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); 3196 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); 3197 3198 if (csa->cds->ds_objset != NULL) { 3199 dmu_objset_evict(csa->cds->ds_objset); 3200 csa->cds->ds_objset = NULL; 3201 } 3202 3203 if (csa->ohds->ds_objset != NULL) { 3204 dmu_objset_evict(csa->ohds->ds_objset); 3205 csa->ohds->ds_objset = NULL; 3206 } 3207 3208 /* 3209 * Reset origin's unique bytes, if it exists. 3210 */ 3211 if (csa->cds->ds_prev) { 3212 dsl_dataset_t *origin = csa->cds->ds_prev; 3213 uint64_t comp, uncomp; 3214 3215 dmu_buf_will_dirty(origin->ds_dbuf, tx); 3216 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3217 origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, 3218 &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); 3219 } 3220 3221 /* swap blkptrs */ 3222 { 3223 blkptr_t tmp; 3224 tmp = csa->ohds->ds_phys->ds_bp; 3225 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; 3226 csa->cds->ds_phys->ds_bp = tmp; 3227 } 3228 3229 /* set dd_*_bytes */ 3230 { 3231 int64_t dused, dcomp, duncomp; 3232 uint64_t cdl_used, cdl_comp, cdl_uncomp; 3233 uint64_t odl_used, odl_comp, odl_uncomp; 3234 3235 ASSERT3U(csa->cds->ds_dir->dd_phys-> 3236 dd_used_breakdown[DD_USED_SNAP], ==, 0); 3237 3238 dsl_deadlist_space(&csa->cds->ds_deadlist, 3239 &cdl_used, &cdl_comp, &cdl_uncomp); 3240 dsl_deadlist_space(&csa->ohds->ds_deadlist, 3241 &odl_used, &odl_comp, &odl_uncomp); 3242 3243 dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - 3244 (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); 3245 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - 3246 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); 3247 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + 3248 cdl_uncomp - 3249 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); 3250 3251 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, 3252 dused, dcomp, duncomp, tx); 3253 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, 3254 -dused, -dcomp, -duncomp, tx); 3255 3256 /* 3257 * The difference in the space used by snapshots is the 3258 * difference in snapshot space due to the head's 3259 * deadlist (since that's the only thing that's 3260 * changing that affects the snapused). 3261 */ 3262 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3263 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3264 &cdl_used, &cdl_comp, &cdl_uncomp); 3265 dsl_deadlist_space_range(&csa->ohds->ds_deadlist, 3266 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3267 &odl_used, &odl_comp, &odl_uncomp); 3268 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, 3269 DD_USED_HEAD, DD_USED_SNAP, tx); 3270 } 3271 3272 /* swap ds_*_bytes */ 3273 SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, 3274 csa->cds->ds_phys->ds_referenced_bytes); 3275 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, 3276 csa->cds->ds_phys->ds_compressed_bytes); 3277 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, 3278 csa->cds->ds_phys->ds_uncompressed_bytes); 3279 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, 3280 csa->cds->ds_phys->ds_unique_bytes); 3281 3282 /* apply any parent delta for change in unconsumed refreservation */ 3283 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, 3284 csa->unused_refres_delta, 0, 0, tx); 3285 3286 /* 3287 * Swap deadlists. 3288 */ 3289 dsl_deadlist_close(&csa->cds->ds_deadlist); 3290 dsl_deadlist_close(&csa->ohds->ds_deadlist); 3291 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, 3292 csa->cds->ds_phys->ds_deadlist_obj); 3293 dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, 3294 csa->cds->ds_phys->ds_deadlist_obj); 3295 dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, 3296 csa->ohds->ds_phys->ds_deadlist_obj); 3297 3298 dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); 3299 } 3300 3301 /* 3302 * Swap 'clone' with its origin head datasets. Used at the end of "zfs 3303 * recv" into an existing fs to swizzle the file system to the new 3304 * version, and by "zfs rollback". Can also be used to swap two 3305 * independent head datasets if neither has any snapshots. 3306 */ 3307 int 3308 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, 3309 boolean_t force) 3310 { 3311 struct cloneswaparg csa; 3312 int error; 3313 3314 ASSERT(clone->ds_owner); 3315 ASSERT(origin_head->ds_owner); 3316 retry: 3317 /* 3318 * Need exclusive access for the swap. If we're swapping these 3319 * datasets back after an error, we already hold the locks. 3320 */ 3321 if (!RW_WRITE_HELD(&clone->ds_rwlock)) 3322 rw_enter(&clone->ds_rwlock, RW_WRITER); 3323 if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && 3324 !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { 3325 rw_exit(&clone->ds_rwlock); 3326 rw_enter(&origin_head->ds_rwlock, RW_WRITER); 3327 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { 3328 rw_exit(&origin_head->ds_rwlock); 3329 goto retry; 3330 } 3331 } 3332 csa.cds = clone; 3333 csa.ohds = origin_head; 3334 csa.force = force; 3335 error = dsl_sync_task_do(clone->ds_dir->dd_pool, 3336 dsl_dataset_clone_swap_check, 3337 dsl_dataset_clone_swap_sync, &csa, NULL, 9); 3338 return (error); 3339 } 3340 3341 /* 3342 * Given a pool name and a dataset object number in that pool, 3343 * return the name of that dataset. 3344 */ 3345 int 3346 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) 3347 { 3348 spa_t *spa; 3349 dsl_pool_t *dp; 3350 dsl_dataset_t *ds; 3351 int error; 3352 3353 if ((error = spa_open(pname, &spa, FTAG)) != 0) 3354 return (error); 3355 dp = spa_get_dsl(spa); 3356 rw_enter(&dp->dp_config_rwlock, RW_READER); 3357 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { 3358 dsl_dataset_name(ds, buf); 3359 dsl_dataset_rele(ds, FTAG); 3360 } 3361 rw_exit(&dp->dp_config_rwlock); 3362 spa_close(spa, FTAG); 3363 3364 return (error); 3365 } 3366 3367 int 3368 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, 3369 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) 3370 { 3371 int error = 0; 3372 3373 ASSERT3S(asize, >, 0); 3374 3375 /* 3376 * *ref_rsrv is the portion of asize that will come from any 3377 * unconsumed refreservation space. 3378 */ 3379 *ref_rsrv = 0; 3380 3381 mutex_enter(&ds->ds_lock); 3382 /* 3383 * Make a space adjustment for reserved bytes. 3384 */ 3385 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { 3386 ASSERT3U(*used, >=, 3387 ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3388 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3389 *ref_rsrv = 3390 asize - MIN(asize, parent_delta(ds, asize + inflight)); 3391 } 3392 3393 if (!check_quota || ds->ds_quota == 0) { 3394 mutex_exit(&ds->ds_lock); 3395 return (0); 3396 } 3397 /* 3398 * If they are requesting more space, and our current estimate 3399 * is over quota, they get to try again unless the actual 3400 * on-disk is over quota and there are no pending changes (which 3401 * may free up space for us). 3402 */ 3403 if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { 3404 if (inflight > 0 || 3405 ds->ds_phys->ds_referenced_bytes < ds->ds_quota) 3406 error = ERESTART; 3407 else 3408 error = EDQUOT; 3409 } 3410 mutex_exit(&ds->ds_lock); 3411 3412 return (error); 3413 } 3414 3415 /* ARGSUSED */ 3416 static int 3417 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) 3418 { 3419 dsl_dataset_t *ds = arg1; 3420 dsl_prop_setarg_t *psa = arg2; 3421 int err; 3422 3423 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) 3424 return (ENOTSUP); 3425 3426 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3427 return (err); 3428 3429 if (psa->psa_effective_value == 0) 3430 return (0); 3431 3432 if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || 3433 psa->psa_effective_value < ds->ds_reserved) 3434 return (ENOSPC); 3435 3436 return (0); 3437 } 3438 3439 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); 3440 3441 void 3442 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3443 { 3444 dsl_dataset_t *ds = arg1; 3445 dsl_prop_setarg_t *psa = arg2; 3446 uint64_t effective_value = psa->psa_effective_value; 3447 3448 dsl_prop_set_sync(ds, psa, tx); 3449 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3450 3451 if (ds->ds_quota != effective_value) { 3452 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3453 ds->ds_quota = effective_value; 3454 3455 spa_history_log_internal(LOG_DS_REFQUOTA, 3456 ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ", 3457 (longlong_t)ds->ds_quota, ds->ds_object); 3458 } 3459 } 3460 3461 int 3462 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) 3463 { 3464 dsl_dataset_t *ds; 3465 dsl_prop_setarg_t psa; 3466 int err; 3467 3468 dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); 3469 3470 err = dsl_dataset_hold(dsname, FTAG, &ds); 3471 if (err) 3472 return (err); 3473 3474 /* 3475 * If someone removes a file, then tries to set the quota, we 3476 * want to make sure the file freeing takes effect. 3477 */ 3478 txg_wait_open(ds->ds_dir->dd_pool, 0); 3479 3480 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3481 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, 3482 ds, &psa, 0); 3483 3484 dsl_dataset_rele(ds, FTAG); 3485 return (err); 3486 } 3487 3488 static int 3489 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) 3490 { 3491 dsl_dataset_t *ds = arg1; 3492 dsl_prop_setarg_t *psa = arg2; 3493 uint64_t effective_value; 3494 uint64_t unique; 3495 int err; 3496 3497 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 3498 SPA_VERSION_REFRESERVATION) 3499 return (ENOTSUP); 3500 3501 if (dsl_dataset_is_snapshot(ds)) 3502 return (EINVAL); 3503 3504 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3505 return (err); 3506 3507 effective_value = psa->psa_effective_value; 3508 3509 /* 3510 * If we are doing the preliminary check in open context, the 3511 * space estimates may be inaccurate. 3512 */ 3513 if (!dmu_tx_is_syncing(tx)) 3514 return (0); 3515 3516 mutex_enter(&ds->ds_lock); 3517 if (!DS_UNIQUE_IS_ACCURATE(ds)) 3518 dsl_dataset_recalc_head_uniq(ds); 3519 unique = ds->ds_phys->ds_unique_bytes; 3520 mutex_exit(&ds->ds_lock); 3521 3522 if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { 3523 uint64_t delta = MAX(unique, effective_value) - 3524 MAX(unique, ds->ds_reserved); 3525 3526 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 3527 return (ENOSPC); 3528 if (ds->ds_quota > 0 && 3529 effective_value > ds->ds_quota) 3530 return (ENOSPC); 3531 } 3532 3533 return (0); 3534 } 3535 3536 static void 3537 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3538 { 3539 dsl_dataset_t *ds = arg1; 3540 dsl_prop_setarg_t *psa = arg2; 3541 uint64_t effective_value = psa->psa_effective_value; 3542 uint64_t unique; 3543 int64_t delta; 3544 3545 dsl_prop_set_sync(ds, psa, tx); 3546 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3547 3548 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3549 3550 mutex_enter(&ds->ds_dir->dd_lock); 3551 mutex_enter(&ds->ds_lock); 3552 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 3553 unique = ds->ds_phys->ds_unique_bytes; 3554 delta = MAX(0, (int64_t)(effective_value - unique)) - 3555 MAX(0, (int64_t)(ds->ds_reserved - unique)); 3556 ds->ds_reserved = effective_value; 3557 mutex_exit(&ds->ds_lock); 3558 3559 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); 3560 mutex_exit(&ds->ds_dir->dd_lock); 3561 3562 spa_history_log_internal(LOG_DS_REFRESERV, 3563 ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu", 3564 (longlong_t)effective_value, ds->ds_object); 3565 } 3566 3567 int 3568 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, 3569 uint64_t reservation) 3570 { 3571 dsl_dataset_t *ds; 3572 dsl_prop_setarg_t psa; 3573 int err; 3574 3575 dsl_prop_setarg_init_uint64(&psa, "refreservation", source, 3576 &reservation); 3577 3578 err = dsl_dataset_hold(dsname, FTAG, &ds); 3579 if (err) 3580 return (err); 3581 3582 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3583 dsl_dataset_set_reservation_check, 3584 dsl_dataset_set_reservation_sync, ds, &psa, 0); 3585 3586 dsl_dataset_rele(ds, FTAG); 3587 return (err); 3588 } 3589 3590 typedef struct zfs_hold_cleanup_arg { 3591 dsl_pool_t *dp; 3592 uint64_t dsobj; 3593 char htag[MAXNAMELEN]; 3594 } zfs_hold_cleanup_arg_t; 3595 3596 static void 3597 dsl_dataset_user_release_onexit(void *arg) 3598 { 3599 zfs_hold_cleanup_arg_t *ca = arg; 3600 3601 (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, 3602 B_TRUE); 3603 kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); 3604 } 3605 3606 void 3607 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, 3608 minor_t minor) 3609 { 3610 zfs_hold_cleanup_arg_t *ca; 3611 3612 ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); 3613 ca->dp = ds->ds_dir->dd_pool; 3614 ca->dsobj = ds->ds_object; 3615 (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); 3616 VERIFY3U(0, ==, zfs_onexit_add_cb(minor, 3617 dsl_dataset_user_release_onexit, ca, NULL)); 3618 } 3619 3620 /* 3621 * If you add new checks here, you may need to add 3622 * additional checks to the "temporary" case in 3623 * snapshot_check() in dmu_objset.c. 3624 */ 3625 static int 3626 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) 3627 { 3628 dsl_dataset_t *ds = arg1; 3629 struct dsl_ds_holdarg *ha = arg2; 3630 char *htag = ha->htag; 3631 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3632 int error = 0; 3633 3634 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3635 return (ENOTSUP); 3636 3637 if (!dsl_dataset_is_snapshot(ds)) 3638 return (EINVAL); 3639 3640 /* tags must be unique */ 3641 mutex_enter(&ds->ds_lock); 3642 if (ds->ds_phys->ds_userrefs_obj) { 3643 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 3644 8, 1, tx); 3645 if (error == 0) 3646 error = EEXIST; 3647 else if (error == ENOENT) 3648 error = 0; 3649 } 3650 mutex_exit(&ds->ds_lock); 3651 3652 if (error == 0 && ha->temphold && 3653 strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 3654 error = E2BIG; 3655 3656 return (error); 3657 } 3658 3659 void 3660 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3661 { 3662 dsl_dataset_t *ds = arg1; 3663 struct dsl_ds_holdarg *ha = arg2; 3664 char *htag = ha->htag; 3665 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3666 objset_t *mos = dp->dp_meta_objset; 3667 uint64_t now = gethrestime_sec(); 3668 uint64_t zapobj; 3669 3670 mutex_enter(&ds->ds_lock); 3671 if (ds->ds_phys->ds_userrefs_obj == 0) { 3672 /* 3673 * This is the first user hold for this dataset. Create 3674 * the userrefs zap object. 3675 */ 3676 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3677 zapobj = ds->ds_phys->ds_userrefs_obj = 3678 zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); 3679 } else { 3680 zapobj = ds->ds_phys->ds_userrefs_obj; 3681 } 3682 ds->ds_userrefs++; 3683 mutex_exit(&ds->ds_lock); 3684 3685 VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); 3686 3687 if (ha->temphold) { 3688 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, 3689 htag, &now, tx)); 3690 } 3691 3692 spa_history_log_internal(LOG_DS_USER_HOLD, 3693 dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, 3694 (int)ha->temphold, ds->ds_object); 3695 } 3696 3697 static int 3698 dsl_dataset_user_hold_one(const char *dsname, void *arg) 3699 { 3700 struct dsl_ds_holdarg *ha = arg; 3701 dsl_dataset_t *ds; 3702 int error; 3703 char *name; 3704 3705 /* alloc a buffer to hold dsname@snapname plus terminating NULL */ 3706 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3707 error = dsl_dataset_hold(name, ha->dstg, &ds); 3708 strfree(name); 3709 if (error == 0) { 3710 ha->gotone = B_TRUE; 3711 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, 3712 dsl_dataset_user_hold_sync, ds, ha, 0); 3713 } else if (error == ENOENT && ha->recursive) { 3714 error = 0; 3715 } else { 3716 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3717 } 3718 return (error); 3719 } 3720 3721 int 3722 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, 3723 boolean_t temphold) 3724 { 3725 struct dsl_ds_holdarg *ha; 3726 int error; 3727 3728 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3729 ha->htag = htag; 3730 ha->temphold = temphold; 3731 error = dsl_sync_task_do(ds->ds_dir->dd_pool, 3732 dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, 3733 ds, ha, 0); 3734 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3735 3736 return (error); 3737 } 3738 3739 int 3740 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, 3741 boolean_t recursive, boolean_t temphold, int cleanup_fd) 3742 { 3743 struct dsl_ds_holdarg *ha; 3744 dsl_sync_task_t *dst; 3745 spa_t *spa; 3746 int error; 3747 minor_t minor = 0; 3748 3749 if (cleanup_fd != -1) { 3750 /* Currently we only support cleanup-on-exit of tempholds. */ 3751 if (!temphold) 3752 return (EINVAL); 3753 error = zfs_onexit_fd_hold(cleanup_fd, &minor); 3754 if (error) 3755 return (error); 3756 } 3757 3758 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3759 3760 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3761 3762 error = spa_open(dsname, &spa, FTAG); 3763 if (error) { 3764 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3765 if (cleanup_fd != -1) 3766 zfs_onexit_fd_rele(cleanup_fd); 3767 return (error); 3768 } 3769 3770 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3771 ha->htag = htag; 3772 ha->snapname = snapname; 3773 ha->recursive = recursive; 3774 ha->temphold = temphold; 3775 3776 if (recursive) { 3777 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, 3778 ha, DS_FIND_CHILDREN); 3779 } else { 3780 error = dsl_dataset_user_hold_one(dsname, ha); 3781 } 3782 if (error == 0) 3783 error = dsl_sync_task_group_wait(ha->dstg); 3784 3785 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3786 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3787 dsl_dataset_t *ds = dst->dst_arg1; 3788 3789 if (dst->dst_err) { 3790 dsl_dataset_name(ds, ha->failed); 3791 *strchr(ha->failed, '@') = '\0'; 3792 } else if (error == 0 && minor != 0 && temphold) { 3793 /* 3794 * If this hold is to be released upon process exit, 3795 * register that action now. 3796 */ 3797 dsl_register_onexit_hold_cleanup(ds, htag, minor); 3798 } 3799 dsl_dataset_rele(ds, ha->dstg); 3800 } 3801 3802 if (error == 0 && recursive && !ha->gotone) 3803 error = ENOENT; 3804 3805 if (error) 3806 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3807 3808 dsl_sync_task_group_destroy(ha->dstg); 3809 3810 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3811 spa_close(spa, FTAG); 3812 if (cleanup_fd != -1) 3813 zfs_onexit_fd_rele(cleanup_fd); 3814 return (error); 3815 } 3816 3817 struct dsl_ds_releasearg { 3818 dsl_dataset_t *ds; 3819 const char *htag; 3820 boolean_t own; /* do we own or just hold ds? */ 3821 }; 3822 3823 static int 3824 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, 3825 boolean_t *might_destroy) 3826 { 3827 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3828 uint64_t zapobj; 3829 uint64_t tmp; 3830 int error; 3831 3832 *might_destroy = B_FALSE; 3833 3834 mutex_enter(&ds->ds_lock); 3835 zapobj = ds->ds_phys->ds_userrefs_obj; 3836 if (zapobj == 0) { 3837 /* The tag can't possibly exist */ 3838 mutex_exit(&ds->ds_lock); 3839 return (ESRCH); 3840 } 3841 3842 /* Make sure the tag exists */ 3843 error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); 3844 if (error) { 3845 mutex_exit(&ds->ds_lock); 3846 if (error == ENOENT) 3847 error = ESRCH; 3848 return (error); 3849 } 3850 3851 if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && 3852 DS_IS_DEFER_DESTROY(ds)) 3853 *might_destroy = B_TRUE; 3854 3855 mutex_exit(&ds->ds_lock); 3856 return (0); 3857 } 3858 3859 static int 3860 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) 3861 { 3862 struct dsl_ds_releasearg *ra = arg1; 3863 dsl_dataset_t *ds = ra->ds; 3864 boolean_t might_destroy; 3865 int error; 3866 3867 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3868 return (ENOTSUP); 3869 3870 error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); 3871 if (error) 3872 return (error); 3873 3874 if (might_destroy) { 3875 struct dsl_ds_destroyarg dsda = {0}; 3876 3877 if (dmu_tx_is_syncing(tx)) { 3878 /* 3879 * If we're not prepared to remove the snapshot, 3880 * we can't allow the release to happen right now. 3881 */ 3882 if (!ra->own) 3883 return (EBUSY); 3884 } 3885 dsda.ds = ds; 3886 dsda.releasing = B_TRUE; 3887 return (dsl_dataset_destroy_check(&dsda, tag, tx)); 3888 } 3889 3890 return (0); 3891 } 3892 3893 static void 3894 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) 3895 { 3896 struct dsl_ds_releasearg *ra = arg1; 3897 dsl_dataset_t *ds = ra->ds; 3898 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3899 objset_t *mos = dp->dp_meta_objset; 3900 uint64_t zapobj; 3901 uint64_t dsobj = ds->ds_object; 3902 uint64_t refs; 3903 int error; 3904 3905 mutex_enter(&ds->ds_lock); 3906 ds->ds_userrefs--; 3907 refs = ds->ds_userrefs; 3908 mutex_exit(&ds->ds_lock); 3909 error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); 3910 VERIFY(error == 0 || error == ENOENT); 3911 zapobj = ds->ds_phys->ds_userrefs_obj; 3912 VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); 3913 if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && 3914 DS_IS_DEFER_DESTROY(ds)) { 3915 struct dsl_ds_destroyarg dsda = {0}; 3916 3917 ASSERT(ra->own); 3918 dsda.ds = ds; 3919 dsda.releasing = B_TRUE; 3920 /* We already did the destroy_check */ 3921 dsl_dataset_destroy_sync(&dsda, tag, tx); 3922 } 3923 3924 spa_history_log_internal(LOG_DS_USER_RELEASE, 3925 dp->dp_spa, tx, "<%s> %lld dataset = %llu", 3926 ra->htag, (longlong_t)refs, dsobj); 3927 } 3928 3929 static int 3930 dsl_dataset_user_release_one(const char *dsname, void *arg) 3931 { 3932 struct dsl_ds_holdarg *ha = arg; 3933 struct dsl_ds_releasearg *ra; 3934 dsl_dataset_t *ds; 3935 int error; 3936 void *dtag = ha->dstg; 3937 char *name; 3938 boolean_t own = B_FALSE; 3939 boolean_t might_destroy; 3940 3941 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ 3942 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3943 error = dsl_dataset_hold(name, dtag, &ds); 3944 strfree(name); 3945 if (error == ENOENT && ha->recursive) 3946 return (0); 3947 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3948 if (error) 3949 return (error); 3950 3951 ha->gotone = B_TRUE; 3952 3953 ASSERT(dsl_dataset_is_snapshot(ds)); 3954 3955 error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); 3956 if (error) { 3957 dsl_dataset_rele(ds, dtag); 3958 return (error); 3959 } 3960 3961 if (might_destroy) { 3962 #ifdef _KERNEL 3963 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3964 error = zfs_unmount_snap(name, NULL); 3965 strfree(name); 3966 if (error) { 3967 dsl_dataset_rele(ds, dtag); 3968 return (error); 3969 } 3970 #endif 3971 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { 3972 dsl_dataset_rele(ds, dtag); 3973 return (EBUSY); 3974 } else { 3975 own = B_TRUE; 3976 dsl_dataset_make_exclusive(ds, dtag); 3977 } 3978 } 3979 3980 ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); 3981 ra->ds = ds; 3982 ra->htag = ha->htag; 3983 ra->own = own; 3984 dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, 3985 dsl_dataset_user_release_sync, ra, dtag, 0); 3986 3987 return (0); 3988 } 3989 3990 int 3991 dsl_dataset_user_release(char *dsname, char *snapname, char *htag, 3992 boolean_t recursive) 3993 { 3994 struct dsl_ds_holdarg *ha; 3995 dsl_sync_task_t *dst; 3996 spa_t *spa; 3997 int error; 3998 3999 top: 4000 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 4001 4002 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 4003 4004 error = spa_open(dsname, &spa, FTAG); 4005 if (error) { 4006 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 4007 return (error); 4008 } 4009 4010 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 4011 ha->htag = htag; 4012 ha->snapname = snapname; 4013 ha->recursive = recursive; 4014 if (recursive) { 4015 error = dmu_objset_find(dsname, dsl_dataset_user_release_one, 4016 ha, DS_FIND_CHILDREN); 4017 } else { 4018 error = dsl_dataset_user_release_one(dsname, ha); 4019 } 4020 if (error == 0) 4021 error = dsl_sync_task_group_wait(ha->dstg); 4022 4023 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 4024 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 4025 struct dsl_ds_releasearg *ra = dst->dst_arg1; 4026 dsl_dataset_t *ds = ra->ds; 4027 4028 if (dst->dst_err) 4029 dsl_dataset_name(ds, ha->failed); 4030 4031 if (ra->own) 4032 dsl_dataset_disown(ds, ha->dstg); 4033 else 4034 dsl_dataset_rele(ds, ha->dstg); 4035 4036 kmem_free(ra, sizeof (struct dsl_ds_releasearg)); 4037 } 4038 4039 if (error == 0 && recursive && !ha->gotone) 4040 error = ENOENT; 4041 4042 if (error && error != EBUSY) 4043 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 4044 4045 dsl_sync_task_group_destroy(ha->dstg); 4046 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 4047 spa_close(spa, FTAG); 4048 4049 /* 4050 * We can get EBUSY if we were racing with deferred destroy and 4051 * dsl_dataset_user_release_check() hadn't done the necessary 4052 * open context setup. We can also get EBUSY if we're racing 4053 * with destroy and that thread is the ds_owner. Either way 4054 * the busy condition should be transient, and we should retry 4055 * the release operation. 4056 */ 4057 if (error == EBUSY) 4058 goto top; 4059 4060 return (error); 4061 } 4062 4063 /* 4064 * Called at spa_load time (with retry == B_FALSE) to release a stale 4065 * temporary user hold. Also called by the onexit code (with retry == B_TRUE). 4066 */ 4067 int 4068 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, 4069 boolean_t retry) 4070 { 4071 dsl_dataset_t *ds; 4072 char *snap; 4073 char *name; 4074 int namelen; 4075 int error; 4076 4077 do { 4078 rw_enter(&dp->dp_config_rwlock, RW_READER); 4079 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 4080 rw_exit(&dp->dp_config_rwlock); 4081 if (error) 4082 return (error); 4083 namelen = dsl_dataset_namelen(ds)+1; 4084 name = kmem_alloc(namelen, KM_SLEEP); 4085 dsl_dataset_name(ds, name); 4086 dsl_dataset_rele(ds, FTAG); 4087 4088 snap = strchr(name, '@'); 4089 *snap = '\0'; 4090 ++snap; 4091 error = dsl_dataset_user_release(name, snap, htag, B_FALSE); 4092 kmem_free(name, namelen); 4093 4094 /* 4095 * The object can't have been destroyed because we have a hold, 4096 * but it might have been renamed, resulting in ENOENT. Retry 4097 * if we've been requested to do so. 4098 * 4099 * It would be nice if we could use the dsobj all the way 4100 * through and avoid ENOENT entirely. But we might need to 4101 * unmount the snapshot, and there's currently no way to lookup 4102 * a vfsp using a ZFS object id. 4103 */ 4104 } while ((error == ENOENT) && retry); 4105 4106 return (error); 4107 } 4108 4109 int 4110 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) 4111 { 4112 dsl_dataset_t *ds; 4113 int err; 4114 4115 err = dsl_dataset_hold(dsname, FTAG, &ds); 4116 if (err) 4117 return (err); 4118 4119 VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); 4120 if (ds->ds_phys->ds_userrefs_obj != 0) { 4121 zap_attribute_t *za; 4122 zap_cursor_t zc; 4123 4124 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 4125 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, 4126 ds->ds_phys->ds_userrefs_obj); 4127 zap_cursor_retrieve(&zc, za) == 0; 4128 zap_cursor_advance(&zc)) { 4129 VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, 4130 za->za_first_integer)); 4131 } 4132 zap_cursor_fini(&zc); 4133 kmem_free(za, sizeof (zap_attribute_t)); 4134 } 4135 dsl_dataset_rele(ds, FTAG); 4136 return (0); 4137 } 4138 4139 /* 4140 * Note, this function is used as the callback for dmu_objset_find(). We 4141 * always return 0 so that we will continue to find and process 4142 * inconsistent datasets, even if we encounter an error trying to 4143 * process one of them. 4144 */ 4145 /* ARGSUSED */ 4146 int 4147 dsl_destroy_inconsistent(const char *dsname, void *arg) 4148 { 4149 dsl_dataset_t *ds; 4150 4151 if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { 4152 if (DS_IS_INCONSISTENT(ds)) 4153 (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); 4154 else 4155 dsl_dataset_disown(ds, FTAG); 4156 } 4157 return (0); 4158 } 4159 4160 /* 4161 * Return (in *usedp) the amount of space written in new that is not 4162 * present in oldsnap. New may be a snapshot or the head. Old must be 4163 * a snapshot before new, in new's filesystem (or its origin). If not then 4164 * fail and return EINVAL. 4165 * 4166 * The written space is calculated by considering two components: First, we 4167 * ignore any freed space, and calculate the written as new's used space 4168 * minus old's used space. Next, we add in the amount of space that was freed 4169 * between the two snapshots, thus reducing new's used space relative to old's. 4170 * Specifically, this is the space that was born before old->ds_creation_txg, 4171 * and freed before new (ie. on new's deadlist or a previous deadlist). 4172 * 4173 * space freed [---------------------] 4174 * snapshots ---O-------O--------O-------O------ 4175 * oldsnap new 4176 */ 4177 int 4178 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, 4179 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 4180 { 4181 int err = 0; 4182 uint64_t snapobj; 4183 dsl_pool_t *dp = new->ds_dir->dd_pool; 4184 4185 *usedp = 0; 4186 *usedp += new->ds_phys->ds_referenced_bytes; 4187 *usedp -= oldsnap->ds_phys->ds_referenced_bytes; 4188 4189 *compp = 0; 4190 *compp += new->ds_phys->ds_compressed_bytes; 4191 *compp -= oldsnap->ds_phys->ds_compressed_bytes; 4192 4193 *uncompp = 0; 4194 *uncompp += new->ds_phys->ds_uncompressed_bytes; 4195 *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; 4196 4197 rw_enter(&dp->dp_config_rwlock, RW_READER); 4198 snapobj = new->ds_object; 4199 while (snapobj != oldsnap->ds_object) { 4200 dsl_dataset_t *snap; 4201 uint64_t used, comp, uncomp; 4202 4203 if (snapobj == new->ds_object) { 4204 snap = new; 4205 } else { 4206 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); 4207 if (err != 0) 4208 break; 4209 } 4210 4211 if (snap->ds_phys->ds_prev_snap_txg == 4212 oldsnap->ds_phys->ds_creation_txg) { 4213 /* 4214 * The blocks in the deadlist can not be born after 4215 * ds_prev_snap_txg, so get the whole deadlist space, 4216 * which is more efficient (especially for old-format 4217 * deadlists). Unfortunately the deadlist code 4218 * doesn't have enough information to make this 4219 * optimization itself. 4220 */ 4221 dsl_deadlist_space(&snap->ds_deadlist, 4222 &used, &comp, &uncomp); 4223 } else { 4224 dsl_deadlist_space_range(&snap->ds_deadlist, 4225 0, oldsnap->ds_phys->ds_creation_txg, 4226 &used, &comp, &uncomp); 4227 } 4228 *usedp += used; 4229 *compp += comp; 4230 *uncompp += uncomp; 4231 4232 /* 4233 * If we get to the beginning of the chain of snapshots 4234 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap 4235 * was not a snapshot of/before new. 4236 */ 4237 snapobj = snap->ds_phys->ds_prev_snap_obj; 4238 if (snap != new) 4239 dsl_dataset_rele(snap, FTAG); 4240 if (snapobj == 0) { 4241 err = EINVAL; 4242 break; 4243 } 4244 4245 } 4246 rw_exit(&dp->dp_config_rwlock); 4247 return (err); 4248 } 4249 4250 /* 4251 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, 4252 * lastsnap, and all snapshots in between are deleted. 4253 * 4254 * blocks that would be freed [---------------------------] 4255 * snapshots ---O-------O--------O-------O--------O 4256 * firstsnap lastsnap 4257 * 4258 * This is the set of blocks that were born after the snap before firstsnap, 4259 * (birth > firstsnap->prev_snap_txg) and died before the snap after the 4260 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). 4261 * We calculate this by iterating over the relevant deadlists (from the snap 4262 * after lastsnap, backward to the snap after firstsnap), summing up the 4263 * space on the deadlist that was born after the snap before firstsnap. 4264 */ 4265 int 4266 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, 4267 dsl_dataset_t *lastsnap, 4268 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 4269 { 4270 int err = 0; 4271 uint64_t snapobj; 4272 dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; 4273 4274 ASSERT(dsl_dataset_is_snapshot(firstsnap)); 4275 ASSERT(dsl_dataset_is_snapshot(lastsnap)); 4276 4277 /* 4278 * Check that the snapshots are in the same dsl_dir, and firstsnap 4279 * is before lastsnap. 4280 */ 4281 if (firstsnap->ds_dir != lastsnap->ds_dir || 4282 firstsnap->ds_phys->ds_creation_txg > 4283 lastsnap->ds_phys->ds_creation_txg) 4284 return (EINVAL); 4285 4286 *usedp = *compp = *uncompp = 0; 4287 4288 rw_enter(&dp->dp_config_rwlock, RW_READER); 4289 snapobj = lastsnap->ds_phys->ds_next_snap_obj; 4290 while (snapobj != firstsnap->ds_object) { 4291 dsl_dataset_t *ds; 4292 uint64_t used, comp, uncomp; 4293 4294 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); 4295 if (err != 0) 4296 break; 4297 4298 dsl_deadlist_space_range(&ds->ds_deadlist, 4299 firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, 4300 &used, &comp, &uncomp); 4301 *usedp += used; 4302 *compp += comp; 4303 *uncompp += uncomp; 4304 4305 snapobj = ds->ds_phys->ds_prev_snap_obj; 4306 ASSERT3U(snapobj, !=, 0); 4307 dsl_dataset_rele(ds, FTAG); 4308 } 4309 rw_exit(&dp->dp_config_rwlock); 4310 return (err); 4311 }