Print this page
FAR: generating send-streams in portable format
This commit adds a switch '-F' to zfs send. This set, zfs send generates
a stream in FAR-format instead of the traditional zfs stream format. The
generated send stream is compatible with the stream generated from 'btrfs send'
and can in principle easily be received to any filesystem.
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ new/usr/src/uts/common/fs/zfs/dsl_dataset.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2012 by Delphix. All rights reserved.
24 24 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25 25 */
26 26
27 27 #include <sys/dmu_objset.h>
28 28 #include <sys/dsl_dataset.h>
29 29 #include <sys/dsl_dir.h>
30 30 #include <sys/dsl_prop.h>
31 31 #include <sys/dsl_synctask.h>
32 32 #include <sys/dmu_traverse.h>
33 33 #include <sys/dmu_impl.h>
34 34 #include <sys/dmu_tx.h>
35 35 #include <sys/arc.h>
36 36 #include <sys/zio.h>
37 37 #include <sys/zap.h>
38 38 #include <sys/zfeature.h>
39 39 #include <sys/unique.h>
40 40 #include <sys/zfs_context.h>
41 41 #include <sys/zfs_ioctl.h>
42 42 #include <sys/spa.h>
43 43 #include <sys/zfs_znode.h>
44 44 #include <sys/zfs_onexit.h>
45 45 #include <sys/zvol.h>
46 46 #include <sys/dsl_scan.h>
47 47 #include <sys/dsl_deadlist.h>
48 48
49 49 static char *dsl_reaper = "the grim reaper";
50 50
51 51 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
52 52 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
53 53 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
54 54
55 55 #define SWITCH64(x, y) \
56 56 { \
57 57 uint64_t __tmp = (x); \
58 58 (x) = (y); \
59 59 (y) = __tmp; \
60 60 }
61 61
62 62 #define DS_REF_MAX (1ULL << 62)
63 63
64 64 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
65 65
66 66 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper)
67 67
68 68
69 69 /*
70 70 * Figure out how much of this delta should be propogated to the dsl_dir
71 71 * layer. If there's a refreservation, that space has already been
72 72 * partially accounted for in our ancestors.
73 73 */
74 74 static int64_t
75 75 parent_delta(dsl_dataset_t *ds, int64_t delta)
76 76 {
77 77 uint64_t old_bytes, new_bytes;
78 78
79 79 if (ds->ds_reserved == 0)
80 80 return (delta);
81 81
82 82 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
83 83 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
84 84
85 85 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
86 86 return (new_bytes - old_bytes);
87 87 }
88 88
89 89 void
90 90 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
91 91 {
92 92 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
93 93 int compressed = BP_GET_PSIZE(bp);
94 94 int uncompressed = BP_GET_UCSIZE(bp);
95 95 int64_t delta;
96 96
97 97 dprintf_bp(bp, "ds=%p", ds);
98 98
99 99 ASSERT(dmu_tx_is_syncing(tx));
100 100 /* It could have been compressed away to nothing */
101 101 if (BP_IS_HOLE(bp))
102 102 return;
103 103 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
104 104 ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
105 105 if (ds == NULL) {
106 106 dsl_pool_mos_diduse_space(tx->tx_pool,
107 107 used, compressed, uncompressed);
108 108 return;
109 109 }
110 110 dmu_buf_will_dirty(ds->ds_dbuf, tx);
111 111
112 112 mutex_enter(&ds->ds_dir->dd_lock);
113 113 mutex_enter(&ds->ds_lock);
114 114 delta = parent_delta(ds, used);
115 115 ds->ds_phys->ds_referenced_bytes += used;
116 116 ds->ds_phys->ds_compressed_bytes += compressed;
117 117 ds->ds_phys->ds_uncompressed_bytes += uncompressed;
118 118 ds->ds_phys->ds_unique_bytes += used;
119 119 mutex_exit(&ds->ds_lock);
120 120 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
121 121 compressed, uncompressed, tx);
122 122 dsl_dir_transfer_space(ds->ds_dir, used - delta,
123 123 DD_USED_REFRSRV, DD_USED_HEAD, tx);
124 124 mutex_exit(&ds->ds_dir->dd_lock);
125 125 }
126 126
127 127 int
128 128 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
129 129 boolean_t async)
130 130 {
131 131 if (BP_IS_HOLE(bp))
132 132 return (0);
133 133
134 134 ASSERT(dmu_tx_is_syncing(tx));
135 135 ASSERT(bp->blk_birth <= tx->tx_txg);
136 136
137 137 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
138 138 int compressed = BP_GET_PSIZE(bp);
139 139 int uncompressed = BP_GET_UCSIZE(bp);
140 140
141 141 ASSERT(used > 0);
142 142 if (ds == NULL) {
143 143 dsl_free(tx->tx_pool, tx->tx_txg, bp);
144 144 dsl_pool_mos_diduse_space(tx->tx_pool,
145 145 -used, -compressed, -uncompressed);
146 146 return (used);
147 147 }
148 148 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
149 149
150 150 ASSERT(!dsl_dataset_is_snapshot(ds));
151 151 dmu_buf_will_dirty(ds->ds_dbuf, tx);
152 152
153 153 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
154 154 int64_t delta;
155 155
156 156 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
157 157 dsl_free(tx->tx_pool, tx->tx_txg, bp);
158 158
159 159 mutex_enter(&ds->ds_dir->dd_lock);
160 160 mutex_enter(&ds->ds_lock);
161 161 ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
162 162 !DS_UNIQUE_IS_ACCURATE(ds));
163 163 delta = parent_delta(ds, -used);
164 164 ds->ds_phys->ds_unique_bytes -= used;
165 165 mutex_exit(&ds->ds_lock);
166 166 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
167 167 delta, -compressed, -uncompressed, tx);
168 168 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
169 169 DD_USED_REFRSRV, DD_USED_HEAD, tx);
170 170 mutex_exit(&ds->ds_dir->dd_lock);
171 171 } else {
172 172 dprintf_bp(bp, "putting on dead list: %s", "");
173 173 if (async) {
174 174 /*
175 175 * We are here as part of zio's write done callback,
176 176 * which means we're a zio interrupt thread. We can't
177 177 * call dsl_deadlist_insert() now because it may block
178 178 * waiting for I/O. Instead, put bp on the deferred
179 179 * queue and let dsl_pool_sync() finish the job.
180 180 */
181 181 bplist_append(&ds->ds_pending_deadlist, bp);
182 182 } else {
183 183 dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
184 184 }
185 185 ASSERT3U(ds->ds_prev->ds_object, ==,
186 186 ds->ds_phys->ds_prev_snap_obj);
187 187 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
188 188 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
189 189 if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
190 190 ds->ds_object && bp->blk_birth >
191 191 ds->ds_prev->ds_phys->ds_prev_snap_txg) {
192 192 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
193 193 mutex_enter(&ds->ds_prev->ds_lock);
194 194 ds->ds_prev->ds_phys->ds_unique_bytes += used;
195 195 mutex_exit(&ds->ds_prev->ds_lock);
196 196 }
197 197 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
198 198 dsl_dir_transfer_space(ds->ds_dir, used,
199 199 DD_USED_HEAD, DD_USED_SNAP, tx);
200 200 }
201 201 }
202 202 mutex_enter(&ds->ds_lock);
203 203 ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
204 204 ds->ds_phys->ds_referenced_bytes -= used;
205 205 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
206 206 ds->ds_phys->ds_compressed_bytes -= compressed;
207 207 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
208 208 ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
209 209 mutex_exit(&ds->ds_lock);
210 210
211 211 return (used);
212 212 }
213 213
214 214 uint64_t
215 215 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
216 216 {
217 217 uint64_t trysnap = 0;
218 218
219 219 if (ds == NULL)
220 220 return (0);
221 221 /*
222 222 * The snapshot creation could fail, but that would cause an
223 223 * incorrect FALSE return, which would only result in an
224 224 * overestimation of the amount of space that an operation would
225 225 * consume, which is OK.
226 226 *
227 227 * There's also a small window where we could miss a pending
228 228 * snapshot, because we could set the sync task in the quiescing
229 229 * phase. So this should only be used as a guess.
230 230 */
231 231 if (ds->ds_trysnap_txg >
232 232 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
233 233 trysnap = ds->ds_trysnap_txg;
234 234 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
235 235 }
236 236
237 237 boolean_t
238 238 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
239 239 uint64_t blk_birth)
240 240 {
241 241 if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
242 242 return (B_FALSE);
243 243
244 244 ddt_prefetch(dsl_dataset_get_spa(ds), bp);
245 245
246 246 return (B_TRUE);
247 247 }
248 248
249 249 /* ARGSUSED */
250 250 static void
251 251 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
252 252 {
253 253 dsl_dataset_t *ds = dsv;
254 254
255 255 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
256 256
257 257 unique_remove(ds->ds_fsid_guid);
258 258
259 259 if (ds->ds_objset != NULL)
260 260 dmu_objset_evict(ds->ds_objset);
261 261
262 262 if (ds->ds_prev) {
263 263 dsl_dataset_drop_ref(ds->ds_prev, ds);
264 264 ds->ds_prev = NULL;
265 265 }
266 266
267 267 bplist_destroy(&ds->ds_pending_deadlist);
268 268 if (db != NULL) {
269 269 dsl_deadlist_close(&ds->ds_deadlist);
270 270 } else {
271 271 ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
272 272 ASSERT(!ds->ds_deadlist.dl_oldfmt);
273 273 }
274 274 if (ds->ds_dir)
275 275 dsl_dir_close(ds->ds_dir, ds);
276 276
277 277 ASSERT(!list_link_active(&ds->ds_synced_link));
278 278
279 279 mutex_destroy(&ds->ds_lock);
280 280 mutex_destroy(&ds->ds_recvlock);
281 281 mutex_destroy(&ds->ds_opening_lock);
282 282 rw_destroy(&ds->ds_rwlock);
283 283 cv_destroy(&ds->ds_exclusive_cv);
284 284
285 285 kmem_free(ds, sizeof (dsl_dataset_t));
286 286 }
287 287
288 288 static int
289 289 dsl_dataset_get_snapname(dsl_dataset_t *ds)
290 290 {
291 291 dsl_dataset_phys_t *headphys;
292 292 int err;
293 293 dmu_buf_t *headdbuf;
294 294 dsl_pool_t *dp = ds->ds_dir->dd_pool;
295 295 objset_t *mos = dp->dp_meta_objset;
296 296
297 297 if (ds->ds_snapname[0])
298 298 return (0);
299 299 if (ds->ds_phys->ds_next_snap_obj == 0)
300 300 return (0);
301 301
302 302 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
303 303 FTAG, &headdbuf);
304 304 if (err)
305 305 return (err);
306 306 headphys = headdbuf->db_data;
307 307 err = zap_value_search(dp->dp_meta_objset,
308 308 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
309 309 dmu_buf_rele(headdbuf, FTAG);
310 310 return (err);
311 311 }
312 312
313 313 static int
314 314 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
315 315 {
316 316 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
317 317 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
318 318 matchtype_t mt;
319 319 int err;
320 320
321 321 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
322 322 mt = MT_FIRST;
323 323 else
324 324 mt = MT_EXACT;
325 325
326 326 err = zap_lookup_norm(mos, snapobj, name, 8, 1,
327 327 value, mt, NULL, 0, NULL);
328 328 if (err == ENOTSUP && mt == MT_FIRST)
329 329 err = zap_lookup(mos, snapobj, name, 8, 1, value);
330 330 return (err);
331 331 }
332 332
333 333 static int
334 334 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
335 335 {
336 336 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
337 337 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
338 338 matchtype_t mt;
339 339 int err;
340 340
341 341 dsl_dir_snap_cmtime_update(ds->ds_dir);
342 342
343 343 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
344 344 mt = MT_FIRST;
345 345 else
346 346 mt = MT_EXACT;
347 347
348 348 err = zap_remove_norm(mos, snapobj, name, mt, tx);
349 349 if (err == ENOTSUP && mt == MT_FIRST)
350 350 err = zap_remove(mos, snapobj, name, tx);
351 351 return (err);
352 352 }
353 353
354 354 static int
355 355 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
356 356 dsl_dataset_t **dsp)
357 357 {
358 358 objset_t *mos = dp->dp_meta_objset;
359 359 dmu_buf_t *dbuf;
360 360 dsl_dataset_t *ds;
361 361 int err;
362 362 dmu_object_info_t doi;
363 363
364 364 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
365 365 dsl_pool_sync_context(dp));
366 366
367 367 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
368 368 if (err)
369 369 return (err);
370 370
371 371 /* Make sure dsobj has the correct object type. */
372 372 dmu_object_info_from_db(dbuf, &doi);
373 373 if (doi.doi_type != DMU_OT_DSL_DATASET)
374 374 return (EINVAL);
375 375
376 376 ds = dmu_buf_get_user(dbuf);
377 377 if (ds == NULL) {
378 378 dsl_dataset_t *winner;
379 379
380 380 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
381 381 ds->ds_dbuf = dbuf;
382 382 ds->ds_object = dsobj;
383 383 ds->ds_phys = dbuf->db_data;
384 384
385 385 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
386 386 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
387 387 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
388 388 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
389 389
390 390 rw_init(&ds->ds_rwlock, 0, 0, 0);
391 391 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
392 392
393 393 bplist_create(&ds->ds_pending_deadlist);
394 394 dsl_deadlist_open(&ds->ds_deadlist,
395 395 mos, ds->ds_phys->ds_deadlist_obj);
396 396
397 397 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
398 398 offsetof(dmu_sendarg_t, dsa_link));
399 399
400 400 if (err == 0) {
401 401 err = dsl_dir_open_obj(dp,
402 402 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
403 403 }
404 404 if (err) {
405 405 mutex_destroy(&ds->ds_lock);
406 406 mutex_destroy(&ds->ds_recvlock);
407 407 mutex_destroy(&ds->ds_opening_lock);
408 408 rw_destroy(&ds->ds_rwlock);
409 409 cv_destroy(&ds->ds_exclusive_cv);
410 410 bplist_destroy(&ds->ds_pending_deadlist);
411 411 dsl_deadlist_close(&ds->ds_deadlist);
412 412 kmem_free(ds, sizeof (dsl_dataset_t));
413 413 dmu_buf_rele(dbuf, tag);
414 414 return (err);
415 415 }
416 416
417 417 if (!dsl_dataset_is_snapshot(ds)) {
418 418 ds->ds_snapname[0] = '\0';
419 419 if (ds->ds_phys->ds_prev_snap_obj) {
420 420 err = dsl_dataset_get_ref(dp,
421 421 ds->ds_phys->ds_prev_snap_obj,
422 422 ds, &ds->ds_prev);
423 423 }
424 424 } else {
425 425 if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
426 426 err = dsl_dataset_get_snapname(ds);
427 427 if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
428 428 err = zap_count(
429 429 ds->ds_dir->dd_pool->dp_meta_objset,
430 430 ds->ds_phys->ds_userrefs_obj,
431 431 &ds->ds_userrefs);
432 432 }
433 433 }
434 434
435 435 if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
436 436 /*
437 437 * In sync context, we're called with either no lock
438 438 * or with the write lock. If we're not syncing,
439 439 * we're always called with the read lock held.
440 440 */
441 441 boolean_t need_lock =
442 442 !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
443 443 dsl_pool_sync_context(dp);
444 444
445 445 if (need_lock)
446 446 rw_enter(&dp->dp_config_rwlock, RW_READER);
447 447
448 448 err = dsl_prop_get_ds(ds,
449 449 "refreservation", sizeof (uint64_t), 1,
450 450 &ds->ds_reserved, NULL);
451 451 if (err == 0) {
452 452 err = dsl_prop_get_ds(ds,
453 453 "refquota", sizeof (uint64_t), 1,
454 454 &ds->ds_quota, NULL);
455 455 }
456 456
457 457 if (need_lock)
458 458 rw_exit(&dp->dp_config_rwlock);
459 459 } else {
460 460 ds->ds_reserved = ds->ds_quota = 0;
461 461 }
462 462
463 463 if (err == 0) {
464 464 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
465 465 dsl_dataset_evict);
466 466 }
467 467 if (err || winner) {
468 468 bplist_destroy(&ds->ds_pending_deadlist);
469 469 dsl_deadlist_close(&ds->ds_deadlist);
470 470 if (ds->ds_prev)
471 471 dsl_dataset_drop_ref(ds->ds_prev, ds);
472 472 dsl_dir_close(ds->ds_dir, ds);
473 473 mutex_destroy(&ds->ds_lock);
474 474 mutex_destroy(&ds->ds_recvlock);
475 475 mutex_destroy(&ds->ds_opening_lock);
476 476 rw_destroy(&ds->ds_rwlock);
477 477 cv_destroy(&ds->ds_exclusive_cv);
478 478 kmem_free(ds, sizeof (dsl_dataset_t));
479 479 if (err) {
480 480 dmu_buf_rele(dbuf, tag);
481 481 return (err);
482 482 }
483 483 ds = winner;
484 484 } else {
485 485 ds->ds_fsid_guid =
486 486 unique_insert(ds->ds_phys->ds_fsid_guid);
487 487 }
488 488 }
489 489 ASSERT3P(ds->ds_dbuf, ==, dbuf);
490 490 ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
491 491 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
492 492 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
493 493 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
494 494 mutex_enter(&ds->ds_lock);
495 495 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
496 496 mutex_exit(&ds->ds_lock);
497 497 dmu_buf_rele(ds->ds_dbuf, tag);
498 498 return (ENOENT);
499 499 }
500 500 mutex_exit(&ds->ds_lock);
501 501 *dsp = ds;
502 502 return (0);
503 503 }
504 504
505 505 static int
506 506 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
507 507 {
508 508 dsl_pool_t *dp = ds->ds_dir->dd_pool;
509 509
510 510 /*
511 511 * In syncing context we don't want the rwlock lock: there
512 512 * may be an existing writer waiting for sync phase to
513 513 * finish. We don't need to worry about such writers, since
514 514 * sync phase is single-threaded, so the writer can't be
515 515 * doing anything while we are active.
516 516 */
517 517 if (dsl_pool_sync_context(dp)) {
518 518 ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
519 519 return (0);
520 520 }
521 521
522 522 /*
523 523 * Normal users will hold the ds_rwlock as a READER until they
524 524 * are finished (i.e., call dsl_dataset_rele()). "Owners" will
525 525 * drop their READER lock after they set the ds_owner field.
526 526 *
527 527 * If the dataset is being destroyed, the destroy thread will
528 528 * obtain a WRITER lock for exclusive access after it's done its
529 529 * open-context work and then change the ds_owner to
530 530 * dsl_reaper once destruction is assured. So threads
531 531 * may block here temporarily, until the "destructability" of
532 532 * the dataset is determined.
533 533 */
534 534 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
535 535 mutex_enter(&ds->ds_lock);
536 536 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
537 537 rw_exit(&dp->dp_config_rwlock);
538 538 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
539 539 if (DSL_DATASET_IS_DESTROYED(ds)) {
540 540 mutex_exit(&ds->ds_lock);
541 541 dsl_dataset_drop_ref(ds, tag);
542 542 rw_enter(&dp->dp_config_rwlock, RW_READER);
543 543 return (ENOENT);
544 544 }
545 545 /*
546 546 * The dp_config_rwlock lives above the ds_lock. And
547 547 * we need to check DSL_DATASET_IS_DESTROYED() while
548 548 * holding the ds_lock, so we have to drop and reacquire
549 549 * the ds_lock here.
550 550 */
551 551 mutex_exit(&ds->ds_lock);
552 552 rw_enter(&dp->dp_config_rwlock, RW_READER);
553 553 mutex_enter(&ds->ds_lock);
554 554 }
555 555 mutex_exit(&ds->ds_lock);
556 556 return (0);
557 557 }
558 558
559 559 int
560 560 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
561 561 dsl_dataset_t **dsp)
562 562 {
563 563 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
564 564
565 565 if (err)
566 566 return (err);
567 567 return (dsl_dataset_hold_ref(*dsp, tag));
568 568 }
569 569
570 570 int
571 571 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
572 572 void *tag, dsl_dataset_t **dsp)
573 573 {
574 574 int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
575 575 if (err)
576 576 return (err);
577 577 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
578 578 dsl_dataset_rele(*dsp, tag);
579 579 *dsp = NULL;
580 580 return (EBUSY);
581 581 }
582 582 return (0);
583 583 }
584 584
585 585 int
586 586 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
587 587 {
588 588 dsl_dir_t *dd;
589 589 dsl_pool_t *dp;
590 590 const char *snapname;
591 591 uint64_t obj;
592 592 int err = 0;
593 593
594 594 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
595 595 if (err)
596 596 return (err);
597 597
598 598 dp = dd->dd_pool;
599 599 obj = dd->dd_phys->dd_head_dataset_obj;
600 600 rw_enter(&dp->dp_config_rwlock, RW_READER);
601 601 if (obj)
602 602 err = dsl_dataset_get_ref(dp, obj, tag, dsp);
603 603 else
604 604 err = ENOENT;
605 605 if (err)
606 606 goto out;
607 607
608 608 err = dsl_dataset_hold_ref(*dsp, tag);
609 609
610 610 /* we may be looking for a snapshot */
611 611 if (err == 0 && snapname != NULL) {
612 612 dsl_dataset_t *ds = NULL;
613 613
614 614 if (*snapname++ != '@') {
615 615 dsl_dataset_rele(*dsp, tag);
616 616 err = ENOENT;
617 617 goto out;
618 618 }
619 619
620 620 dprintf("looking for snapshot '%s'\n", snapname);
621 621 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
622 622 if (err == 0)
623 623 err = dsl_dataset_get_ref(dp, obj, tag, &ds);
624 624 dsl_dataset_rele(*dsp, tag);
625 625
626 626 ASSERT3U((err == 0), ==, (ds != NULL));
627 627
628 628 if (ds) {
629 629 mutex_enter(&ds->ds_lock);
630 630 if (ds->ds_snapname[0] == 0)
631 631 (void) strlcpy(ds->ds_snapname, snapname,
632 632 sizeof (ds->ds_snapname));
633 633 mutex_exit(&ds->ds_lock);
634 634 err = dsl_dataset_hold_ref(ds, tag);
635 635 *dsp = err ? NULL : ds;
636 636 }
637 637 }
638 638 out:
639 639 rw_exit(&dp->dp_config_rwlock);
640 640 dsl_dir_close(dd, FTAG);
641 641 return (err);
642 642 }
643 643
644 644 int
645 645 dsl_dataset_own(const char *name, boolean_t inconsistentok,
646 646 void *tag, dsl_dataset_t **dsp)
647 647 {
648 648 int err = dsl_dataset_hold(name, tag, dsp);
649 649 if (err)
650 650 return (err);
651 651 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
652 652 dsl_dataset_rele(*dsp, tag);
653 653 return (EBUSY);
654 654 }
655 655 return (0);
656 656 }
657 657
658 658 void
659 659 dsl_dataset_name(dsl_dataset_t *ds, char *name)
660 660 {
661 661 if (ds == NULL) {
662 662 (void) strcpy(name, "mos");
663 663 } else {
664 664 dsl_dir_name(ds->ds_dir, name);
665 665 VERIFY(0 == dsl_dataset_get_snapname(ds));
666 666 if (ds->ds_snapname[0]) {
667 667 (void) strcat(name, "@");
668 668 /*
669 669 * We use a "recursive" mutex so that we
670 670 * can call dprintf_ds() with ds_lock held.
671 671 */
672 672 if (!MUTEX_HELD(&ds->ds_lock)) {
↓ open down ↓ |
672 lines elided |
↑ open up ↑ |
673 673 mutex_enter(&ds->ds_lock);
674 674 (void) strcat(name, ds->ds_snapname);
675 675 mutex_exit(&ds->ds_lock);
676 676 } else {
677 677 (void) strcat(name, ds->ds_snapname);
678 678 }
679 679 }
680 680 }
681 681 }
682 682
683 -static int
683 +int
684 684 dsl_dataset_namelen(dsl_dataset_t *ds)
685 685 {
686 686 int result;
687 687
688 688 if (ds == NULL) {
689 689 result = 3; /* "mos" */
690 690 } else {
691 691 result = dsl_dir_namelen(ds->ds_dir);
692 692 VERIFY(0 == dsl_dataset_get_snapname(ds));
693 693 if (ds->ds_snapname[0]) {
694 694 ++result; /* adding one for the @-sign */
695 695 if (!MUTEX_HELD(&ds->ds_lock)) {
696 696 mutex_enter(&ds->ds_lock);
697 697 result += strlen(ds->ds_snapname);
698 698 mutex_exit(&ds->ds_lock);
699 699 } else {
700 700 result += strlen(ds->ds_snapname);
701 701 }
702 702 }
703 703 }
704 704
705 705 return (result);
706 706 }
707 707
708 708 void
709 709 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
710 710 {
711 711 dmu_buf_rele(ds->ds_dbuf, tag);
712 712 }
713 713
714 714 void
715 715 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
716 716 {
717 717 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
718 718 rw_exit(&ds->ds_rwlock);
719 719 }
720 720 dsl_dataset_drop_ref(ds, tag);
721 721 }
722 722
723 723 void
724 724 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
725 725 {
726 726 ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
727 727 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
728 728
729 729 mutex_enter(&ds->ds_lock);
730 730 ds->ds_owner = NULL;
731 731 if (RW_WRITE_HELD(&ds->ds_rwlock)) {
732 732 rw_exit(&ds->ds_rwlock);
733 733 cv_broadcast(&ds->ds_exclusive_cv);
734 734 }
735 735 mutex_exit(&ds->ds_lock);
736 736 if (ds->ds_dbuf)
737 737 dsl_dataset_drop_ref(ds, tag);
738 738 else
739 739 dsl_dataset_evict(NULL, ds);
740 740 }
741 741
742 742 boolean_t
743 743 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
744 744 {
745 745 boolean_t gotit = FALSE;
746 746
747 747 mutex_enter(&ds->ds_lock);
748 748 if (ds->ds_owner == NULL &&
749 749 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
750 750 ds->ds_owner = tag;
751 751 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
752 752 rw_exit(&ds->ds_rwlock);
753 753 gotit = TRUE;
754 754 }
755 755 mutex_exit(&ds->ds_lock);
756 756 return (gotit);
757 757 }
758 758
759 759 void
760 760 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
761 761 {
762 762 ASSERT3P(owner, ==, ds->ds_owner);
763 763 if (!RW_WRITE_HELD(&ds->ds_rwlock))
764 764 rw_enter(&ds->ds_rwlock, RW_WRITER);
765 765 }
766 766
767 767 uint64_t
768 768 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
769 769 uint64_t flags, dmu_tx_t *tx)
770 770 {
771 771 dsl_pool_t *dp = dd->dd_pool;
772 772 dmu_buf_t *dbuf;
773 773 dsl_dataset_phys_t *dsphys;
774 774 uint64_t dsobj;
775 775 objset_t *mos = dp->dp_meta_objset;
776 776
777 777 if (origin == NULL)
778 778 origin = dp->dp_origin_snap;
779 779
780 780 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
781 781 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
782 782 ASSERT(dmu_tx_is_syncing(tx));
783 783 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
784 784
785 785 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
786 786 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
787 787 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
788 788 dmu_buf_will_dirty(dbuf, tx);
789 789 dsphys = dbuf->db_data;
790 790 bzero(dsphys, sizeof (dsl_dataset_phys_t));
791 791 dsphys->ds_dir_obj = dd->dd_object;
792 792 dsphys->ds_flags = flags;
793 793 dsphys->ds_fsid_guid = unique_create();
794 794 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
795 795 sizeof (dsphys->ds_guid));
796 796 dsphys->ds_snapnames_zapobj =
797 797 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
798 798 DMU_OT_NONE, 0, tx);
799 799 dsphys->ds_creation_time = gethrestime_sec();
800 800 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
801 801
802 802 if (origin == NULL) {
803 803 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
804 804 } else {
805 805 dsl_dataset_t *ohds;
806 806
807 807 dsphys->ds_prev_snap_obj = origin->ds_object;
808 808 dsphys->ds_prev_snap_txg =
809 809 origin->ds_phys->ds_creation_txg;
810 810 dsphys->ds_referenced_bytes =
811 811 origin->ds_phys->ds_referenced_bytes;
812 812 dsphys->ds_compressed_bytes =
813 813 origin->ds_phys->ds_compressed_bytes;
814 814 dsphys->ds_uncompressed_bytes =
815 815 origin->ds_phys->ds_uncompressed_bytes;
816 816 dsphys->ds_bp = origin->ds_phys->ds_bp;
817 817 dsphys->ds_flags |= origin->ds_phys->ds_flags;
818 818
819 819 dmu_buf_will_dirty(origin->ds_dbuf, tx);
820 820 origin->ds_phys->ds_num_children++;
821 821
822 822 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
823 823 origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
824 824 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
825 825 dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
826 826 dsl_dataset_rele(ohds, FTAG);
827 827
828 828 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
829 829 if (origin->ds_phys->ds_next_clones_obj == 0) {
830 830 origin->ds_phys->ds_next_clones_obj =
831 831 zap_create(mos,
832 832 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
833 833 }
834 834 VERIFY(0 == zap_add_int(mos,
835 835 origin->ds_phys->ds_next_clones_obj,
836 836 dsobj, tx));
837 837 }
838 838
839 839 dmu_buf_will_dirty(dd->dd_dbuf, tx);
840 840 dd->dd_phys->dd_origin_obj = origin->ds_object;
841 841 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
842 842 if (origin->ds_dir->dd_phys->dd_clones == 0) {
843 843 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
844 844 origin->ds_dir->dd_phys->dd_clones =
845 845 zap_create(mos,
846 846 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
847 847 }
848 848 VERIFY3U(0, ==, zap_add_int(mos,
849 849 origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
850 850 }
851 851 }
852 852
853 853 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
854 854 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
855 855
856 856 dmu_buf_rele(dbuf, FTAG);
857 857
858 858 dmu_buf_will_dirty(dd->dd_dbuf, tx);
859 859 dd->dd_phys->dd_head_dataset_obj = dsobj;
860 860
861 861 return (dsobj);
862 862 }
863 863
864 864 uint64_t
865 865 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
866 866 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
867 867 {
868 868 dsl_pool_t *dp = pdd->dd_pool;
869 869 uint64_t dsobj, ddobj;
870 870 dsl_dir_t *dd;
871 871
872 872 ASSERT(lastname[0] != '@');
873 873
874 874 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
875 875 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
876 876
877 877 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
878 878
879 879 dsl_deleg_set_create_perms(dd, tx, cr);
880 880
881 881 dsl_dir_close(dd, FTAG);
882 882
883 883 /*
884 884 * If we are creating a clone, make sure we zero out any stale
885 885 * data from the origin snapshots zil header.
886 886 */
887 887 if (origin != NULL) {
888 888 dsl_dataset_t *ds;
889 889 objset_t *os;
890 890
891 891 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
892 892 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
893 893 bzero(&os->os_zil_header, sizeof (os->os_zil_header));
894 894 dsl_dataset_dirty(ds, tx);
895 895 dsl_dataset_rele(ds, FTAG);
896 896 }
897 897
898 898 return (dsobj);
899 899 }
900 900
901 901 /*
902 902 * The snapshots must all be in the same pool.
903 903 */
904 904 int
905 905 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer,
906 906 nvlist_t *errlist)
907 907 {
908 908 int err;
909 909 dsl_sync_task_t *dst;
910 910 spa_t *spa;
911 911 nvpair_t *pair;
912 912 dsl_sync_task_group_t *dstg;
913 913
914 914 pair = nvlist_next_nvpair(snaps, NULL);
915 915 if (pair == NULL)
916 916 return (0);
917 917
918 918 err = spa_open(nvpair_name(pair), &spa, FTAG);
919 919 if (err)
920 920 return (err);
921 921 dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
922 922
923 923 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
924 924 pair = nvlist_next_nvpair(snaps, pair)) {
925 925 dsl_dataset_t *ds;
926 926
927 927 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
928 928 if (err == 0) {
929 929 struct dsl_ds_destroyarg *dsda;
930 930
931 931 dsl_dataset_make_exclusive(ds, dstg);
932 932 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
933 933 KM_SLEEP);
934 934 dsda->ds = ds;
935 935 dsda->defer = defer;
936 936 dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
937 937 dsl_dataset_destroy_sync, dsda, dstg, 0);
938 938 } else if (err == ENOENT) {
939 939 err = 0;
940 940 } else {
941 941 fnvlist_add_int32(errlist, nvpair_name(pair), err);
942 942 break;
943 943 }
944 944 }
945 945
946 946 if (err == 0)
947 947 err = dsl_sync_task_group_wait(dstg);
948 948
949 949 for (dst = list_head(&dstg->dstg_tasks); dst;
950 950 dst = list_next(&dstg->dstg_tasks, dst)) {
951 951 struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
952 952 dsl_dataset_t *ds = dsda->ds;
953 953
954 954 /*
955 955 * Return the snapshots that triggered the error.
956 956 */
957 957 if (dst->dst_err != 0) {
958 958 char name[ZFS_MAXNAMELEN];
959 959 dsl_dataset_name(ds, name);
960 960 fnvlist_add_int32(errlist, name, dst->dst_err);
961 961 }
962 962 ASSERT3P(dsda->rm_origin, ==, NULL);
963 963 dsl_dataset_disown(ds, dstg);
964 964 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
965 965 }
966 966
967 967 dsl_sync_task_group_destroy(dstg);
968 968 spa_close(spa, FTAG);
969 969 return (err);
970 970
971 971 }
972 972
973 973 static boolean_t
974 974 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
975 975 {
976 976 boolean_t might_destroy = B_FALSE;
977 977
978 978 mutex_enter(&ds->ds_lock);
979 979 if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
980 980 DS_IS_DEFER_DESTROY(ds))
981 981 might_destroy = B_TRUE;
982 982 mutex_exit(&ds->ds_lock);
983 983
984 984 return (might_destroy);
985 985 }
986 986
987 987 /*
988 988 * If we're removing a clone, and these three conditions are true:
989 989 * 1) the clone's origin has no other children
990 990 * 2) the clone's origin has no user references
991 991 * 3) the clone's origin has been marked for deferred destruction
992 992 * Then, prepare to remove the origin as part of this sync task group.
993 993 */
994 994 static int
995 995 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
996 996 {
997 997 dsl_dataset_t *ds = dsda->ds;
998 998 dsl_dataset_t *origin = ds->ds_prev;
999 999
1000 1000 if (dsl_dataset_might_destroy_origin(origin)) {
1001 1001 char *name;
1002 1002 int namelen;
1003 1003 int error;
1004 1004
1005 1005 namelen = dsl_dataset_namelen(origin) + 1;
1006 1006 name = kmem_alloc(namelen, KM_SLEEP);
1007 1007 dsl_dataset_name(origin, name);
1008 1008 #ifdef _KERNEL
1009 1009 error = zfs_unmount_snap(name, NULL);
1010 1010 if (error) {
1011 1011 kmem_free(name, namelen);
1012 1012 return (error);
1013 1013 }
1014 1014 #endif
1015 1015 error = dsl_dataset_own(name, B_TRUE, tag, &origin);
1016 1016 kmem_free(name, namelen);
1017 1017 if (error)
1018 1018 return (error);
1019 1019 dsda->rm_origin = origin;
1020 1020 dsl_dataset_make_exclusive(origin, tag);
1021 1021 }
1022 1022
1023 1023 return (0);
1024 1024 }
1025 1025
1026 1026 /*
1027 1027 * ds must be opened as OWNER. On return (whether successful or not),
1028 1028 * ds will be closed and caller can no longer dereference it.
1029 1029 */
1030 1030 int
1031 1031 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
1032 1032 {
1033 1033 int err;
1034 1034 dsl_sync_task_group_t *dstg;
1035 1035 objset_t *os;
1036 1036 dsl_dir_t *dd;
1037 1037 uint64_t obj;
1038 1038 struct dsl_ds_destroyarg dsda = { 0 };
1039 1039
1040 1040 dsda.ds = ds;
1041 1041
1042 1042 if (dsl_dataset_is_snapshot(ds)) {
1043 1043 /* Destroying a snapshot is simpler */
1044 1044 dsl_dataset_make_exclusive(ds, tag);
1045 1045
1046 1046 dsda.defer = defer;
1047 1047 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1048 1048 dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
1049 1049 &dsda, tag, 0);
1050 1050 ASSERT3P(dsda.rm_origin, ==, NULL);
1051 1051 goto out;
1052 1052 } else if (defer) {
1053 1053 err = EINVAL;
1054 1054 goto out;
1055 1055 }
1056 1056
1057 1057 dd = ds->ds_dir;
1058 1058
1059 1059 if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
1060 1060 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1061 1061 /*
1062 1062 * Check for errors and mark this ds as inconsistent, in
1063 1063 * case we crash while freeing the objects.
1064 1064 */
1065 1065 err = dsl_sync_task_do(dd->dd_pool,
1066 1066 dsl_dataset_destroy_begin_check,
1067 1067 dsl_dataset_destroy_begin_sync, ds, NULL, 0);
1068 1068 if (err)
1069 1069 goto out;
1070 1070
1071 1071 err = dmu_objset_from_ds(ds, &os);
1072 1072 if (err)
1073 1073 goto out;
1074 1074
1075 1075 /*
1076 1076 * Remove all objects while in the open context so that
1077 1077 * there is less work to do in the syncing context.
1078 1078 */
1079 1079 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
1080 1080 ds->ds_phys->ds_prev_snap_txg)) {
1081 1081 /*
1082 1082 * Ignore errors, if there is not enough disk space
1083 1083 * we will deal with it in dsl_dataset_destroy_sync().
1084 1084 */
1085 1085 (void) dmu_free_object(os, obj);
1086 1086 }
1087 1087 if (err != ESRCH)
1088 1088 goto out;
1089 1089
1090 1090 /*
1091 1091 * Sync out all in-flight IO.
1092 1092 */
1093 1093 txg_wait_synced(dd->dd_pool, 0);
1094 1094
1095 1095 /*
1096 1096 * If we managed to free all the objects in open
1097 1097 * context, the user space accounting should be zero.
1098 1098 */
1099 1099 if (ds->ds_phys->ds_bp.blk_fill == 0 &&
1100 1100 dmu_objset_userused_enabled(os)) {
1101 1101 uint64_t count;
1102 1102
1103 1103 ASSERT(zap_count(os, DMU_USERUSED_OBJECT,
1104 1104 &count) != 0 || count == 0);
1105 1105 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT,
1106 1106 &count) != 0 || count == 0);
1107 1107 }
1108 1108 }
1109 1109
1110 1110 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
1111 1111 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
1112 1112 rw_exit(&dd->dd_pool->dp_config_rwlock);
1113 1113
1114 1114 if (err)
1115 1115 goto out;
1116 1116
1117 1117 /*
1118 1118 * Blow away the dsl_dir + head dataset.
1119 1119 */
1120 1120 dsl_dataset_make_exclusive(ds, tag);
1121 1121 /*
1122 1122 * If we're removing a clone, we might also need to remove its
1123 1123 * origin.
1124 1124 */
1125 1125 do {
1126 1126 dsda.need_prep = B_FALSE;
1127 1127 if (dsl_dir_is_clone(dd)) {
1128 1128 err = dsl_dataset_origin_rm_prep(&dsda, tag);
1129 1129 if (err) {
1130 1130 dsl_dir_close(dd, FTAG);
1131 1131 goto out;
1132 1132 }
1133 1133 }
1134 1134
1135 1135 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
1136 1136 dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
1137 1137 dsl_dataset_destroy_sync, &dsda, tag, 0);
1138 1138 dsl_sync_task_create(dstg, dsl_dir_destroy_check,
1139 1139 dsl_dir_destroy_sync, dd, FTAG, 0);
1140 1140 err = dsl_sync_task_group_wait(dstg);
1141 1141 dsl_sync_task_group_destroy(dstg);
1142 1142
1143 1143 /*
1144 1144 * We could be racing against 'zfs release' or 'zfs destroy -d'
1145 1145 * on the origin snap, in which case we can get EBUSY if we
1146 1146 * needed to destroy the origin snap but were not ready to
1147 1147 * do so.
1148 1148 */
1149 1149 if (dsda.need_prep) {
1150 1150 ASSERT(err == EBUSY);
1151 1151 ASSERT(dsl_dir_is_clone(dd));
1152 1152 ASSERT(dsda.rm_origin == NULL);
1153 1153 }
1154 1154 } while (dsda.need_prep);
1155 1155
1156 1156 if (dsda.rm_origin != NULL)
1157 1157 dsl_dataset_disown(dsda.rm_origin, tag);
1158 1158
1159 1159 /* if it is successful, dsl_dir_destroy_sync will close the dd */
1160 1160 if (err)
1161 1161 dsl_dir_close(dd, FTAG);
1162 1162 out:
1163 1163 dsl_dataset_disown(ds, tag);
1164 1164 return (err);
1165 1165 }
1166 1166
1167 1167 blkptr_t *
1168 1168 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1169 1169 {
1170 1170 return (&ds->ds_phys->ds_bp);
1171 1171 }
1172 1172
1173 1173 void
1174 1174 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1175 1175 {
1176 1176 ASSERT(dmu_tx_is_syncing(tx));
1177 1177 /* If it's the meta-objset, set dp_meta_rootbp */
1178 1178 if (ds == NULL) {
1179 1179 tx->tx_pool->dp_meta_rootbp = *bp;
1180 1180 } else {
1181 1181 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1182 1182 ds->ds_phys->ds_bp = *bp;
1183 1183 }
1184 1184 }
1185 1185
1186 1186 spa_t *
1187 1187 dsl_dataset_get_spa(dsl_dataset_t *ds)
1188 1188 {
1189 1189 return (ds->ds_dir->dd_pool->dp_spa);
1190 1190 }
1191 1191
1192 1192 void
1193 1193 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1194 1194 {
1195 1195 dsl_pool_t *dp;
1196 1196
1197 1197 if (ds == NULL) /* this is the meta-objset */
1198 1198 return;
1199 1199
1200 1200 ASSERT(ds->ds_objset != NULL);
1201 1201
1202 1202 if (ds->ds_phys->ds_next_snap_obj != 0)
1203 1203 panic("dirtying snapshot!");
1204 1204
1205 1205 dp = ds->ds_dir->dd_pool;
1206 1206
1207 1207 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
1208 1208 /* up the hold count until we can be written out */
1209 1209 dmu_buf_add_ref(ds->ds_dbuf, ds);
1210 1210 }
1211 1211 }
1212 1212
1213 1213 boolean_t
1214 1214 dsl_dataset_is_dirty(dsl_dataset_t *ds)
1215 1215 {
1216 1216 for (int t = 0; t < TXG_SIZE; t++) {
1217 1217 if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
1218 1218 ds, t))
1219 1219 return (B_TRUE);
1220 1220 }
1221 1221 return (B_FALSE);
1222 1222 }
1223 1223
1224 1224 /*
1225 1225 * The unique space in the head dataset can be calculated by subtracting
1226 1226 * the space used in the most recent snapshot, that is still being used
1227 1227 * in this file system, from the space currently in use. To figure out
1228 1228 * the space in the most recent snapshot still in use, we need to take
1229 1229 * the total space used in the snapshot and subtract out the space that
1230 1230 * has been freed up since the snapshot was taken.
1231 1231 */
1232 1232 static void
1233 1233 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1234 1234 {
1235 1235 uint64_t mrs_used;
1236 1236 uint64_t dlused, dlcomp, dluncomp;
1237 1237
1238 1238 ASSERT(!dsl_dataset_is_snapshot(ds));
1239 1239
1240 1240 if (ds->ds_phys->ds_prev_snap_obj != 0)
1241 1241 mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
1242 1242 else
1243 1243 mrs_used = 0;
1244 1244
1245 1245 dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1246 1246
1247 1247 ASSERT3U(dlused, <=, mrs_used);
1248 1248 ds->ds_phys->ds_unique_bytes =
1249 1249 ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
1250 1250
1251 1251 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1252 1252 SPA_VERSION_UNIQUE_ACCURATE)
1253 1253 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1254 1254 }
1255 1255
1256 1256 struct killarg {
1257 1257 dsl_dataset_t *ds;
1258 1258 dmu_tx_t *tx;
1259 1259 };
1260 1260
1261 1261 /* ARGSUSED */
1262 1262 static int
1263 1263 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
1264 1264 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1265 1265 {
1266 1266 struct killarg *ka = arg;
1267 1267 dmu_tx_t *tx = ka->tx;
1268 1268
1269 1269 if (bp == NULL)
1270 1270 return (0);
1271 1271
1272 1272 if (zb->zb_level == ZB_ZIL_LEVEL) {
1273 1273 ASSERT(zilog != NULL);
1274 1274 /*
1275 1275 * It's a block in the intent log. It has no
1276 1276 * accounting, so just free it.
1277 1277 */
1278 1278 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
1279 1279 } else {
1280 1280 ASSERT(zilog == NULL);
1281 1281 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
1282 1282 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
1283 1283 }
1284 1284
1285 1285 return (0);
1286 1286 }
1287 1287
1288 1288 /* ARGSUSED */
1289 1289 static int
1290 1290 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
1291 1291 {
1292 1292 dsl_dataset_t *ds = arg1;
1293 1293 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1294 1294 uint64_t count;
1295 1295 int err;
1296 1296
1297 1297 /*
1298 1298 * Can't delete a head dataset if there are snapshots of it.
1299 1299 * (Except if the only snapshots are from the branch we cloned
1300 1300 * from.)
1301 1301 */
1302 1302 if (ds->ds_prev != NULL &&
1303 1303 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1304 1304 return (EBUSY);
1305 1305
1306 1306 /*
1307 1307 * This is really a dsl_dir thing, but check it here so that
1308 1308 * we'll be less likely to leave this dataset inconsistent &
1309 1309 * nearly destroyed.
1310 1310 */
1311 1311 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
1312 1312 if (err)
1313 1313 return (err);
1314 1314 if (count != 0)
1315 1315 return (EEXIST);
1316 1316
1317 1317 return (0);
1318 1318 }
1319 1319
1320 1320 /* ARGSUSED */
1321 1321 static void
1322 1322 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1323 1323 {
1324 1324 dsl_dataset_t *ds = arg1;
1325 1325
1326 1326 /* Mark it as inconsistent on-disk, in case we crash */
1327 1327 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1328 1328 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1329 1329
1330 1330 spa_history_log_internal_ds(ds, "destroy begin", tx, "");
1331 1331 }
1332 1332
1333 1333 static int
1334 1334 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
1335 1335 dmu_tx_t *tx)
1336 1336 {
1337 1337 dsl_dataset_t *ds = dsda->ds;
1338 1338 dsl_dataset_t *ds_prev = ds->ds_prev;
1339 1339
1340 1340 if (dsl_dataset_might_destroy_origin(ds_prev)) {
1341 1341 struct dsl_ds_destroyarg ndsda = {0};
1342 1342
1343 1343 /*
1344 1344 * If we're not prepared to remove the origin, don't remove
1345 1345 * the clone either.
1346 1346 */
1347 1347 if (dsda->rm_origin == NULL) {
1348 1348 dsda->need_prep = B_TRUE;
1349 1349 return (EBUSY);
1350 1350 }
1351 1351
1352 1352 ndsda.ds = ds_prev;
1353 1353 ndsda.is_origin_rm = B_TRUE;
1354 1354 return (dsl_dataset_destroy_check(&ndsda, tag, tx));
1355 1355 }
1356 1356
1357 1357 /*
1358 1358 * If we're not going to remove the origin after all,
1359 1359 * undo the open context setup.
1360 1360 */
1361 1361 if (dsda->rm_origin != NULL) {
1362 1362 dsl_dataset_disown(dsda->rm_origin, tag);
1363 1363 dsda->rm_origin = NULL;
1364 1364 }
1365 1365
1366 1366 return (0);
1367 1367 }
1368 1368
1369 1369 /*
1370 1370 * If you add new checks here, you may need to add
1371 1371 * additional checks to the "temporary" case in
1372 1372 * snapshot_check() in dmu_objset.c.
1373 1373 */
1374 1374 /* ARGSUSED */
1375 1375 int
1376 1376 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
1377 1377 {
1378 1378 struct dsl_ds_destroyarg *dsda = arg1;
1379 1379 dsl_dataset_t *ds = dsda->ds;
1380 1380
1381 1381 /* we have an owner hold, so noone else can destroy us */
1382 1382 ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
1383 1383
1384 1384 /*
1385 1385 * Only allow deferred destroy on pools that support it.
1386 1386 * NOTE: deferred destroy is only supported on snapshots.
1387 1387 */
1388 1388 if (dsda->defer) {
1389 1389 if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
1390 1390 SPA_VERSION_USERREFS)
1391 1391 return (ENOTSUP);
1392 1392 ASSERT(dsl_dataset_is_snapshot(ds));
1393 1393 return (0);
1394 1394 }
1395 1395
1396 1396 /*
1397 1397 * Can't delete a head dataset if there are snapshots of it.
1398 1398 * (Except if the only snapshots are from the branch we cloned
1399 1399 * from.)
1400 1400 */
1401 1401 if (ds->ds_prev != NULL &&
1402 1402 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1403 1403 return (EBUSY);
1404 1404
1405 1405 /*
1406 1406 * If we made changes this txg, traverse_dsl_dataset won't find
1407 1407 * them. Try again.
1408 1408 */
1409 1409 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1410 1410 return (EAGAIN);
1411 1411
1412 1412 if (dsl_dataset_is_snapshot(ds)) {
1413 1413 /*
1414 1414 * If this snapshot has an elevated user reference count,
1415 1415 * we can't destroy it yet.
1416 1416 */
1417 1417 if (ds->ds_userrefs > 0 && !dsda->releasing)
1418 1418 return (EBUSY);
1419 1419
1420 1420 mutex_enter(&ds->ds_lock);
1421 1421 /*
1422 1422 * Can't delete a branch point. However, if we're destroying
1423 1423 * a clone and removing its origin due to it having a user
1424 1424 * hold count of 0 and having been marked for deferred destroy,
1425 1425 * it's OK for the origin to have a single clone.
1426 1426 */
1427 1427 if (ds->ds_phys->ds_num_children >
1428 1428 (dsda->is_origin_rm ? 2 : 1)) {
1429 1429 mutex_exit(&ds->ds_lock);
1430 1430 return (EEXIST);
1431 1431 }
1432 1432 mutex_exit(&ds->ds_lock);
1433 1433 } else if (dsl_dir_is_clone(ds->ds_dir)) {
1434 1434 return (dsl_dataset_origin_check(dsda, arg2, tx));
1435 1435 }
1436 1436
1437 1437 /* XXX we should do some i/o error checking... */
1438 1438 return (0);
1439 1439 }
1440 1440
1441 1441 struct refsarg {
1442 1442 kmutex_t lock;
1443 1443 boolean_t gone;
1444 1444 kcondvar_t cv;
1445 1445 };
1446 1446
1447 1447 /* ARGSUSED */
1448 1448 static void
1449 1449 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
1450 1450 {
1451 1451 struct refsarg *arg = argv;
1452 1452
1453 1453 mutex_enter(&arg->lock);
1454 1454 arg->gone = TRUE;
1455 1455 cv_signal(&arg->cv);
1456 1456 mutex_exit(&arg->lock);
1457 1457 }
1458 1458
1459 1459 static void
1460 1460 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
1461 1461 {
1462 1462 struct refsarg arg;
1463 1463
1464 1464 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
1465 1465 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
1466 1466 arg.gone = FALSE;
1467 1467 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
1468 1468 dsl_dataset_refs_gone);
1469 1469 dmu_buf_rele(ds->ds_dbuf, tag);
1470 1470 mutex_enter(&arg.lock);
1471 1471 while (!arg.gone)
1472 1472 cv_wait(&arg.cv, &arg.lock);
1473 1473 ASSERT(arg.gone);
1474 1474 mutex_exit(&arg.lock);
1475 1475 ds->ds_dbuf = NULL;
1476 1476 ds->ds_phys = NULL;
1477 1477 mutex_destroy(&arg.lock);
1478 1478 cv_destroy(&arg.cv);
1479 1479 }
1480 1480
1481 1481 static void
1482 1482 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
1483 1483 {
1484 1484 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1485 1485 uint64_t count;
1486 1486 int err;
1487 1487
1488 1488 ASSERT(ds->ds_phys->ds_num_children >= 2);
1489 1489 err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
1490 1490 /*
1491 1491 * The err should not be ENOENT, but a bug in a previous version
1492 1492 * of the code could cause upgrade_clones_cb() to not set
1493 1493 * ds_next_snap_obj when it should, leading to a missing entry.
1494 1494 * If we knew that the pool was created after
1495 1495 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1496 1496 * ENOENT. However, at least we can check that we don't have
1497 1497 * too many entries in the next_clones_obj even after failing to
1498 1498 * remove this one.
1499 1499 */
1500 1500 if (err != ENOENT) {
1501 1501 VERIFY0(err);
1502 1502 }
1503 1503 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
1504 1504 &count));
1505 1505 ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
1506 1506 }
1507 1507
1508 1508 static void
1509 1509 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
1510 1510 {
1511 1511 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1512 1512 zap_cursor_t zc;
1513 1513 zap_attribute_t za;
1514 1514
1515 1515 /*
1516 1516 * If it is the old version, dd_clones doesn't exist so we can't
1517 1517 * find the clones, but deadlist_remove_key() is a no-op so it
1518 1518 * doesn't matter.
1519 1519 */
1520 1520 if (ds->ds_dir->dd_phys->dd_clones == 0)
1521 1521 return;
1522 1522
1523 1523 for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
1524 1524 zap_cursor_retrieve(&zc, &za) == 0;
1525 1525 zap_cursor_advance(&zc)) {
1526 1526 dsl_dataset_t *clone;
1527 1527
1528 1528 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1529 1529 za.za_first_integer, FTAG, &clone));
1530 1530 if (clone->ds_dir->dd_origin_txg > mintxg) {
1531 1531 dsl_deadlist_remove_key(&clone->ds_deadlist,
1532 1532 mintxg, tx);
1533 1533 dsl_dataset_remove_clones_key(clone, mintxg, tx);
1534 1534 }
1535 1535 dsl_dataset_rele(clone, FTAG);
1536 1536 }
1537 1537 zap_cursor_fini(&zc);
1538 1538 }
1539 1539
1540 1540 struct process_old_arg {
1541 1541 dsl_dataset_t *ds;
1542 1542 dsl_dataset_t *ds_prev;
1543 1543 boolean_t after_branch_point;
1544 1544 zio_t *pio;
1545 1545 uint64_t used, comp, uncomp;
1546 1546 };
1547 1547
1548 1548 static int
1549 1549 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1550 1550 {
1551 1551 struct process_old_arg *poa = arg;
1552 1552 dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
1553 1553
1554 1554 if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
1555 1555 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
1556 1556 if (poa->ds_prev && !poa->after_branch_point &&
1557 1557 bp->blk_birth >
1558 1558 poa->ds_prev->ds_phys->ds_prev_snap_txg) {
1559 1559 poa->ds_prev->ds_phys->ds_unique_bytes +=
1560 1560 bp_get_dsize_sync(dp->dp_spa, bp);
1561 1561 }
1562 1562 } else {
1563 1563 poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
1564 1564 poa->comp += BP_GET_PSIZE(bp);
1565 1565 poa->uncomp += BP_GET_UCSIZE(bp);
1566 1566 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
1567 1567 }
1568 1568 return (0);
1569 1569 }
1570 1570
1571 1571 static void
1572 1572 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
1573 1573 dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
1574 1574 {
1575 1575 struct process_old_arg poa = { 0 };
1576 1576 dsl_pool_t *dp = ds->ds_dir->dd_pool;
1577 1577 objset_t *mos = dp->dp_meta_objset;
1578 1578
1579 1579 ASSERT(ds->ds_deadlist.dl_oldfmt);
1580 1580 ASSERT(ds_next->ds_deadlist.dl_oldfmt);
1581 1581
1582 1582 poa.ds = ds;
1583 1583 poa.ds_prev = ds_prev;
1584 1584 poa.after_branch_point = after_branch_point;
1585 1585 poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1586 1586 VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
1587 1587 process_old_cb, &poa, tx));
1588 1588 VERIFY0(zio_wait(poa.pio));
1589 1589 ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
1590 1590
1591 1591 /* change snapused */
1592 1592 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1593 1593 -poa.used, -poa.comp, -poa.uncomp, tx);
1594 1594
1595 1595 /* swap next's deadlist to our deadlist */
1596 1596 dsl_deadlist_close(&ds->ds_deadlist);
1597 1597 dsl_deadlist_close(&ds_next->ds_deadlist);
1598 1598 SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
1599 1599 ds->ds_phys->ds_deadlist_obj);
1600 1600 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1601 1601 dsl_deadlist_open(&ds_next->ds_deadlist, mos,
1602 1602 ds_next->ds_phys->ds_deadlist_obj);
1603 1603 }
1604 1604
1605 1605 static int
1606 1606 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
1607 1607 {
1608 1608 int err;
1609 1609 struct killarg ka;
1610 1610
1611 1611 /*
1612 1612 * Free everything that we point to (that's born after
1613 1613 * the previous snapshot, if we are a clone)
1614 1614 *
1615 1615 * NB: this should be very quick, because we already
1616 1616 * freed all the objects in open context.
1617 1617 */
1618 1618 ka.ds = ds;
1619 1619 ka.tx = tx;
1620 1620 err = traverse_dataset(ds,
1621 1621 ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
1622 1622 kill_blkptr, &ka);
1623 1623 ASSERT0(err);
1624 1624 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
1625 1625
1626 1626 return (err);
1627 1627 }
1628 1628
1629 1629 void
1630 1630 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
1631 1631 {
1632 1632 struct dsl_ds_destroyarg *dsda = arg1;
1633 1633 dsl_dataset_t *ds = dsda->ds;
1634 1634 int err;
1635 1635 int after_branch_point = FALSE;
1636 1636 dsl_pool_t *dp = ds->ds_dir->dd_pool;
1637 1637 objset_t *mos = dp->dp_meta_objset;
1638 1638 dsl_dataset_t *ds_prev = NULL;
1639 1639 boolean_t wont_destroy;
1640 1640 uint64_t obj;
1641 1641
1642 1642 wont_destroy = (dsda->defer &&
1643 1643 (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
1644 1644
1645 1645 ASSERT(ds->ds_owner || wont_destroy);
1646 1646 ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
1647 1647 ASSERT(ds->ds_prev == NULL ||
1648 1648 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
1649 1649 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
1650 1650
1651 1651 if (wont_destroy) {
1652 1652 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
1653 1653 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1654 1654 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
1655 1655 spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
1656 1656 return;
1657 1657 }
1658 1658
1659 1659 /* We need to log before removing it from the namespace. */
1660 1660 spa_history_log_internal_ds(ds, "destroy", tx, "");
1661 1661
1662 1662 /* signal any waiters that this dataset is going away */
1663 1663 mutex_enter(&ds->ds_lock);
1664 1664 ds->ds_owner = dsl_reaper;
1665 1665 cv_broadcast(&ds->ds_exclusive_cv);
1666 1666 mutex_exit(&ds->ds_lock);
1667 1667
1668 1668 /* Remove our reservation */
1669 1669 if (ds->ds_reserved != 0) {
1670 1670 dsl_prop_setarg_t psa;
1671 1671 uint64_t value = 0;
1672 1672
1673 1673 dsl_prop_setarg_init_uint64(&psa, "refreservation",
1674 1674 (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
1675 1675 &value);
1676 1676 psa.psa_effective_value = 0; /* predict default value */
1677 1677
1678 1678 dsl_dataset_set_reservation_sync(ds, &psa, tx);
1679 1679 ASSERT0(ds->ds_reserved);
1680 1680 }
1681 1681
1682 1682 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1683 1683
1684 1684 dsl_scan_ds_destroyed(ds, tx);
1685 1685
1686 1686 obj = ds->ds_object;
1687 1687
1688 1688 if (ds->ds_phys->ds_prev_snap_obj != 0) {
1689 1689 if (ds->ds_prev) {
1690 1690 ds_prev = ds->ds_prev;
1691 1691 } else {
1692 1692 VERIFY(0 == dsl_dataset_hold_obj(dp,
1693 1693 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
1694 1694 }
1695 1695 after_branch_point =
1696 1696 (ds_prev->ds_phys->ds_next_snap_obj != obj);
1697 1697
1698 1698 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1699 1699 if (after_branch_point &&
1700 1700 ds_prev->ds_phys->ds_next_clones_obj != 0) {
1701 1701 remove_from_next_clones(ds_prev, obj, tx);
1702 1702 if (ds->ds_phys->ds_next_snap_obj != 0) {
1703 1703 VERIFY(0 == zap_add_int(mos,
1704 1704 ds_prev->ds_phys->ds_next_clones_obj,
1705 1705 ds->ds_phys->ds_next_snap_obj, tx));
1706 1706 }
1707 1707 }
1708 1708 if (after_branch_point &&
1709 1709 ds->ds_phys->ds_next_snap_obj == 0) {
1710 1710 /* This clone is toast. */
1711 1711 ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1712 1712 ds_prev->ds_phys->ds_num_children--;
1713 1713
1714 1714 /*
1715 1715 * If the clone's origin has no other clones, no
1716 1716 * user holds, and has been marked for deferred
1717 1717 * deletion, then we should have done the necessary
1718 1718 * destroy setup for it.
1719 1719 */
1720 1720 if (ds_prev->ds_phys->ds_num_children == 1 &&
1721 1721 ds_prev->ds_userrefs == 0 &&
1722 1722 DS_IS_DEFER_DESTROY(ds_prev)) {
1723 1723 ASSERT3P(dsda->rm_origin, !=, NULL);
1724 1724 } else {
1725 1725 ASSERT3P(dsda->rm_origin, ==, NULL);
1726 1726 }
1727 1727 } else if (!after_branch_point) {
1728 1728 ds_prev->ds_phys->ds_next_snap_obj =
1729 1729 ds->ds_phys->ds_next_snap_obj;
1730 1730 }
1731 1731 }
1732 1732
1733 1733 if (dsl_dataset_is_snapshot(ds)) {
1734 1734 dsl_dataset_t *ds_next;
1735 1735 uint64_t old_unique;
1736 1736 uint64_t used = 0, comp = 0, uncomp = 0;
1737 1737
1738 1738 VERIFY(0 == dsl_dataset_hold_obj(dp,
1739 1739 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
1740 1740 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1741 1741
1742 1742 old_unique = ds_next->ds_phys->ds_unique_bytes;
1743 1743
1744 1744 dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1745 1745 ds_next->ds_phys->ds_prev_snap_obj =
1746 1746 ds->ds_phys->ds_prev_snap_obj;
1747 1747 ds_next->ds_phys->ds_prev_snap_txg =
1748 1748 ds->ds_phys->ds_prev_snap_txg;
1749 1749 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1750 1750 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1751 1751
1752 1752
1753 1753 if (ds_next->ds_deadlist.dl_oldfmt) {
1754 1754 process_old_deadlist(ds, ds_prev, ds_next,
1755 1755 after_branch_point, tx);
1756 1756 } else {
1757 1757 /* Adjust prev's unique space. */
1758 1758 if (ds_prev && !after_branch_point) {
1759 1759 dsl_deadlist_space_range(&ds_next->ds_deadlist,
1760 1760 ds_prev->ds_phys->ds_prev_snap_txg,
1761 1761 ds->ds_phys->ds_prev_snap_txg,
1762 1762 &used, &comp, &uncomp);
1763 1763 ds_prev->ds_phys->ds_unique_bytes += used;
1764 1764 }
1765 1765
1766 1766 /* Adjust snapused. */
1767 1767 dsl_deadlist_space_range(&ds_next->ds_deadlist,
1768 1768 ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1769 1769 &used, &comp, &uncomp);
1770 1770 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1771 1771 -used, -comp, -uncomp, tx);
1772 1772
1773 1773 /* Move blocks to be freed to pool's free list. */
1774 1774 dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
1775 1775 &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
1776 1776 tx);
1777 1777 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
1778 1778 DD_USED_HEAD, used, comp, uncomp, tx);
1779 1779
1780 1780 /* Merge our deadlist into next's and free it. */
1781 1781 dsl_deadlist_merge(&ds_next->ds_deadlist,
1782 1782 ds->ds_phys->ds_deadlist_obj, tx);
1783 1783 }
1784 1784 dsl_deadlist_close(&ds->ds_deadlist);
1785 1785 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1786 1786
1787 1787 /* Collapse range in clone heads */
1788 1788 dsl_dataset_remove_clones_key(ds,
1789 1789 ds->ds_phys->ds_creation_txg, tx);
1790 1790
1791 1791 if (dsl_dataset_is_snapshot(ds_next)) {
1792 1792 dsl_dataset_t *ds_nextnext;
1793 1793
1794 1794 /*
1795 1795 * Update next's unique to include blocks which
1796 1796 * were previously shared by only this snapshot
1797 1797 * and it. Those blocks will be born after the
1798 1798 * prev snap and before this snap, and will have
1799 1799 * died after the next snap and before the one
1800 1800 * after that (ie. be on the snap after next's
1801 1801 * deadlist).
1802 1802 */
1803 1803 VERIFY(0 == dsl_dataset_hold_obj(dp,
1804 1804 ds_next->ds_phys->ds_next_snap_obj,
1805 1805 FTAG, &ds_nextnext));
1806 1806 dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
1807 1807 ds->ds_phys->ds_prev_snap_txg,
1808 1808 ds->ds_phys->ds_creation_txg,
1809 1809 &used, &comp, &uncomp);
1810 1810 ds_next->ds_phys->ds_unique_bytes += used;
1811 1811 dsl_dataset_rele(ds_nextnext, FTAG);
1812 1812 ASSERT3P(ds_next->ds_prev, ==, NULL);
1813 1813
1814 1814 /* Collapse range in this head. */
1815 1815 dsl_dataset_t *hds;
1816 1816 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
1817 1817 ds->ds_dir->dd_phys->dd_head_dataset_obj,
1818 1818 FTAG, &hds));
1819 1819 dsl_deadlist_remove_key(&hds->ds_deadlist,
1820 1820 ds->ds_phys->ds_creation_txg, tx);
1821 1821 dsl_dataset_rele(hds, FTAG);
1822 1822
1823 1823 } else {
1824 1824 ASSERT3P(ds_next->ds_prev, ==, ds);
1825 1825 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
1826 1826 ds_next->ds_prev = NULL;
1827 1827 if (ds_prev) {
1828 1828 VERIFY(0 == dsl_dataset_get_ref(dp,
1829 1829 ds->ds_phys->ds_prev_snap_obj,
1830 1830 ds_next, &ds_next->ds_prev));
1831 1831 }
1832 1832
1833 1833 dsl_dataset_recalc_head_uniq(ds_next);
1834 1834
1835 1835 /*
1836 1836 * Reduce the amount of our unconsmed refreservation
1837 1837 * being charged to our parent by the amount of
1838 1838 * new unique data we have gained.
1839 1839 */
1840 1840 if (old_unique < ds_next->ds_reserved) {
1841 1841 int64_t mrsdelta;
1842 1842 uint64_t new_unique =
1843 1843 ds_next->ds_phys->ds_unique_bytes;
1844 1844
1845 1845 ASSERT(old_unique <= new_unique);
1846 1846 mrsdelta = MIN(new_unique - old_unique,
1847 1847 ds_next->ds_reserved - old_unique);
1848 1848 dsl_dir_diduse_space(ds->ds_dir,
1849 1849 DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1850 1850 }
1851 1851 }
1852 1852 dsl_dataset_rele(ds_next, FTAG);
1853 1853 } else {
1854 1854 zfeature_info_t *async_destroy =
1855 1855 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
1856 1856 objset_t *os;
1857 1857
1858 1858 /*
1859 1859 * There's no next snapshot, so this is a head dataset.
1860 1860 * Destroy the deadlist. Unless it's a clone, the
1861 1861 * deadlist should be empty. (If it's a clone, it's
1862 1862 * safe to ignore the deadlist contents.)
1863 1863 */
1864 1864 dsl_deadlist_close(&ds->ds_deadlist);
1865 1865 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1866 1866 ds->ds_phys->ds_deadlist_obj = 0;
1867 1867
1868 1868 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
1869 1869
1870 1870 if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
1871 1871 err = old_synchronous_dataset_destroy(ds, tx);
1872 1872 } else {
1873 1873 /*
1874 1874 * Move the bptree into the pool's list of trees to
1875 1875 * clean up and update space accounting information.
1876 1876 */
1877 1877 uint64_t used, comp, uncomp;
1878 1878
1879 1879 zil_destroy_sync(dmu_objset_zil(os), tx);
1880 1880
1881 1881 if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
1882 1882 spa_feature_incr(dp->dp_spa, async_destroy, tx);
1883 1883 dp->dp_bptree_obj = bptree_alloc(mos, tx);
1884 1884 VERIFY(zap_add(mos,
1885 1885 DMU_POOL_DIRECTORY_OBJECT,
1886 1886 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
1887 1887 &dp->dp_bptree_obj, tx) == 0);
1888 1888 }
1889 1889
1890 1890 used = ds->ds_dir->dd_phys->dd_used_bytes;
1891 1891 comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
1892 1892 uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
1893 1893
1894 1894 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
1895 1895 ds->ds_phys->ds_unique_bytes == used);
1896 1896
1897 1897 bptree_add(mos, dp->dp_bptree_obj,
1898 1898 &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
1899 1899 used, comp, uncomp, tx);
1900 1900 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
1901 1901 -used, -comp, -uncomp, tx);
1902 1902 dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
1903 1903 used, comp, uncomp, tx);
1904 1904 }
1905 1905
1906 1906 if (ds->ds_prev != NULL) {
1907 1907 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
1908 1908 VERIFY3U(0, ==, zap_remove_int(mos,
1909 1909 ds->ds_prev->ds_dir->dd_phys->dd_clones,
1910 1910 ds->ds_object, tx));
1911 1911 }
1912 1912 dsl_dataset_rele(ds->ds_prev, ds);
1913 1913 ds->ds_prev = ds_prev = NULL;
1914 1914 }
1915 1915 }
1916 1916
1917 1917 /*
1918 1918 * This must be done after the dsl_traverse(), because it will
1919 1919 * re-open the objset.
1920 1920 */
1921 1921 if (ds->ds_objset) {
1922 1922 dmu_objset_evict(ds->ds_objset);
1923 1923 ds->ds_objset = NULL;
1924 1924 }
1925 1925
1926 1926 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1927 1927 /* Erase the link in the dir */
1928 1928 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1929 1929 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
1930 1930 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
1931 1931 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1932 1932 ASSERT(err == 0);
1933 1933 } else {
1934 1934 /* remove from snapshot namespace */
1935 1935 dsl_dataset_t *ds_head;
1936 1936 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
1937 1937 VERIFY(0 == dsl_dataset_hold_obj(dp,
1938 1938 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
1939 1939 VERIFY(0 == dsl_dataset_get_snapname(ds));
1940 1940 #ifdef ZFS_DEBUG
1941 1941 {
1942 1942 uint64_t val;
1943 1943
1944 1944 err = dsl_dataset_snap_lookup(ds_head,
1945 1945 ds->ds_snapname, &val);
1946 1946 ASSERT0(err);
1947 1947 ASSERT3U(val, ==, obj);
1948 1948 }
1949 1949 #endif
1950 1950 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
1951 1951 ASSERT(err == 0);
1952 1952 dsl_dataset_rele(ds_head, FTAG);
1953 1953 }
1954 1954
1955 1955 if (ds_prev && ds->ds_prev != ds_prev)
1956 1956 dsl_dataset_rele(ds_prev, FTAG);
1957 1957
1958 1958 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
1959 1959
1960 1960 if (ds->ds_phys->ds_next_clones_obj != 0) {
1961 1961 uint64_t count;
1962 1962 ASSERT(0 == zap_count(mos,
1963 1963 ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
1964 1964 VERIFY(0 == dmu_object_free(mos,
1965 1965 ds->ds_phys->ds_next_clones_obj, tx));
1966 1966 }
1967 1967 if (ds->ds_phys->ds_props_obj != 0)
1968 1968 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
1969 1969 if (ds->ds_phys->ds_userrefs_obj != 0)
1970 1970 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
1971 1971 dsl_dir_close(ds->ds_dir, ds);
1972 1972 ds->ds_dir = NULL;
1973 1973 dsl_dataset_drain_refs(ds, tag);
1974 1974 VERIFY(0 == dmu_object_free(mos, obj, tx));
1975 1975
1976 1976 if (dsda->rm_origin) {
1977 1977 /*
1978 1978 * Remove the origin of the clone we just destroyed.
1979 1979 */
1980 1980 struct dsl_ds_destroyarg ndsda = {0};
1981 1981
1982 1982 ndsda.ds = dsda->rm_origin;
1983 1983 dsl_dataset_destroy_sync(&ndsda, tag, tx);
1984 1984 }
1985 1985 }
1986 1986
1987 1987 static int
1988 1988 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1989 1989 {
1990 1990 uint64_t asize;
1991 1991
1992 1992 if (!dmu_tx_is_syncing(tx))
1993 1993 return (0);
1994 1994
1995 1995 /*
1996 1996 * If there's an fs-only reservation, any blocks that might become
1997 1997 * owned by the snapshot dataset must be accommodated by space
1998 1998 * outside of the reservation.
1999 1999 */
2000 2000 ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
2001 2001 asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2002 2002 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
2003 2003 return (ENOSPC);
2004 2004
2005 2005 /*
2006 2006 * Propagate any reserved space for this snapshot to other
2007 2007 * snapshot checks in this sync group.
2008 2008 */
2009 2009 if (asize > 0)
2010 2010 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
2011 2011
2012 2012 return (0);
2013 2013 }
2014 2014
2015 2015 int
2016 2016 dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
2017 2017 dmu_tx_t *tx)
2018 2018 {
2019 2019 int err;
2020 2020 uint64_t value;
2021 2021
2022 2022 /*
2023 2023 * We don't allow multiple snapshots of the same txg. If there
2024 2024 * is already one, try again.
2025 2025 */
2026 2026 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
2027 2027 return (EAGAIN);
2028 2028
2029 2029 /*
2030 2030 * Check for conflicting snapshot name.
2031 2031 */
2032 2032 err = dsl_dataset_snap_lookup(ds, snapname, &value);
2033 2033 if (err == 0)
2034 2034 return (EEXIST);
2035 2035 if (err != ENOENT)
2036 2036 return (err);
2037 2037
2038 2038 /*
2039 2039 * Check that the dataset's name is not too long. Name consists
2040 2040 * of the dataset's length + 1 for the @-sign + snapshot name's length
2041 2041 */
2042 2042 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
2043 2043 return (ENAMETOOLONG);
2044 2044
2045 2045 err = dsl_dataset_snapshot_reserve_space(ds, tx);
2046 2046 if (err)
2047 2047 return (err);
2048 2048
2049 2049 ds->ds_trysnap_txg = tx->tx_txg;
2050 2050 return (0);
2051 2051 }
2052 2052
2053 2053 void
2054 2054 dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
2055 2055 dmu_tx_t *tx)
2056 2056 {
2057 2057 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2058 2058 dmu_buf_t *dbuf;
2059 2059 dsl_dataset_phys_t *dsphys;
2060 2060 uint64_t dsobj, crtxg;
2061 2061 objset_t *mos = dp->dp_meta_objset;
2062 2062 int err;
2063 2063
2064 2064 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
2065 2065
2066 2066 /*
2067 2067 * The origin's ds_creation_txg has to be < TXG_INITIAL
2068 2068 */
2069 2069 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
2070 2070 crtxg = 1;
2071 2071 else
2072 2072 crtxg = tx->tx_txg;
2073 2073
2074 2074 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
2075 2075 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
2076 2076 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
2077 2077 dmu_buf_will_dirty(dbuf, tx);
2078 2078 dsphys = dbuf->db_data;
2079 2079 bzero(dsphys, sizeof (dsl_dataset_phys_t));
2080 2080 dsphys->ds_dir_obj = ds->ds_dir->dd_object;
2081 2081 dsphys->ds_fsid_guid = unique_create();
2082 2082 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
2083 2083 sizeof (dsphys->ds_guid));
2084 2084 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
2085 2085 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
2086 2086 dsphys->ds_next_snap_obj = ds->ds_object;
2087 2087 dsphys->ds_num_children = 1;
2088 2088 dsphys->ds_creation_time = gethrestime_sec();
2089 2089 dsphys->ds_creation_txg = crtxg;
2090 2090 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
2091 2091 dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
2092 2092 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
2093 2093 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
2094 2094 dsphys->ds_flags = ds->ds_phys->ds_flags;
2095 2095 dsphys->ds_bp = ds->ds_phys->ds_bp;
2096 2096 dmu_buf_rele(dbuf, FTAG);
2097 2097
2098 2098 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
2099 2099 if (ds->ds_prev) {
2100 2100 uint64_t next_clones_obj =
2101 2101 ds->ds_prev->ds_phys->ds_next_clones_obj;
2102 2102 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
2103 2103 ds->ds_object ||
2104 2104 ds->ds_prev->ds_phys->ds_num_children > 1);
2105 2105 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
2106 2106 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2107 2107 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
2108 2108 ds->ds_prev->ds_phys->ds_creation_txg);
2109 2109 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
2110 2110 } else if (next_clones_obj != 0) {
2111 2111 remove_from_next_clones(ds->ds_prev,
2112 2112 dsphys->ds_next_snap_obj, tx);
2113 2113 VERIFY3U(0, ==, zap_add_int(mos,
2114 2114 next_clones_obj, dsobj, tx));
2115 2115 }
2116 2116 }
2117 2117
2118 2118 /*
2119 2119 * If we have a reference-reservation on this dataset, we will
2120 2120 * need to increase the amount of refreservation being charged
2121 2121 * since our unique space is going to zero.
2122 2122 */
2123 2123 if (ds->ds_reserved) {
2124 2124 int64_t delta;
2125 2125 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
2126 2126 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2127 2127 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
2128 2128 delta, 0, 0, tx);
2129 2129 }
2130 2130
2131 2131 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2132 2132 zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
2133 2133 ds->ds_dir->dd_myname, snapname, dsobj,
2134 2134 ds->ds_phys->ds_prev_snap_txg);
2135 2135 ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
2136 2136 UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
2137 2137 dsl_deadlist_close(&ds->ds_deadlist);
2138 2138 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
2139 2139 dsl_deadlist_add_key(&ds->ds_deadlist,
2140 2140 ds->ds_phys->ds_prev_snap_txg, tx);
2141 2141
2142 2142 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
2143 2143 ds->ds_phys->ds_prev_snap_obj = dsobj;
2144 2144 ds->ds_phys->ds_prev_snap_txg = crtxg;
2145 2145 ds->ds_phys->ds_unique_bytes = 0;
2146 2146 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
2147 2147 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
2148 2148
2149 2149 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
2150 2150 snapname, 8, 1, &dsobj, tx);
2151 2151 ASSERT(err == 0);
2152 2152
2153 2153 if (ds->ds_prev)
2154 2154 dsl_dataset_drop_ref(ds->ds_prev, ds);
2155 2155 VERIFY(0 == dsl_dataset_get_ref(dp,
2156 2156 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
2157 2157
2158 2158 dsl_scan_ds_snapshotted(ds, tx);
2159 2159
2160 2160 dsl_dir_snap_cmtime_update(ds->ds_dir);
2161 2161
2162 2162 spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
2163 2163 }
2164 2164
2165 2165 void
2166 2166 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
2167 2167 {
2168 2168 ASSERT(dmu_tx_is_syncing(tx));
2169 2169 ASSERT(ds->ds_objset != NULL);
2170 2170 ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
2171 2171
2172 2172 /*
2173 2173 * in case we had to change ds_fsid_guid when we opened it,
2174 2174 * sync it out now.
2175 2175 */
2176 2176 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2177 2177 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
2178 2178
2179 2179 dmu_objset_sync(ds->ds_objset, zio, tx);
2180 2180 }
2181 2181
2182 2182 static void
2183 2183 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
2184 2184 {
2185 2185 uint64_t count = 0;
2186 2186 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
2187 2187 zap_cursor_t zc;
2188 2188 zap_attribute_t za;
2189 2189 nvlist_t *propval;
2190 2190 nvlist_t *val;
2191 2191
2192 2192 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2193 2193 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2194 2194 VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2195 2195
2196 2196 /*
2197 2197 * There may me missing entries in ds_next_clones_obj
2198 2198 * due to a bug in a previous version of the code.
2199 2199 * Only trust it if it has the right number of entries.
2200 2200 */
2201 2201 if (ds->ds_phys->ds_next_clones_obj != 0) {
2202 2202 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
2203 2203 &count));
2204 2204 }
2205 2205 if (count != ds->ds_phys->ds_num_children - 1) {
2206 2206 goto fail;
2207 2207 }
2208 2208 for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
2209 2209 zap_cursor_retrieve(&zc, &za) == 0;
2210 2210 zap_cursor_advance(&zc)) {
2211 2211 dsl_dataset_t *clone;
2212 2212 char buf[ZFS_MAXNAMELEN];
2213 2213 /*
2214 2214 * Even though we hold the dp_config_rwlock, the dataset
2215 2215 * may fail to open, returning ENOENT. If there is a
2216 2216 * thread concurrently attempting to destroy this
2217 2217 * dataset, it will have the ds_rwlock held for
2218 2218 * RW_WRITER. Our call to dsl_dataset_hold_obj() ->
2219 2219 * dsl_dataset_hold_ref() will fail its
2220 2220 * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
2221 2221 * dp_config_rwlock, and wait for the destroy progress
2222 2222 * and signal ds_exclusive_cv. If the destroy was
2223 2223 * successful, we will see that
2224 2224 * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
2225 2225 */
2226 2226 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
2227 2227 za.za_first_integer, FTAG, &clone) != 0)
2228 2228 continue;
2229 2229 dsl_dir_name(clone->ds_dir, buf);
2230 2230 VERIFY(nvlist_add_boolean(val, buf) == 0);
2231 2231 dsl_dataset_rele(clone, FTAG);
2232 2232 }
2233 2233 zap_cursor_fini(&zc);
2234 2234 VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
2235 2235 VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
2236 2236 propval) == 0);
2237 2237 fail:
2238 2238 nvlist_free(val);
2239 2239 nvlist_free(propval);
2240 2240 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2241 2241 }
2242 2242
2243 2243 void
2244 2244 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
2245 2245 {
2246 2246 uint64_t refd, avail, uobjs, aobjs, ratio;
2247 2247
2248 2248 ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
2249 2249 (ds->ds_phys->ds_uncompressed_bytes * 100 /
2250 2250 ds->ds_phys->ds_compressed_bytes);
2251 2251
2252 2252 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
2253 2253
2254 2254 if (dsl_dataset_is_snapshot(ds)) {
2255 2255 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
2256 2256 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
2257 2257 ds->ds_phys->ds_unique_bytes);
2258 2258 get_clones_stat(ds, nv);
2259 2259 } else {
2260 2260 dsl_dir_stats(ds->ds_dir, nv);
2261 2261 }
2262 2262
2263 2263 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
2264 2264 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
2265 2265 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
2266 2266
2267 2267 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
2268 2268 ds->ds_phys->ds_creation_time);
2269 2269 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
2270 2270 ds->ds_phys->ds_creation_txg);
2271 2271 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
2272 2272 ds->ds_quota);
2273 2273 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
2274 2274 ds->ds_reserved);
2275 2275 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
2276 2276 ds->ds_phys->ds_guid);
2277 2277 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
2278 2278 ds->ds_phys->ds_unique_bytes);
2279 2279 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
2280 2280 ds->ds_object);
2281 2281 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
2282 2282 ds->ds_userrefs);
2283 2283 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
2284 2284 DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
2285 2285
2286 2286 if (ds->ds_phys->ds_prev_snap_obj != 0) {
2287 2287 uint64_t written, comp, uncomp;
2288 2288 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2289 2289 dsl_dataset_t *prev;
2290 2290
2291 2291 rw_enter(&dp->dp_config_rwlock, RW_READER);
2292 2292 int err = dsl_dataset_hold_obj(dp,
2293 2293 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
2294 2294 rw_exit(&dp->dp_config_rwlock);
2295 2295 if (err == 0) {
2296 2296 err = dsl_dataset_space_written(prev, ds, &written,
2297 2297 &comp, &uncomp);
2298 2298 dsl_dataset_rele(prev, FTAG);
2299 2299 if (err == 0) {
2300 2300 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
2301 2301 written);
2302 2302 }
2303 2303 }
2304 2304 }
2305 2305 }
2306 2306
2307 2307 void
2308 2308 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
2309 2309 {
2310 2310 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
2311 2311 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
2312 2312 stat->dds_guid = ds->ds_phys->ds_guid;
2313 2313 stat->dds_origin[0] = '\0';
2314 2314 if (dsl_dataset_is_snapshot(ds)) {
2315 2315 stat->dds_is_snapshot = B_TRUE;
2316 2316 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
2317 2317 } else {
2318 2318 stat->dds_is_snapshot = B_FALSE;
2319 2319 stat->dds_num_clones = 0;
2320 2320
2321 2321 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2322 2322 if (dsl_dir_is_clone(ds->ds_dir)) {
2323 2323 dsl_dataset_t *ods;
2324 2324
2325 2325 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
2326 2326 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
2327 2327 dsl_dataset_name(ods, stat->dds_origin);
2328 2328 dsl_dataset_drop_ref(ods, FTAG);
2329 2329 }
2330 2330 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2331 2331 }
2332 2332 }
2333 2333
2334 2334 uint64_t
2335 2335 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2336 2336 {
2337 2337 return (ds->ds_fsid_guid);
2338 2338 }
2339 2339
2340 2340 void
2341 2341 dsl_dataset_space(dsl_dataset_t *ds,
2342 2342 uint64_t *refdbytesp, uint64_t *availbytesp,
2343 2343 uint64_t *usedobjsp, uint64_t *availobjsp)
2344 2344 {
2345 2345 *refdbytesp = ds->ds_phys->ds_referenced_bytes;
2346 2346 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2347 2347 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
2348 2348 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
2349 2349 if (ds->ds_quota != 0) {
2350 2350 /*
2351 2351 * Adjust available bytes according to refquota
2352 2352 */
2353 2353 if (*refdbytesp < ds->ds_quota)
2354 2354 *availbytesp = MIN(*availbytesp,
2355 2355 ds->ds_quota - *refdbytesp);
2356 2356 else
2357 2357 *availbytesp = 0;
2358 2358 }
2359 2359 *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
2360 2360 *availobjsp = DN_MAX_OBJECT - *usedobjsp;
2361 2361 }
2362 2362
2363 2363 boolean_t
2364 2364 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2365 2365 {
2366 2366 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2367 2367
2368 2368 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
2369 2369 dsl_pool_sync_context(dp));
2370 2370 if (ds->ds_prev == NULL)
2371 2371 return (B_FALSE);
2372 2372 if (ds->ds_phys->ds_bp.blk_birth >
2373 2373 ds->ds_prev->ds_phys->ds_creation_txg) {
2374 2374 objset_t *os, *os_prev;
2375 2375 /*
2376 2376 * It may be that only the ZIL differs, because it was
2377 2377 * reset in the head. Don't count that as being
2378 2378 * modified.
2379 2379 */
2380 2380 if (dmu_objset_from_ds(ds, &os) != 0)
2381 2381 return (B_TRUE);
2382 2382 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
2383 2383 return (B_TRUE);
2384 2384 return (bcmp(&os->os_phys->os_meta_dnode,
2385 2385 &os_prev->os_phys->os_meta_dnode,
2386 2386 sizeof (os->os_phys->os_meta_dnode)) != 0);
2387 2387 }
2388 2388 return (B_FALSE);
2389 2389 }
2390 2390
2391 2391 /* ARGSUSED */
2392 2392 static int
2393 2393 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
2394 2394 {
2395 2395 dsl_dataset_t *ds = arg1;
2396 2396 char *newsnapname = arg2;
2397 2397 dsl_dir_t *dd = ds->ds_dir;
2398 2398 dsl_dataset_t *hds;
2399 2399 uint64_t val;
2400 2400 int err;
2401 2401
2402 2402 err = dsl_dataset_hold_obj(dd->dd_pool,
2403 2403 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
2404 2404 if (err)
2405 2405 return (err);
2406 2406
2407 2407 /* new name better not be in use */
2408 2408 err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
2409 2409 dsl_dataset_rele(hds, FTAG);
2410 2410
2411 2411 if (err == 0)
2412 2412 err = EEXIST;
2413 2413 else if (err == ENOENT)
2414 2414 err = 0;
2415 2415
2416 2416 /* dataset name + 1 for the "@" + the new snapshot name must fit */
2417 2417 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
2418 2418 err = ENAMETOOLONG;
2419 2419
2420 2420 return (err);
2421 2421 }
2422 2422
2423 2423 static void
2424 2424 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2425 2425 {
2426 2426 dsl_dataset_t *ds = arg1;
2427 2427 const char *newsnapname = arg2;
2428 2428 dsl_dir_t *dd = ds->ds_dir;
2429 2429 objset_t *mos = dd->dd_pool->dp_meta_objset;
2430 2430 dsl_dataset_t *hds;
2431 2431 int err;
2432 2432
2433 2433 ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
2434 2434
2435 2435 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2436 2436 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
2437 2437
2438 2438 VERIFY(0 == dsl_dataset_get_snapname(ds));
2439 2439 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
2440 2440 ASSERT0(err);
2441 2441 mutex_enter(&ds->ds_lock);
2442 2442 (void) strcpy(ds->ds_snapname, newsnapname);
2443 2443 mutex_exit(&ds->ds_lock);
2444 2444 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
2445 2445 ds->ds_snapname, 8, 1, &ds->ds_object, tx);
2446 2446 ASSERT0(err);
2447 2447
2448 2448 spa_history_log_internal_ds(ds, "rename", tx,
2449 2449 "-> @%s", newsnapname);
2450 2450 dsl_dataset_rele(hds, FTAG);
2451 2451 }
2452 2452
2453 2453 struct renamesnaparg {
2454 2454 dsl_sync_task_group_t *dstg;
2455 2455 char failed[MAXPATHLEN];
2456 2456 char *oldsnap;
2457 2457 char *newsnap;
2458 2458 };
2459 2459
2460 2460 static int
2461 2461 dsl_snapshot_rename_one(const char *name, void *arg)
2462 2462 {
2463 2463 struct renamesnaparg *ra = arg;
2464 2464 dsl_dataset_t *ds = NULL;
2465 2465 char *snapname;
2466 2466 int err;
2467 2467
2468 2468 snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
2469 2469 (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
2470 2470
2471 2471 /*
2472 2472 * For recursive snapshot renames the parent won't be changing
2473 2473 * so we just pass name for both the to/from argument.
2474 2474 */
2475 2475 err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
2476 2476 if (err != 0) {
2477 2477 strfree(snapname);
2478 2478 return (err == ENOENT ? 0 : err);
2479 2479 }
2480 2480
2481 2481 #ifdef _KERNEL
2482 2482 /*
2483 2483 * For all filesystems undergoing rename, we'll need to unmount it.
2484 2484 */
2485 2485 (void) zfs_unmount_snap(snapname, NULL);
2486 2486 #endif
2487 2487 err = dsl_dataset_hold(snapname, ra->dstg, &ds);
2488 2488 strfree(snapname);
2489 2489 if (err != 0)
2490 2490 return (err == ENOENT ? 0 : err);
2491 2491
2492 2492 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
2493 2493 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
2494 2494
2495 2495 return (0);
2496 2496 }
2497 2497
2498 2498 static int
2499 2499 dsl_recursive_rename(char *oldname, const char *newname)
2500 2500 {
2501 2501 int err;
2502 2502 struct renamesnaparg *ra;
2503 2503 dsl_sync_task_t *dst;
2504 2504 spa_t *spa;
2505 2505 char *cp, *fsname = spa_strdup(oldname);
2506 2506 int len = strlen(oldname) + 1;
2507 2507
2508 2508 /* truncate the snapshot name to get the fsname */
2509 2509 cp = strchr(fsname, '@');
2510 2510 *cp = '\0';
2511 2511
2512 2512 err = spa_open(fsname, &spa, FTAG);
2513 2513 if (err) {
2514 2514 kmem_free(fsname, len);
2515 2515 return (err);
2516 2516 }
2517 2517 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
2518 2518 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
2519 2519
2520 2520 ra->oldsnap = strchr(oldname, '@') + 1;
2521 2521 ra->newsnap = strchr(newname, '@') + 1;
2522 2522 *ra->failed = '\0';
2523 2523
2524 2524 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
2525 2525 DS_FIND_CHILDREN);
2526 2526 kmem_free(fsname, len);
2527 2527
2528 2528 if (err == 0) {
2529 2529 err = dsl_sync_task_group_wait(ra->dstg);
2530 2530 }
2531 2531
2532 2532 for (dst = list_head(&ra->dstg->dstg_tasks); dst;
2533 2533 dst = list_next(&ra->dstg->dstg_tasks, dst)) {
2534 2534 dsl_dataset_t *ds = dst->dst_arg1;
2535 2535 if (dst->dst_err) {
2536 2536 dsl_dir_name(ds->ds_dir, ra->failed);
2537 2537 (void) strlcat(ra->failed, "@", sizeof (ra->failed));
2538 2538 (void) strlcat(ra->failed, ra->newsnap,
2539 2539 sizeof (ra->failed));
2540 2540 }
2541 2541 dsl_dataset_rele(ds, ra->dstg);
2542 2542 }
2543 2543
2544 2544 if (err)
2545 2545 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
2546 2546
2547 2547 dsl_sync_task_group_destroy(ra->dstg);
2548 2548 kmem_free(ra, sizeof (struct renamesnaparg));
2549 2549 spa_close(spa, FTAG);
2550 2550 return (err);
2551 2551 }
2552 2552
2553 2553 static int
2554 2554 dsl_valid_rename(const char *oldname, void *arg)
2555 2555 {
2556 2556 int delta = *(int *)arg;
2557 2557
2558 2558 if (strlen(oldname) + delta >= MAXNAMELEN)
2559 2559 return (ENAMETOOLONG);
2560 2560
2561 2561 return (0);
2562 2562 }
2563 2563
2564 2564 #pragma weak dmu_objset_rename = dsl_dataset_rename
2565 2565 int
2566 2566 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
2567 2567 {
2568 2568 dsl_dir_t *dd;
2569 2569 dsl_dataset_t *ds;
2570 2570 const char *tail;
2571 2571 int err;
2572 2572
2573 2573 err = dsl_dir_open(oldname, FTAG, &dd, &tail);
2574 2574 if (err)
2575 2575 return (err);
2576 2576
2577 2577 if (tail == NULL) {
2578 2578 int delta = strlen(newname) - strlen(oldname);
2579 2579
2580 2580 /* if we're growing, validate child name lengths */
2581 2581 if (delta > 0)
2582 2582 err = dmu_objset_find(oldname, dsl_valid_rename,
2583 2583 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
2584 2584
2585 2585 if (err == 0)
2586 2586 err = dsl_dir_rename(dd, newname);
2587 2587 dsl_dir_close(dd, FTAG);
2588 2588 return (err);
2589 2589 }
2590 2590
2591 2591 if (tail[0] != '@') {
2592 2592 /* the name ended in a nonexistent component */
2593 2593 dsl_dir_close(dd, FTAG);
2594 2594 return (ENOENT);
2595 2595 }
2596 2596
2597 2597 dsl_dir_close(dd, FTAG);
2598 2598
2599 2599 /* new name must be snapshot in same filesystem */
2600 2600 tail = strchr(newname, '@');
2601 2601 if (tail == NULL)
2602 2602 return (EINVAL);
2603 2603 tail++;
2604 2604 if (strncmp(oldname, newname, tail - newname) != 0)
2605 2605 return (EXDEV);
2606 2606
2607 2607 if (recursive) {
2608 2608 err = dsl_recursive_rename(oldname, newname);
2609 2609 } else {
2610 2610 err = dsl_dataset_hold(oldname, FTAG, &ds);
2611 2611 if (err)
2612 2612 return (err);
2613 2613
2614 2614 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
2615 2615 dsl_dataset_snapshot_rename_check,
2616 2616 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
2617 2617
2618 2618 dsl_dataset_rele(ds, FTAG);
2619 2619 }
2620 2620
2621 2621 return (err);
2622 2622 }
2623 2623
2624 2624 struct promotenode {
2625 2625 list_node_t link;
2626 2626 dsl_dataset_t *ds;
2627 2627 };
2628 2628
2629 2629 struct promotearg {
2630 2630 list_t shared_snaps, origin_snaps, clone_snaps;
2631 2631 dsl_dataset_t *origin_origin;
2632 2632 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2633 2633 char *err_ds;
2634 2634 };
2635 2635
2636 2636 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2637 2637 static boolean_t snaplist_unstable(list_t *l);
2638 2638
2639 2639 static int
2640 2640 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
2641 2641 {
2642 2642 dsl_dataset_t *hds = arg1;
2643 2643 struct promotearg *pa = arg2;
2644 2644 struct promotenode *snap = list_head(&pa->shared_snaps);
2645 2645 dsl_dataset_t *origin_ds = snap->ds;
2646 2646 int err;
2647 2647 uint64_t unused;
2648 2648
2649 2649 /* Check that it is a real clone */
2650 2650 if (!dsl_dir_is_clone(hds->ds_dir))
2651 2651 return (EINVAL);
2652 2652
2653 2653 /* Since this is so expensive, don't do the preliminary check */
2654 2654 if (!dmu_tx_is_syncing(tx))
2655 2655 return (0);
2656 2656
2657 2657 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
2658 2658 return (EXDEV);
2659 2659
2660 2660 /* compute origin's new unique space */
2661 2661 snap = list_tail(&pa->clone_snaps);
2662 2662 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2663 2663 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2664 2664 origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2665 2665 &pa->unique, &unused, &unused);
2666 2666
2667 2667 /*
2668 2668 * Walk the snapshots that we are moving
2669 2669 *
2670 2670 * Compute space to transfer. Consider the incremental changes
2671 2671 * to used for each snapshot:
2672 2672 * (my used) = (prev's used) + (blocks born) - (blocks killed)
2673 2673 * So each snapshot gave birth to:
2674 2674 * (blocks born) = (my used) - (prev's used) + (blocks killed)
2675 2675 * So a sequence would look like:
2676 2676 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2677 2677 * Which simplifies to:
2678 2678 * uN + kN + kN-1 + ... + k1 + k0
2679 2679 * Note however, if we stop before we reach the ORIGIN we get:
2680 2680 * uN + kN + kN-1 + ... + kM - uM-1
2681 2681 */
2682 2682 pa->used = origin_ds->ds_phys->ds_referenced_bytes;
2683 2683 pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2684 2684 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2685 2685 for (snap = list_head(&pa->shared_snaps); snap;
2686 2686 snap = list_next(&pa->shared_snaps, snap)) {
2687 2687 uint64_t val, dlused, dlcomp, dluncomp;
2688 2688 dsl_dataset_t *ds = snap->ds;
2689 2689
2690 2690 /* Check that the snapshot name does not conflict */
2691 2691 VERIFY(0 == dsl_dataset_get_snapname(ds));
2692 2692 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2693 2693 if (err == 0) {
2694 2694 err = EEXIST;
2695 2695 goto out;
2696 2696 }
2697 2697 if (err != ENOENT)
2698 2698 goto out;
2699 2699
2700 2700 /* The very first snapshot does not have a deadlist */
2701 2701 if (ds->ds_phys->ds_prev_snap_obj == 0)
2702 2702 continue;
2703 2703
2704 2704 dsl_deadlist_space(&ds->ds_deadlist,
2705 2705 &dlused, &dlcomp, &dluncomp);
2706 2706 pa->used += dlused;
2707 2707 pa->comp += dlcomp;
2708 2708 pa->uncomp += dluncomp;
2709 2709 }
2710 2710
2711 2711 /*
2712 2712 * If we are a clone of a clone then we never reached ORIGIN,
2713 2713 * so we need to subtract out the clone origin's used space.
2714 2714 */
2715 2715 if (pa->origin_origin) {
2716 2716 pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
2717 2717 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2718 2718 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2719 2719 }
2720 2720
2721 2721 /* Check that there is enough space here */
2722 2722 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2723 2723 pa->used);
2724 2724 if (err)
2725 2725 return (err);
2726 2726
2727 2727 /*
2728 2728 * Compute the amounts of space that will be used by snapshots
2729 2729 * after the promotion (for both origin and clone). For each,
2730 2730 * it is the amount of space that will be on all of their
2731 2731 * deadlists (that was not born before their new origin).
2732 2732 */
2733 2733 if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2734 2734 uint64_t space;
2735 2735
2736 2736 /*
2737 2737 * Note, typically this will not be a clone of a clone,
2738 2738 * so dd_origin_txg will be < TXG_INITIAL, so
2739 2739 * these snaplist_space() -> dsl_deadlist_space_range()
2740 2740 * calls will be fast because they do not have to
2741 2741 * iterate over all bps.
2742 2742 */
2743 2743 snap = list_head(&pa->origin_snaps);
2744 2744 err = snaplist_space(&pa->shared_snaps,
2745 2745 snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
2746 2746 if (err)
2747 2747 return (err);
2748 2748
2749 2749 err = snaplist_space(&pa->clone_snaps,
2750 2750 snap->ds->ds_dir->dd_origin_txg, &space);
2751 2751 if (err)
2752 2752 return (err);
2753 2753 pa->cloneusedsnap += space;
2754 2754 }
2755 2755 if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2756 2756 err = snaplist_space(&pa->origin_snaps,
2757 2757 origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
2758 2758 if (err)
2759 2759 return (err);
2760 2760 }
2761 2761
2762 2762 return (0);
2763 2763 out:
2764 2764 pa->err_ds = snap->ds->ds_snapname;
2765 2765 return (err);
2766 2766 }
2767 2767
2768 2768 static void
2769 2769 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2770 2770 {
2771 2771 dsl_dataset_t *hds = arg1;
2772 2772 struct promotearg *pa = arg2;
2773 2773 struct promotenode *snap = list_head(&pa->shared_snaps);
2774 2774 dsl_dataset_t *origin_ds = snap->ds;
2775 2775 dsl_dataset_t *origin_head;
2776 2776 dsl_dir_t *dd = hds->ds_dir;
2777 2777 dsl_pool_t *dp = hds->ds_dir->dd_pool;
2778 2778 dsl_dir_t *odd = NULL;
2779 2779 uint64_t oldnext_obj;
2780 2780 int64_t delta;
2781 2781
2782 2782 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
2783 2783
2784 2784 snap = list_head(&pa->origin_snaps);
2785 2785 origin_head = snap->ds;
2786 2786
2787 2787 /*
2788 2788 * We need to explicitly open odd, since origin_ds's dd will be
2789 2789 * changing.
2790 2790 */
2791 2791 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
2792 2792 NULL, FTAG, &odd));
2793 2793
2794 2794 /* change origin's next snap */
2795 2795 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2796 2796 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2797 2797 snap = list_tail(&pa->clone_snaps);
2798 2798 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2799 2799 origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2800 2800
2801 2801 /* change the origin's next clone */
2802 2802 if (origin_ds->ds_phys->ds_next_clones_obj) {
2803 2803 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
2804 2804 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2805 2805 origin_ds->ds_phys->ds_next_clones_obj,
2806 2806 oldnext_obj, tx));
2807 2807 }
2808 2808
2809 2809 /* change origin */
2810 2810 dmu_buf_will_dirty(dd->dd_dbuf, tx);
2811 2811 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2812 2812 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2813 2813 dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2814 2814 dmu_buf_will_dirty(odd->dd_dbuf, tx);
2815 2815 odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2816 2816 origin_head->ds_dir->dd_origin_txg =
2817 2817 origin_ds->ds_phys->ds_creation_txg;
2818 2818
2819 2819 /* change dd_clone entries */
2820 2820 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2821 2821 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2822 2822 odd->dd_phys->dd_clones, hds->ds_object, tx));
2823 2823 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2824 2824 pa->origin_origin->ds_dir->dd_phys->dd_clones,
2825 2825 hds->ds_object, tx));
2826 2826
2827 2827 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2828 2828 pa->origin_origin->ds_dir->dd_phys->dd_clones,
2829 2829 origin_head->ds_object, tx));
2830 2830 if (dd->dd_phys->dd_clones == 0) {
2831 2831 dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
2832 2832 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
2833 2833 }
2834 2834 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2835 2835 dd->dd_phys->dd_clones, origin_head->ds_object, tx));
2836 2836
2837 2837 }
2838 2838
2839 2839 /* move snapshots to this dir */
2840 2840 for (snap = list_head(&pa->shared_snaps); snap;
2841 2841 snap = list_next(&pa->shared_snaps, snap)) {
2842 2842 dsl_dataset_t *ds = snap->ds;
2843 2843
2844 2844 /* unregister props as dsl_dir is changing */
2845 2845 if (ds->ds_objset) {
2846 2846 dmu_objset_evict(ds->ds_objset);
2847 2847 ds->ds_objset = NULL;
2848 2848 }
2849 2849 /* move snap name entry */
2850 2850 VERIFY(0 == dsl_dataset_get_snapname(ds));
2851 2851 VERIFY(0 == dsl_dataset_snap_remove(origin_head,
2852 2852 ds->ds_snapname, tx));
2853 2853 VERIFY(0 == zap_add(dp->dp_meta_objset,
2854 2854 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2855 2855 8, 1, &ds->ds_object, tx));
2856 2856
2857 2857 /* change containing dsl_dir */
2858 2858 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2859 2859 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2860 2860 ds->ds_phys->ds_dir_obj = dd->dd_object;
2861 2861 ASSERT3P(ds->ds_dir, ==, odd);
2862 2862 dsl_dir_close(ds->ds_dir, ds);
2863 2863 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
2864 2864 NULL, ds, &ds->ds_dir));
2865 2865
2866 2866 /* move any clone references */
2867 2867 if (ds->ds_phys->ds_next_clones_obj &&
2868 2868 spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2869 2869 zap_cursor_t zc;
2870 2870 zap_attribute_t za;
2871 2871
2872 2872 for (zap_cursor_init(&zc, dp->dp_meta_objset,
2873 2873 ds->ds_phys->ds_next_clones_obj);
2874 2874 zap_cursor_retrieve(&zc, &za) == 0;
2875 2875 zap_cursor_advance(&zc)) {
2876 2876 dsl_dataset_t *cnds;
2877 2877 uint64_t o;
2878 2878
2879 2879 if (za.za_first_integer == oldnext_obj) {
2880 2880 /*
2881 2881 * We've already moved the
2882 2882 * origin's reference.
2883 2883 */
2884 2884 continue;
2885 2885 }
2886 2886
2887 2887 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
2888 2888 za.za_first_integer, FTAG, &cnds));
2889 2889 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
2890 2890
2891 2891 VERIFY3U(zap_remove_int(dp->dp_meta_objset,
2892 2892 odd->dd_phys->dd_clones, o, tx), ==, 0);
2893 2893 VERIFY3U(zap_add_int(dp->dp_meta_objset,
2894 2894 dd->dd_phys->dd_clones, o, tx), ==, 0);
2895 2895 dsl_dataset_rele(cnds, FTAG);
2896 2896 }
2897 2897 zap_cursor_fini(&zc);
2898 2898 }
2899 2899
2900 2900 ASSERT0(dsl_prop_numcb(ds));
2901 2901 }
2902 2902
2903 2903 /*
2904 2904 * Change space accounting.
2905 2905 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2906 2906 * both be valid, or both be 0 (resulting in delta == 0). This
2907 2907 * is true for each of {clone,origin} independently.
2908 2908 */
2909 2909
2910 2910 delta = pa->cloneusedsnap -
2911 2911 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2912 2912 ASSERT3S(delta, >=, 0);
2913 2913 ASSERT3U(pa->used, >=, delta);
2914 2914 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2915 2915 dsl_dir_diduse_space(dd, DD_USED_HEAD,
2916 2916 pa->used - delta, pa->comp, pa->uncomp, tx);
2917 2917
2918 2918 delta = pa->originusedsnap -
2919 2919 odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2920 2920 ASSERT3S(delta, <=, 0);
2921 2921 ASSERT3U(pa->used, >=, -delta);
2922 2922 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2923 2923 dsl_dir_diduse_space(odd, DD_USED_HEAD,
2924 2924 -pa->used - delta, -pa->comp, -pa->uncomp, tx);
2925 2925
2926 2926 origin_ds->ds_phys->ds_unique_bytes = pa->unique;
2927 2927
2928 2928 /* log history record */
2929 2929 spa_history_log_internal_ds(hds, "promote", tx, "");
2930 2930
2931 2931 dsl_dir_close(odd, FTAG);
2932 2932 }
2933 2933
2934 2934 static char *snaplist_tag = "snaplist";
2935 2935 /*
2936 2936 * Make a list of dsl_dataset_t's for the snapshots between first_obj
2937 2937 * (exclusive) and last_obj (inclusive). The list will be in reverse
2938 2938 * order (last_obj will be the list_head()). If first_obj == 0, do all
2939 2939 * snapshots back to this dataset's origin.
2940 2940 */
2941 2941 static int
2942 2942 snaplist_make(dsl_pool_t *dp, boolean_t own,
2943 2943 uint64_t first_obj, uint64_t last_obj, list_t *l)
2944 2944 {
2945 2945 uint64_t obj = last_obj;
2946 2946
2947 2947 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
2948 2948
2949 2949 list_create(l, sizeof (struct promotenode),
2950 2950 offsetof(struct promotenode, link));
2951 2951
2952 2952 while (obj != first_obj) {
2953 2953 dsl_dataset_t *ds;
2954 2954 struct promotenode *snap;
2955 2955 int err;
2956 2956
2957 2957 if (own) {
2958 2958 err = dsl_dataset_own_obj(dp, obj,
2959 2959 0, snaplist_tag, &ds);
2960 2960 if (err == 0)
2961 2961 dsl_dataset_make_exclusive(ds, snaplist_tag);
2962 2962 } else {
2963 2963 err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
2964 2964 }
2965 2965 if (err == ENOENT) {
2966 2966 /* lost race with snapshot destroy */
2967 2967 struct promotenode *last = list_tail(l);
2968 2968 ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
2969 2969 obj = last->ds->ds_phys->ds_prev_snap_obj;
2970 2970 continue;
2971 2971 } else if (err) {
2972 2972 return (err);
2973 2973 }
2974 2974
2975 2975 if (first_obj == 0)
2976 2976 first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
2977 2977
2978 2978 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
2979 2979 snap->ds = ds;
2980 2980 list_insert_tail(l, snap);
2981 2981 obj = ds->ds_phys->ds_prev_snap_obj;
2982 2982 }
2983 2983
2984 2984 return (0);
2985 2985 }
2986 2986
2987 2987 static int
2988 2988 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2989 2989 {
2990 2990 struct promotenode *snap;
2991 2991
2992 2992 *spacep = 0;
2993 2993 for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2994 2994 uint64_t used, comp, uncomp;
2995 2995 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2996 2996 mintxg, UINT64_MAX, &used, &comp, &uncomp);
2997 2997 *spacep += used;
2998 2998 }
2999 2999 return (0);
3000 3000 }
3001 3001
3002 3002 static void
3003 3003 snaplist_destroy(list_t *l, boolean_t own)
3004 3004 {
3005 3005 struct promotenode *snap;
3006 3006
3007 3007 if (!l || !list_link_active(&l->list_head))
3008 3008 return;
3009 3009
3010 3010 while ((snap = list_tail(l)) != NULL) {
3011 3011 list_remove(l, snap);
3012 3012 if (own)
3013 3013 dsl_dataset_disown(snap->ds, snaplist_tag);
3014 3014 else
3015 3015 dsl_dataset_rele(snap->ds, snaplist_tag);
3016 3016 kmem_free(snap, sizeof (struct promotenode));
3017 3017 }
3018 3018 list_destroy(l);
3019 3019 }
3020 3020
3021 3021 /*
3022 3022 * Promote a clone. Nomenclature note:
3023 3023 * "clone" or "cds": the original clone which is being promoted
3024 3024 * "origin" or "ods": the snapshot which is originally clone's origin
3025 3025 * "origin head" or "ohds": the dataset which is the head
3026 3026 * (filesystem/volume) for the origin
3027 3027 * "origin origin": the origin of the origin's filesystem (typically
3028 3028 * NULL, indicating that the clone is not a clone of a clone).
3029 3029 */
3030 3030 int
3031 3031 dsl_dataset_promote(const char *name, char *conflsnap)
3032 3032 {
3033 3033 dsl_dataset_t *ds;
3034 3034 dsl_dir_t *dd;
3035 3035 dsl_pool_t *dp;
3036 3036 dmu_object_info_t doi;
3037 3037 struct promotearg pa = { 0 };
3038 3038 struct promotenode *snap;
3039 3039 int err;
3040 3040
3041 3041 err = dsl_dataset_hold(name, FTAG, &ds);
3042 3042 if (err)
3043 3043 return (err);
3044 3044 dd = ds->ds_dir;
3045 3045 dp = dd->dd_pool;
3046 3046
3047 3047 err = dmu_object_info(dp->dp_meta_objset,
3048 3048 ds->ds_phys->ds_snapnames_zapobj, &doi);
3049 3049 if (err) {
3050 3050 dsl_dataset_rele(ds, FTAG);
3051 3051 return (err);
3052 3052 }
3053 3053
3054 3054 if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
3055 3055 dsl_dataset_rele(ds, FTAG);
3056 3056 return (EINVAL);
3057 3057 }
3058 3058
3059 3059 /*
3060 3060 * We are going to inherit all the snapshots taken before our
3061 3061 * origin (i.e., our new origin will be our parent's origin).
3062 3062 * Take ownership of them so that we can rename them into our
3063 3063 * namespace.
3064 3064 */
3065 3065 rw_enter(&dp->dp_config_rwlock, RW_READER);
3066 3066
3067 3067 err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
3068 3068 &pa.shared_snaps);
3069 3069 if (err != 0)
3070 3070 goto out;
3071 3071
3072 3072 err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
3073 3073 if (err != 0)
3074 3074 goto out;
3075 3075
3076 3076 snap = list_head(&pa.shared_snaps);
3077 3077 ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
3078 3078 err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
3079 3079 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
3080 3080 if (err != 0)
3081 3081 goto out;
3082 3082
3083 3083 if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
3084 3084 err = dsl_dataset_hold_obj(dp,
3085 3085 snap->ds->ds_dir->dd_phys->dd_origin_obj,
3086 3086 FTAG, &pa.origin_origin);
3087 3087 if (err != 0)
3088 3088 goto out;
3089 3089 }
3090 3090
3091 3091 out:
3092 3092 rw_exit(&dp->dp_config_rwlock);
3093 3093
3094 3094 /*
3095 3095 * Add in 128x the snapnames zapobj size, since we will be moving
3096 3096 * a bunch of snapnames to the promoted ds, and dirtying their
3097 3097 * bonus buffers.
3098 3098 */
3099 3099 if (err == 0) {
3100 3100 err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
3101 3101 dsl_dataset_promote_sync, ds, &pa,
3102 3102 2 + 2 * doi.doi_physical_blocks_512);
3103 3103 if (err && pa.err_ds && conflsnap)
3104 3104 (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
3105 3105 }
3106 3106
3107 3107 snaplist_destroy(&pa.shared_snaps, B_TRUE);
3108 3108 snaplist_destroy(&pa.clone_snaps, B_FALSE);
3109 3109 snaplist_destroy(&pa.origin_snaps, B_FALSE);
3110 3110 if (pa.origin_origin)
3111 3111 dsl_dataset_rele(pa.origin_origin, FTAG);
3112 3112 dsl_dataset_rele(ds, FTAG);
3113 3113 return (err);
3114 3114 }
3115 3115
3116 3116 struct cloneswaparg {
3117 3117 dsl_dataset_t *cds; /* clone dataset */
3118 3118 dsl_dataset_t *ohds; /* origin's head dataset */
3119 3119 boolean_t force;
3120 3120 int64_t unused_refres_delta; /* change in unconsumed refreservation */
3121 3121 };
3122 3122
3123 3123 /* ARGSUSED */
3124 3124 static int
3125 3125 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
3126 3126 {
3127 3127 struct cloneswaparg *csa = arg1;
3128 3128
3129 3129 /* they should both be heads */
3130 3130 if (dsl_dataset_is_snapshot(csa->cds) ||
3131 3131 dsl_dataset_is_snapshot(csa->ohds))
3132 3132 return (EINVAL);
3133 3133
3134 3134 /* the branch point should be just before them */
3135 3135 if (csa->cds->ds_prev != csa->ohds->ds_prev)
3136 3136 return (EINVAL);
3137 3137
3138 3138 /* cds should be the clone (unless they are unrelated) */
3139 3139 if (csa->cds->ds_prev != NULL &&
3140 3140 csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
3141 3141 csa->ohds->ds_object !=
3142 3142 csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
3143 3143 return (EINVAL);
3144 3144
3145 3145 /* the clone should be a child of the origin */
3146 3146 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
3147 3147 return (EINVAL);
3148 3148
3149 3149 /* ohds shouldn't be modified unless 'force' */
3150 3150 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
3151 3151 return (ETXTBSY);
3152 3152
3153 3153 /* adjust amount of any unconsumed refreservation */
3154 3154 csa->unused_refres_delta =
3155 3155 (int64_t)MIN(csa->ohds->ds_reserved,
3156 3156 csa->ohds->ds_phys->ds_unique_bytes) -
3157 3157 (int64_t)MIN(csa->ohds->ds_reserved,
3158 3158 csa->cds->ds_phys->ds_unique_bytes);
3159 3159
3160 3160 if (csa->unused_refres_delta > 0 &&
3161 3161 csa->unused_refres_delta >
3162 3162 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
3163 3163 return (ENOSPC);
3164 3164
3165 3165 if (csa->ohds->ds_quota != 0 &&
3166 3166 csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
3167 3167 return (EDQUOT);
3168 3168
3169 3169 return (0);
3170 3170 }
3171 3171
3172 3172 /* ARGSUSED */
3173 3173 static void
3174 3174 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3175 3175 {
3176 3176 struct cloneswaparg *csa = arg1;
3177 3177 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
3178 3178
3179 3179 ASSERT(csa->cds->ds_reserved == 0);
3180 3180 ASSERT(csa->ohds->ds_quota == 0 ||
3181 3181 csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
3182 3182
3183 3183 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
3184 3184 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
3185 3185
3186 3186 if (csa->cds->ds_objset != NULL) {
3187 3187 dmu_objset_evict(csa->cds->ds_objset);
3188 3188 csa->cds->ds_objset = NULL;
3189 3189 }
3190 3190
3191 3191 if (csa->ohds->ds_objset != NULL) {
3192 3192 dmu_objset_evict(csa->ohds->ds_objset);
3193 3193 csa->ohds->ds_objset = NULL;
3194 3194 }
3195 3195
3196 3196 /*
3197 3197 * Reset origin's unique bytes, if it exists.
3198 3198 */
3199 3199 if (csa->cds->ds_prev) {
3200 3200 dsl_dataset_t *origin = csa->cds->ds_prev;
3201 3201 uint64_t comp, uncomp;
3202 3202
3203 3203 dmu_buf_will_dirty(origin->ds_dbuf, tx);
3204 3204 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3205 3205 origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
3206 3206 &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
3207 3207 }
3208 3208
3209 3209 /* swap blkptrs */
3210 3210 {
3211 3211 blkptr_t tmp;
3212 3212 tmp = csa->ohds->ds_phys->ds_bp;
3213 3213 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
3214 3214 csa->cds->ds_phys->ds_bp = tmp;
3215 3215 }
3216 3216
3217 3217 /* set dd_*_bytes */
3218 3218 {
3219 3219 int64_t dused, dcomp, duncomp;
3220 3220 uint64_t cdl_used, cdl_comp, cdl_uncomp;
3221 3221 uint64_t odl_used, odl_comp, odl_uncomp;
3222 3222
3223 3223 ASSERT3U(csa->cds->ds_dir->dd_phys->
3224 3224 dd_used_breakdown[DD_USED_SNAP], ==, 0);
3225 3225
3226 3226 dsl_deadlist_space(&csa->cds->ds_deadlist,
3227 3227 &cdl_used, &cdl_comp, &cdl_uncomp);
3228 3228 dsl_deadlist_space(&csa->ohds->ds_deadlist,
3229 3229 &odl_used, &odl_comp, &odl_uncomp);
3230 3230
3231 3231 dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
3232 3232 (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
3233 3233 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
3234 3234 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
3235 3235 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
3236 3236 cdl_uncomp -
3237 3237 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
3238 3238
3239 3239 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
3240 3240 dused, dcomp, duncomp, tx);
3241 3241 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
3242 3242 -dused, -dcomp, -duncomp, tx);
3243 3243
3244 3244 /*
3245 3245 * The difference in the space used by snapshots is the
3246 3246 * difference in snapshot space due to the head's
3247 3247 * deadlist (since that's the only thing that's
3248 3248 * changing that affects the snapused).
3249 3249 */
3250 3250 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3251 3251 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3252 3252 &cdl_used, &cdl_comp, &cdl_uncomp);
3253 3253 dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
3254 3254 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3255 3255 &odl_used, &odl_comp, &odl_uncomp);
3256 3256 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
3257 3257 DD_USED_HEAD, DD_USED_SNAP, tx);
3258 3258 }
3259 3259
3260 3260 /* swap ds_*_bytes */
3261 3261 SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
3262 3262 csa->cds->ds_phys->ds_referenced_bytes);
3263 3263 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
3264 3264 csa->cds->ds_phys->ds_compressed_bytes);
3265 3265 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
3266 3266 csa->cds->ds_phys->ds_uncompressed_bytes);
3267 3267 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
3268 3268 csa->cds->ds_phys->ds_unique_bytes);
3269 3269
3270 3270 /* apply any parent delta for change in unconsumed refreservation */
3271 3271 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
3272 3272 csa->unused_refres_delta, 0, 0, tx);
3273 3273
3274 3274 /*
3275 3275 * Swap deadlists.
3276 3276 */
3277 3277 dsl_deadlist_close(&csa->cds->ds_deadlist);
3278 3278 dsl_deadlist_close(&csa->ohds->ds_deadlist);
3279 3279 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
3280 3280 csa->cds->ds_phys->ds_deadlist_obj);
3281 3281 dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
3282 3282 csa->cds->ds_phys->ds_deadlist_obj);
3283 3283 dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
3284 3284 csa->ohds->ds_phys->ds_deadlist_obj);
3285 3285
3286 3286 dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
3287 3287
3288 3288 spa_history_log_internal_ds(csa->cds, "clone swap", tx,
3289 3289 "parent=%s", csa->ohds->ds_dir->dd_myname);
3290 3290 }
3291 3291
3292 3292 /*
3293 3293 * Swap 'clone' with its origin head datasets. Used at the end of "zfs
3294 3294 * recv" into an existing fs to swizzle the file system to the new
3295 3295 * version, and by "zfs rollback". Can also be used to swap two
3296 3296 * independent head datasets if neither has any snapshots.
3297 3297 */
3298 3298 int
3299 3299 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
3300 3300 boolean_t force)
3301 3301 {
3302 3302 struct cloneswaparg csa;
3303 3303 int error;
3304 3304
3305 3305 ASSERT(clone->ds_owner);
3306 3306 ASSERT(origin_head->ds_owner);
3307 3307 retry:
3308 3308 /*
3309 3309 * Need exclusive access for the swap. If we're swapping these
3310 3310 * datasets back after an error, we already hold the locks.
3311 3311 */
3312 3312 if (!RW_WRITE_HELD(&clone->ds_rwlock))
3313 3313 rw_enter(&clone->ds_rwlock, RW_WRITER);
3314 3314 if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
3315 3315 !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
3316 3316 rw_exit(&clone->ds_rwlock);
3317 3317 rw_enter(&origin_head->ds_rwlock, RW_WRITER);
3318 3318 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
3319 3319 rw_exit(&origin_head->ds_rwlock);
3320 3320 goto retry;
3321 3321 }
3322 3322 }
3323 3323 csa.cds = clone;
3324 3324 csa.ohds = origin_head;
3325 3325 csa.force = force;
3326 3326 error = dsl_sync_task_do(clone->ds_dir->dd_pool,
3327 3327 dsl_dataset_clone_swap_check,
3328 3328 dsl_dataset_clone_swap_sync, &csa, NULL, 9);
3329 3329 return (error);
3330 3330 }
3331 3331
3332 3332 /*
3333 3333 * Given a pool name and a dataset object number in that pool,
3334 3334 * return the name of that dataset.
3335 3335 */
3336 3336 int
3337 3337 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3338 3338 {
3339 3339 spa_t *spa;
3340 3340 dsl_pool_t *dp;
3341 3341 dsl_dataset_t *ds;
3342 3342 int error;
3343 3343
3344 3344 if ((error = spa_open(pname, &spa, FTAG)) != 0)
3345 3345 return (error);
3346 3346 dp = spa_get_dsl(spa);
3347 3347 rw_enter(&dp->dp_config_rwlock, RW_READER);
3348 3348 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
3349 3349 dsl_dataset_name(ds, buf);
3350 3350 dsl_dataset_rele(ds, FTAG);
3351 3351 }
3352 3352 rw_exit(&dp->dp_config_rwlock);
3353 3353 spa_close(spa, FTAG);
3354 3354
3355 3355 return (error);
3356 3356 }
3357 3357
3358 3358 int
3359 3359 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3360 3360 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3361 3361 {
3362 3362 int error = 0;
3363 3363
3364 3364 ASSERT3S(asize, >, 0);
3365 3365
3366 3366 /*
3367 3367 * *ref_rsrv is the portion of asize that will come from any
3368 3368 * unconsumed refreservation space.
3369 3369 */
3370 3370 *ref_rsrv = 0;
3371 3371
3372 3372 mutex_enter(&ds->ds_lock);
3373 3373 /*
3374 3374 * Make a space adjustment for reserved bytes.
3375 3375 */
3376 3376 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
3377 3377 ASSERT3U(*used, >=,
3378 3378 ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3379 3379 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3380 3380 *ref_rsrv =
3381 3381 asize - MIN(asize, parent_delta(ds, asize + inflight));
3382 3382 }
3383 3383
3384 3384 if (!check_quota || ds->ds_quota == 0) {
3385 3385 mutex_exit(&ds->ds_lock);
3386 3386 return (0);
3387 3387 }
3388 3388 /*
3389 3389 * If they are requesting more space, and our current estimate
3390 3390 * is over quota, they get to try again unless the actual
3391 3391 * on-disk is over quota and there are no pending changes (which
3392 3392 * may free up space for us).
3393 3393 */
3394 3394 if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
3395 3395 if (inflight > 0 ||
3396 3396 ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
3397 3397 error = ERESTART;
3398 3398 else
3399 3399 error = EDQUOT;
3400 3400 }
3401 3401 mutex_exit(&ds->ds_lock);
3402 3402
3403 3403 return (error);
3404 3404 }
3405 3405
3406 3406 /* ARGSUSED */
3407 3407 static int
3408 3408 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
3409 3409 {
3410 3410 dsl_dataset_t *ds = arg1;
3411 3411 dsl_prop_setarg_t *psa = arg2;
3412 3412 int err;
3413 3413
3414 3414 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
3415 3415 return (ENOTSUP);
3416 3416
3417 3417 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3418 3418 return (err);
3419 3419
3420 3420 if (psa->psa_effective_value == 0)
3421 3421 return (0);
3422 3422
3423 3423 if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
3424 3424 psa->psa_effective_value < ds->ds_reserved)
3425 3425 return (ENOSPC);
3426 3426
3427 3427 return (0);
3428 3428 }
3429 3429
3430 3430 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
3431 3431
3432 3432 void
3433 3433 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3434 3434 {
3435 3435 dsl_dataset_t *ds = arg1;
3436 3436 dsl_prop_setarg_t *psa = arg2;
3437 3437 uint64_t effective_value = psa->psa_effective_value;
3438 3438
3439 3439 dsl_prop_set_sync(ds, psa, tx);
3440 3440 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3441 3441
3442 3442 if (ds->ds_quota != effective_value) {
3443 3443 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3444 3444 ds->ds_quota = effective_value;
3445 3445 }
3446 3446 }
3447 3447
3448 3448 int
3449 3449 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
3450 3450 {
3451 3451 dsl_dataset_t *ds;
3452 3452 dsl_prop_setarg_t psa;
3453 3453 int err;
3454 3454
3455 3455 dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a);
3456 3456
3457 3457 err = dsl_dataset_hold(dsname, FTAG, &ds);
3458 3458 if (err)
3459 3459 return (err);
3460 3460
3461 3461 /*
3462 3462 * If someone removes a file, then tries to set the quota, we
3463 3463 * want to make sure the file freeing takes effect.
3464 3464 */
3465 3465 txg_wait_open(ds->ds_dir->dd_pool, 0);
3466 3466
3467 3467 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3468 3468 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
3469 3469 ds, &psa, 0);
3470 3470
3471 3471 dsl_dataset_rele(ds, FTAG);
3472 3472 return (err);
3473 3473 }
3474 3474
3475 3475 static int
3476 3476 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
3477 3477 {
3478 3478 dsl_dataset_t *ds = arg1;
3479 3479 dsl_prop_setarg_t *psa = arg2;
3480 3480 uint64_t effective_value;
3481 3481 uint64_t unique;
3482 3482 int err;
3483 3483
3484 3484 if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
3485 3485 SPA_VERSION_REFRESERVATION)
3486 3486 return (ENOTSUP);
3487 3487
3488 3488 if (dsl_dataset_is_snapshot(ds))
3489 3489 return (EINVAL);
3490 3490
3491 3491 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3492 3492 return (err);
3493 3493
3494 3494 effective_value = psa->psa_effective_value;
3495 3495
3496 3496 /*
3497 3497 * If we are doing the preliminary check in open context, the
3498 3498 * space estimates may be inaccurate.
3499 3499 */
3500 3500 if (!dmu_tx_is_syncing(tx))
3501 3501 return (0);
3502 3502
3503 3503 mutex_enter(&ds->ds_lock);
3504 3504 if (!DS_UNIQUE_IS_ACCURATE(ds))
3505 3505 dsl_dataset_recalc_head_uniq(ds);
3506 3506 unique = ds->ds_phys->ds_unique_bytes;
3507 3507 mutex_exit(&ds->ds_lock);
3508 3508
3509 3509 if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
3510 3510 uint64_t delta = MAX(unique, effective_value) -
3511 3511 MAX(unique, ds->ds_reserved);
3512 3512
3513 3513 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
3514 3514 return (ENOSPC);
3515 3515 if (ds->ds_quota > 0 &&
3516 3516 effective_value > ds->ds_quota)
3517 3517 return (ENOSPC);
3518 3518 }
3519 3519
3520 3520 return (0);
3521 3521 }
3522 3522
3523 3523 static void
3524 3524 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3525 3525 {
3526 3526 dsl_dataset_t *ds = arg1;
3527 3527 dsl_prop_setarg_t *psa = arg2;
3528 3528 uint64_t effective_value = psa->psa_effective_value;
3529 3529 uint64_t unique;
3530 3530 int64_t delta;
3531 3531
3532 3532 dsl_prop_set_sync(ds, psa, tx);
3533 3533 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3534 3534
3535 3535 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3536 3536
3537 3537 mutex_enter(&ds->ds_dir->dd_lock);
3538 3538 mutex_enter(&ds->ds_lock);
3539 3539 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3540 3540 unique = ds->ds_phys->ds_unique_bytes;
3541 3541 delta = MAX(0, (int64_t)(effective_value - unique)) -
3542 3542 MAX(0, (int64_t)(ds->ds_reserved - unique));
3543 3543 ds->ds_reserved = effective_value;
3544 3544 mutex_exit(&ds->ds_lock);
3545 3545
3546 3546 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3547 3547 mutex_exit(&ds->ds_dir->dd_lock);
3548 3548 }
3549 3549
3550 3550 int
3551 3551 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
3552 3552 uint64_t reservation)
3553 3553 {
3554 3554 dsl_dataset_t *ds;
3555 3555 dsl_prop_setarg_t psa;
3556 3556 int err;
3557 3557
3558 3558 dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
3559 3559 &reservation);
3560 3560
3561 3561 err = dsl_dataset_hold(dsname, FTAG, &ds);
3562 3562 if (err)
3563 3563 return (err);
3564 3564
3565 3565 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3566 3566 dsl_dataset_set_reservation_check,
3567 3567 dsl_dataset_set_reservation_sync, ds, &psa, 0);
3568 3568
3569 3569 dsl_dataset_rele(ds, FTAG);
3570 3570 return (err);
3571 3571 }
3572 3572
3573 3573 typedef struct zfs_hold_cleanup_arg {
3574 3574 dsl_pool_t *dp;
3575 3575 uint64_t dsobj;
3576 3576 char htag[MAXNAMELEN];
3577 3577 } zfs_hold_cleanup_arg_t;
3578 3578
3579 3579 static void
3580 3580 dsl_dataset_user_release_onexit(void *arg)
3581 3581 {
3582 3582 zfs_hold_cleanup_arg_t *ca = arg;
3583 3583
3584 3584 (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
3585 3585 B_TRUE);
3586 3586 kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
3587 3587 }
3588 3588
3589 3589 void
3590 3590 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
3591 3591 minor_t minor)
3592 3592 {
3593 3593 zfs_hold_cleanup_arg_t *ca;
3594 3594
3595 3595 ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
3596 3596 ca->dp = ds->ds_dir->dd_pool;
3597 3597 ca->dsobj = ds->ds_object;
3598 3598 (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
3599 3599 VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
3600 3600 dsl_dataset_user_release_onexit, ca, NULL));
3601 3601 }
3602 3602
3603 3603 /*
3604 3604 * If you add new checks here, you may need to add
3605 3605 * additional checks to the "temporary" case in
3606 3606 * snapshot_check() in dmu_objset.c.
3607 3607 */
3608 3608 static int
3609 3609 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
3610 3610 {
3611 3611 dsl_dataset_t *ds = arg1;
3612 3612 struct dsl_ds_holdarg *ha = arg2;
3613 3613 const char *htag = ha->htag;
3614 3614 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3615 3615 int error = 0;
3616 3616
3617 3617 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3618 3618 return (ENOTSUP);
3619 3619
3620 3620 if (!dsl_dataset_is_snapshot(ds))
3621 3621 return (EINVAL);
3622 3622
3623 3623 /* tags must be unique */
3624 3624 mutex_enter(&ds->ds_lock);
3625 3625 if (ds->ds_phys->ds_userrefs_obj) {
3626 3626 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
3627 3627 8, 1, tx);
3628 3628 if (error == 0)
3629 3629 error = EEXIST;
3630 3630 else if (error == ENOENT)
3631 3631 error = 0;
3632 3632 }
3633 3633 mutex_exit(&ds->ds_lock);
3634 3634
3635 3635 if (error == 0 && ha->temphold &&
3636 3636 strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
3637 3637 error = E2BIG;
3638 3638
3639 3639 return (error);
3640 3640 }
3641 3641
3642 3642 void
3643 3643 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3644 3644 {
3645 3645 dsl_dataset_t *ds = arg1;
3646 3646 struct dsl_ds_holdarg *ha = arg2;
3647 3647 const char *htag = ha->htag;
3648 3648 dsl_pool_t *dp = ds->ds_dir->dd_pool;
3649 3649 objset_t *mos = dp->dp_meta_objset;
3650 3650 uint64_t now = gethrestime_sec();
3651 3651 uint64_t zapobj;
3652 3652
3653 3653 mutex_enter(&ds->ds_lock);
3654 3654 if (ds->ds_phys->ds_userrefs_obj == 0) {
3655 3655 /*
3656 3656 * This is the first user hold for this dataset. Create
3657 3657 * the userrefs zap object.
3658 3658 */
3659 3659 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3660 3660 zapobj = ds->ds_phys->ds_userrefs_obj =
3661 3661 zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
3662 3662 } else {
3663 3663 zapobj = ds->ds_phys->ds_userrefs_obj;
3664 3664 }
3665 3665 ds->ds_userrefs++;
3666 3666 mutex_exit(&ds->ds_lock);
3667 3667
3668 3668 VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
3669 3669
3670 3670 if (ha->temphold) {
3671 3671 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
3672 3672 htag, &now, tx));
3673 3673 }
3674 3674
3675 3675 spa_history_log_internal_ds(ds, "hold", tx,
3676 3676 "tag = %s temp = %d holds now = %llu",
3677 3677 htag, (int)ha->temphold, ds->ds_userrefs);
3678 3678 }
3679 3679
3680 3680 static int
3681 3681 dsl_dataset_user_hold_one(const char *dsname, void *arg)
3682 3682 {
3683 3683 struct dsl_ds_holdarg *ha = arg;
3684 3684 dsl_dataset_t *ds;
3685 3685 int error;
3686 3686 char *name;
3687 3687
3688 3688 /* alloc a buffer to hold dsname@snapname plus terminating NULL */
3689 3689 name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3690 3690 error = dsl_dataset_hold(name, ha->dstg, &ds);
3691 3691 strfree(name);
3692 3692 if (error == 0) {
3693 3693 ha->gotone = B_TRUE;
3694 3694 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
3695 3695 dsl_dataset_user_hold_sync, ds, ha, 0);
3696 3696 } else if (error == ENOENT && ha->recursive) {
3697 3697 error = 0;
3698 3698 } else {
3699 3699 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3700 3700 }
3701 3701 return (error);
3702 3702 }
3703 3703
3704 3704 int
3705 3705 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
3706 3706 boolean_t temphold)
3707 3707 {
3708 3708 struct dsl_ds_holdarg *ha;
3709 3709 int error;
3710 3710
3711 3711 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3712 3712 ha->htag = htag;
3713 3713 ha->temphold = temphold;
3714 3714 error = dsl_sync_task_do(ds->ds_dir->dd_pool,
3715 3715 dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
3716 3716 ds, ha, 0);
3717 3717 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3718 3718
3719 3719 return (error);
3720 3720 }
3721 3721
3722 3722 int
3723 3723 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
3724 3724 boolean_t recursive, boolean_t temphold, int cleanup_fd)
3725 3725 {
3726 3726 struct dsl_ds_holdarg *ha;
3727 3727 dsl_sync_task_t *dst;
3728 3728 spa_t *spa;
3729 3729 int error;
3730 3730 minor_t minor = 0;
3731 3731
3732 3732 if (cleanup_fd != -1) {
3733 3733 /* Currently we only support cleanup-on-exit of tempholds. */
3734 3734 if (!temphold)
3735 3735 return (EINVAL);
3736 3736 error = zfs_onexit_fd_hold(cleanup_fd, &minor);
3737 3737 if (error)
3738 3738 return (error);
3739 3739 }
3740 3740
3741 3741 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3742 3742
3743 3743 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3744 3744
3745 3745 error = spa_open(dsname, &spa, FTAG);
3746 3746 if (error) {
3747 3747 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3748 3748 if (cleanup_fd != -1)
3749 3749 zfs_onexit_fd_rele(cleanup_fd);
3750 3750 return (error);
3751 3751 }
3752 3752
3753 3753 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3754 3754 ha->htag = htag;
3755 3755 ha->snapname = snapname;
3756 3756 ha->recursive = recursive;
3757 3757 ha->temphold = temphold;
3758 3758
3759 3759 if (recursive) {
3760 3760 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
3761 3761 ha, DS_FIND_CHILDREN);
3762 3762 } else {
3763 3763 error = dsl_dataset_user_hold_one(dsname, ha);
3764 3764 }
3765 3765 if (error == 0)
3766 3766 error = dsl_sync_task_group_wait(ha->dstg);
3767 3767
3768 3768 for (dst = list_head(&ha->dstg->dstg_tasks); dst;
3769 3769 dst = list_next(&ha->dstg->dstg_tasks, dst)) {
3770 3770 dsl_dataset_t *ds = dst->dst_arg1;
3771 3771
3772 3772 if (dst->dst_err) {
3773 3773 dsl_dataset_name(ds, ha->failed);
3774 3774 *strchr(ha->failed, '@') = '\0';
3775 3775 } else if (error == 0 && minor != 0 && temphold) {
3776 3776 /*
3777 3777 * If this hold is to be released upon process exit,
3778 3778 * register that action now.
3779 3779 */
3780 3780 dsl_register_onexit_hold_cleanup(ds, htag, minor);
3781 3781 }
3782 3782 dsl_dataset_rele(ds, ha->dstg);
3783 3783 }
3784 3784
3785 3785 if (error == 0 && recursive && !ha->gotone)
3786 3786 error = ENOENT;
3787 3787
3788 3788 if (error)
3789 3789 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
3790 3790
3791 3791 dsl_sync_task_group_destroy(ha->dstg);
3792 3792
3793 3793 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3794 3794 spa_close(spa, FTAG);
3795 3795 if (cleanup_fd != -1)
3796 3796 zfs_onexit_fd_rele(cleanup_fd);
3797 3797 return (error);
3798 3798 }
3799 3799
3800 3800 struct dsl_ds_releasearg {
3801 3801 dsl_dataset_t *ds;
3802 3802 const char *htag;
3803 3803 boolean_t own; /* do we own or just hold ds? */
3804 3804 };
3805 3805
3806 3806 static int
3807 3807 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
3808 3808 boolean_t *might_destroy)
3809 3809 {
3810 3810 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3811 3811 uint64_t zapobj;
3812 3812 uint64_t tmp;
3813 3813 int error;
3814 3814
3815 3815 *might_destroy = B_FALSE;
3816 3816
3817 3817 mutex_enter(&ds->ds_lock);
3818 3818 zapobj = ds->ds_phys->ds_userrefs_obj;
3819 3819 if (zapobj == 0) {
3820 3820 /* The tag can't possibly exist */
3821 3821 mutex_exit(&ds->ds_lock);
3822 3822 return (ESRCH);
3823 3823 }
3824 3824
3825 3825 /* Make sure the tag exists */
3826 3826 error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
3827 3827 if (error) {
3828 3828 mutex_exit(&ds->ds_lock);
3829 3829 if (error == ENOENT)
3830 3830 error = ESRCH;
3831 3831 return (error);
3832 3832 }
3833 3833
3834 3834 if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
3835 3835 DS_IS_DEFER_DESTROY(ds))
3836 3836 *might_destroy = B_TRUE;
3837 3837
3838 3838 mutex_exit(&ds->ds_lock);
3839 3839 return (0);
3840 3840 }
3841 3841
3842 3842 static int
3843 3843 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
3844 3844 {
3845 3845 struct dsl_ds_releasearg *ra = arg1;
3846 3846 dsl_dataset_t *ds = ra->ds;
3847 3847 boolean_t might_destroy;
3848 3848 int error;
3849 3849
3850 3850 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3851 3851 return (ENOTSUP);
3852 3852
3853 3853 error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
3854 3854 if (error)
3855 3855 return (error);
3856 3856
3857 3857 if (might_destroy) {
3858 3858 struct dsl_ds_destroyarg dsda = {0};
3859 3859
3860 3860 if (dmu_tx_is_syncing(tx)) {
3861 3861 /*
3862 3862 * If we're not prepared to remove the snapshot,
3863 3863 * we can't allow the release to happen right now.
3864 3864 */
3865 3865 if (!ra->own)
3866 3866 return (EBUSY);
3867 3867 }
3868 3868 dsda.ds = ds;
3869 3869 dsda.releasing = B_TRUE;
3870 3870 return (dsl_dataset_destroy_check(&dsda, tag, tx));
3871 3871 }
3872 3872
3873 3873 return (0);
3874 3874 }
3875 3875
3876 3876 static void
3877 3877 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
3878 3878 {
3879 3879 struct dsl_ds_releasearg *ra = arg1;
3880 3880 dsl_dataset_t *ds = ra->ds;
3881 3881 dsl_pool_t *dp = ds->ds_dir->dd_pool;
3882 3882 objset_t *mos = dp->dp_meta_objset;
3883 3883 uint64_t zapobj;
3884 3884 uint64_t refs;
3885 3885 int error;
3886 3886
3887 3887 mutex_enter(&ds->ds_lock);
3888 3888 ds->ds_userrefs--;
3889 3889 refs = ds->ds_userrefs;
3890 3890 mutex_exit(&ds->ds_lock);
3891 3891 error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
3892 3892 VERIFY(error == 0 || error == ENOENT);
3893 3893 zapobj = ds->ds_phys->ds_userrefs_obj;
3894 3894 VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
3895 3895
3896 3896 spa_history_log_internal_ds(ds, "release", tx,
3897 3897 "tag = %s refs now = %lld", ra->htag, (longlong_t)refs);
3898 3898
3899 3899 if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
3900 3900 DS_IS_DEFER_DESTROY(ds)) {
3901 3901 struct dsl_ds_destroyarg dsda = {0};
3902 3902
3903 3903 ASSERT(ra->own);
3904 3904 dsda.ds = ds;
3905 3905 dsda.releasing = B_TRUE;
3906 3906 /* We already did the destroy_check */
3907 3907 dsl_dataset_destroy_sync(&dsda, tag, tx);
3908 3908 }
3909 3909 }
3910 3910
3911 3911 static int
3912 3912 dsl_dataset_user_release_one(const char *dsname, void *arg)
3913 3913 {
3914 3914 struct dsl_ds_holdarg *ha = arg;
3915 3915 struct dsl_ds_releasearg *ra;
3916 3916 dsl_dataset_t *ds;
3917 3917 int error;
3918 3918 void *dtag = ha->dstg;
3919 3919 char *name;
3920 3920 boolean_t own = B_FALSE;
3921 3921 boolean_t might_destroy;
3922 3922
3923 3923 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
3924 3924 name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3925 3925 error = dsl_dataset_hold(name, dtag, &ds);
3926 3926 strfree(name);
3927 3927 if (error == ENOENT && ha->recursive)
3928 3928 return (0);
3929 3929 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3930 3930 if (error)
3931 3931 return (error);
3932 3932
3933 3933 ha->gotone = B_TRUE;
3934 3934
3935 3935 ASSERT(dsl_dataset_is_snapshot(ds));
3936 3936
3937 3937 error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
3938 3938 if (error) {
3939 3939 dsl_dataset_rele(ds, dtag);
3940 3940 return (error);
3941 3941 }
3942 3942
3943 3943 if (might_destroy) {
3944 3944 #ifdef _KERNEL
3945 3945 name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3946 3946 error = zfs_unmount_snap(name, NULL);
3947 3947 strfree(name);
3948 3948 if (error) {
3949 3949 dsl_dataset_rele(ds, dtag);
3950 3950 return (error);
3951 3951 }
3952 3952 #endif
3953 3953 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
3954 3954 dsl_dataset_rele(ds, dtag);
3955 3955 return (EBUSY);
3956 3956 } else {
3957 3957 own = B_TRUE;
3958 3958 dsl_dataset_make_exclusive(ds, dtag);
3959 3959 }
3960 3960 }
3961 3961
3962 3962 ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
3963 3963 ra->ds = ds;
3964 3964 ra->htag = ha->htag;
3965 3965 ra->own = own;
3966 3966 dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
3967 3967 dsl_dataset_user_release_sync, ra, dtag, 0);
3968 3968
3969 3969 return (0);
3970 3970 }
3971 3971
3972 3972 int
3973 3973 dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
3974 3974 boolean_t recursive)
3975 3975 {
3976 3976 struct dsl_ds_holdarg *ha;
3977 3977 dsl_sync_task_t *dst;
3978 3978 spa_t *spa;
3979 3979 int error;
3980 3980
3981 3981 top:
3982 3982 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3983 3983
3984 3984 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3985 3985
3986 3986 error = spa_open(dsname, &spa, FTAG);
3987 3987 if (error) {
3988 3988 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3989 3989 return (error);
3990 3990 }
3991 3991
3992 3992 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3993 3993 ha->htag = htag;
3994 3994 ha->snapname = snapname;
3995 3995 ha->recursive = recursive;
3996 3996 if (recursive) {
3997 3997 error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
3998 3998 ha, DS_FIND_CHILDREN);
3999 3999 } else {
4000 4000 error = dsl_dataset_user_release_one(dsname, ha);
4001 4001 }
4002 4002 if (error == 0)
4003 4003 error = dsl_sync_task_group_wait(ha->dstg);
4004 4004
4005 4005 for (dst = list_head(&ha->dstg->dstg_tasks); dst;
4006 4006 dst = list_next(&ha->dstg->dstg_tasks, dst)) {
4007 4007 struct dsl_ds_releasearg *ra = dst->dst_arg1;
4008 4008 dsl_dataset_t *ds = ra->ds;
4009 4009
4010 4010 if (dst->dst_err)
4011 4011 dsl_dataset_name(ds, ha->failed);
4012 4012
4013 4013 if (ra->own)
4014 4014 dsl_dataset_disown(ds, ha->dstg);
4015 4015 else
4016 4016 dsl_dataset_rele(ds, ha->dstg);
4017 4017
4018 4018 kmem_free(ra, sizeof (struct dsl_ds_releasearg));
4019 4019 }
4020 4020
4021 4021 if (error == 0 && recursive && !ha->gotone)
4022 4022 error = ENOENT;
4023 4023
4024 4024 if (error && error != EBUSY)
4025 4025 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
4026 4026
4027 4027 dsl_sync_task_group_destroy(ha->dstg);
4028 4028 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
4029 4029 spa_close(spa, FTAG);
4030 4030
4031 4031 /*
4032 4032 * We can get EBUSY if we were racing with deferred destroy and
4033 4033 * dsl_dataset_user_release_check() hadn't done the necessary
4034 4034 * open context setup. We can also get EBUSY if we're racing
4035 4035 * with destroy and that thread is the ds_owner. Either way
4036 4036 * the busy condition should be transient, and we should retry
4037 4037 * the release operation.
4038 4038 */
4039 4039 if (error == EBUSY)
4040 4040 goto top;
4041 4041
4042 4042 return (error);
4043 4043 }
4044 4044
4045 4045 /*
4046 4046 * Called at spa_load time (with retry == B_FALSE) to release a stale
4047 4047 * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
4048 4048 */
4049 4049 int
4050 4050 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
4051 4051 boolean_t retry)
4052 4052 {
4053 4053 dsl_dataset_t *ds;
4054 4054 char *snap;
4055 4055 char *name;
4056 4056 int namelen;
4057 4057 int error;
4058 4058
4059 4059 do {
4060 4060 rw_enter(&dp->dp_config_rwlock, RW_READER);
4061 4061 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
4062 4062 rw_exit(&dp->dp_config_rwlock);
4063 4063 if (error)
4064 4064 return (error);
4065 4065 namelen = dsl_dataset_namelen(ds)+1;
4066 4066 name = kmem_alloc(namelen, KM_SLEEP);
4067 4067 dsl_dataset_name(ds, name);
4068 4068 dsl_dataset_rele(ds, FTAG);
4069 4069
4070 4070 snap = strchr(name, '@');
4071 4071 *snap = '\0';
4072 4072 ++snap;
4073 4073 error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
4074 4074 kmem_free(name, namelen);
4075 4075
4076 4076 /*
4077 4077 * The object can't have been destroyed because we have a hold,
4078 4078 * but it might have been renamed, resulting in ENOENT. Retry
4079 4079 * if we've been requested to do so.
4080 4080 *
4081 4081 * It would be nice if we could use the dsobj all the way
4082 4082 * through and avoid ENOENT entirely. But we might need to
4083 4083 * unmount the snapshot, and there's currently no way to lookup
4084 4084 * a vfsp using a ZFS object id.
4085 4085 */
4086 4086 } while ((error == ENOENT) && retry);
4087 4087
4088 4088 return (error);
4089 4089 }
4090 4090
4091 4091 int
4092 4092 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
4093 4093 {
4094 4094 dsl_dataset_t *ds;
4095 4095 int err;
4096 4096
4097 4097 err = dsl_dataset_hold(dsname, FTAG, &ds);
4098 4098 if (err)
4099 4099 return (err);
4100 4100
4101 4101 VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
4102 4102 if (ds->ds_phys->ds_userrefs_obj != 0) {
4103 4103 zap_attribute_t *za;
4104 4104 zap_cursor_t zc;
4105 4105
4106 4106 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
4107 4107 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
4108 4108 ds->ds_phys->ds_userrefs_obj);
4109 4109 zap_cursor_retrieve(&zc, za) == 0;
4110 4110 zap_cursor_advance(&zc)) {
4111 4111 VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
4112 4112 za->za_first_integer));
4113 4113 }
4114 4114 zap_cursor_fini(&zc);
4115 4115 kmem_free(za, sizeof (zap_attribute_t));
4116 4116 }
4117 4117 dsl_dataset_rele(ds, FTAG);
4118 4118 return (0);
4119 4119 }
4120 4120
4121 4121 /*
4122 4122 * Note, this function is used as the callback for dmu_objset_find(). We
4123 4123 * always return 0 so that we will continue to find and process
4124 4124 * inconsistent datasets, even if we encounter an error trying to
4125 4125 * process one of them.
4126 4126 */
4127 4127 /* ARGSUSED */
4128 4128 int
4129 4129 dsl_destroy_inconsistent(const char *dsname, void *arg)
4130 4130 {
4131 4131 dsl_dataset_t *ds;
4132 4132
4133 4133 if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
4134 4134 if (DS_IS_INCONSISTENT(ds))
4135 4135 (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
4136 4136 else
4137 4137 dsl_dataset_disown(ds, FTAG);
4138 4138 }
4139 4139 return (0);
4140 4140 }
4141 4141
4142 4142 /*
4143 4143 * Return (in *usedp) the amount of space written in new that is not
4144 4144 * present in oldsnap. New may be a snapshot or the head. Old must be
4145 4145 * a snapshot before new, in new's filesystem (or its origin). If not then
4146 4146 * fail and return EINVAL.
4147 4147 *
4148 4148 * The written space is calculated by considering two components: First, we
4149 4149 * ignore any freed space, and calculate the written as new's used space
4150 4150 * minus old's used space. Next, we add in the amount of space that was freed
4151 4151 * between the two snapshots, thus reducing new's used space relative to old's.
4152 4152 * Specifically, this is the space that was born before old->ds_creation_txg,
4153 4153 * and freed before new (ie. on new's deadlist or a previous deadlist).
4154 4154 *
4155 4155 * space freed [---------------------]
4156 4156 * snapshots ---O-------O--------O-------O------
4157 4157 * oldsnap new
4158 4158 */
4159 4159 int
4160 4160 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
4161 4161 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4162 4162 {
4163 4163 int err = 0;
4164 4164 uint64_t snapobj;
4165 4165 dsl_pool_t *dp = new->ds_dir->dd_pool;
4166 4166
4167 4167 *usedp = 0;
4168 4168 *usedp += new->ds_phys->ds_referenced_bytes;
4169 4169 *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
4170 4170
4171 4171 *compp = 0;
4172 4172 *compp += new->ds_phys->ds_compressed_bytes;
4173 4173 *compp -= oldsnap->ds_phys->ds_compressed_bytes;
4174 4174
4175 4175 *uncompp = 0;
4176 4176 *uncompp += new->ds_phys->ds_uncompressed_bytes;
4177 4177 *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
4178 4178
4179 4179 rw_enter(&dp->dp_config_rwlock, RW_READER);
4180 4180 snapobj = new->ds_object;
4181 4181 while (snapobj != oldsnap->ds_object) {
4182 4182 dsl_dataset_t *snap;
4183 4183 uint64_t used, comp, uncomp;
4184 4184
4185 4185 if (snapobj == new->ds_object) {
4186 4186 snap = new;
4187 4187 } else {
4188 4188 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
4189 4189 if (err != 0)
4190 4190 break;
4191 4191 }
4192 4192
4193 4193 if (snap->ds_phys->ds_prev_snap_txg ==
4194 4194 oldsnap->ds_phys->ds_creation_txg) {
4195 4195 /*
4196 4196 * The blocks in the deadlist can not be born after
4197 4197 * ds_prev_snap_txg, so get the whole deadlist space,
4198 4198 * which is more efficient (especially for old-format
4199 4199 * deadlists). Unfortunately the deadlist code
4200 4200 * doesn't have enough information to make this
4201 4201 * optimization itself.
4202 4202 */
4203 4203 dsl_deadlist_space(&snap->ds_deadlist,
4204 4204 &used, &comp, &uncomp);
4205 4205 } else {
4206 4206 dsl_deadlist_space_range(&snap->ds_deadlist,
4207 4207 0, oldsnap->ds_phys->ds_creation_txg,
4208 4208 &used, &comp, &uncomp);
4209 4209 }
4210 4210 *usedp += used;
4211 4211 *compp += comp;
4212 4212 *uncompp += uncomp;
4213 4213
4214 4214 /*
4215 4215 * If we get to the beginning of the chain of snapshots
4216 4216 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
4217 4217 * was not a snapshot of/before new.
4218 4218 */
4219 4219 snapobj = snap->ds_phys->ds_prev_snap_obj;
4220 4220 if (snap != new)
4221 4221 dsl_dataset_rele(snap, FTAG);
4222 4222 if (snapobj == 0) {
4223 4223 err = EINVAL;
4224 4224 break;
4225 4225 }
4226 4226
4227 4227 }
4228 4228 rw_exit(&dp->dp_config_rwlock);
4229 4229 return (err);
4230 4230 }
4231 4231
4232 4232 /*
4233 4233 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
4234 4234 * lastsnap, and all snapshots in between are deleted.
4235 4235 *
4236 4236 * blocks that would be freed [---------------------------]
4237 4237 * snapshots ---O-------O--------O-------O--------O
4238 4238 * firstsnap lastsnap
4239 4239 *
4240 4240 * This is the set of blocks that were born after the snap before firstsnap,
4241 4241 * (birth > firstsnap->prev_snap_txg) and died before the snap after the
4242 4242 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
4243 4243 * We calculate this by iterating over the relevant deadlists (from the snap
4244 4244 * after lastsnap, backward to the snap after firstsnap), summing up the
4245 4245 * space on the deadlist that was born after the snap before firstsnap.
4246 4246 */
4247 4247 int
4248 4248 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
4249 4249 dsl_dataset_t *lastsnap,
4250 4250 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4251 4251 {
4252 4252 int err = 0;
4253 4253 uint64_t snapobj;
4254 4254 dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
4255 4255
4256 4256 ASSERT(dsl_dataset_is_snapshot(firstsnap));
4257 4257 ASSERT(dsl_dataset_is_snapshot(lastsnap));
4258 4258
4259 4259 /*
4260 4260 * Check that the snapshots are in the same dsl_dir, and firstsnap
4261 4261 * is before lastsnap.
4262 4262 */
4263 4263 if (firstsnap->ds_dir != lastsnap->ds_dir ||
4264 4264 firstsnap->ds_phys->ds_creation_txg >
4265 4265 lastsnap->ds_phys->ds_creation_txg)
4266 4266 return (EINVAL);
4267 4267
4268 4268 *usedp = *compp = *uncompp = 0;
4269 4269
4270 4270 rw_enter(&dp->dp_config_rwlock, RW_READER);
4271 4271 snapobj = lastsnap->ds_phys->ds_next_snap_obj;
4272 4272 while (snapobj != firstsnap->ds_object) {
4273 4273 dsl_dataset_t *ds;
4274 4274 uint64_t used, comp, uncomp;
4275 4275
4276 4276 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
4277 4277 if (err != 0)
4278 4278 break;
4279 4279
4280 4280 dsl_deadlist_space_range(&ds->ds_deadlist,
4281 4281 firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
4282 4282 &used, &comp, &uncomp);
4283 4283 *usedp += used;
4284 4284 *compp += comp;
4285 4285 *uncompp += uncomp;
4286 4286
4287 4287 snapobj = ds->ds_phys->ds_prev_snap_obj;
4288 4288 ASSERT3U(snapobj, !=, 0);
4289 4289 dsl_dataset_rele(ds, FTAG);
4290 4290 }
4291 4291 rw_exit(&dp->dp_config_rwlock);
4292 4292 return (err);
4293 4293 }
↓ open down ↓ |
3600 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX