Print this page
OS-1566 filesystem limits for ZFS datasets
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ new/usr/src/uts/common/fs/zfs/dsl_dataset.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2012 by Delphix. All rights reserved.
24 24 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25 25 */
26 26
27 27 #include <sys/dmu_objset.h>
28 28 #include <sys/dsl_dataset.h>
29 29 #include <sys/dsl_dir.h>
30 30 #include <sys/dsl_prop.h>
31 31 #include <sys/dsl_synctask.h>
32 32 #include <sys/dmu_traverse.h>
33 33 #include <sys/dmu_impl.h>
34 34 #include <sys/dmu_tx.h>
35 35 #include <sys/arc.h>
36 36 #include <sys/zio.h>
37 37 #include <sys/zap.h>
↓ open down ↓ |
37 lines elided |
↑ open up ↑ |
38 38 #include <sys/zfeature.h>
39 39 #include <sys/unique.h>
40 40 #include <sys/zfs_context.h>
41 41 #include <sys/zfs_ioctl.h>
42 42 #include <sys/spa.h>
43 43 #include <sys/zfs_znode.h>
44 44 #include <sys/zfs_onexit.h>
45 45 #include <sys/zvol.h>
46 46 #include <sys/dsl_scan.h>
47 47 #include <sys/dsl_deadlist.h>
48 +#include "zfs_prop.h"
48 49
49 50 static char *dsl_reaper = "the grim reaper";
50 51
51 52 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
52 53 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
53 54 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
54 55
55 56 #define SWITCH64(x, y) \
56 57 { \
57 58 uint64_t __tmp = (x); \
58 59 (x) = (y); \
59 60 (y) = __tmp; \
60 61 }
61 62
62 63 #define DS_REF_MAX (1ULL << 62)
63 64
64 65 #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
65 66
66 67 #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper)
67 68
68 69
69 70 /*
70 71 * Figure out how much of this delta should be propogated to the dsl_dir
71 72 * layer. If there's a refreservation, that space has already been
72 73 * partially accounted for in our ancestors.
73 74 */
74 75 static int64_t
75 76 parent_delta(dsl_dataset_t *ds, int64_t delta)
76 77 {
77 78 uint64_t old_bytes, new_bytes;
78 79
79 80 if (ds->ds_reserved == 0)
80 81 return (delta);
81 82
82 83 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
83 84 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
84 85
85 86 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
86 87 return (new_bytes - old_bytes);
87 88 }
88 89
89 90 void
90 91 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
91 92 {
92 93 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
93 94 int compressed = BP_GET_PSIZE(bp);
94 95 int uncompressed = BP_GET_UCSIZE(bp);
95 96 int64_t delta;
96 97
97 98 dprintf_bp(bp, "ds=%p", ds);
98 99
99 100 ASSERT(dmu_tx_is_syncing(tx));
100 101 /* It could have been compressed away to nothing */
101 102 if (BP_IS_HOLE(bp))
102 103 return;
103 104 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
104 105 ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
105 106 if (ds == NULL) {
106 107 dsl_pool_mos_diduse_space(tx->tx_pool,
107 108 used, compressed, uncompressed);
108 109 return;
109 110 }
110 111 dmu_buf_will_dirty(ds->ds_dbuf, tx);
111 112
112 113 mutex_enter(&ds->ds_dir->dd_lock);
113 114 mutex_enter(&ds->ds_lock);
114 115 delta = parent_delta(ds, used);
115 116 ds->ds_phys->ds_referenced_bytes += used;
116 117 ds->ds_phys->ds_compressed_bytes += compressed;
117 118 ds->ds_phys->ds_uncompressed_bytes += uncompressed;
118 119 ds->ds_phys->ds_unique_bytes += used;
119 120 mutex_exit(&ds->ds_lock);
120 121 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
121 122 compressed, uncompressed, tx);
122 123 dsl_dir_transfer_space(ds->ds_dir, used - delta,
123 124 DD_USED_REFRSRV, DD_USED_HEAD, tx);
124 125 mutex_exit(&ds->ds_dir->dd_lock);
125 126 }
126 127
127 128 int
128 129 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
129 130 boolean_t async)
130 131 {
131 132 if (BP_IS_HOLE(bp))
132 133 return (0);
133 134
134 135 ASSERT(dmu_tx_is_syncing(tx));
135 136 ASSERT(bp->blk_birth <= tx->tx_txg);
136 137
137 138 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
138 139 int compressed = BP_GET_PSIZE(bp);
139 140 int uncompressed = BP_GET_UCSIZE(bp);
140 141
141 142 ASSERT(used > 0);
142 143 if (ds == NULL) {
143 144 dsl_free(tx->tx_pool, tx->tx_txg, bp);
144 145 dsl_pool_mos_diduse_space(tx->tx_pool,
145 146 -used, -compressed, -uncompressed);
146 147 return (used);
147 148 }
148 149 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
149 150
150 151 ASSERT(!dsl_dataset_is_snapshot(ds));
151 152 dmu_buf_will_dirty(ds->ds_dbuf, tx);
152 153
153 154 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
154 155 int64_t delta;
155 156
156 157 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
157 158 dsl_free(tx->tx_pool, tx->tx_txg, bp);
158 159
159 160 mutex_enter(&ds->ds_dir->dd_lock);
160 161 mutex_enter(&ds->ds_lock);
161 162 ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
162 163 !DS_UNIQUE_IS_ACCURATE(ds));
163 164 delta = parent_delta(ds, -used);
164 165 ds->ds_phys->ds_unique_bytes -= used;
165 166 mutex_exit(&ds->ds_lock);
166 167 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
167 168 delta, -compressed, -uncompressed, tx);
168 169 dsl_dir_transfer_space(ds->ds_dir, -used - delta,
169 170 DD_USED_REFRSRV, DD_USED_HEAD, tx);
170 171 mutex_exit(&ds->ds_dir->dd_lock);
171 172 } else {
172 173 dprintf_bp(bp, "putting on dead list: %s", "");
173 174 if (async) {
174 175 /*
175 176 * We are here as part of zio's write done callback,
176 177 * which means we're a zio interrupt thread. We can't
177 178 * call dsl_deadlist_insert() now because it may block
178 179 * waiting for I/O. Instead, put bp on the deferred
179 180 * queue and let dsl_pool_sync() finish the job.
180 181 */
181 182 bplist_append(&ds->ds_pending_deadlist, bp);
182 183 } else {
183 184 dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
184 185 }
185 186 ASSERT3U(ds->ds_prev->ds_object, ==,
186 187 ds->ds_phys->ds_prev_snap_obj);
187 188 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
188 189 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
189 190 if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
190 191 ds->ds_object && bp->blk_birth >
191 192 ds->ds_prev->ds_phys->ds_prev_snap_txg) {
192 193 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
193 194 mutex_enter(&ds->ds_prev->ds_lock);
194 195 ds->ds_prev->ds_phys->ds_unique_bytes += used;
195 196 mutex_exit(&ds->ds_prev->ds_lock);
196 197 }
197 198 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
198 199 dsl_dir_transfer_space(ds->ds_dir, used,
199 200 DD_USED_HEAD, DD_USED_SNAP, tx);
200 201 }
201 202 }
202 203 mutex_enter(&ds->ds_lock);
203 204 ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
204 205 ds->ds_phys->ds_referenced_bytes -= used;
205 206 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
206 207 ds->ds_phys->ds_compressed_bytes -= compressed;
207 208 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
208 209 ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
209 210 mutex_exit(&ds->ds_lock);
210 211
211 212 return (used);
212 213 }
213 214
214 215 uint64_t
215 216 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
216 217 {
217 218 uint64_t trysnap = 0;
218 219
219 220 if (ds == NULL)
220 221 return (0);
221 222 /*
222 223 * The snapshot creation could fail, but that would cause an
223 224 * incorrect FALSE return, which would only result in an
224 225 * overestimation of the amount of space that an operation would
225 226 * consume, which is OK.
226 227 *
227 228 * There's also a small window where we could miss a pending
228 229 * snapshot, because we could set the sync task in the quiescing
229 230 * phase. So this should only be used as a guess.
230 231 */
231 232 if (ds->ds_trysnap_txg >
232 233 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
233 234 trysnap = ds->ds_trysnap_txg;
234 235 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
235 236 }
236 237
237 238 boolean_t
238 239 dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
239 240 uint64_t blk_birth)
240 241 {
241 242 if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
242 243 return (B_FALSE);
243 244
244 245 ddt_prefetch(dsl_dataset_get_spa(ds), bp);
245 246
246 247 return (B_TRUE);
247 248 }
248 249
249 250 /* ARGSUSED */
250 251 static void
251 252 dsl_dataset_evict(dmu_buf_t *db, void *dsv)
252 253 {
253 254 dsl_dataset_t *ds = dsv;
254 255
255 256 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
256 257
257 258 unique_remove(ds->ds_fsid_guid);
258 259
259 260 if (ds->ds_objset != NULL)
260 261 dmu_objset_evict(ds->ds_objset);
261 262
262 263 if (ds->ds_prev) {
263 264 dsl_dataset_drop_ref(ds->ds_prev, ds);
264 265 ds->ds_prev = NULL;
265 266 }
266 267
267 268 bplist_destroy(&ds->ds_pending_deadlist);
268 269 if (db != NULL) {
269 270 dsl_deadlist_close(&ds->ds_deadlist);
270 271 } else {
271 272 ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
272 273 ASSERT(!ds->ds_deadlist.dl_oldfmt);
273 274 }
274 275 if (ds->ds_dir)
275 276 dsl_dir_close(ds->ds_dir, ds);
276 277
277 278 ASSERT(!list_link_active(&ds->ds_synced_link));
278 279
279 280 mutex_destroy(&ds->ds_lock);
280 281 mutex_destroy(&ds->ds_recvlock);
281 282 mutex_destroy(&ds->ds_opening_lock);
282 283 rw_destroy(&ds->ds_rwlock);
283 284 cv_destroy(&ds->ds_exclusive_cv);
284 285
285 286 kmem_free(ds, sizeof (dsl_dataset_t));
286 287 }
287 288
288 289 static int
289 290 dsl_dataset_get_snapname(dsl_dataset_t *ds)
290 291 {
291 292 dsl_dataset_phys_t *headphys;
292 293 int err;
293 294 dmu_buf_t *headdbuf;
294 295 dsl_pool_t *dp = ds->ds_dir->dd_pool;
295 296 objset_t *mos = dp->dp_meta_objset;
296 297
297 298 if (ds->ds_snapname[0])
298 299 return (0);
299 300 if (ds->ds_phys->ds_next_snap_obj == 0)
300 301 return (0);
301 302
302 303 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
303 304 FTAG, &headdbuf);
304 305 if (err)
305 306 return (err);
306 307 headphys = headdbuf->db_data;
307 308 err = zap_value_search(dp->dp_meta_objset,
308 309 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
309 310 dmu_buf_rele(headdbuf, FTAG);
310 311 return (err);
311 312 }
312 313
313 314 static int
314 315 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
315 316 {
316 317 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
317 318 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
318 319 matchtype_t mt;
319 320 int err;
320 321
321 322 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
322 323 mt = MT_FIRST;
323 324 else
324 325 mt = MT_EXACT;
325 326
326 327 err = zap_lookup_norm(mos, snapobj, name, 8, 1,
327 328 value, mt, NULL, 0, NULL);
328 329 if (err == ENOTSUP && mt == MT_FIRST)
329 330 err = zap_lookup(mos, snapobj, name, 8, 1, value);
330 331 return (err);
331 332 }
332 333
333 334 static int
334 335 dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
335 336 {
336 337 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
337 338 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
338 339 matchtype_t mt;
339 340 int err;
340 341
↓ open down ↓ |
283 lines elided |
↑ open up ↑ |
341 342 dsl_dir_snap_cmtime_update(ds->ds_dir);
342 343
343 344 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
344 345 mt = MT_FIRST;
345 346 else
346 347 mt = MT_EXACT;
347 348
348 349 err = zap_remove_norm(mos, snapobj, name, mt, tx);
349 350 if (err == ENOTSUP && mt == MT_FIRST)
350 351 err = zap_remove(mos, snapobj, name, tx);
352 +
353 + if (err == 0)
354 + dsl_snapcount_adjust(ds->ds_dir, tx, -1, B_TRUE);
355 +
351 356 return (err);
352 357 }
353 358
354 359 static int
355 360 dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
356 361 dsl_dataset_t **dsp)
357 362 {
358 363 objset_t *mos = dp->dp_meta_objset;
359 364 dmu_buf_t *dbuf;
360 365 dsl_dataset_t *ds;
361 366 int err;
362 367 dmu_object_info_t doi;
363 368
364 369 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
365 370 dsl_pool_sync_context(dp));
366 371
367 372 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
368 373 if (err)
369 374 return (err);
370 375
371 376 /* Make sure dsobj has the correct object type. */
372 377 dmu_object_info_from_db(dbuf, &doi);
373 378 if (doi.doi_type != DMU_OT_DSL_DATASET)
374 379 return (EINVAL);
375 380
376 381 ds = dmu_buf_get_user(dbuf);
377 382 if (ds == NULL) {
378 383 dsl_dataset_t *winner;
379 384
380 385 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
381 386 ds->ds_dbuf = dbuf;
382 387 ds->ds_object = dsobj;
383 388 ds->ds_phys = dbuf->db_data;
384 389
385 390 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
386 391 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
387 392 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
388 393 mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
389 394
390 395 rw_init(&ds->ds_rwlock, 0, 0, 0);
391 396 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
392 397
393 398 bplist_create(&ds->ds_pending_deadlist);
394 399 dsl_deadlist_open(&ds->ds_deadlist,
395 400 mos, ds->ds_phys->ds_deadlist_obj);
396 401
397 402 list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
398 403 offsetof(dmu_sendarg_t, dsa_link));
399 404
400 405 if (err == 0) {
401 406 err = dsl_dir_open_obj(dp,
402 407 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
403 408 }
404 409 if (err) {
405 410 mutex_destroy(&ds->ds_lock);
406 411 mutex_destroy(&ds->ds_recvlock);
407 412 mutex_destroy(&ds->ds_opening_lock);
408 413 rw_destroy(&ds->ds_rwlock);
409 414 cv_destroy(&ds->ds_exclusive_cv);
410 415 bplist_destroy(&ds->ds_pending_deadlist);
411 416 dsl_deadlist_close(&ds->ds_deadlist);
412 417 kmem_free(ds, sizeof (dsl_dataset_t));
413 418 dmu_buf_rele(dbuf, tag);
414 419 return (err);
415 420 }
416 421
417 422 if (!dsl_dataset_is_snapshot(ds)) {
418 423 ds->ds_snapname[0] = '\0';
419 424 if (ds->ds_phys->ds_prev_snap_obj) {
420 425 err = dsl_dataset_get_ref(dp,
421 426 ds->ds_phys->ds_prev_snap_obj,
422 427 ds, &ds->ds_prev);
423 428 }
424 429 } else {
425 430 if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
426 431 err = dsl_dataset_get_snapname(ds);
427 432 if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
428 433 err = zap_count(
429 434 ds->ds_dir->dd_pool->dp_meta_objset,
430 435 ds->ds_phys->ds_userrefs_obj,
431 436 &ds->ds_userrefs);
432 437 }
433 438 }
434 439
435 440 if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
436 441 /*
437 442 * In sync context, we're called with either no lock
438 443 * or with the write lock. If we're not syncing,
439 444 * we're always called with the read lock held.
440 445 */
441 446 boolean_t need_lock =
442 447 !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
443 448 dsl_pool_sync_context(dp);
444 449
445 450 if (need_lock)
446 451 rw_enter(&dp->dp_config_rwlock, RW_READER);
447 452
448 453 err = dsl_prop_get_ds(ds,
449 454 "refreservation", sizeof (uint64_t), 1,
450 455 &ds->ds_reserved, NULL);
451 456 if (err == 0) {
452 457 err = dsl_prop_get_ds(ds,
453 458 "refquota", sizeof (uint64_t), 1,
454 459 &ds->ds_quota, NULL);
455 460 }
456 461
457 462 if (need_lock)
458 463 rw_exit(&dp->dp_config_rwlock);
459 464 } else {
460 465 ds->ds_reserved = ds->ds_quota = 0;
461 466 }
462 467
463 468 if (err == 0) {
464 469 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
465 470 dsl_dataset_evict);
466 471 }
467 472 if (err || winner) {
468 473 bplist_destroy(&ds->ds_pending_deadlist);
469 474 dsl_deadlist_close(&ds->ds_deadlist);
470 475 if (ds->ds_prev)
471 476 dsl_dataset_drop_ref(ds->ds_prev, ds);
472 477 dsl_dir_close(ds->ds_dir, ds);
473 478 mutex_destroy(&ds->ds_lock);
474 479 mutex_destroy(&ds->ds_recvlock);
475 480 mutex_destroy(&ds->ds_opening_lock);
476 481 rw_destroy(&ds->ds_rwlock);
477 482 cv_destroy(&ds->ds_exclusive_cv);
478 483 kmem_free(ds, sizeof (dsl_dataset_t));
479 484 if (err) {
480 485 dmu_buf_rele(dbuf, tag);
481 486 return (err);
482 487 }
483 488 ds = winner;
484 489 } else {
485 490 ds->ds_fsid_guid =
486 491 unique_insert(ds->ds_phys->ds_fsid_guid);
487 492 }
488 493 }
489 494 ASSERT3P(ds->ds_dbuf, ==, dbuf);
490 495 ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
491 496 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
492 497 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
493 498 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
494 499 mutex_enter(&ds->ds_lock);
495 500 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
496 501 mutex_exit(&ds->ds_lock);
497 502 dmu_buf_rele(ds->ds_dbuf, tag);
498 503 return (ENOENT);
499 504 }
500 505 mutex_exit(&ds->ds_lock);
501 506 *dsp = ds;
502 507 return (0);
503 508 }
504 509
505 510 static int
506 511 dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
507 512 {
508 513 dsl_pool_t *dp = ds->ds_dir->dd_pool;
509 514
510 515 /*
511 516 * In syncing context we don't want the rwlock lock: there
512 517 * may be an existing writer waiting for sync phase to
513 518 * finish. We don't need to worry about such writers, since
514 519 * sync phase is single-threaded, so the writer can't be
515 520 * doing anything while we are active.
516 521 */
517 522 if (dsl_pool_sync_context(dp)) {
518 523 ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
519 524 return (0);
520 525 }
521 526
522 527 /*
523 528 * Normal users will hold the ds_rwlock as a READER until they
524 529 * are finished (i.e., call dsl_dataset_rele()). "Owners" will
525 530 * drop their READER lock after they set the ds_owner field.
526 531 *
527 532 * If the dataset is being destroyed, the destroy thread will
528 533 * obtain a WRITER lock for exclusive access after it's done its
529 534 * open-context work and then change the ds_owner to
530 535 * dsl_reaper once destruction is assured. So threads
531 536 * may block here temporarily, until the "destructability" of
532 537 * the dataset is determined.
533 538 */
534 539 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
535 540 mutex_enter(&ds->ds_lock);
536 541 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
537 542 rw_exit(&dp->dp_config_rwlock);
538 543 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
539 544 if (DSL_DATASET_IS_DESTROYED(ds)) {
540 545 mutex_exit(&ds->ds_lock);
541 546 dsl_dataset_drop_ref(ds, tag);
542 547 rw_enter(&dp->dp_config_rwlock, RW_READER);
543 548 return (ENOENT);
544 549 }
545 550 /*
546 551 * The dp_config_rwlock lives above the ds_lock. And
547 552 * we need to check DSL_DATASET_IS_DESTROYED() while
548 553 * holding the ds_lock, so we have to drop and reacquire
549 554 * the ds_lock here.
550 555 */
551 556 mutex_exit(&ds->ds_lock);
552 557 rw_enter(&dp->dp_config_rwlock, RW_READER);
553 558 mutex_enter(&ds->ds_lock);
554 559 }
555 560 mutex_exit(&ds->ds_lock);
556 561 return (0);
557 562 }
558 563
559 564 int
560 565 dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
561 566 dsl_dataset_t **dsp)
562 567 {
563 568 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
564 569
565 570 if (err)
566 571 return (err);
567 572 return (dsl_dataset_hold_ref(*dsp, tag));
568 573 }
569 574
570 575 int
571 576 dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
572 577 void *tag, dsl_dataset_t **dsp)
573 578 {
574 579 int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
575 580 if (err)
576 581 return (err);
577 582 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
578 583 dsl_dataset_rele(*dsp, tag);
579 584 *dsp = NULL;
580 585 return (EBUSY);
581 586 }
582 587 return (0);
583 588 }
584 589
585 590 int
586 591 dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
587 592 {
588 593 dsl_dir_t *dd;
589 594 dsl_pool_t *dp;
590 595 const char *snapname;
591 596 uint64_t obj;
592 597 int err = 0;
593 598
594 599 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
595 600 if (err)
596 601 return (err);
597 602
598 603 dp = dd->dd_pool;
599 604 obj = dd->dd_phys->dd_head_dataset_obj;
600 605 rw_enter(&dp->dp_config_rwlock, RW_READER);
601 606 if (obj)
602 607 err = dsl_dataset_get_ref(dp, obj, tag, dsp);
603 608 else
604 609 err = ENOENT;
605 610 if (err)
606 611 goto out;
607 612
608 613 err = dsl_dataset_hold_ref(*dsp, tag);
609 614
610 615 /* we may be looking for a snapshot */
611 616 if (err == 0 && snapname != NULL) {
612 617 dsl_dataset_t *ds = NULL;
613 618
614 619 if (*snapname++ != '@') {
615 620 dsl_dataset_rele(*dsp, tag);
616 621 err = ENOENT;
617 622 goto out;
618 623 }
619 624
620 625 dprintf("looking for snapshot '%s'\n", snapname);
621 626 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
622 627 if (err == 0)
623 628 err = dsl_dataset_get_ref(dp, obj, tag, &ds);
624 629 dsl_dataset_rele(*dsp, tag);
625 630
626 631 ASSERT3U((err == 0), ==, (ds != NULL));
627 632
628 633 if (ds) {
629 634 mutex_enter(&ds->ds_lock);
630 635 if (ds->ds_snapname[0] == 0)
631 636 (void) strlcpy(ds->ds_snapname, snapname,
632 637 sizeof (ds->ds_snapname));
633 638 mutex_exit(&ds->ds_lock);
634 639 err = dsl_dataset_hold_ref(ds, tag);
635 640 *dsp = err ? NULL : ds;
636 641 }
637 642 }
638 643 out:
639 644 rw_exit(&dp->dp_config_rwlock);
640 645 dsl_dir_close(dd, FTAG);
641 646 return (err);
642 647 }
643 648
644 649 int
645 650 dsl_dataset_own(const char *name, boolean_t inconsistentok,
646 651 void *tag, dsl_dataset_t **dsp)
647 652 {
648 653 int err = dsl_dataset_hold(name, tag, dsp);
649 654 if (err)
650 655 return (err);
651 656 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
652 657 dsl_dataset_rele(*dsp, tag);
653 658 return (EBUSY);
654 659 }
655 660 return (0);
656 661 }
657 662
658 663 void
659 664 dsl_dataset_name(dsl_dataset_t *ds, char *name)
660 665 {
661 666 if (ds == NULL) {
662 667 (void) strcpy(name, "mos");
663 668 } else {
664 669 dsl_dir_name(ds->ds_dir, name);
665 670 VERIFY(0 == dsl_dataset_get_snapname(ds));
666 671 if (ds->ds_snapname[0]) {
667 672 (void) strcat(name, "@");
668 673 /*
669 674 * We use a "recursive" mutex so that we
670 675 * can call dprintf_ds() with ds_lock held.
671 676 */
672 677 if (!MUTEX_HELD(&ds->ds_lock)) {
673 678 mutex_enter(&ds->ds_lock);
674 679 (void) strcat(name, ds->ds_snapname);
675 680 mutex_exit(&ds->ds_lock);
676 681 } else {
677 682 (void) strcat(name, ds->ds_snapname);
678 683 }
679 684 }
680 685 }
681 686 }
682 687
683 688 static int
684 689 dsl_dataset_namelen(dsl_dataset_t *ds)
685 690 {
686 691 int result;
687 692
688 693 if (ds == NULL) {
689 694 result = 3; /* "mos" */
690 695 } else {
691 696 result = dsl_dir_namelen(ds->ds_dir);
692 697 VERIFY(0 == dsl_dataset_get_snapname(ds));
693 698 if (ds->ds_snapname[0]) {
694 699 ++result; /* adding one for the @-sign */
695 700 if (!MUTEX_HELD(&ds->ds_lock)) {
696 701 mutex_enter(&ds->ds_lock);
697 702 result += strlen(ds->ds_snapname);
698 703 mutex_exit(&ds->ds_lock);
699 704 } else {
700 705 result += strlen(ds->ds_snapname);
701 706 }
702 707 }
703 708 }
704 709
705 710 return (result);
706 711 }
707 712
708 713 void
709 714 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
710 715 {
711 716 dmu_buf_rele(ds->ds_dbuf, tag);
712 717 }
713 718
714 719 void
715 720 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
716 721 {
717 722 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
718 723 rw_exit(&ds->ds_rwlock);
719 724 }
720 725 dsl_dataset_drop_ref(ds, tag);
721 726 }
722 727
723 728 void
724 729 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
725 730 {
726 731 ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
727 732 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
728 733
729 734 mutex_enter(&ds->ds_lock);
730 735 ds->ds_owner = NULL;
731 736 if (RW_WRITE_HELD(&ds->ds_rwlock)) {
732 737 rw_exit(&ds->ds_rwlock);
733 738 cv_broadcast(&ds->ds_exclusive_cv);
734 739 }
735 740 mutex_exit(&ds->ds_lock);
736 741 if (ds->ds_dbuf)
737 742 dsl_dataset_drop_ref(ds, tag);
738 743 else
739 744 dsl_dataset_evict(NULL, ds);
740 745 }
741 746
742 747 boolean_t
743 748 dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
744 749 {
745 750 boolean_t gotit = FALSE;
746 751
747 752 mutex_enter(&ds->ds_lock);
748 753 if (ds->ds_owner == NULL &&
749 754 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
750 755 ds->ds_owner = tag;
751 756 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
752 757 rw_exit(&ds->ds_rwlock);
753 758 gotit = TRUE;
754 759 }
755 760 mutex_exit(&ds->ds_lock);
756 761 return (gotit);
757 762 }
758 763
759 764 void
760 765 dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
761 766 {
762 767 ASSERT3P(owner, ==, ds->ds_owner);
763 768 if (!RW_WRITE_HELD(&ds->ds_rwlock))
764 769 rw_enter(&ds->ds_rwlock, RW_WRITER);
765 770 }
766 771
767 772 uint64_t
768 773 dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
769 774 uint64_t flags, dmu_tx_t *tx)
770 775 {
771 776 dsl_pool_t *dp = dd->dd_pool;
772 777 dmu_buf_t *dbuf;
773 778 dsl_dataset_phys_t *dsphys;
774 779 uint64_t dsobj;
775 780 objset_t *mos = dp->dp_meta_objset;
776 781
777 782 if (origin == NULL)
778 783 origin = dp->dp_origin_snap;
779 784
780 785 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
781 786 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
782 787 ASSERT(dmu_tx_is_syncing(tx));
783 788 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
784 789
785 790 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
786 791 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
787 792 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
788 793 dmu_buf_will_dirty(dbuf, tx);
789 794 dsphys = dbuf->db_data;
790 795 bzero(dsphys, sizeof (dsl_dataset_phys_t));
791 796 dsphys->ds_dir_obj = dd->dd_object;
792 797 dsphys->ds_flags = flags;
793 798 dsphys->ds_fsid_guid = unique_create();
794 799 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
795 800 sizeof (dsphys->ds_guid));
796 801 dsphys->ds_snapnames_zapobj =
797 802 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
798 803 DMU_OT_NONE, 0, tx);
799 804 dsphys->ds_creation_time = gethrestime_sec();
800 805 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
801 806
802 807 if (origin == NULL) {
803 808 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
804 809 } else {
805 810 dsl_dataset_t *ohds;
806 811
807 812 dsphys->ds_prev_snap_obj = origin->ds_object;
808 813 dsphys->ds_prev_snap_txg =
809 814 origin->ds_phys->ds_creation_txg;
810 815 dsphys->ds_referenced_bytes =
811 816 origin->ds_phys->ds_referenced_bytes;
812 817 dsphys->ds_compressed_bytes =
813 818 origin->ds_phys->ds_compressed_bytes;
814 819 dsphys->ds_uncompressed_bytes =
815 820 origin->ds_phys->ds_uncompressed_bytes;
816 821 dsphys->ds_bp = origin->ds_phys->ds_bp;
817 822 dsphys->ds_flags |= origin->ds_phys->ds_flags;
818 823
819 824 dmu_buf_will_dirty(origin->ds_dbuf, tx);
820 825 origin->ds_phys->ds_num_children++;
821 826
822 827 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
823 828 origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
824 829 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
825 830 dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
826 831 dsl_dataset_rele(ohds, FTAG);
827 832
828 833 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
829 834 if (origin->ds_phys->ds_next_clones_obj == 0) {
830 835 origin->ds_phys->ds_next_clones_obj =
831 836 zap_create(mos,
832 837 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
833 838 }
834 839 VERIFY(0 == zap_add_int(mos,
835 840 origin->ds_phys->ds_next_clones_obj,
836 841 dsobj, tx));
837 842 }
838 843
839 844 dmu_buf_will_dirty(dd->dd_dbuf, tx);
840 845 dd->dd_phys->dd_origin_obj = origin->ds_object;
841 846 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
842 847 if (origin->ds_dir->dd_phys->dd_clones == 0) {
843 848 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
844 849 origin->ds_dir->dd_phys->dd_clones =
845 850 zap_create(mos,
846 851 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
847 852 }
848 853 VERIFY3U(0, ==, zap_add_int(mos,
849 854 origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
850 855 }
851 856 }
852 857
853 858 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
854 859 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
855 860
856 861 dmu_buf_rele(dbuf, FTAG);
857 862
858 863 dmu_buf_will_dirty(dd->dd_dbuf, tx);
859 864 dd->dd_phys->dd_head_dataset_obj = dsobj;
860 865
861 866 return (dsobj);
862 867 }
863 868
864 869 uint64_t
865 870 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
866 871 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
867 872 {
868 873 dsl_pool_t *dp = pdd->dd_pool;
869 874 uint64_t dsobj, ddobj;
870 875 dsl_dir_t *dd;
871 876
872 877 ASSERT(lastname[0] != '@');
873 878
874 879 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
875 880 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
876 881
877 882 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
878 883
879 884 dsl_deleg_set_create_perms(dd, tx, cr);
880 885
881 886 dsl_dir_close(dd, FTAG);
882 887
883 888 /*
884 889 * If we are creating a clone, make sure we zero out any stale
885 890 * data from the origin snapshots zil header.
886 891 */
887 892 if (origin != NULL) {
888 893 dsl_dataset_t *ds;
889 894 objset_t *os;
890 895
891 896 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
892 897 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
893 898 bzero(&os->os_zil_header, sizeof (os->os_zil_header));
894 899 dsl_dataset_dirty(ds, tx);
895 900 dsl_dataset_rele(ds, FTAG);
896 901 }
897 902
898 903 return (dsobj);
899 904 }
900 905
901 906 /*
902 907 * The snapshots must all be in the same pool.
903 908 */
904 909 int
905 910 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer,
906 911 nvlist_t *errlist)
907 912 {
908 913 int err;
909 914 dsl_sync_task_t *dst;
910 915 spa_t *spa;
911 916 nvpair_t *pair;
912 917 dsl_sync_task_group_t *dstg;
913 918
914 919 pair = nvlist_next_nvpair(snaps, NULL);
915 920 if (pair == NULL)
916 921 return (0);
917 922
918 923 err = spa_open(nvpair_name(pair), &spa, FTAG);
919 924 if (err)
920 925 return (err);
921 926 dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
922 927
923 928 for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
924 929 pair = nvlist_next_nvpair(snaps, pair)) {
925 930 dsl_dataset_t *ds;
926 931
927 932 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
928 933 if (err == 0) {
929 934 struct dsl_ds_destroyarg *dsda;
930 935
931 936 dsl_dataset_make_exclusive(ds, dstg);
932 937 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
933 938 KM_SLEEP);
934 939 dsda->ds = ds;
935 940 dsda->defer = defer;
936 941 dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
937 942 dsl_dataset_destroy_sync, dsda, dstg, 0);
938 943 } else if (err == ENOENT) {
939 944 err = 0;
940 945 } else {
941 946 fnvlist_add_int32(errlist, nvpair_name(pair), err);
942 947 break;
943 948 }
944 949 }
945 950
946 951 if (err == 0)
947 952 err = dsl_sync_task_group_wait(dstg);
948 953
949 954 for (dst = list_head(&dstg->dstg_tasks); dst;
950 955 dst = list_next(&dstg->dstg_tasks, dst)) {
951 956 struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
952 957 dsl_dataset_t *ds = dsda->ds;
953 958
954 959 /*
955 960 * Return the snapshots that triggered the error.
956 961 */
957 962 if (dst->dst_err != 0) {
958 963 char name[ZFS_MAXNAMELEN];
959 964 dsl_dataset_name(ds, name);
960 965 fnvlist_add_int32(errlist, name, dst->dst_err);
961 966 }
962 967 ASSERT3P(dsda->rm_origin, ==, NULL);
963 968 dsl_dataset_disown(ds, dstg);
964 969 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
965 970 }
966 971
967 972 dsl_sync_task_group_destroy(dstg);
968 973 spa_close(spa, FTAG);
969 974 return (err);
970 975
971 976 }
972 977
973 978 static boolean_t
974 979 dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
975 980 {
976 981 boolean_t might_destroy = B_FALSE;
977 982
978 983 mutex_enter(&ds->ds_lock);
979 984 if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
980 985 DS_IS_DEFER_DESTROY(ds))
981 986 might_destroy = B_TRUE;
982 987 mutex_exit(&ds->ds_lock);
983 988
984 989 return (might_destroy);
985 990 }
986 991
987 992 /*
988 993 * If we're removing a clone, and these three conditions are true:
989 994 * 1) the clone's origin has no other children
990 995 * 2) the clone's origin has no user references
991 996 * 3) the clone's origin has been marked for deferred destruction
992 997 * Then, prepare to remove the origin as part of this sync task group.
993 998 */
994 999 static int
995 1000 dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
996 1001 {
997 1002 dsl_dataset_t *ds = dsda->ds;
998 1003 dsl_dataset_t *origin = ds->ds_prev;
999 1004
1000 1005 if (dsl_dataset_might_destroy_origin(origin)) {
1001 1006 char *name;
1002 1007 int namelen;
1003 1008 int error;
1004 1009
1005 1010 namelen = dsl_dataset_namelen(origin) + 1;
1006 1011 name = kmem_alloc(namelen, KM_SLEEP);
1007 1012 dsl_dataset_name(origin, name);
1008 1013 #ifdef _KERNEL
1009 1014 error = zfs_unmount_snap(name, NULL);
1010 1015 if (error) {
1011 1016 kmem_free(name, namelen);
1012 1017 return (error);
1013 1018 }
1014 1019 #endif
1015 1020 error = dsl_dataset_own(name, B_TRUE, tag, &origin);
1016 1021 kmem_free(name, namelen);
1017 1022 if (error)
1018 1023 return (error);
1019 1024 dsda->rm_origin = origin;
1020 1025 dsl_dataset_make_exclusive(origin, tag);
1021 1026 }
1022 1027
1023 1028 return (0);
1024 1029 }
1025 1030
1026 1031 /*
1027 1032 * ds must be opened as OWNER. On return (whether successful or not),
1028 1033 * ds will be closed and caller can no longer dereference it.
1029 1034 */
1030 1035 int
1031 1036 dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
1032 1037 {
1033 1038 int err;
1034 1039 dsl_sync_task_group_t *dstg;
1035 1040 objset_t *os;
1036 1041 dsl_dir_t *dd;
1037 1042 uint64_t obj;
1038 1043 struct dsl_ds_destroyarg dsda = { 0 };
1039 1044
1040 1045 dsda.ds = ds;
1041 1046
1042 1047 if (dsl_dataset_is_snapshot(ds)) {
1043 1048 /* Destroying a snapshot is simpler */
1044 1049 dsl_dataset_make_exclusive(ds, tag);
1045 1050
1046 1051 dsda.defer = defer;
1047 1052 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1048 1053 dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
1049 1054 &dsda, tag, 0);
1050 1055 ASSERT3P(dsda.rm_origin, ==, NULL);
1051 1056 goto out;
1052 1057 } else if (defer) {
1053 1058 err = EINVAL;
1054 1059 goto out;
1055 1060 }
1056 1061
1057 1062 dd = ds->ds_dir;
1058 1063
1059 1064 if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
1060 1065 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1061 1066 /*
1062 1067 * Check for errors and mark this ds as inconsistent, in
1063 1068 * case we crash while freeing the objects.
1064 1069 */
1065 1070 err = dsl_sync_task_do(dd->dd_pool,
1066 1071 dsl_dataset_destroy_begin_check,
1067 1072 dsl_dataset_destroy_begin_sync, ds, NULL, 0);
1068 1073 if (err)
1069 1074 goto out;
1070 1075
1071 1076 err = dmu_objset_from_ds(ds, &os);
1072 1077 if (err)
1073 1078 goto out;
1074 1079
1075 1080 /*
1076 1081 * Remove all objects while in the open context so that
1077 1082 * there is less work to do in the syncing context.
1078 1083 */
1079 1084 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
1080 1085 ds->ds_phys->ds_prev_snap_txg)) {
1081 1086 /*
1082 1087 * Ignore errors, if there is not enough disk space
1083 1088 * we will deal with it in dsl_dataset_destroy_sync().
1084 1089 */
1085 1090 (void) dmu_free_object(os, obj);
1086 1091 }
1087 1092 if (err != ESRCH)
1088 1093 goto out;
1089 1094
1090 1095 /*
1091 1096 * Sync out all in-flight IO.
1092 1097 */
1093 1098 txg_wait_synced(dd->dd_pool, 0);
1094 1099
1095 1100 /*
1096 1101 * If we managed to free all the objects in open
1097 1102 * context, the user space accounting should be zero.
1098 1103 */
1099 1104 if (ds->ds_phys->ds_bp.blk_fill == 0 &&
1100 1105 dmu_objset_userused_enabled(os)) {
1101 1106 uint64_t count;
1102 1107
1103 1108 ASSERT(zap_count(os, DMU_USERUSED_OBJECT,
1104 1109 &count) != 0 || count == 0);
1105 1110 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT,
1106 1111 &count) != 0 || count == 0);
1107 1112 }
1108 1113 }
1109 1114
1110 1115 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
1111 1116 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
1112 1117 rw_exit(&dd->dd_pool->dp_config_rwlock);
1113 1118
1114 1119 if (err)
1115 1120 goto out;
1116 1121
1117 1122 /*
1118 1123 * Blow away the dsl_dir + head dataset.
1119 1124 */
1120 1125 dsl_dataset_make_exclusive(ds, tag);
1121 1126 /*
1122 1127 * If we're removing a clone, we might also need to remove its
1123 1128 * origin.
1124 1129 */
1125 1130 do {
1126 1131 dsda.need_prep = B_FALSE;
1127 1132 if (dsl_dir_is_clone(dd)) {
1128 1133 err = dsl_dataset_origin_rm_prep(&dsda, tag);
↓ open down ↓ |
768 lines elided |
↑ open up ↑ |
1129 1134 if (err) {
1130 1135 dsl_dir_close(dd, FTAG);
1131 1136 goto out;
1132 1137 }
1133 1138 }
1134 1139
1135 1140 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
1136 1141 dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
1137 1142 dsl_dataset_destroy_sync, &dsda, tag, 0);
1138 1143 dsl_sync_task_create(dstg, dsl_dir_destroy_check,
1139 - dsl_dir_destroy_sync, dd, FTAG, 0);
1144 + dsl_dir_destroy_sync, dd, tag, 0);
1140 1145 err = dsl_sync_task_group_wait(dstg);
1141 1146 dsl_sync_task_group_destroy(dstg);
1142 1147
1143 1148 /*
1144 1149 * We could be racing against 'zfs release' or 'zfs destroy -d'
1145 1150 * on the origin snap, in which case we can get EBUSY if we
1146 1151 * needed to destroy the origin snap but were not ready to
1147 1152 * do so.
1148 1153 */
1149 1154 if (dsda.need_prep) {
1150 1155 ASSERT(err == EBUSY);
1151 1156 ASSERT(dsl_dir_is_clone(dd));
1152 1157 ASSERT(dsda.rm_origin == NULL);
1153 1158 }
1154 1159 } while (dsda.need_prep);
1155 1160
1156 1161 if (dsda.rm_origin != NULL)
1157 1162 dsl_dataset_disown(dsda.rm_origin, tag);
1158 1163
1159 1164 /* if it is successful, dsl_dir_destroy_sync will close the dd */
1160 1165 if (err)
1161 1166 dsl_dir_close(dd, FTAG);
1162 1167 out:
1163 1168 dsl_dataset_disown(ds, tag);
1164 1169 return (err);
1165 1170 }
1166 1171
1167 1172 blkptr_t *
1168 1173 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
1169 1174 {
1170 1175 return (&ds->ds_phys->ds_bp);
1171 1176 }
1172 1177
1173 1178 void
1174 1179 dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
1175 1180 {
1176 1181 ASSERT(dmu_tx_is_syncing(tx));
1177 1182 /* If it's the meta-objset, set dp_meta_rootbp */
1178 1183 if (ds == NULL) {
1179 1184 tx->tx_pool->dp_meta_rootbp = *bp;
1180 1185 } else {
1181 1186 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1182 1187 ds->ds_phys->ds_bp = *bp;
1183 1188 }
1184 1189 }
1185 1190
1186 1191 spa_t *
1187 1192 dsl_dataset_get_spa(dsl_dataset_t *ds)
1188 1193 {
1189 1194 return (ds->ds_dir->dd_pool->dp_spa);
1190 1195 }
1191 1196
1192 1197 void
1193 1198 dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
1194 1199 {
1195 1200 dsl_pool_t *dp;
1196 1201
1197 1202 if (ds == NULL) /* this is the meta-objset */
1198 1203 return;
1199 1204
1200 1205 ASSERT(ds->ds_objset != NULL);
1201 1206
1202 1207 if (ds->ds_phys->ds_next_snap_obj != 0)
1203 1208 panic("dirtying snapshot!");
1204 1209
1205 1210 dp = ds->ds_dir->dd_pool;
1206 1211
1207 1212 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
1208 1213 /* up the hold count until we can be written out */
1209 1214 dmu_buf_add_ref(ds->ds_dbuf, ds);
1210 1215 }
1211 1216 }
1212 1217
1213 1218 boolean_t
1214 1219 dsl_dataset_is_dirty(dsl_dataset_t *ds)
1215 1220 {
1216 1221 for (int t = 0; t < TXG_SIZE; t++) {
1217 1222 if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
1218 1223 ds, t))
1219 1224 return (B_TRUE);
1220 1225 }
1221 1226 return (B_FALSE);
1222 1227 }
1223 1228
1224 1229 /*
1225 1230 * The unique space in the head dataset can be calculated by subtracting
1226 1231 * the space used in the most recent snapshot, that is still being used
1227 1232 * in this file system, from the space currently in use. To figure out
1228 1233 * the space in the most recent snapshot still in use, we need to take
1229 1234 * the total space used in the snapshot and subtract out the space that
1230 1235 * has been freed up since the snapshot was taken.
1231 1236 */
1232 1237 static void
1233 1238 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1234 1239 {
1235 1240 uint64_t mrs_used;
1236 1241 uint64_t dlused, dlcomp, dluncomp;
1237 1242
1238 1243 ASSERT(!dsl_dataset_is_snapshot(ds));
1239 1244
1240 1245 if (ds->ds_phys->ds_prev_snap_obj != 0)
1241 1246 mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
1242 1247 else
1243 1248 mrs_used = 0;
1244 1249
1245 1250 dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1246 1251
1247 1252 ASSERT3U(dlused, <=, mrs_used);
1248 1253 ds->ds_phys->ds_unique_bytes =
1249 1254 ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
1250 1255
1251 1256 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1252 1257 SPA_VERSION_UNIQUE_ACCURATE)
1253 1258 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1254 1259 }
1255 1260
1256 1261 struct killarg {
1257 1262 dsl_dataset_t *ds;
1258 1263 dmu_tx_t *tx;
1259 1264 };
1260 1265
1261 1266 /* ARGSUSED */
1262 1267 static int
1263 1268 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
1264 1269 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1265 1270 {
1266 1271 struct killarg *ka = arg;
1267 1272 dmu_tx_t *tx = ka->tx;
1268 1273
1269 1274 if (bp == NULL)
1270 1275 return (0);
1271 1276
1272 1277 if (zb->zb_level == ZB_ZIL_LEVEL) {
1273 1278 ASSERT(zilog != NULL);
1274 1279 /*
1275 1280 * It's a block in the intent log. It has no
1276 1281 * accounting, so just free it.
1277 1282 */
1278 1283 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
1279 1284 } else {
1280 1285 ASSERT(zilog == NULL);
1281 1286 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
1282 1287 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
1283 1288 }
1284 1289
1285 1290 return (0);
1286 1291 }
1287 1292
1288 1293 /* ARGSUSED */
1289 1294 static int
1290 1295 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
1291 1296 {
1292 1297 dsl_dataset_t *ds = arg1;
1293 1298 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1294 1299 uint64_t count;
1295 1300 int err;
1296 1301
1297 1302 /*
1298 1303 * Can't delete a head dataset if there are snapshots of it.
1299 1304 * (Except if the only snapshots are from the branch we cloned
1300 1305 * from.)
1301 1306 */
1302 1307 if (ds->ds_prev != NULL &&
1303 1308 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1304 1309 return (EBUSY);
1305 1310
1306 1311 /*
1307 1312 * This is really a dsl_dir thing, but check it here so that
1308 1313 * we'll be less likely to leave this dataset inconsistent &
1309 1314 * nearly destroyed.
1310 1315 */
1311 1316 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
1312 1317 if (err)
1313 1318 return (err);
1314 1319 if (count != 0)
1315 1320 return (EEXIST);
1316 1321
1317 1322 return (0);
1318 1323 }
1319 1324
1320 1325 /* ARGSUSED */
1321 1326 static void
1322 1327 dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1323 1328 {
1324 1329 dsl_dataset_t *ds = arg1;
1325 1330
1326 1331 /* Mark it as inconsistent on-disk, in case we crash */
1327 1332 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1328 1333 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1329 1334
1330 1335 spa_history_log_internal_ds(ds, "destroy begin", tx, "");
1331 1336 }
1332 1337
1333 1338 static int
1334 1339 dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
1335 1340 dmu_tx_t *tx)
1336 1341 {
1337 1342 dsl_dataset_t *ds = dsda->ds;
1338 1343 dsl_dataset_t *ds_prev = ds->ds_prev;
1339 1344
1340 1345 if (dsl_dataset_might_destroy_origin(ds_prev)) {
1341 1346 struct dsl_ds_destroyarg ndsda = {0};
1342 1347
1343 1348 /*
1344 1349 * If we're not prepared to remove the origin, don't remove
1345 1350 * the clone either.
1346 1351 */
1347 1352 if (dsda->rm_origin == NULL) {
1348 1353 dsda->need_prep = B_TRUE;
1349 1354 return (EBUSY);
1350 1355 }
1351 1356
1352 1357 ndsda.ds = ds_prev;
1353 1358 ndsda.is_origin_rm = B_TRUE;
1354 1359 return (dsl_dataset_destroy_check(&ndsda, tag, tx));
1355 1360 }
1356 1361
1357 1362 /*
1358 1363 * If we're not going to remove the origin after all,
1359 1364 * undo the open context setup.
1360 1365 */
1361 1366 if (dsda->rm_origin != NULL) {
1362 1367 dsl_dataset_disown(dsda->rm_origin, tag);
1363 1368 dsda->rm_origin = NULL;
1364 1369 }
1365 1370
1366 1371 return (0);
1367 1372 }
1368 1373
1369 1374 /*
1370 1375 * If you add new checks here, you may need to add
1371 1376 * additional checks to the "temporary" case in
1372 1377 * snapshot_check() in dmu_objset.c.
1373 1378 */
1374 1379 /* ARGSUSED */
1375 1380 int
1376 1381 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
1377 1382 {
1378 1383 struct dsl_ds_destroyarg *dsda = arg1;
1379 1384 dsl_dataset_t *ds = dsda->ds;
1380 1385
1381 1386 /* we have an owner hold, so noone else can destroy us */
1382 1387 ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
1383 1388
1384 1389 /*
1385 1390 * Only allow deferred destroy on pools that support it.
1386 1391 * NOTE: deferred destroy is only supported on snapshots.
1387 1392 */
1388 1393 if (dsda->defer) {
1389 1394 if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
1390 1395 SPA_VERSION_USERREFS)
1391 1396 return (ENOTSUP);
1392 1397 ASSERT(dsl_dataset_is_snapshot(ds));
1393 1398 return (0);
1394 1399 }
1395 1400
1396 1401 /*
1397 1402 * Can't delete a head dataset if there are snapshots of it.
1398 1403 * (Except if the only snapshots are from the branch we cloned
1399 1404 * from.)
1400 1405 */
1401 1406 if (ds->ds_prev != NULL &&
1402 1407 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
1403 1408 return (EBUSY);
1404 1409
1405 1410 /*
1406 1411 * If we made changes this txg, traverse_dsl_dataset won't find
1407 1412 * them. Try again.
1408 1413 */
1409 1414 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
1410 1415 return (EAGAIN);
1411 1416
1412 1417 if (dsl_dataset_is_snapshot(ds)) {
1413 1418 /*
1414 1419 * If this snapshot has an elevated user reference count,
1415 1420 * we can't destroy it yet.
1416 1421 */
1417 1422 if (ds->ds_userrefs > 0 && !dsda->releasing)
1418 1423 return (EBUSY);
1419 1424
1420 1425 mutex_enter(&ds->ds_lock);
1421 1426 /*
1422 1427 * Can't delete a branch point. However, if we're destroying
1423 1428 * a clone and removing its origin due to it having a user
1424 1429 * hold count of 0 and having been marked for deferred destroy,
1425 1430 * it's OK for the origin to have a single clone.
1426 1431 */
1427 1432 if (ds->ds_phys->ds_num_children >
1428 1433 (dsda->is_origin_rm ? 2 : 1)) {
1429 1434 mutex_exit(&ds->ds_lock);
1430 1435 return (EEXIST);
1431 1436 }
1432 1437 mutex_exit(&ds->ds_lock);
1433 1438 } else if (dsl_dir_is_clone(ds->ds_dir)) {
1434 1439 return (dsl_dataset_origin_check(dsda, arg2, tx));
1435 1440 }
1436 1441
1437 1442 /* XXX we should do some i/o error checking... */
1438 1443 return (0);
1439 1444 }
1440 1445
1441 1446 struct refsarg {
1442 1447 kmutex_t lock;
1443 1448 boolean_t gone;
1444 1449 kcondvar_t cv;
1445 1450 };
1446 1451
1447 1452 /* ARGSUSED */
1448 1453 static void
1449 1454 dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
1450 1455 {
1451 1456 struct refsarg *arg = argv;
1452 1457
1453 1458 mutex_enter(&arg->lock);
1454 1459 arg->gone = TRUE;
1455 1460 cv_signal(&arg->cv);
1456 1461 mutex_exit(&arg->lock);
1457 1462 }
1458 1463
1459 1464 static void
1460 1465 dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
1461 1466 {
1462 1467 struct refsarg arg;
1463 1468
1464 1469 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
1465 1470 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
1466 1471 arg.gone = FALSE;
1467 1472 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
1468 1473 dsl_dataset_refs_gone);
1469 1474 dmu_buf_rele(ds->ds_dbuf, tag);
1470 1475 mutex_enter(&arg.lock);
1471 1476 while (!arg.gone)
1472 1477 cv_wait(&arg.cv, &arg.lock);
1473 1478 ASSERT(arg.gone);
1474 1479 mutex_exit(&arg.lock);
1475 1480 ds->ds_dbuf = NULL;
1476 1481 ds->ds_phys = NULL;
1477 1482 mutex_destroy(&arg.lock);
1478 1483 cv_destroy(&arg.cv);
1479 1484 }
1480 1485
1481 1486 static void
1482 1487 remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
1483 1488 {
1484 1489 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1485 1490 uint64_t count;
1486 1491 int err;
1487 1492
1488 1493 ASSERT(ds->ds_phys->ds_num_children >= 2);
1489 1494 err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
1490 1495 /*
1491 1496 * The err should not be ENOENT, but a bug in a previous version
1492 1497 * of the code could cause upgrade_clones_cb() to not set
1493 1498 * ds_next_snap_obj when it should, leading to a missing entry.
1494 1499 * If we knew that the pool was created after
1495 1500 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
1496 1501 * ENOENT. However, at least we can check that we don't have
1497 1502 * too many entries in the next_clones_obj even after failing to
1498 1503 * remove this one.
1499 1504 */
1500 1505 if (err != ENOENT) {
1501 1506 VERIFY0(err);
1502 1507 }
1503 1508 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
1504 1509 &count));
1505 1510 ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
1506 1511 }
1507 1512
1508 1513 static void
1509 1514 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
1510 1515 {
1511 1516 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1512 1517 zap_cursor_t zc;
1513 1518 zap_attribute_t za;
1514 1519
1515 1520 /*
1516 1521 * If it is the old version, dd_clones doesn't exist so we can't
1517 1522 * find the clones, but deadlist_remove_key() is a no-op so it
1518 1523 * doesn't matter.
1519 1524 */
1520 1525 if (ds->ds_dir->dd_phys->dd_clones == 0)
1521 1526 return;
1522 1527
1523 1528 for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
1524 1529 zap_cursor_retrieve(&zc, &za) == 0;
1525 1530 zap_cursor_advance(&zc)) {
1526 1531 dsl_dataset_t *clone;
1527 1532
1528 1533 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1529 1534 za.za_first_integer, FTAG, &clone));
1530 1535 if (clone->ds_dir->dd_origin_txg > mintxg) {
1531 1536 dsl_deadlist_remove_key(&clone->ds_deadlist,
1532 1537 mintxg, tx);
1533 1538 dsl_dataset_remove_clones_key(clone, mintxg, tx);
1534 1539 }
1535 1540 dsl_dataset_rele(clone, FTAG);
1536 1541 }
1537 1542 zap_cursor_fini(&zc);
1538 1543 }
1539 1544
1540 1545 struct process_old_arg {
1541 1546 dsl_dataset_t *ds;
1542 1547 dsl_dataset_t *ds_prev;
1543 1548 boolean_t after_branch_point;
1544 1549 zio_t *pio;
1545 1550 uint64_t used, comp, uncomp;
1546 1551 };
1547 1552
1548 1553 static int
1549 1554 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1550 1555 {
1551 1556 struct process_old_arg *poa = arg;
1552 1557 dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
1553 1558
1554 1559 if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
1555 1560 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
1556 1561 if (poa->ds_prev && !poa->after_branch_point &&
1557 1562 bp->blk_birth >
1558 1563 poa->ds_prev->ds_phys->ds_prev_snap_txg) {
1559 1564 poa->ds_prev->ds_phys->ds_unique_bytes +=
1560 1565 bp_get_dsize_sync(dp->dp_spa, bp);
1561 1566 }
1562 1567 } else {
1563 1568 poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
1564 1569 poa->comp += BP_GET_PSIZE(bp);
1565 1570 poa->uncomp += BP_GET_UCSIZE(bp);
1566 1571 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
1567 1572 }
1568 1573 return (0);
1569 1574 }
1570 1575
1571 1576 static void
1572 1577 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
1573 1578 dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
1574 1579 {
1575 1580 struct process_old_arg poa = { 0 };
1576 1581 dsl_pool_t *dp = ds->ds_dir->dd_pool;
1577 1582 objset_t *mos = dp->dp_meta_objset;
1578 1583
1579 1584 ASSERT(ds->ds_deadlist.dl_oldfmt);
1580 1585 ASSERT(ds_next->ds_deadlist.dl_oldfmt);
1581 1586
1582 1587 poa.ds = ds;
1583 1588 poa.ds_prev = ds_prev;
1584 1589 poa.after_branch_point = after_branch_point;
1585 1590 poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1586 1591 VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
1587 1592 process_old_cb, &poa, tx));
1588 1593 VERIFY0(zio_wait(poa.pio));
1589 1594 ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
1590 1595
1591 1596 /* change snapused */
1592 1597 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1593 1598 -poa.used, -poa.comp, -poa.uncomp, tx);
1594 1599
1595 1600 /* swap next's deadlist to our deadlist */
1596 1601 dsl_deadlist_close(&ds->ds_deadlist);
1597 1602 dsl_deadlist_close(&ds_next->ds_deadlist);
1598 1603 SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
1599 1604 ds->ds_phys->ds_deadlist_obj);
1600 1605 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1601 1606 dsl_deadlist_open(&ds_next->ds_deadlist, mos,
1602 1607 ds_next->ds_phys->ds_deadlist_obj);
1603 1608 }
1604 1609
1605 1610 static int
1606 1611 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
1607 1612 {
1608 1613 int err;
1609 1614 struct killarg ka;
1610 1615
1611 1616 /*
1612 1617 * Free everything that we point to (that's born after
1613 1618 * the previous snapshot, if we are a clone)
1614 1619 *
1615 1620 * NB: this should be very quick, because we already
1616 1621 * freed all the objects in open context.
1617 1622 */
1618 1623 ka.ds = ds;
1619 1624 ka.tx = tx;
1620 1625 err = traverse_dataset(ds,
1621 1626 ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
1622 1627 kill_blkptr, &ka);
1623 1628 ASSERT0(err);
1624 1629 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
1625 1630
1626 1631 return (err);
1627 1632 }
1628 1633
1629 1634 void
1630 1635 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
1631 1636 {
1632 1637 struct dsl_ds_destroyarg *dsda = arg1;
1633 1638 dsl_dataset_t *ds = dsda->ds;
1634 1639 int err;
1635 1640 int after_branch_point = FALSE;
1636 1641 dsl_pool_t *dp = ds->ds_dir->dd_pool;
1637 1642 objset_t *mos = dp->dp_meta_objset;
1638 1643 dsl_dataset_t *ds_prev = NULL;
1639 1644 boolean_t wont_destroy;
1640 1645 uint64_t obj;
1641 1646
1642 1647 wont_destroy = (dsda->defer &&
1643 1648 (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
1644 1649
1645 1650 ASSERT(ds->ds_owner || wont_destroy);
1646 1651 ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
1647 1652 ASSERT(ds->ds_prev == NULL ||
1648 1653 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
1649 1654 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
1650 1655
1651 1656 if (wont_destroy) {
1652 1657 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
1653 1658 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1654 1659 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
1655 1660 spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
1656 1661 return;
1657 1662 }
1658 1663
1659 1664 /* We need to log before removing it from the namespace. */
1660 1665 spa_history_log_internal_ds(ds, "destroy", tx, "");
1661 1666
1662 1667 /* signal any waiters that this dataset is going away */
1663 1668 mutex_enter(&ds->ds_lock);
1664 1669 ds->ds_owner = dsl_reaper;
1665 1670 cv_broadcast(&ds->ds_exclusive_cv);
1666 1671 mutex_exit(&ds->ds_lock);
1667 1672
1668 1673 /* Remove our reservation */
1669 1674 if (ds->ds_reserved != 0) {
1670 1675 dsl_prop_setarg_t psa;
1671 1676 uint64_t value = 0;
1672 1677
1673 1678 dsl_prop_setarg_init_uint64(&psa, "refreservation",
1674 1679 (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
1675 1680 &value);
1676 1681 psa.psa_effective_value = 0; /* predict default value */
1677 1682
1678 1683 dsl_dataset_set_reservation_sync(ds, &psa, tx);
1679 1684 ASSERT0(ds->ds_reserved);
1680 1685 }
1681 1686
1682 1687 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
1683 1688
1684 1689 dsl_scan_ds_destroyed(ds, tx);
1685 1690
1686 1691 obj = ds->ds_object;
1687 1692
1688 1693 if (ds->ds_phys->ds_prev_snap_obj != 0) {
1689 1694 if (ds->ds_prev) {
1690 1695 ds_prev = ds->ds_prev;
1691 1696 } else {
1692 1697 VERIFY(0 == dsl_dataset_hold_obj(dp,
1693 1698 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
1694 1699 }
1695 1700 after_branch_point =
1696 1701 (ds_prev->ds_phys->ds_next_snap_obj != obj);
1697 1702
1698 1703 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
1699 1704 if (after_branch_point &&
1700 1705 ds_prev->ds_phys->ds_next_clones_obj != 0) {
1701 1706 remove_from_next_clones(ds_prev, obj, tx);
1702 1707 if (ds->ds_phys->ds_next_snap_obj != 0) {
1703 1708 VERIFY(0 == zap_add_int(mos,
1704 1709 ds_prev->ds_phys->ds_next_clones_obj,
1705 1710 ds->ds_phys->ds_next_snap_obj, tx));
1706 1711 }
1707 1712 }
1708 1713 if (after_branch_point &&
1709 1714 ds->ds_phys->ds_next_snap_obj == 0) {
1710 1715 /* This clone is toast. */
1711 1716 ASSERT(ds_prev->ds_phys->ds_num_children > 1);
1712 1717 ds_prev->ds_phys->ds_num_children--;
1713 1718
1714 1719 /*
1715 1720 * If the clone's origin has no other clones, no
1716 1721 * user holds, and has been marked for deferred
1717 1722 * deletion, then we should have done the necessary
1718 1723 * destroy setup for it.
1719 1724 */
1720 1725 if (ds_prev->ds_phys->ds_num_children == 1 &&
1721 1726 ds_prev->ds_userrefs == 0 &&
1722 1727 DS_IS_DEFER_DESTROY(ds_prev)) {
1723 1728 ASSERT3P(dsda->rm_origin, !=, NULL);
1724 1729 } else {
1725 1730 ASSERT3P(dsda->rm_origin, ==, NULL);
1726 1731 }
1727 1732 } else if (!after_branch_point) {
1728 1733 ds_prev->ds_phys->ds_next_snap_obj =
1729 1734 ds->ds_phys->ds_next_snap_obj;
1730 1735 }
1731 1736 }
1732 1737
1733 1738 if (dsl_dataset_is_snapshot(ds)) {
1734 1739 dsl_dataset_t *ds_next;
1735 1740 uint64_t old_unique;
1736 1741 uint64_t used = 0, comp = 0, uncomp = 0;
1737 1742
1738 1743 VERIFY(0 == dsl_dataset_hold_obj(dp,
1739 1744 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
1740 1745 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
1741 1746
1742 1747 old_unique = ds_next->ds_phys->ds_unique_bytes;
1743 1748
1744 1749 dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
1745 1750 ds_next->ds_phys->ds_prev_snap_obj =
1746 1751 ds->ds_phys->ds_prev_snap_obj;
1747 1752 ds_next->ds_phys->ds_prev_snap_txg =
1748 1753 ds->ds_phys->ds_prev_snap_txg;
1749 1754 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1750 1755 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
1751 1756
1752 1757
1753 1758 if (ds_next->ds_deadlist.dl_oldfmt) {
1754 1759 process_old_deadlist(ds, ds_prev, ds_next,
1755 1760 after_branch_point, tx);
1756 1761 } else {
1757 1762 /* Adjust prev's unique space. */
1758 1763 if (ds_prev && !after_branch_point) {
1759 1764 dsl_deadlist_space_range(&ds_next->ds_deadlist,
1760 1765 ds_prev->ds_phys->ds_prev_snap_txg,
1761 1766 ds->ds_phys->ds_prev_snap_txg,
1762 1767 &used, &comp, &uncomp);
1763 1768 ds_prev->ds_phys->ds_unique_bytes += used;
1764 1769 }
1765 1770
1766 1771 /* Adjust snapused. */
1767 1772 dsl_deadlist_space_range(&ds_next->ds_deadlist,
1768 1773 ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1769 1774 &used, &comp, &uncomp);
1770 1775 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1771 1776 -used, -comp, -uncomp, tx);
1772 1777
1773 1778 /* Move blocks to be freed to pool's free list. */
1774 1779 dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
1775 1780 &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
1776 1781 tx);
1777 1782 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
1778 1783 DD_USED_HEAD, used, comp, uncomp, tx);
1779 1784
1780 1785 /* Merge our deadlist into next's and free it. */
1781 1786 dsl_deadlist_merge(&ds_next->ds_deadlist,
1782 1787 ds->ds_phys->ds_deadlist_obj, tx);
1783 1788 }
1784 1789 dsl_deadlist_close(&ds->ds_deadlist);
1785 1790 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1786 1791
1787 1792 /* Collapse range in clone heads */
1788 1793 dsl_dataset_remove_clones_key(ds,
1789 1794 ds->ds_phys->ds_creation_txg, tx);
1790 1795
1791 1796 if (dsl_dataset_is_snapshot(ds_next)) {
1792 1797 dsl_dataset_t *ds_nextnext;
1793 1798
1794 1799 /*
1795 1800 * Update next's unique to include blocks which
1796 1801 * were previously shared by only this snapshot
1797 1802 * and it. Those blocks will be born after the
1798 1803 * prev snap and before this snap, and will have
1799 1804 * died after the next snap and before the one
1800 1805 * after that (ie. be on the snap after next's
1801 1806 * deadlist).
1802 1807 */
1803 1808 VERIFY(0 == dsl_dataset_hold_obj(dp,
1804 1809 ds_next->ds_phys->ds_next_snap_obj,
1805 1810 FTAG, &ds_nextnext));
1806 1811 dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
1807 1812 ds->ds_phys->ds_prev_snap_txg,
1808 1813 ds->ds_phys->ds_creation_txg,
1809 1814 &used, &comp, &uncomp);
1810 1815 ds_next->ds_phys->ds_unique_bytes += used;
1811 1816 dsl_dataset_rele(ds_nextnext, FTAG);
1812 1817 ASSERT3P(ds_next->ds_prev, ==, NULL);
1813 1818
1814 1819 /* Collapse range in this head. */
1815 1820 dsl_dataset_t *hds;
1816 1821 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
1817 1822 ds->ds_dir->dd_phys->dd_head_dataset_obj,
1818 1823 FTAG, &hds));
1819 1824 dsl_deadlist_remove_key(&hds->ds_deadlist,
1820 1825 ds->ds_phys->ds_creation_txg, tx);
1821 1826 dsl_dataset_rele(hds, FTAG);
1822 1827
1823 1828 } else {
1824 1829 ASSERT3P(ds_next->ds_prev, ==, ds);
1825 1830 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
1826 1831 ds_next->ds_prev = NULL;
1827 1832 if (ds_prev) {
1828 1833 VERIFY(0 == dsl_dataset_get_ref(dp,
1829 1834 ds->ds_phys->ds_prev_snap_obj,
1830 1835 ds_next, &ds_next->ds_prev));
1831 1836 }
1832 1837
1833 1838 dsl_dataset_recalc_head_uniq(ds_next);
1834 1839
1835 1840 /*
1836 1841 * Reduce the amount of our unconsmed refreservation
1837 1842 * being charged to our parent by the amount of
1838 1843 * new unique data we have gained.
1839 1844 */
1840 1845 if (old_unique < ds_next->ds_reserved) {
1841 1846 int64_t mrsdelta;
1842 1847 uint64_t new_unique =
1843 1848 ds_next->ds_phys->ds_unique_bytes;
1844 1849
1845 1850 ASSERT(old_unique <= new_unique);
1846 1851 mrsdelta = MIN(new_unique - old_unique,
1847 1852 ds_next->ds_reserved - old_unique);
1848 1853 dsl_dir_diduse_space(ds->ds_dir,
1849 1854 DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1850 1855 }
1851 1856 }
1852 1857 dsl_dataset_rele(ds_next, FTAG);
1853 1858 } else {
1854 1859 zfeature_info_t *async_destroy =
1855 1860 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
1856 1861 objset_t *os;
1857 1862
1858 1863 /*
1859 1864 * There's no next snapshot, so this is a head dataset.
1860 1865 * Destroy the deadlist. Unless it's a clone, the
1861 1866 * deadlist should be empty. (If it's a clone, it's
1862 1867 * safe to ignore the deadlist contents.)
1863 1868 */
1864 1869 dsl_deadlist_close(&ds->ds_deadlist);
1865 1870 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1866 1871 ds->ds_phys->ds_deadlist_obj = 0;
1867 1872
1868 1873 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os));
1869 1874
1870 1875 if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
1871 1876 err = old_synchronous_dataset_destroy(ds, tx);
1872 1877 } else {
1873 1878 /*
1874 1879 * Move the bptree into the pool's list of trees to
1875 1880 * clean up and update space accounting information.
1876 1881 */
1877 1882 uint64_t used, comp, uncomp;
1878 1883
1879 1884 zil_destroy_sync(dmu_objset_zil(os), tx);
1880 1885
1881 1886 if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
1882 1887 spa_feature_incr(dp->dp_spa, async_destroy, tx);
1883 1888 dp->dp_bptree_obj = bptree_alloc(mos, tx);
1884 1889 VERIFY(zap_add(mos,
1885 1890 DMU_POOL_DIRECTORY_OBJECT,
1886 1891 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
1887 1892 &dp->dp_bptree_obj, tx) == 0);
1888 1893 }
1889 1894
1890 1895 used = ds->ds_dir->dd_phys->dd_used_bytes;
1891 1896 comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
1892 1897 uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
1893 1898
1894 1899 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
1895 1900 ds->ds_phys->ds_unique_bytes == used);
1896 1901
1897 1902 bptree_add(mos, dp->dp_bptree_obj,
1898 1903 &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
1899 1904 used, comp, uncomp, tx);
1900 1905 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
1901 1906 -used, -comp, -uncomp, tx);
1902 1907 dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
1903 1908 used, comp, uncomp, tx);
1904 1909 }
1905 1910
1906 1911 if (ds->ds_prev != NULL) {
1907 1912 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
1908 1913 VERIFY3U(0, ==, zap_remove_int(mos,
1909 1914 ds->ds_prev->ds_dir->dd_phys->dd_clones,
1910 1915 ds->ds_object, tx));
1911 1916 }
1912 1917 dsl_dataset_rele(ds->ds_prev, ds);
1913 1918 ds->ds_prev = ds_prev = NULL;
1914 1919 }
1915 1920 }
1916 1921
1917 1922 /*
1918 1923 * This must be done after the dsl_traverse(), because it will
1919 1924 * re-open the objset.
1920 1925 */
1921 1926 if (ds->ds_objset) {
1922 1927 dmu_objset_evict(ds->ds_objset);
1923 1928 ds->ds_objset = NULL;
1924 1929 }
1925 1930
1926 1931 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
1927 1932 /* Erase the link in the dir */
1928 1933 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1929 1934 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
1930 1935 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
1931 1936 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
1932 1937 ASSERT(err == 0);
1933 1938 } else {
1934 1939 /* remove from snapshot namespace */
1935 1940 dsl_dataset_t *ds_head;
1936 1941 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
1937 1942 VERIFY(0 == dsl_dataset_hold_obj(dp,
1938 1943 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
1939 1944 VERIFY(0 == dsl_dataset_get_snapname(ds));
1940 1945 #ifdef ZFS_DEBUG
1941 1946 {
1942 1947 uint64_t val;
1943 1948
1944 1949 err = dsl_dataset_snap_lookup(ds_head,
1945 1950 ds->ds_snapname, &val);
1946 1951 ASSERT0(err);
1947 1952 ASSERT3U(val, ==, obj);
1948 1953 }
1949 1954 #endif
1950 1955 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
1951 1956 ASSERT(err == 0);
1952 1957 dsl_dataset_rele(ds_head, FTAG);
1953 1958 }
1954 1959
1955 1960 if (ds_prev && ds->ds_prev != ds_prev)
1956 1961 dsl_dataset_rele(ds_prev, FTAG);
1957 1962
1958 1963 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
1959 1964
1960 1965 if (ds->ds_phys->ds_next_clones_obj != 0) {
1961 1966 uint64_t count;
1962 1967 ASSERT(0 == zap_count(mos,
1963 1968 ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
1964 1969 VERIFY(0 == dmu_object_free(mos,
1965 1970 ds->ds_phys->ds_next_clones_obj, tx));
1966 1971 }
1967 1972 if (ds->ds_phys->ds_props_obj != 0)
1968 1973 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
1969 1974 if (ds->ds_phys->ds_userrefs_obj != 0)
1970 1975 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
1971 1976 dsl_dir_close(ds->ds_dir, ds);
1972 1977 ds->ds_dir = NULL;
1973 1978 dsl_dataset_drain_refs(ds, tag);
1974 1979 VERIFY(0 == dmu_object_free(mos, obj, tx));
1975 1980
1976 1981 if (dsda->rm_origin) {
1977 1982 /*
1978 1983 * Remove the origin of the clone we just destroyed.
1979 1984 */
1980 1985 struct dsl_ds_destroyarg ndsda = {0};
1981 1986
1982 1987 ndsda.ds = dsda->rm_origin;
1983 1988 dsl_dataset_destroy_sync(&ndsda, tag, tx);
1984 1989 }
1985 1990 }
1986 1991
1987 1992 static int
1988 1993 dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
1989 1994 {
1990 1995 uint64_t asize;
1991 1996
1992 1997 if (!dmu_tx_is_syncing(tx))
1993 1998 return (0);
1994 1999
1995 2000 /*
1996 2001 * If there's an fs-only reservation, any blocks that might become
1997 2002 * owned by the snapshot dataset must be accommodated by space
1998 2003 * outside of the reservation.
1999 2004 */
2000 2005 ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
2001 2006 asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2002 2007 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
2003 2008 return (ENOSPC);
2004 2009
↓ open down ↓ |
855 lines elided |
↑ open up ↑ |
2005 2010 /*
2006 2011 * Propagate any reserved space for this snapshot to other
2007 2012 * snapshot checks in this sync group.
2008 2013 */
2009 2014 if (asize > 0)
2010 2015 dsl_dir_willuse_space(ds->ds_dir, asize, tx);
2011 2016
2012 2017 return (0);
2013 2018 }
2014 2019
2020 +/*
2021 + * Check if adding additional snapshot(s) would exceed any snapshot limits.
2022 + * Note that all snapshot limits up to the root dataset (i.e. the pool itself)
2023 + * or the given ancestor must be satisfied. Note that it is valid for the
2024 + * count to exceed the limit. This can happen if a snapshot is taken by an
2025 + * administrative user in the global zone (e.g. a recursive snapshot by root).
2026 + */
2015 2027 int
2028 +dsl_snapcount_check(dsl_dir_t *dd, uint64_t cnt, dsl_dir_t *ancestor)
2029 +{
2030 + uint64_t limit;
2031 + int err = 0;
2032 +
2033 + /*
2034 + * The limit is never enforced for the admin user in global zone.
2035 + * If we're not in the global zone then we need to run this check in
2036 + * open context, since thats when we know what zone we're in and
2037 + * syncing is only performed in the global zone.
2038 + */
2039 + if (INGLOBALZONE(curproc))
2040 + return (0);
2041 +
2042 + /*
2043 + * If renaming a dataset with no snapshots, count adjustment is 0.
2044 + */
2045 + if (cnt == 0)
2046 + return (0);
2047 +
2048 + /*
2049 + * If an ancestor has been provided, stop checking the limit once we
2050 + * hit that dir. We need this during rename so that we don't overcount
2051 + * the check once we recurse up to the common ancestor.
2052 + */
2053 + if (ancestor == dd)
2054 + return (0);
2055 +
2056 + /*
2057 + * If we hit an uninitialized node while recursing up the tree, we can
2058 + * stop since we know the counts are not valid on this node and we
2059 + * know we won't touch this node's counts.
2060 + */
2061 + if (dd->dd_phys->dd_filesystem_count == 0)
2062 + return (0);
2063 +
2064 + /*
2065 + * If there's no value for this property, there's no need to enforce a
2066 + * snapshot limit.
2067 + */
2068 + err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_SNAPSHOT_LIMIT),
2069 + 8, 1, &limit, NULL, B_FALSE);
2070 + if (err == ENOENT)
2071 + return (0);
2072 + else if (err != 0)
2073 + return (err);
2074 +
2075 +#ifdef _KERNEL
2076 + extern void __dtrace_probe_zfs__ss__limit(uint64_t, uint64_t, char *);
2077 + __dtrace_probe_zfs__ss__limit(
2078 + (uint64_t)dd->dd_phys->dd_snapshot_count, (uint64_t)limit,
2079 + dd->dd_myname);
2080 +#endif
2081 +
2082 + if (limit != MAXLIMIT &&
2083 + (dd->dd_phys->dd_snapshot_count + cnt) > limit)
2084 + return (EDQUOT);
2085 +
2086 + if (dd->dd_parent != NULL)
2087 + err = dsl_snapcount_check(dd->dd_parent, cnt, ancestor);
2088 +
2089 + return (err);
2090 +}
2091 +
2092 +/*
2093 + * Adjust the snapshot count for the specified dsl_dir_t and all parents.
2094 + * When a new snapshot is created, increment the count on all parents, and when
2095 + * a snapshot is destroyed, decrement the count.
2096 + */
2097 +void
2098 +dsl_snapcount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
2099 + boolean_t first)
2100 +{
2101 + /*
2102 + * If we hit an uninitialized node while recursing up the tree, we can
2103 + * stop since we know the counts are not valid on this node and we
2104 + * know we shouldn't touch this node's counts. An uninitialized count
2105 + * on the node indicates that either the feature has not yet been
2106 + * activated or there are no limits on this part of the tree.
2107 + */
2108 + if (dd->dd_phys->dd_filesystem_count == 0)
2109 + return;
2110 +
2111 + /*
2112 + * The feature might have previously been active, so there could be
2113 + * non-0 counts on the nodes, but it might now be inactive.
2114 + *
2115 + * On initial entry we need to check if this feature is active, but
2116 + * we don't want to re-check this on each recursive call. Note: the
2117 + * feature cannot be active if its not enabled. If the feature is not
2118 + * active, don't touch the on-disk count fields.
2119 + */
2120 + if (first) {
2121 + dsl_dataset_t *ds = NULL;
2122 + spa_t *spa;
2123 + zfeature_info_t *quota_feat =
2124 + &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
2125 +
2126 + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2127 + dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
2128 + spa = dsl_dataset_get_spa(ds);
2129 + dsl_dataset_rele(ds, FTAG);
2130 + if (!spa_feature_is_active(spa, quota_feat))
2131 + return;
2132 + }
2133 +
2134 + /*
2135 + * As with dsl_dataset_set_reservation_check(), wdon't want to run
2136 + * this check in open context.
2137 + */
2138 + if (!dmu_tx_is_syncing(tx))
2139 + return;
2140 +
2141 + /* if renaming a dataset with no snapshots, count adjustment is 0 */
2142 + if (delta == 0)
2143 + return;
2144 +
2145 + /*
2146 + * If we hit an uninitialized node while recursing up the tree, we can
2147 + * stop since we know the counts are not valid on this node and we
2148 + * know we shouldn't touch this node's counts.
2149 + */
2150 + if (dd->dd_phys->dd_filesystem_count == 0)
2151 + return;
2152 +
2153 + /* Increment count for parent */
2154 + dmu_buf_will_dirty(dd->dd_dbuf, tx);
2155 +
2156 + mutex_enter(&dd->dd_lock);
2157 +
2158 + dd->dd_phys->dd_snapshot_count += delta;
2159 +
2160 + /* Roll up this additional count into our ancestors */
2161 + if (dd->dd_parent != NULL)
2162 + dsl_snapcount_adjust(dd->dd_parent, tx, delta, B_FALSE);
2163 +
2164 + mutex_exit(&dd->dd_lock);
2165 +}
2166 +
2167 +int
2016 2168 dsl_dataset_snapshot_check(dsl_dataset_t *ds, const char *snapname,
2017 - dmu_tx_t *tx)
2169 + uint64_t cnt, dmu_tx_t *tx)
2018 2170 {
2019 2171 int err;
2020 2172 uint64_t value;
2021 2173
2022 2174 /*
2023 2175 * We don't allow multiple snapshots of the same txg. If there
2024 2176 * is already one, try again.
2025 2177 */
2026 2178 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
2027 2179 return (EAGAIN);
2028 2180
2029 2181 /*
2030 2182 * Check for conflicting snapshot name.
2031 2183 */
2032 2184 err = dsl_dataset_snap_lookup(ds, snapname, &value);
2033 2185 if (err == 0)
2034 2186 return (EEXIST);
↓ open down ↓ |
7 lines elided |
↑ open up ↑ |
2035 2187 if (err != ENOENT)
2036 2188 return (err);
2037 2189
2038 2190 /*
2039 2191 * Check that the dataset's name is not too long. Name consists
2040 2192 * of the dataset's length + 1 for the @-sign + snapshot name's length
2041 2193 */
2042 2194 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
2043 2195 return (ENAMETOOLONG);
2044 2196
2197 + err = dsl_snapcount_check(ds->ds_dir, cnt, NULL);
2198 + if (err)
2199 + return (err);
2200 +
2045 2201 err = dsl_dataset_snapshot_reserve_space(ds, tx);
2046 2202 if (err)
2047 2203 return (err);
2048 2204
2049 2205 ds->ds_trysnap_txg = tx->tx_txg;
2050 2206 return (0);
2051 2207 }
2052 2208
2053 2209 void
2054 2210 dsl_dataset_snapshot_sync(dsl_dataset_t *ds, const char *snapname,
2055 2211 dmu_tx_t *tx)
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
2056 2212 {
2057 2213 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2058 2214 dmu_buf_t *dbuf;
2059 2215 dsl_dataset_phys_t *dsphys;
2060 2216 uint64_t dsobj, crtxg;
2061 2217 objset_t *mos = dp->dp_meta_objset;
2062 2218 int err;
2063 2219
2064 2220 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
2065 2221
2222 + dsl_snapcount_adjust(ds->ds_dir, tx, 1, B_TRUE);
2223 +
2066 2224 /*
2067 2225 * The origin's ds_creation_txg has to be < TXG_INITIAL
2068 2226 */
2069 2227 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
2070 2228 crtxg = 1;
2071 2229 else
2072 2230 crtxg = tx->tx_txg;
2073 2231
2074 2232 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
2075 2233 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
2076 2234 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
2077 2235 dmu_buf_will_dirty(dbuf, tx);
2078 2236 dsphys = dbuf->db_data;
2079 2237 bzero(dsphys, sizeof (dsl_dataset_phys_t));
2080 2238 dsphys->ds_dir_obj = ds->ds_dir->dd_object;
2081 2239 dsphys->ds_fsid_guid = unique_create();
2082 2240 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
2083 2241 sizeof (dsphys->ds_guid));
2084 2242 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
2085 2243 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
2086 2244 dsphys->ds_next_snap_obj = ds->ds_object;
2087 2245 dsphys->ds_num_children = 1;
2088 2246 dsphys->ds_creation_time = gethrestime_sec();
2089 2247 dsphys->ds_creation_txg = crtxg;
2090 2248 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
2091 2249 dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
2092 2250 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
2093 2251 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
2094 2252 dsphys->ds_flags = ds->ds_phys->ds_flags;
2095 2253 dsphys->ds_bp = ds->ds_phys->ds_bp;
2096 2254 dmu_buf_rele(dbuf, FTAG);
2097 2255
2098 2256 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
2099 2257 if (ds->ds_prev) {
2100 2258 uint64_t next_clones_obj =
2101 2259 ds->ds_prev->ds_phys->ds_next_clones_obj;
2102 2260 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
2103 2261 ds->ds_object ||
2104 2262 ds->ds_prev->ds_phys->ds_num_children > 1);
2105 2263 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
2106 2264 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2107 2265 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
2108 2266 ds->ds_prev->ds_phys->ds_creation_txg);
2109 2267 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
2110 2268 } else if (next_clones_obj != 0) {
2111 2269 remove_from_next_clones(ds->ds_prev,
2112 2270 dsphys->ds_next_snap_obj, tx);
2113 2271 VERIFY3U(0, ==, zap_add_int(mos,
2114 2272 next_clones_obj, dsobj, tx));
2115 2273 }
2116 2274 }
2117 2275
2118 2276 /*
2119 2277 * If we have a reference-reservation on this dataset, we will
2120 2278 * need to increase the amount of refreservation being charged
2121 2279 * since our unique space is going to zero.
2122 2280 */
2123 2281 if (ds->ds_reserved) {
2124 2282 int64_t delta;
2125 2283 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
2126 2284 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
2127 2285 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
2128 2286 delta, 0, 0, tx);
2129 2287 }
2130 2288
2131 2289 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2132 2290 zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
2133 2291 ds->ds_dir->dd_myname, snapname, dsobj,
2134 2292 ds->ds_phys->ds_prev_snap_txg);
2135 2293 ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
2136 2294 UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
2137 2295 dsl_deadlist_close(&ds->ds_deadlist);
2138 2296 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
2139 2297 dsl_deadlist_add_key(&ds->ds_deadlist,
2140 2298 ds->ds_phys->ds_prev_snap_txg, tx);
2141 2299
2142 2300 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
2143 2301 ds->ds_phys->ds_prev_snap_obj = dsobj;
2144 2302 ds->ds_phys->ds_prev_snap_txg = crtxg;
2145 2303 ds->ds_phys->ds_unique_bytes = 0;
2146 2304 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
2147 2305 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
2148 2306
2149 2307 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
2150 2308 snapname, 8, 1, &dsobj, tx);
2151 2309 ASSERT(err == 0);
2152 2310
2153 2311 if (ds->ds_prev)
2154 2312 dsl_dataset_drop_ref(ds->ds_prev, ds);
2155 2313 VERIFY(0 == dsl_dataset_get_ref(dp,
2156 2314 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
2157 2315
2158 2316 dsl_scan_ds_snapshotted(ds, tx);
2159 2317
2160 2318 dsl_dir_snap_cmtime_update(ds->ds_dir);
2161 2319
2162 2320 spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
2163 2321 }
2164 2322
2165 2323 void
2166 2324 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
2167 2325 {
2168 2326 ASSERT(dmu_tx_is_syncing(tx));
2169 2327 ASSERT(ds->ds_objset != NULL);
2170 2328 ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
2171 2329
2172 2330 /*
2173 2331 * in case we had to change ds_fsid_guid when we opened it,
2174 2332 * sync it out now.
2175 2333 */
2176 2334 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2177 2335 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
2178 2336
2179 2337 dmu_objset_sync(ds->ds_objset, zio, tx);
2180 2338 }
2181 2339
2182 2340 static void
2183 2341 get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
2184 2342 {
2185 2343 uint64_t count = 0;
2186 2344 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
2187 2345 zap_cursor_t zc;
2188 2346 zap_attribute_t za;
2189 2347 nvlist_t *propval;
2190 2348 nvlist_t *val;
2191 2349
2192 2350 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2193 2351 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2194 2352 VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2195 2353
2196 2354 /*
2197 2355 * There may me missing entries in ds_next_clones_obj
2198 2356 * due to a bug in a previous version of the code.
2199 2357 * Only trust it if it has the right number of entries.
2200 2358 */
2201 2359 if (ds->ds_phys->ds_next_clones_obj != 0) {
2202 2360 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
2203 2361 &count));
2204 2362 }
2205 2363 if (count != ds->ds_phys->ds_num_children - 1) {
2206 2364 goto fail;
2207 2365 }
2208 2366 for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
2209 2367 zap_cursor_retrieve(&zc, &za) == 0;
2210 2368 zap_cursor_advance(&zc)) {
2211 2369 dsl_dataset_t *clone;
2212 2370 char buf[ZFS_MAXNAMELEN];
2213 2371 /*
2214 2372 * Even though we hold the dp_config_rwlock, the dataset
2215 2373 * may fail to open, returning ENOENT. If there is a
2216 2374 * thread concurrently attempting to destroy this
2217 2375 * dataset, it will have the ds_rwlock held for
2218 2376 * RW_WRITER. Our call to dsl_dataset_hold_obj() ->
2219 2377 * dsl_dataset_hold_ref() will fail its
2220 2378 * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
2221 2379 * dp_config_rwlock, and wait for the destroy progress
2222 2380 * and signal ds_exclusive_cv. If the destroy was
2223 2381 * successful, we will see that
2224 2382 * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
2225 2383 */
2226 2384 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
2227 2385 za.za_first_integer, FTAG, &clone) != 0)
2228 2386 continue;
2229 2387 dsl_dir_name(clone->ds_dir, buf);
2230 2388 VERIFY(nvlist_add_boolean(val, buf) == 0);
2231 2389 dsl_dataset_rele(clone, FTAG);
2232 2390 }
2233 2391 zap_cursor_fini(&zc);
2234 2392 VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
2235 2393 VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
2236 2394 propval) == 0);
2237 2395 fail:
2238 2396 nvlist_free(val);
2239 2397 nvlist_free(propval);
2240 2398 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2241 2399 }
2242 2400
2243 2401 void
2244 2402 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
2245 2403 {
2246 2404 uint64_t refd, avail, uobjs, aobjs, ratio;
2247 2405
2248 2406 ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
2249 2407 (ds->ds_phys->ds_uncompressed_bytes * 100 /
2250 2408 ds->ds_phys->ds_compressed_bytes);
2251 2409
2252 2410 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
2253 2411
2254 2412 if (dsl_dataset_is_snapshot(ds)) {
2255 2413 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
2256 2414 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
2257 2415 ds->ds_phys->ds_unique_bytes);
2258 2416 get_clones_stat(ds, nv);
2259 2417 } else {
2260 2418 dsl_dir_stats(ds->ds_dir, nv);
2261 2419 }
2262 2420
2263 2421 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
2264 2422 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
2265 2423 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
2266 2424
2267 2425 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
2268 2426 ds->ds_phys->ds_creation_time);
2269 2427 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
2270 2428 ds->ds_phys->ds_creation_txg);
2271 2429 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
2272 2430 ds->ds_quota);
2273 2431 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
2274 2432 ds->ds_reserved);
2275 2433 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
2276 2434 ds->ds_phys->ds_guid);
2277 2435 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
2278 2436 ds->ds_phys->ds_unique_bytes);
2279 2437 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
2280 2438 ds->ds_object);
2281 2439 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
2282 2440 ds->ds_userrefs);
2283 2441 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
2284 2442 DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
2285 2443
2286 2444 if (ds->ds_phys->ds_prev_snap_obj != 0) {
2287 2445 uint64_t written, comp, uncomp;
2288 2446 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2289 2447 dsl_dataset_t *prev;
2290 2448
2291 2449 rw_enter(&dp->dp_config_rwlock, RW_READER);
2292 2450 int err = dsl_dataset_hold_obj(dp,
2293 2451 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
2294 2452 rw_exit(&dp->dp_config_rwlock);
2295 2453 if (err == 0) {
2296 2454 err = dsl_dataset_space_written(prev, ds, &written,
2297 2455 &comp, &uncomp);
2298 2456 dsl_dataset_rele(prev, FTAG);
2299 2457 if (err == 0) {
2300 2458 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
2301 2459 written);
2302 2460 }
2303 2461 }
2304 2462 }
2305 2463 }
2306 2464
2307 2465 void
2308 2466 dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
2309 2467 {
2310 2468 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
2311 2469 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
2312 2470 stat->dds_guid = ds->ds_phys->ds_guid;
2313 2471 stat->dds_origin[0] = '\0';
2314 2472 if (dsl_dataset_is_snapshot(ds)) {
2315 2473 stat->dds_is_snapshot = B_TRUE;
2316 2474 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
2317 2475 } else {
2318 2476 stat->dds_is_snapshot = B_FALSE;
2319 2477 stat->dds_num_clones = 0;
2320 2478
2321 2479 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
2322 2480 if (dsl_dir_is_clone(ds->ds_dir)) {
2323 2481 dsl_dataset_t *ods;
2324 2482
2325 2483 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
2326 2484 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
2327 2485 dsl_dataset_name(ods, stat->dds_origin);
2328 2486 dsl_dataset_drop_ref(ods, FTAG);
2329 2487 }
2330 2488 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2331 2489 }
2332 2490 }
2333 2491
2334 2492 uint64_t
2335 2493 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2336 2494 {
2337 2495 return (ds->ds_fsid_guid);
2338 2496 }
2339 2497
2340 2498 void
2341 2499 dsl_dataset_space(dsl_dataset_t *ds,
2342 2500 uint64_t *refdbytesp, uint64_t *availbytesp,
2343 2501 uint64_t *usedobjsp, uint64_t *availobjsp)
2344 2502 {
2345 2503 *refdbytesp = ds->ds_phys->ds_referenced_bytes;
2346 2504 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2347 2505 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
2348 2506 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
2349 2507 if (ds->ds_quota != 0) {
2350 2508 /*
2351 2509 * Adjust available bytes according to refquota
2352 2510 */
2353 2511 if (*refdbytesp < ds->ds_quota)
2354 2512 *availbytesp = MIN(*availbytesp,
2355 2513 ds->ds_quota - *refdbytesp);
2356 2514 else
2357 2515 *availbytesp = 0;
2358 2516 }
2359 2517 *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
2360 2518 *availobjsp = DN_MAX_OBJECT - *usedobjsp;
2361 2519 }
2362 2520
2363 2521 boolean_t
2364 2522 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2365 2523 {
2366 2524 dsl_pool_t *dp = ds->ds_dir->dd_pool;
2367 2525
2368 2526 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
2369 2527 dsl_pool_sync_context(dp));
2370 2528 if (ds->ds_prev == NULL)
2371 2529 return (B_FALSE);
2372 2530 if (ds->ds_phys->ds_bp.blk_birth >
2373 2531 ds->ds_prev->ds_phys->ds_creation_txg) {
2374 2532 objset_t *os, *os_prev;
2375 2533 /*
2376 2534 * It may be that only the ZIL differs, because it was
2377 2535 * reset in the head. Don't count that as being
2378 2536 * modified.
2379 2537 */
2380 2538 if (dmu_objset_from_ds(ds, &os) != 0)
2381 2539 return (B_TRUE);
2382 2540 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
2383 2541 return (B_TRUE);
2384 2542 return (bcmp(&os->os_phys->os_meta_dnode,
2385 2543 &os_prev->os_phys->os_meta_dnode,
2386 2544 sizeof (os->os_phys->os_meta_dnode)) != 0);
2387 2545 }
2388 2546 return (B_FALSE);
2389 2547 }
2390 2548
2391 2549 /* ARGSUSED */
2392 2550 static int
2393 2551 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
2394 2552 {
2395 2553 dsl_dataset_t *ds = arg1;
2396 2554 char *newsnapname = arg2;
2397 2555 dsl_dir_t *dd = ds->ds_dir;
2398 2556 dsl_dataset_t *hds;
2399 2557 uint64_t val;
2400 2558 int err;
2401 2559
2402 2560 err = dsl_dataset_hold_obj(dd->dd_pool,
2403 2561 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
2404 2562 if (err)
2405 2563 return (err);
2406 2564
2407 2565 /* new name better not be in use */
2408 2566 err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
2409 2567 dsl_dataset_rele(hds, FTAG);
2410 2568
2411 2569 if (err == 0)
2412 2570 err = EEXIST;
2413 2571 else if (err == ENOENT)
2414 2572 err = 0;
2415 2573
2416 2574 /* dataset name + 1 for the "@" + the new snapshot name must fit */
2417 2575 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
2418 2576 err = ENAMETOOLONG;
2419 2577
2420 2578 return (err);
2421 2579 }
2422 2580
2423 2581 static void
2424 2582 dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2425 2583 {
2426 2584 dsl_dataset_t *ds = arg1;
2427 2585 const char *newsnapname = arg2;
2428 2586 dsl_dir_t *dd = ds->ds_dir;
2429 2587 objset_t *mos = dd->dd_pool->dp_meta_objset;
2430 2588 dsl_dataset_t *hds;
2431 2589 int err;
2432 2590
2433 2591 ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
2434 2592
2435 2593 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
2436 2594 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
2437 2595
2438 2596 VERIFY(0 == dsl_dataset_get_snapname(ds));
2439 2597 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
2440 2598 ASSERT0(err);
2441 2599 mutex_enter(&ds->ds_lock);
2442 2600 (void) strcpy(ds->ds_snapname, newsnapname);
2443 2601 mutex_exit(&ds->ds_lock);
2444 2602 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
2445 2603 ds->ds_snapname, 8, 1, &ds->ds_object, tx);
2446 2604 ASSERT0(err);
2447 2605
2448 2606 spa_history_log_internal_ds(ds, "rename", tx,
2449 2607 "-> @%s", newsnapname);
2450 2608 dsl_dataset_rele(hds, FTAG);
2451 2609 }
2452 2610
2453 2611 struct renamesnaparg {
2454 2612 dsl_sync_task_group_t *dstg;
2455 2613 char failed[MAXPATHLEN];
2456 2614 char *oldsnap;
2457 2615 char *newsnap;
2458 2616 };
2459 2617
2460 2618 static int
2461 2619 dsl_snapshot_rename_one(const char *name, void *arg)
2462 2620 {
2463 2621 struct renamesnaparg *ra = arg;
2464 2622 dsl_dataset_t *ds = NULL;
2465 2623 char *snapname;
2466 2624 int err;
2467 2625
2468 2626 snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
2469 2627 (void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
2470 2628
2471 2629 /*
2472 2630 * For recursive snapshot renames the parent won't be changing
2473 2631 * so we just pass name for both the to/from argument.
2474 2632 */
2475 2633 err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
2476 2634 if (err != 0) {
2477 2635 strfree(snapname);
2478 2636 return (err == ENOENT ? 0 : err);
2479 2637 }
2480 2638
2481 2639 #ifdef _KERNEL
2482 2640 /*
2483 2641 * For all filesystems undergoing rename, we'll need to unmount it.
2484 2642 */
2485 2643 (void) zfs_unmount_snap(snapname, NULL);
2486 2644 #endif
2487 2645 err = dsl_dataset_hold(snapname, ra->dstg, &ds);
2488 2646 strfree(snapname);
2489 2647 if (err != 0)
2490 2648 return (err == ENOENT ? 0 : err);
2491 2649
2492 2650 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
2493 2651 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
2494 2652
2495 2653 return (0);
2496 2654 }
2497 2655
2498 2656 static int
2499 2657 dsl_recursive_rename(char *oldname, const char *newname)
2500 2658 {
2501 2659 int err;
2502 2660 struct renamesnaparg *ra;
2503 2661 dsl_sync_task_t *dst;
2504 2662 spa_t *spa;
2505 2663 char *cp, *fsname = spa_strdup(oldname);
2506 2664 int len = strlen(oldname) + 1;
2507 2665
2508 2666 /* truncate the snapshot name to get the fsname */
2509 2667 cp = strchr(fsname, '@');
2510 2668 *cp = '\0';
2511 2669
2512 2670 err = spa_open(fsname, &spa, FTAG);
2513 2671 if (err) {
2514 2672 kmem_free(fsname, len);
2515 2673 return (err);
2516 2674 }
2517 2675 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
2518 2676 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
2519 2677
2520 2678 ra->oldsnap = strchr(oldname, '@') + 1;
2521 2679 ra->newsnap = strchr(newname, '@') + 1;
2522 2680 *ra->failed = '\0';
2523 2681
2524 2682 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
2525 2683 DS_FIND_CHILDREN);
2526 2684 kmem_free(fsname, len);
2527 2685
2528 2686 if (err == 0) {
2529 2687 err = dsl_sync_task_group_wait(ra->dstg);
2530 2688 }
2531 2689
2532 2690 for (dst = list_head(&ra->dstg->dstg_tasks); dst;
2533 2691 dst = list_next(&ra->dstg->dstg_tasks, dst)) {
2534 2692 dsl_dataset_t *ds = dst->dst_arg1;
2535 2693 if (dst->dst_err) {
2536 2694 dsl_dir_name(ds->ds_dir, ra->failed);
2537 2695 (void) strlcat(ra->failed, "@", sizeof (ra->failed));
2538 2696 (void) strlcat(ra->failed, ra->newsnap,
2539 2697 sizeof (ra->failed));
2540 2698 }
2541 2699 dsl_dataset_rele(ds, ra->dstg);
2542 2700 }
2543 2701
2544 2702 if (err)
2545 2703 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
2546 2704
2547 2705 dsl_sync_task_group_destroy(ra->dstg);
2548 2706 kmem_free(ra, sizeof (struct renamesnaparg));
2549 2707 spa_close(spa, FTAG);
2550 2708 return (err);
2551 2709 }
2552 2710
2553 2711 static int
2554 2712 dsl_valid_rename(const char *oldname, void *arg)
2555 2713 {
2556 2714 int delta = *(int *)arg;
2557 2715
2558 2716 if (strlen(oldname) + delta >= MAXNAMELEN)
2559 2717 return (ENAMETOOLONG);
2560 2718
2561 2719 return (0);
2562 2720 }
2563 2721
2564 2722 #pragma weak dmu_objset_rename = dsl_dataset_rename
2565 2723 int
2566 2724 dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
2567 2725 {
2568 2726 dsl_dir_t *dd;
2569 2727 dsl_dataset_t *ds;
2570 2728 const char *tail;
2571 2729 int err;
2572 2730
2573 2731 err = dsl_dir_open(oldname, FTAG, &dd, &tail);
2574 2732 if (err)
2575 2733 return (err);
2576 2734
2577 2735 if (tail == NULL) {
2578 2736 int delta = strlen(newname) - strlen(oldname);
2579 2737
2580 2738 /* if we're growing, validate child name lengths */
2581 2739 if (delta > 0)
2582 2740 err = dmu_objset_find(oldname, dsl_valid_rename,
2583 2741 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
2584 2742
2585 2743 if (err == 0)
2586 2744 err = dsl_dir_rename(dd, newname);
2587 2745 dsl_dir_close(dd, FTAG);
2588 2746 return (err);
2589 2747 }
2590 2748
2591 2749 if (tail[0] != '@') {
2592 2750 /* the name ended in a nonexistent component */
2593 2751 dsl_dir_close(dd, FTAG);
2594 2752 return (ENOENT);
2595 2753 }
2596 2754
2597 2755 dsl_dir_close(dd, FTAG);
2598 2756
2599 2757 /* new name must be snapshot in same filesystem */
2600 2758 tail = strchr(newname, '@');
2601 2759 if (tail == NULL)
2602 2760 return (EINVAL);
2603 2761 tail++;
2604 2762 if (strncmp(oldname, newname, tail - newname) != 0)
2605 2763 return (EXDEV);
2606 2764
2607 2765 if (recursive) {
2608 2766 err = dsl_recursive_rename(oldname, newname);
2609 2767 } else {
2610 2768 err = dsl_dataset_hold(oldname, FTAG, &ds);
2611 2769 if (err)
2612 2770 return (err);
2613 2771
2614 2772 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
2615 2773 dsl_dataset_snapshot_rename_check,
2616 2774 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
2617 2775
2618 2776 dsl_dataset_rele(ds, FTAG);
2619 2777 }
2620 2778
2621 2779 return (err);
2622 2780 }
2623 2781
2624 2782 struct promotenode {
2625 2783 list_node_t link;
2626 2784 dsl_dataset_t *ds;
2627 2785 };
2628 2786
2629 2787 struct promotearg {
2630 2788 list_t shared_snaps, origin_snaps, clone_snaps;
2631 2789 dsl_dataset_t *origin_origin;
2632 2790 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
2633 2791 char *err_ds;
2634 2792 };
2635 2793
2636 2794 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
2637 2795 static boolean_t snaplist_unstable(list_t *l);
2638 2796
2639 2797 static int
2640 2798 dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
2641 2799 {
2642 2800 dsl_dataset_t *hds = arg1;
2643 2801 struct promotearg *pa = arg2;
2644 2802 struct promotenode *snap = list_head(&pa->shared_snaps);
2645 2803 dsl_dataset_t *origin_ds = snap->ds;
2646 2804 int err;
2647 2805 uint64_t unused;
2648 2806
2649 2807 /* Check that it is a real clone */
2650 2808 if (!dsl_dir_is_clone(hds->ds_dir))
2651 2809 return (EINVAL);
2652 2810
2653 2811 /* Since this is so expensive, don't do the preliminary check */
2654 2812 if (!dmu_tx_is_syncing(tx))
2655 2813 return (0);
2656 2814
2657 2815 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
2658 2816 return (EXDEV);
2659 2817
2660 2818 /* compute origin's new unique space */
2661 2819 snap = list_tail(&pa->clone_snaps);
2662 2820 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2663 2821 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2664 2822 origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2665 2823 &pa->unique, &unused, &unused);
2666 2824
2667 2825 /*
2668 2826 * Walk the snapshots that we are moving
2669 2827 *
2670 2828 * Compute space to transfer. Consider the incremental changes
2671 2829 * to used for each snapshot:
2672 2830 * (my used) = (prev's used) + (blocks born) - (blocks killed)
2673 2831 * So each snapshot gave birth to:
2674 2832 * (blocks born) = (my used) - (prev's used) + (blocks killed)
2675 2833 * So a sequence would look like:
2676 2834 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2677 2835 * Which simplifies to:
2678 2836 * uN + kN + kN-1 + ... + k1 + k0
2679 2837 * Note however, if we stop before we reach the ORIGIN we get:
2680 2838 * uN + kN + kN-1 + ... + kM - uM-1
2681 2839 */
2682 2840 pa->used = origin_ds->ds_phys->ds_referenced_bytes;
2683 2841 pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2684 2842 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2685 2843 for (snap = list_head(&pa->shared_snaps); snap;
2686 2844 snap = list_next(&pa->shared_snaps, snap)) {
2687 2845 uint64_t val, dlused, dlcomp, dluncomp;
2688 2846 dsl_dataset_t *ds = snap->ds;
2689 2847
2690 2848 /* Check that the snapshot name does not conflict */
2691 2849 VERIFY(0 == dsl_dataset_get_snapname(ds));
2692 2850 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2693 2851 if (err == 0) {
2694 2852 err = EEXIST;
2695 2853 goto out;
2696 2854 }
2697 2855 if (err != ENOENT)
2698 2856 goto out;
2699 2857
2700 2858 /* The very first snapshot does not have a deadlist */
2701 2859 if (ds->ds_phys->ds_prev_snap_obj == 0)
2702 2860 continue;
2703 2861
2704 2862 dsl_deadlist_space(&ds->ds_deadlist,
2705 2863 &dlused, &dlcomp, &dluncomp);
2706 2864 pa->used += dlused;
2707 2865 pa->comp += dlcomp;
2708 2866 pa->uncomp += dluncomp;
2709 2867 }
2710 2868
↓ open down ↓ |
635 lines elided |
↑ open up ↑ |
2711 2869 /*
2712 2870 * If we are a clone of a clone then we never reached ORIGIN,
2713 2871 * so we need to subtract out the clone origin's used space.
2714 2872 */
2715 2873 if (pa->origin_origin) {
2716 2874 pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
2717 2875 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2718 2876 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2719 2877 }
2720 2878
2721 - /* Check that there is enough space here */
2879 + /* Check that there is enough space and limit headroom here */
2722 2880 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2723 - pa->used);
2881 + origin_ds->ds_dir, pa->used, tx);
2724 2882 if (err)
2725 2883 return (err);
2726 2884
2727 2885 /*
2728 2886 * Compute the amounts of space that will be used by snapshots
2729 2887 * after the promotion (for both origin and clone). For each,
2730 2888 * it is the amount of space that will be on all of their
2731 2889 * deadlists (that was not born before their new origin).
2732 2890 */
2733 2891 if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2734 2892 uint64_t space;
2735 2893
2736 2894 /*
2737 2895 * Note, typically this will not be a clone of a clone,
2738 2896 * so dd_origin_txg will be < TXG_INITIAL, so
2739 2897 * these snaplist_space() -> dsl_deadlist_space_range()
2740 2898 * calls will be fast because they do not have to
2741 2899 * iterate over all bps.
2742 2900 */
2743 2901 snap = list_head(&pa->origin_snaps);
2744 2902 err = snaplist_space(&pa->shared_snaps,
2745 2903 snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap);
2746 2904 if (err)
2747 2905 return (err);
2748 2906
2749 2907 err = snaplist_space(&pa->clone_snaps,
2750 2908 snap->ds->ds_dir->dd_origin_txg, &space);
2751 2909 if (err)
2752 2910 return (err);
2753 2911 pa->cloneusedsnap += space;
2754 2912 }
2755 2913 if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2756 2914 err = snaplist_space(&pa->origin_snaps,
2757 2915 origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
2758 2916 if (err)
2759 2917 return (err);
2760 2918 }
2761 2919
2762 2920 return (0);
2763 2921 out:
2764 2922 pa->err_ds = snap->ds->ds_snapname;
2765 2923 return (err);
2766 2924 }
2767 2925
2768 2926 static void
2769 2927 dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
2770 2928 {
2771 2929 dsl_dataset_t *hds = arg1;
2772 2930 struct promotearg *pa = arg2;
2773 2931 struct promotenode *snap = list_head(&pa->shared_snaps);
2774 2932 dsl_dataset_t *origin_ds = snap->ds;
2775 2933 dsl_dataset_t *origin_head;
2776 2934 dsl_dir_t *dd = hds->ds_dir;
2777 2935 dsl_pool_t *dp = hds->ds_dir->dd_pool;
2778 2936 dsl_dir_t *odd = NULL;
2779 2937 uint64_t oldnext_obj;
2780 2938 int64_t delta;
2781 2939
2782 2940 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
2783 2941
2784 2942 snap = list_head(&pa->origin_snaps);
2785 2943 origin_head = snap->ds;
2786 2944
2787 2945 /*
2788 2946 * We need to explicitly open odd, since origin_ds's dd will be
2789 2947 * changing.
2790 2948 */
2791 2949 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
2792 2950 NULL, FTAG, &odd));
2793 2951
2794 2952 /* change origin's next snap */
2795 2953 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2796 2954 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2797 2955 snap = list_tail(&pa->clone_snaps);
2798 2956 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2799 2957 origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2800 2958
2801 2959 /* change the origin's next clone */
2802 2960 if (origin_ds->ds_phys->ds_next_clones_obj) {
2803 2961 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
2804 2962 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2805 2963 origin_ds->ds_phys->ds_next_clones_obj,
2806 2964 oldnext_obj, tx));
2807 2965 }
2808 2966
2809 2967 /* change origin */
2810 2968 dmu_buf_will_dirty(dd->dd_dbuf, tx);
2811 2969 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2812 2970 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2813 2971 dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2814 2972 dmu_buf_will_dirty(odd->dd_dbuf, tx);
2815 2973 odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2816 2974 origin_head->ds_dir->dd_origin_txg =
2817 2975 origin_ds->ds_phys->ds_creation_txg;
2818 2976
2819 2977 /* change dd_clone entries */
2820 2978 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2821 2979 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2822 2980 odd->dd_phys->dd_clones, hds->ds_object, tx));
2823 2981 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2824 2982 pa->origin_origin->ds_dir->dd_phys->dd_clones,
2825 2983 hds->ds_object, tx));
2826 2984
2827 2985 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
2828 2986 pa->origin_origin->ds_dir->dd_phys->dd_clones,
2829 2987 origin_head->ds_object, tx));
2830 2988 if (dd->dd_phys->dd_clones == 0) {
2831 2989 dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
2832 2990 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
2833 2991 }
2834 2992 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
2835 2993 dd->dd_phys->dd_clones, origin_head->ds_object, tx));
2836 2994
2837 2995 }
2838 2996
2839 2997 /* move snapshots to this dir */
2840 2998 for (snap = list_head(&pa->shared_snaps); snap;
2841 2999 snap = list_next(&pa->shared_snaps, snap)) {
2842 3000 dsl_dataset_t *ds = snap->ds;
2843 3001
2844 3002 /* unregister props as dsl_dir is changing */
2845 3003 if (ds->ds_objset) {
↓ open down ↓ |
112 lines elided |
↑ open up ↑ |
2846 3004 dmu_objset_evict(ds->ds_objset);
2847 3005 ds->ds_objset = NULL;
2848 3006 }
2849 3007 /* move snap name entry */
2850 3008 VERIFY(0 == dsl_dataset_get_snapname(ds));
2851 3009 VERIFY(0 == dsl_dataset_snap_remove(origin_head,
2852 3010 ds->ds_snapname, tx));
2853 3011 VERIFY(0 == zap_add(dp->dp_meta_objset,
2854 3012 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2855 3013 8, 1, &ds->ds_object, tx));
3014 + dsl_snapcount_adjust(hds->ds_dir, tx, 1, B_TRUE);
2856 3015
2857 3016 /* change containing dsl_dir */
2858 3017 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2859 3018 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2860 3019 ds->ds_phys->ds_dir_obj = dd->dd_object;
2861 3020 ASSERT3P(ds->ds_dir, ==, odd);
2862 3021 dsl_dir_close(ds->ds_dir, ds);
2863 3022 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
2864 3023 NULL, ds, &ds->ds_dir));
2865 3024
2866 3025 /* move any clone references */
2867 3026 if (ds->ds_phys->ds_next_clones_obj &&
2868 3027 spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2869 3028 zap_cursor_t zc;
2870 3029 zap_attribute_t za;
2871 3030
2872 3031 for (zap_cursor_init(&zc, dp->dp_meta_objset,
2873 3032 ds->ds_phys->ds_next_clones_obj);
2874 3033 zap_cursor_retrieve(&zc, &za) == 0;
2875 3034 zap_cursor_advance(&zc)) {
2876 3035 dsl_dataset_t *cnds;
2877 3036 uint64_t o;
2878 3037
2879 3038 if (za.za_first_integer == oldnext_obj) {
2880 3039 /*
2881 3040 * We've already moved the
2882 3041 * origin's reference.
2883 3042 */
2884 3043 continue;
2885 3044 }
2886 3045
2887 3046 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
2888 3047 za.za_first_integer, FTAG, &cnds));
2889 3048 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
2890 3049
2891 3050 VERIFY3U(zap_remove_int(dp->dp_meta_objset,
2892 3051 odd->dd_phys->dd_clones, o, tx), ==, 0);
2893 3052 VERIFY3U(zap_add_int(dp->dp_meta_objset,
2894 3053 dd->dd_phys->dd_clones, o, tx), ==, 0);
2895 3054 dsl_dataset_rele(cnds, FTAG);
2896 3055 }
2897 3056 zap_cursor_fini(&zc);
2898 3057 }
2899 3058
2900 3059 ASSERT0(dsl_prop_numcb(ds));
2901 3060 }
2902 3061
2903 3062 /*
2904 3063 * Change space accounting.
2905 3064 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2906 3065 * both be valid, or both be 0 (resulting in delta == 0). This
2907 3066 * is true for each of {clone,origin} independently.
2908 3067 */
2909 3068
2910 3069 delta = pa->cloneusedsnap -
2911 3070 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2912 3071 ASSERT3S(delta, >=, 0);
2913 3072 ASSERT3U(pa->used, >=, delta);
2914 3073 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2915 3074 dsl_dir_diduse_space(dd, DD_USED_HEAD,
2916 3075 pa->used - delta, pa->comp, pa->uncomp, tx);
2917 3076
2918 3077 delta = pa->originusedsnap -
2919 3078 odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2920 3079 ASSERT3S(delta, <=, 0);
2921 3080 ASSERT3U(pa->used, >=, -delta);
2922 3081 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2923 3082 dsl_dir_diduse_space(odd, DD_USED_HEAD,
2924 3083 -pa->used - delta, -pa->comp, -pa->uncomp, tx);
2925 3084
2926 3085 origin_ds->ds_phys->ds_unique_bytes = pa->unique;
2927 3086
2928 3087 /* log history record */
2929 3088 spa_history_log_internal_ds(hds, "promote", tx, "");
2930 3089
2931 3090 dsl_dir_close(odd, FTAG);
2932 3091 }
2933 3092
2934 3093 static char *snaplist_tag = "snaplist";
2935 3094 /*
2936 3095 * Make a list of dsl_dataset_t's for the snapshots between first_obj
2937 3096 * (exclusive) and last_obj (inclusive). The list will be in reverse
2938 3097 * order (last_obj will be the list_head()). If first_obj == 0, do all
2939 3098 * snapshots back to this dataset's origin.
2940 3099 */
2941 3100 static int
2942 3101 snaplist_make(dsl_pool_t *dp, boolean_t own,
2943 3102 uint64_t first_obj, uint64_t last_obj, list_t *l)
2944 3103 {
2945 3104 uint64_t obj = last_obj;
2946 3105
2947 3106 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
2948 3107
2949 3108 list_create(l, sizeof (struct promotenode),
2950 3109 offsetof(struct promotenode, link));
2951 3110
2952 3111 while (obj != first_obj) {
2953 3112 dsl_dataset_t *ds;
2954 3113 struct promotenode *snap;
2955 3114 int err;
2956 3115
2957 3116 if (own) {
2958 3117 err = dsl_dataset_own_obj(dp, obj,
2959 3118 0, snaplist_tag, &ds);
2960 3119 if (err == 0)
2961 3120 dsl_dataset_make_exclusive(ds, snaplist_tag);
2962 3121 } else {
2963 3122 err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
2964 3123 }
2965 3124 if (err == ENOENT) {
2966 3125 /* lost race with snapshot destroy */
2967 3126 struct promotenode *last = list_tail(l);
2968 3127 ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
2969 3128 obj = last->ds->ds_phys->ds_prev_snap_obj;
2970 3129 continue;
2971 3130 } else if (err) {
2972 3131 return (err);
2973 3132 }
2974 3133
2975 3134 if (first_obj == 0)
2976 3135 first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
2977 3136
2978 3137 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
2979 3138 snap->ds = ds;
2980 3139 list_insert_tail(l, snap);
2981 3140 obj = ds->ds_phys->ds_prev_snap_obj;
2982 3141 }
2983 3142
2984 3143 return (0);
2985 3144 }
2986 3145
2987 3146 static int
2988 3147 snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2989 3148 {
2990 3149 struct promotenode *snap;
2991 3150
2992 3151 *spacep = 0;
2993 3152 for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2994 3153 uint64_t used, comp, uncomp;
2995 3154 dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2996 3155 mintxg, UINT64_MAX, &used, &comp, &uncomp);
2997 3156 *spacep += used;
2998 3157 }
2999 3158 return (0);
3000 3159 }
3001 3160
3002 3161 static void
3003 3162 snaplist_destroy(list_t *l, boolean_t own)
3004 3163 {
3005 3164 struct promotenode *snap;
3006 3165
3007 3166 if (!l || !list_link_active(&l->list_head))
3008 3167 return;
3009 3168
3010 3169 while ((snap = list_tail(l)) != NULL) {
3011 3170 list_remove(l, snap);
3012 3171 if (own)
3013 3172 dsl_dataset_disown(snap->ds, snaplist_tag);
3014 3173 else
3015 3174 dsl_dataset_rele(snap->ds, snaplist_tag);
3016 3175 kmem_free(snap, sizeof (struct promotenode));
3017 3176 }
3018 3177 list_destroy(l);
3019 3178 }
3020 3179
3021 3180 /*
3022 3181 * Promote a clone. Nomenclature note:
3023 3182 * "clone" or "cds": the original clone which is being promoted
3024 3183 * "origin" or "ods": the snapshot which is originally clone's origin
3025 3184 * "origin head" or "ohds": the dataset which is the head
3026 3185 * (filesystem/volume) for the origin
3027 3186 * "origin origin": the origin of the origin's filesystem (typically
3028 3187 * NULL, indicating that the clone is not a clone of a clone).
3029 3188 */
3030 3189 int
3031 3190 dsl_dataset_promote(const char *name, char *conflsnap)
3032 3191 {
3033 3192 dsl_dataset_t *ds;
3034 3193 dsl_dir_t *dd;
3035 3194 dsl_pool_t *dp;
3036 3195 dmu_object_info_t doi;
3037 3196 struct promotearg pa = { 0 };
3038 3197 struct promotenode *snap;
3039 3198 int err;
3040 3199
3041 3200 err = dsl_dataset_hold(name, FTAG, &ds);
3042 3201 if (err)
3043 3202 return (err);
3044 3203 dd = ds->ds_dir;
3045 3204 dp = dd->dd_pool;
3046 3205
3047 3206 err = dmu_object_info(dp->dp_meta_objset,
3048 3207 ds->ds_phys->ds_snapnames_zapobj, &doi);
3049 3208 if (err) {
3050 3209 dsl_dataset_rele(ds, FTAG);
3051 3210 return (err);
3052 3211 }
3053 3212
3054 3213 if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
3055 3214 dsl_dataset_rele(ds, FTAG);
3056 3215 return (EINVAL);
3057 3216 }
3058 3217
3059 3218 /*
3060 3219 * We are going to inherit all the snapshots taken before our
3061 3220 * origin (i.e., our new origin will be our parent's origin).
3062 3221 * Take ownership of them so that we can rename them into our
3063 3222 * namespace.
3064 3223 */
3065 3224 rw_enter(&dp->dp_config_rwlock, RW_READER);
3066 3225
3067 3226 err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
3068 3227 &pa.shared_snaps);
3069 3228 if (err != 0)
3070 3229 goto out;
3071 3230
3072 3231 err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
3073 3232 if (err != 0)
3074 3233 goto out;
3075 3234
3076 3235 snap = list_head(&pa.shared_snaps);
3077 3236 ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
3078 3237 err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
3079 3238 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
3080 3239 if (err != 0)
3081 3240 goto out;
3082 3241
3083 3242 if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
3084 3243 err = dsl_dataset_hold_obj(dp,
3085 3244 snap->ds->ds_dir->dd_phys->dd_origin_obj,
3086 3245 FTAG, &pa.origin_origin);
3087 3246 if (err != 0)
3088 3247 goto out;
3089 3248 }
3090 3249
3091 3250 out:
3092 3251 rw_exit(&dp->dp_config_rwlock);
3093 3252
3094 3253 /*
3095 3254 * Add in 128x the snapnames zapobj size, since we will be moving
3096 3255 * a bunch of snapnames to the promoted ds, and dirtying their
3097 3256 * bonus buffers.
3098 3257 */
3099 3258 if (err == 0) {
3100 3259 err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
3101 3260 dsl_dataset_promote_sync, ds, &pa,
3102 3261 2 + 2 * doi.doi_physical_blocks_512);
3103 3262 if (err && pa.err_ds && conflsnap)
3104 3263 (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
3105 3264 }
3106 3265
3107 3266 snaplist_destroy(&pa.shared_snaps, B_TRUE);
3108 3267 snaplist_destroy(&pa.clone_snaps, B_FALSE);
3109 3268 snaplist_destroy(&pa.origin_snaps, B_FALSE);
3110 3269 if (pa.origin_origin)
3111 3270 dsl_dataset_rele(pa.origin_origin, FTAG);
3112 3271 dsl_dataset_rele(ds, FTAG);
3113 3272 return (err);
3114 3273 }
3115 3274
3116 3275 struct cloneswaparg {
3117 3276 dsl_dataset_t *cds; /* clone dataset */
3118 3277 dsl_dataset_t *ohds; /* origin's head dataset */
3119 3278 boolean_t force;
3120 3279 int64_t unused_refres_delta; /* change in unconsumed refreservation */
3121 3280 };
3122 3281
3123 3282 /* ARGSUSED */
3124 3283 static int
3125 3284 dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
3126 3285 {
3127 3286 struct cloneswaparg *csa = arg1;
3128 3287
3129 3288 /* they should both be heads */
3130 3289 if (dsl_dataset_is_snapshot(csa->cds) ||
3131 3290 dsl_dataset_is_snapshot(csa->ohds))
3132 3291 return (EINVAL);
3133 3292
3134 3293 /* the branch point should be just before them */
3135 3294 if (csa->cds->ds_prev != csa->ohds->ds_prev)
3136 3295 return (EINVAL);
3137 3296
3138 3297 /* cds should be the clone (unless they are unrelated) */
3139 3298 if (csa->cds->ds_prev != NULL &&
3140 3299 csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
3141 3300 csa->ohds->ds_object !=
3142 3301 csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
3143 3302 return (EINVAL);
3144 3303
3145 3304 /* the clone should be a child of the origin */
3146 3305 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
3147 3306 return (EINVAL);
3148 3307
3149 3308 /* ohds shouldn't be modified unless 'force' */
3150 3309 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
3151 3310 return (ETXTBSY);
3152 3311
3153 3312 /* adjust amount of any unconsumed refreservation */
3154 3313 csa->unused_refres_delta =
3155 3314 (int64_t)MIN(csa->ohds->ds_reserved,
3156 3315 csa->ohds->ds_phys->ds_unique_bytes) -
3157 3316 (int64_t)MIN(csa->ohds->ds_reserved,
3158 3317 csa->cds->ds_phys->ds_unique_bytes);
3159 3318
3160 3319 if (csa->unused_refres_delta > 0 &&
3161 3320 csa->unused_refres_delta >
3162 3321 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
3163 3322 return (ENOSPC);
3164 3323
3165 3324 if (csa->ohds->ds_quota != 0 &&
3166 3325 csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
3167 3326 return (EDQUOT);
3168 3327
3169 3328 return (0);
3170 3329 }
3171 3330
3172 3331 /* ARGSUSED */
3173 3332 static void
3174 3333 dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3175 3334 {
3176 3335 struct cloneswaparg *csa = arg1;
3177 3336 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
3178 3337
3179 3338 ASSERT(csa->cds->ds_reserved == 0);
3180 3339 ASSERT(csa->ohds->ds_quota == 0 ||
3181 3340 csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
3182 3341
3183 3342 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
3184 3343 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
3185 3344
3186 3345 if (csa->cds->ds_objset != NULL) {
3187 3346 dmu_objset_evict(csa->cds->ds_objset);
3188 3347 csa->cds->ds_objset = NULL;
3189 3348 }
3190 3349
3191 3350 if (csa->ohds->ds_objset != NULL) {
3192 3351 dmu_objset_evict(csa->ohds->ds_objset);
3193 3352 csa->ohds->ds_objset = NULL;
3194 3353 }
3195 3354
3196 3355 /*
3197 3356 * Reset origin's unique bytes, if it exists.
3198 3357 */
3199 3358 if (csa->cds->ds_prev) {
3200 3359 dsl_dataset_t *origin = csa->cds->ds_prev;
3201 3360 uint64_t comp, uncomp;
3202 3361
3203 3362 dmu_buf_will_dirty(origin->ds_dbuf, tx);
3204 3363 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3205 3364 origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
3206 3365 &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
3207 3366 }
3208 3367
3209 3368 /* swap blkptrs */
3210 3369 {
3211 3370 blkptr_t tmp;
3212 3371 tmp = csa->ohds->ds_phys->ds_bp;
3213 3372 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
3214 3373 csa->cds->ds_phys->ds_bp = tmp;
3215 3374 }
3216 3375
3217 3376 /* set dd_*_bytes */
3218 3377 {
3219 3378 int64_t dused, dcomp, duncomp;
3220 3379 uint64_t cdl_used, cdl_comp, cdl_uncomp;
3221 3380 uint64_t odl_used, odl_comp, odl_uncomp;
3222 3381
3223 3382 ASSERT3U(csa->cds->ds_dir->dd_phys->
3224 3383 dd_used_breakdown[DD_USED_SNAP], ==, 0);
3225 3384
3226 3385 dsl_deadlist_space(&csa->cds->ds_deadlist,
3227 3386 &cdl_used, &cdl_comp, &cdl_uncomp);
3228 3387 dsl_deadlist_space(&csa->ohds->ds_deadlist,
3229 3388 &odl_used, &odl_comp, &odl_uncomp);
3230 3389
3231 3390 dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
3232 3391 (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
3233 3392 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
3234 3393 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
3235 3394 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
3236 3395 cdl_uncomp -
3237 3396 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
3238 3397
3239 3398 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
3240 3399 dused, dcomp, duncomp, tx);
3241 3400 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
3242 3401 -dused, -dcomp, -duncomp, tx);
3243 3402
3244 3403 /*
3245 3404 * The difference in the space used by snapshots is the
3246 3405 * difference in snapshot space due to the head's
3247 3406 * deadlist (since that's the only thing that's
3248 3407 * changing that affects the snapused).
3249 3408 */
3250 3409 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3251 3410 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3252 3411 &cdl_used, &cdl_comp, &cdl_uncomp);
3253 3412 dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
3254 3413 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3255 3414 &odl_used, &odl_comp, &odl_uncomp);
3256 3415 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
3257 3416 DD_USED_HEAD, DD_USED_SNAP, tx);
3258 3417 }
3259 3418
3260 3419 /* swap ds_*_bytes */
3261 3420 SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
3262 3421 csa->cds->ds_phys->ds_referenced_bytes);
3263 3422 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
3264 3423 csa->cds->ds_phys->ds_compressed_bytes);
3265 3424 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
3266 3425 csa->cds->ds_phys->ds_uncompressed_bytes);
3267 3426 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
3268 3427 csa->cds->ds_phys->ds_unique_bytes);
3269 3428
3270 3429 /* apply any parent delta for change in unconsumed refreservation */
3271 3430 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
3272 3431 csa->unused_refres_delta, 0, 0, tx);
3273 3432
3274 3433 /*
3275 3434 * Swap deadlists.
3276 3435 */
3277 3436 dsl_deadlist_close(&csa->cds->ds_deadlist);
3278 3437 dsl_deadlist_close(&csa->ohds->ds_deadlist);
3279 3438 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
3280 3439 csa->cds->ds_phys->ds_deadlist_obj);
3281 3440 dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
3282 3441 csa->cds->ds_phys->ds_deadlist_obj);
3283 3442 dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
3284 3443 csa->ohds->ds_phys->ds_deadlist_obj);
3285 3444
3286 3445 dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
3287 3446
3288 3447 spa_history_log_internal_ds(csa->cds, "clone swap", tx,
3289 3448 "parent=%s", csa->ohds->ds_dir->dd_myname);
3290 3449 }
3291 3450
3292 3451 /*
3293 3452 * Swap 'clone' with its origin head datasets. Used at the end of "zfs
3294 3453 * recv" into an existing fs to swizzle the file system to the new
3295 3454 * version, and by "zfs rollback". Can also be used to swap two
3296 3455 * independent head datasets if neither has any snapshots.
3297 3456 */
3298 3457 int
3299 3458 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
3300 3459 boolean_t force)
3301 3460 {
3302 3461 struct cloneswaparg csa;
3303 3462 int error;
3304 3463
3305 3464 ASSERT(clone->ds_owner);
3306 3465 ASSERT(origin_head->ds_owner);
3307 3466 retry:
3308 3467 /*
3309 3468 * Need exclusive access for the swap. If we're swapping these
3310 3469 * datasets back after an error, we already hold the locks.
3311 3470 */
3312 3471 if (!RW_WRITE_HELD(&clone->ds_rwlock))
3313 3472 rw_enter(&clone->ds_rwlock, RW_WRITER);
3314 3473 if (!RW_WRITE_HELD(&origin_head->ds_rwlock) &&
3315 3474 !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
3316 3475 rw_exit(&clone->ds_rwlock);
3317 3476 rw_enter(&origin_head->ds_rwlock, RW_WRITER);
3318 3477 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
3319 3478 rw_exit(&origin_head->ds_rwlock);
3320 3479 goto retry;
3321 3480 }
3322 3481 }
3323 3482 csa.cds = clone;
3324 3483 csa.ohds = origin_head;
3325 3484 csa.force = force;
3326 3485 error = dsl_sync_task_do(clone->ds_dir->dd_pool,
3327 3486 dsl_dataset_clone_swap_check,
3328 3487 dsl_dataset_clone_swap_sync, &csa, NULL, 9);
3329 3488 return (error);
3330 3489 }
3331 3490
3332 3491 /*
3333 3492 * Given a pool name and a dataset object number in that pool,
3334 3493 * return the name of that dataset.
3335 3494 */
3336 3495 int
3337 3496 dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
3338 3497 {
3339 3498 spa_t *spa;
3340 3499 dsl_pool_t *dp;
3341 3500 dsl_dataset_t *ds;
3342 3501 int error;
3343 3502
3344 3503 if ((error = spa_open(pname, &spa, FTAG)) != 0)
3345 3504 return (error);
3346 3505 dp = spa_get_dsl(spa);
3347 3506 rw_enter(&dp->dp_config_rwlock, RW_READER);
3348 3507 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
3349 3508 dsl_dataset_name(ds, buf);
3350 3509 dsl_dataset_rele(ds, FTAG);
3351 3510 }
3352 3511 rw_exit(&dp->dp_config_rwlock);
3353 3512 spa_close(spa, FTAG);
3354 3513
3355 3514 return (error);
3356 3515 }
3357 3516
3358 3517 int
3359 3518 dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
3360 3519 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
3361 3520 {
3362 3521 int error = 0;
3363 3522
3364 3523 ASSERT3S(asize, >, 0);
3365 3524
3366 3525 /*
3367 3526 * *ref_rsrv is the portion of asize that will come from any
3368 3527 * unconsumed refreservation space.
3369 3528 */
3370 3529 *ref_rsrv = 0;
3371 3530
3372 3531 mutex_enter(&ds->ds_lock);
3373 3532 /*
3374 3533 * Make a space adjustment for reserved bytes.
3375 3534 */
3376 3535 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
3377 3536 ASSERT3U(*used, >=,
3378 3537 ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3379 3538 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3380 3539 *ref_rsrv =
3381 3540 asize - MIN(asize, parent_delta(ds, asize + inflight));
3382 3541 }
3383 3542
3384 3543 if (!check_quota || ds->ds_quota == 0) {
3385 3544 mutex_exit(&ds->ds_lock);
3386 3545 return (0);
3387 3546 }
3388 3547 /*
3389 3548 * If they are requesting more space, and our current estimate
3390 3549 * is over quota, they get to try again unless the actual
3391 3550 * on-disk is over quota and there are no pending changes (which
3392 3551 * may free up space for us).
3393 3552 */
3394 3553 if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
3395 3554 if (inflight > 0 ||
3396 3555 ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
3397 3556 error = ERESTART;
3398 3557 else
3399 3558 error = EDQUOT;
3400 3559 }
3401 3560 mutex_exit(&ds->ds_lock);
3402 3561
3403 3562 return (error);
3404 3563 }
3405 3564
3406 3565 /* ARGSUSED */
3407 3566 static int
3408 3567 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
3409 3568 {
3410 3569 dsl_dataset_t *ds = arg1;
3411 3570 dsl_prop_setarg_t *psa = arg2;
3412 3571 int err;
3413 3572
3414 3573 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
3415 3574 return (ENOTSUP);
3416 3575
3417 3576 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3418 3577 return (err);
3419 3578
3420 3579 if (psa->psa_effective_value == 0)
3421 3580 return (0);
3422 3581
3423 3582 if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
3424 3583 psa->psa_effective_value < ds->ds_reserved)
3425 3584 return (ENOSPC);
3426 3585
3427 3586 return (0);
3428 3587 }
3429 3588
3430 3589 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
3431 3590
3432 3591 void
3433 3592 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3434 3593 {
3435 3594 dsl_dataset_t *ds = arg1;
3436 3595 dsl_prop_setarg_t *psa = arg2;
3437 3596 uint64_t effective_value = psa->psa_effective_value;
3438 3597
3439 3598 dsl_prop_set_sync(ds, psa, tx);
3440 3599 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3441 3600
3442 3601 if (ds->ds_quota != effective_value) {
3443 3602 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3444 3603 ds->ds_quota = effective_value;
3445 3604 }
3446 3605 }
3447 3606
3448 3607 int
3449 3608 dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
3450 3609 {
3451 3610 dsl_dataset_t *ds;
3452 3611 dsl_prop_setarg_t psa;
3453 3612 int err;
3454 3613
3455 3614 dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a);
3456 3615
3457 3616 err = dsl_dataset_hold(dsname, FTAG, &ds);
3458 3617 if (err)
3459 3618 return (err);
3460 3619
3461 3620 /*
3462 3621 * If someone removes a file, then tries to set the quota, we
3463 3622 * want to make sure the file freeing takes effect.
3464 3623 */
3465 3624 txg_wait_open(ds->ds_dir->dd_pool, 0);
3466 3625
3467 3626 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3468 3627 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
3469 3628 ds, &psa, 0);
3470 3629
3471 3630 dsl_dataset_rele(ds, FTAG);
3472 3631 return (err);
3473 3632 }
3474 3633
3475 3634 static int
3476 3635 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
3477 3636 {
3478 3637 dsl_dataset_t *ds = arg1;
3479 3638 dsl_prop_setarg_t *psa = arg2;
3480 3639 uint64_t effective_value;
3481 3640 uint64_t unique;
3482 3641 int err;
3483 3642
3484 3643 if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
3485 3644 SPA_VERSION_REFRESERVATION)
3486 3645 return (ENOTSUP);
3487 3646
3488 3647 if (dsl_dataset_is_snapshot(ds))
3489 3648 return (EINVAL);
3490 3649
3491 3650 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3492 3651 return (err);
3493 3652
3494 3653 effective_value = psa->psa_effective_value;
3495 3654
3496 3655 /*
3497 3656 * If we are doing the preliminary check in open context, the
3498 3657 * space estimates may be inaccurate.
3499 3658 */
3500 3659 if (!dmu_tx_is_syncing(tx))
3501 3660 return (0);
3502 3661
3503 3662 mutex_enter(&ds->ds_lock);
3504 3663 if (!DS_UNIQUE_IS_ACCURATE(ds))
3505 3664 dsl_dataset_recalc_head_uniq(ds);
3506 3665 unique = ds->ds_phys->ds_unique_bytes;
3507 3666 mutex_exit(&ds->ds_lock);
3508 3667
3509 3668 if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
3510 3669 uint64_t delta = MAX(unique, effective_value) -
3511 3670 MAX(unique, ds->ds_reserved);
3512 3671
3513 3672 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
3514 3673 return (ENOSPC);
3515 3674 if (ds->ds_quota > 0 &&
3516 3675 effective_value > ds->ds_quota)
3517 3676 return (ENOSPC);
3518 3677 }
3519 3678
3520 3679 return (0);
3521 3680 }
3522 3681
3523 3682 static void
3524 3683 dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3525 3684 {
3526 3685 dsl_dataset_t *ds = arg1;
3527 3686 dsl_prop_setarg_t *psa = arg2;
3528 3687 uint64_t effective_value = psa->psa_effective_value;
3529 3688 uint64_t unique;
3530 3689 int64_t delta;
3531 3690
3532 3691 dsl_prop_set_sync(ds, psa, tx);
3533 3692 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3534 3693
3535 3694 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3536 3695
3537 3696 mutex_enter(&ds->ds_dir->dd_lock);
3538 3697 mutex_enter(&ds->ds_lock);
3539 3698 ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
3540 3699 unique = ds->ds_phys->ds_unique_bytes;
3541 3700 delta = MAX(0, (int64_t)(effective_value - unique)) -
3542 3701 MAX(0, (int64_t)(ds->ds_reserved - unique));
3543 3702 ds->ds_reserved = effective_value;
3544 3703 mutex_exit(&ds->ds_lock);
3545 3704
3546 3705 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
3547 3706 mutex_exit(&ds->ds_dir->dd_lock);
3548 3707 }
3549 3708
3550 3709 int
3551 3710 dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
3552 3711 uint64_t reservation)
3553 3712 {
3554 3713 dsl_dataset_t *ds;
3555 3714 dsl_prop_setarg_t psa;
3556 3715 int err;
3557 3716
3558 3717 dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
3559 3718 &reservation);
3560 3719
3561 3720 err = dsl_dataset_hold(dsname, FTAG, &ds);
3562 3721 if (err)
3563 3722 return (err);
3564 3723
3565 3724 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
3566 3725 dsl_dataset_set_reservation_check,
3567 3726 dsl_dataset_set_reservation_sync, ds, &psa, 0);
3568 3727
3569 3728 dsl_dataset_rele(ds, FTAG);
3570 3729 return (err);
3571 3730 }
3572 3731
3573 3732 typedef struct zfs_hold_cleanup_arg {
3574 3733 dsl_pool_t *dp;
3575 3734 uint64_t dsobj;
3576 3735 char htag[MAXNAMELEN];
3577 3736 } zfs_hold_cleanup_arg_t;
3578 3737
3579 3738 static void
3580 3739 dsl_dataset_user_release_onexit(void *arg)
3581 3740 {
3582 3741 zfs_hold_cleanup_arg_t *ca = arg;
3583 3742
3584 3743 (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag,
3585 3744 B_TRUE);
3586 3745 kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
3587 3746 }
3588 3747
3589 3748 void
3590 3749 dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag,
3591 3750 minor_t minor)
3592 3751 {
3593 3752 zfs_hold_cleanup_arg_t *ca;
3594 3753
3595 3754 ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP);
3596 3755 ca->dp = ds->ds_dir->dd_pool;
3597 3756 ca->dsobj = ds->ds_object;
3598 3757 (void) strlcpy(ca->htag, htag, sizeof (ca->htag));
3599 3758 VERIFY3U(0, ==, zfs_onexit_add_cb(minor,
3600 3759 dsl_dataset_user_release_onexit, ca, NULL));
3601 3760 }
3602 3761
3603 3762 /*
3604 3763 * If you add new checks here, you may need to add
3605 3764 * additional checks to the "temporary" case in
3606 3765 * snapshot_check() in dmu_objset.c.
3607 3766 */
3608 3767 static int
3609 3768 dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
3610 3769 {
3611 3770 dsl_dataset_t *ds = arg1;
3612 3771 struct dsl_ds_holdarg *ha = arg2;
3613 3772 const char *htag = ha->htag;
3614 3773 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3615 3774 int error = 0;
3616 3775
3617 3776 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3618 3777 return (ENOTSUP);
3619 3778
3620 3779 if (!dsl_dataset_is_snapshot(ds))
3621 3780 return (EINVAL);
3622 3781
3623 3782 /* tags must be unique */
3624 3783 mutex_enter(&ds->ds_lock);
3625 3784 if (ds->ds_phys->ds_userrefs_obj) {
3626 3785 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
3627 3786 8, 1, tx);
3628 3787 if (error == 0)
3629 3788 error = EEXIST;
3630 3789 else if (error == ENOENT)
3631 3790 error = 0;
3632 3791 }
3633 3792 mutex_exit(&ds->ds_lock);
3634 3793
3635 3794 if (error == 0 && ha->temphold &&
3636 3795 strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
3637 3796 error = E2BIG;
3638 3797
3639 3798 return (error);
3640 3799 }
3641 3800
3642 3801 void
3643 3802 dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3644 3803 {
3645 3804 dsl_dataset_t *ds = arg1;
3646 3805 struct dsl_ds_holdarg *ha = arg2;
3647 3806 const char *htag = ha->htag;
3648 3807 dsl_pool_t *dp = ds->ds_dir->dd_pool;
3649 3808 objset_t *mos = dp->dp_meta_objset;
3650 3809 uint64_t now = gethrestime_sec();
3651 3810 uint64_t zapobj;
3652 3811
3653 3812 mutex_enter(&ds->ds_lock);
3654 3813 if (ds->ds_phys->ds_userrefs_obj == 0) {
3655 3814 /*
3656 3815 * This is the first user hold for this dataset. Create
3657 3816 * the userrefs zap object.
3658 3817 */
3659 3818 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3660 3819 zapobj = ds->ds_phys->ds_userrefs_obj =
3661 3820 zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
3662 3821 } else {
3663 3822 zapobj = ds->ds_phys->ds_userrefs_obj;
3664 3823 }
3665 3824 ds->ds_userrefs++;
3666 3825 mutex_exit(&ds->ds_lock);
3667 3826
3668 3827 VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
3669 3828
3670 3829 if (ha->temphold) {
3671 3830 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
3672 3831 htag, &now, tx));
3673 3832 }
3674 3833
3675 3834 spa_history_log_internal_ds(ds, "hold", tx,
3676 3835 "tag = %s temp = %d holds now = %llu",
3677 3836 htag, (int)ha->temphold, ds->ds_userrefs);
3678 3837 }
3679 3838
3680 3839 static int
3681 3840 dsl_dataset_user_hold_one(const char *dsname, void *arg)
3682 3841 {
3683 3842 struct dsl_ds_holdarg *ha = arg;
3684 3843 dsl_dataset_t *ds;
3685 3844 int error;
3686 3845 char *name;
3687 3846
3688 3847 /* alloc a buffer to hold dsname@snapname plus terminating NULL */
3689 3848 name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3690 3849 error = dsl_dataset_hold(name, ha->dstg, &ds);
3691 3850 strfree(name);
3692 3851 if (error == 0) {
3693 3852 ha->gotone = B_TRUE;
3694 3853 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
3695 3854 dsl_dataset_user_hold_sync, ds, ha, 0);
3696 3855 } else if (error == ENOENT && ha->recursive) {
3697 3856 error = 0;
3698 3857 } else {
3699 3858 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3700 3859 }
3701 3860 return (error);
3702 3861 }
3703 3862
3704 3863 int
3705 3864 dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag,
3706 3865 boolean_t temphold)
3707 3866 {
3708 3867 struct dsl_ds_holdarg *ha;
3709 3868 int error;
3710 3869
3711 3870 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3712 3871 ha->htag = htag;
3713 3872 ha->temphold = temphold;
3714 3873 error = dsl_sync_task_do(ds->ds_dir->dd_pool,
3715 3874 dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync,
3716 3875 ds, ha, 0);
3717 3876 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3718 3877
3719 3878 return (error);
3720 3879 }
3721 3880
3722 3881 int
3723 3882 dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
3724 3883 boolean_t recursive, boolean_t temphold, int cleanup_fd)
3725 3884 {
3726 3885 struct dsl_ds_holdarg *ha;
3727 3886 dsl_sync_task_t *dst;
3728 3887 spa_t *spa;
3729 3888 int error;
3730 3889 minor_t minor = 0;
3731 3890
3732 3891 if (cleanup_fd != -1) {
3733 3892 /* Currently we only support cleanup-on-exit of tempholds. */
3734 3893 if (!temphold)
3735 3894 return (EINVAL);
3736 3895 error = zfs_onexit_fd_hold(cleanup_fd, &minor);
3737 3896 if (error)
3738 3897 return (error);
3739 3898 }
3740 3899
3741 3900 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3742 3901
3743 3902 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3744 3903
3745 3904 error = spa_open(dsname, &spa, FTAG);
3746 3905 if (error) {
3747 3906 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3748 3907 if (cleanup_fd != -1)
3749 3908 zfs_onexit_fd_rele(cleanup_fd);
3750 3909 return (error);
3751 3910 }
3752 3911
3753 3912 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3754 3913 ha->htag = htag;
3755 3914 ha->snapname = snapname;
3756 3915 ha->recursive = recursive;
3757 3916 ha->temphold = temphold;
3758 3917
3759 3918 if (recursive) {
3760 3919 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
3761 3920 ha, DS_FIND_CHILDREN);
3762 3921 } else {
3763 3922 error = dsl_dataset_user_hold_one(dsname, ha);
3764 3923 }
3765 3924 if (error == 0)
3766 3925 error = dsl_sync_task_group_wait(ha->dstg);
3767 3926
3768 3927 for (dst = list_head(&ha->dstg->dstg_tasks); dst;
3769 3928 dst = list_next(&ha->dstg->dstg_tasks, dst)) {
3770 3929 dsl_dataset_t *ds = dst->dst_arg1;
3771 3930
3772 3931 if (dst->dst_err) {
3773 3932 dsl_dataset_name(ds, ha->failed);
3774 3933 *strchr(ha->failed, '@') = '\0';
3775 3934 } else if (error == 0 && minor != 0 && temphold) {
3776 3935 /*
3777 3936 * If this hold is to be released upon process exit,
3778 3937 * register that action now.
3779 3938 */
3780 3939 dsl_register_onexit_hold_cleanup(ds, htag, minor);
3781 3940 }
3782 3941 dsl_dataset_rele(ds, ha->dstg);
3783 3942 }
3784 3943
3785 3944 if (error == 0 && recursive && !ha->gotone)
3786 3945 error = ENOENT;
3787 3946
3788 3947 if (error)
3789 3948 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
3790 3949
3791 3950 dsl_sync_task_group_destroy(ha->dstg);
3792 3951
3793 3952 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3794 3953 spa_close(spa, FTAG);
3795 3954 if (cleanup_fd != -1)
3796 3955 zfs_onexit_fd_rele(cleanup_fd);
3797 3956 return (error);
3798 3957 }
3799 3958
3800 3959 struct dsl_ds_releasearg {
3801 3960 dsl_dataset_t *ds;
3802 3961 const char *htag;
3803 3962 boolean_t own; /* do we own or just hold ds? */
3804 3963 };
3805 3964
3806 3965 static int
3807 3966 dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
3808 3967 boolean_t *might_destroy)
3809 3968 {
3810 3969 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
3811 3970 uint64_t zapobj;
3812 3971 uint64_t tmp;
3813 3972 int error;
3814 3973
3815 3974 *might_destroy = B_FALSE;
3816 3975
3817 3976 mutex_enter(&ds->ds_lock);
3818 3977 zapobj = ds->ds_phys->ds_userrefs_obj;
3819 3978 if (zapobj == 0) {
3820 3979 /* The tag can't possibly exist */
3821 3980 mutex_exit(&ds->ds_lock);
3822 3981 return (ESRCH);
3823 3982 }
3824 3983
3825 3984 /* Make sure the tag exists */
3826 3985 error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
3827 3986 if (error) {
3828 3987 mutex_exit(&ds->ds_lock);
3829 3988 if (error == ENOENT)
3830 3989 error = ESRCH;
3831 3990 return (error);
3832 3991 }
3833 3992
3834 3993 if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
3835 3994 DS_IS_DEFER_DESTROY(ds))
3836 3995 *might_destroy = B_TRUE;
3837 3996
3838 3997 mutex_exit(&ds->ds_lock);
3839 3998 return (0);
3840 3999 }
3841 4000
3842 4001 static int
3843 4002 dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
3844 4003 {
3845 4004 struct dsl_ds_releasearg *ra = arg1;
3846 4005 dsl_dataset_t *ds = ra->ds;
3847 4006 boolean_t might_destroy;
3848 4007 int error;
3849 4008
3850 4009 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
3851 4010 return (ENOTSUP);
3852 4011
3853 4012 error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
3854 4013 if (error)
3855 4014 return (error);
3856 4015
3857 4016 if (might_destroy) {
3858 4017 struct dsl_ds_destroyarg dsda = {0};
3859 4018
3860 4019 if (dmu_tx_is_syncing(tx)) {
3861 4020 /*
3862 4021 * If we're not prepared to remove the snapshot,
3863 4022 * we can't allow the release to happen right now.
3864 4023 */
3865 4024 if (!ra->own)
3866 4025 return (EBUSY);
3867 4026 }
3868 4027 dsda.ds = ds;
3869 4028 dsda.releasing = B_TRUE;
3870 4029 return (dsl_dataset_destroy_check(&dsda, tag, tx));
3871 4030 }
3872 4031
3873 4032 return (0);
3874 4033 }
3875 4034
3876 4035 static void
3877 4036 dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx)
3878 4037 {
3879 4038 struct dsl_ds_releasearg *ra = arg1;
3880 4039 dsl_dataset_t *ds = ra->ds;
3881 4040 dsl_pool_t *dp = ds->ds_dir->dd_pool;
3882 4041 objset_t *mos = dp->dp_meta_objset;
3883 4042 uint64_t zapobj;
3884 4043 uint64_t refs;
3885 4044 int error;
3886 4045
3887 4046 mutex_enter(&ds->ds_lock);
3888 4047 ds->ds_userrefs--;
3889 4048 refs = ds->ds_userrefs;
3890 4049 mutex_exit(&ds->ds_lock);
3891 4050 error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
3892 4051 VERIFY(error == 0 || error == ENOENT);
3893 4052 zapobj = ds->ds_phys->ds_userrefs_obj;
3894 4053 VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
3895 4054
3896 4055 spa_history_log_internal_ds(ds, "release", tx,
3897 4056 "tag = %s refs now = %lld", ra->htag, (longlong_t)refs);
3898 4057
3899 4058 if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
3900 4059 DS_IS_DEFER_DESTROY(ds)) {
3901 4060 struct dsl_ds_destroyarg dsda = {0};
3902 4061
3903 4062 ASSERT(ra->own);
3904 4063 dsda.ds = ds;
3905 4064 dsda.releasing = B_TRUE;
3906 4065 /* We already did the destroy_check */
3907 4066 dsl_dataset_destroy_sync(&dsda, tag, tx);
3908 4067 }
3909 4068 }
3910 4069
3911 4070 static int
3912 4071 dsl_dataset_user_release_one(const char *dsname, void *arg)
3913 4072 {
3914 4073 struct dsl_ds_holdarg *ha = arg;
3915 4074 struct dsl_ds_releasearg *ra;
3916 4075 dsl_dataset_t *ds;
3917 4076 int error;
3918 4077 void *dtag = ha->dstg;
3919 4078 char *name;
3920 4079 boolean_t own = B_FALSE;
3921 4080 boolean_t might_destroy;
3922 4081
3923 4082 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
3924 4083 name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3925 4084 error = dsl_dataset_hold(name, dtag, &ds);
3926 4085 strfree(name);
3927 4086 if (error == ENOENT && ha->recursive)
3928 4087 return (0);
3929 4088 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3930 4089 if (error)
3931 4090 return (error);
3932 4091
3933 4092 ha->gotone = B_TRUE;
3934 4093
3935 4094 ASSERT(dsl_dataset_is_snapshot(ds));
3936 4095
3937 4096 error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
3938 4097 if (error) {
3939 4098 dsl_dataset_rele(ds, dtag);
3940 4099 return (error);
3941 4100 }
3942 4101
3943 4102 if (might_destroy) {
3944 4103 #ifdef _KERNEL
3945 4104 name = kmem_asprintf("%s@%s", dsname, ha->snapname);
3946 4105 error = zfs_unmount_snap(name, NULL);
3947 4106 strfree(name);
3948 4107 if (error) {
3949 4108 dsl_dataset_rele(ds, dtag);
3950 4109 return (error);
3951 4110 }
3952 4111 #endif
3953 4112 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
3954 4113 dsl_dataset_rele(ds, dtag);
3955 4114 return (EBUSY);
3956 4115 } else {
3957 4116 own = B_TRUE;
3958 4117 dsl_dataset_make_exclusive(ds, dtag);
3959 4118 }
3960 4119 }
3961 4120
3962 4121 ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
3963 4122 ra->ds = ds;
3964 4123 ra->htag = ha->htag;
3965 4124 ra->own = own;
3966 4125 dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
3967 4126 dsl_dataset_user_release_sync, ra, dtag, 0);
3968 4127
3969 4128 return (0);
3970 4129 }
3971 4130
3972 4131 int
3973 4132 dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
3974 4133 boolean_t recursive)
3975 4134 {
3976 4135 struct dsl_ds_holdarg *ha;
3977 4136 dsl_sync_task_t *dst;
3978 4137 spa_t *spa;
3979 4138 int error;
3980 4139
3981 4140 top:
3982 4141 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
3983 4142
3984 4143 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
3985 4144
3986 4145 error = spa_open(dsname, &spa, FTAG);
3987 4146 if (error) {
3988 4147 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
3989 4148 return (error);
3990 4149 }
3991 4150
3992 4151 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
3993 4152 ha->htag = htag;
3994 4153 ha->snapname = snapname;
3995 4154 ha->recursive = recursive;
3996 4155 if (recursive) {
3997 4156 error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
3998 4157 ha, DS_FIND_CHILDREN);
3999 4158 } else {
4000 4159 error = dsl_dataset_user_release_one(dsname, ha);
4001 4160 }
4002 4161 if (error == 0)
4003 4162 error = dsl_sync_task_group_wait(ha->dstg);
4004 4163
4005 4164 for (dst = list_head(&ha->dstg->dstg_tasks); dst;
4006 4165 dst = list_next(&ha->dstg->dstg_tasks, dst)) {
4007 4166 struct dsl_ds_releasearg *ra = dst->dst_arg1;
4008 4167 dsl_dataset_t *ds = ra->ds;
4009 4168
4010 4169 if (dst->dst_err)
4011 4170 dsl_dataset_name(ds, ha->failed);
4012 4171
4013 4172 if (ra->own)
4014 4173 dsl_dataset_disown(ds, ha->dstg);
4015 4174 else
4016 4175 dsl_dataset_rele(ds, ha->dstg);
4017 4176
4018 4177 kmem_free(ra, sizeof (struct dsl_ds_releasearg));
4019 4178 }
4020 4179
4021 4180 if (error == 0 && recursive && !ha->gotone)
4022 4181 error = ENOENT;
4023 4182
4024 4183 if (error && error != EBUSY)
4025 4184 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
4026 4185
4027 4186 dsl_sync_task_group_destroy(ha->dstg);
4028 4187 kmem_free(ha, sizeof (struct dsl_ds_holdarg));
4029 4188 spa_close(spa, FTAG);
4030 4189
4031 4190 /*
4032 4191 * We can get EBUSY if we were racing with deferred destroy and
4033 4192 * dsl_dataset_user_release_check() hadn't done the necessary
4034 4193 * open context setup. We can also get EBUSY if we're racing
4035 4194 * with destroy and that thread is the ds_owner. Either way
4036 4195 * the busy condition should be transient, and we should retry
4037 4196 * the release operation.
4038 4197 */
4039 4198 if (error == EBUSY)
4040 4199 goto top;
4041 4200
4042 4201 return (error);
4043 4202 }
4044 4203
4045 4204 /*
4046 4205 * Called at spa_load time (with retry == B_FALSE) to release a stale
4047 4206 * temporary user hold. Also called by the onexit code (with retry == B_TRUE).
4048 4207 */
4049 4208 int
4050 4209 dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag,
4051 4210 boolean_t retry)
4052 4211 {
4053 4212 dsl_dataset_t *ds;
4054 4213 char *snap;
4055 4214 char *name;
4056 4215 int namelen;
4057 4216 int error;
4058 4217
4059 4218 do {
4060 4219 rw_enter(&dp->dp_config_rwlock, RW_READER);
4061 4220 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
4062 4221 rw_exit(&dp->dp_config_rwlock);
4063 4222 if (error)
4064 4223 return (error);
4065 4224 namelen = dsl_dataset_namelen(ds)+1;
4066 4225 name = kmem_alloc(namelen, KM_SLEEP);
4067 4226 dsl_dataset_name(ds, name);
4068 4227 dsl_dataset_rele(ds, FTAG);
4069 4228
4070 4229 snap = strchr(name, '@');
4071 4230 *snap = '\0';
4072 4231 ++snap;
4073 4232 error = dsl_dataset_user_release(name, snap, htag, B_FALSE);
4074 4233 kmem_free(name, namelen);
4075 4234
4076 4235 /*
4077 4236 * The object can't have been destroyed because we have a hold,
4078 4237 * but it might have been renamed, resulting in ENOENT. Retry
4079 4238 * if we've been requested to do so.
4080 4239 *
4081 4240 * It would be nice if we could use the dsobj all the way
4082 4241 * through and avoid ENOENT entirely. But we might need to
4083 4242 * unmount the snapshot, and there's currently no way to lookup
4084 4243 * a vfsp using a ZFS object id.
4085 4244 */
4086 4245 } while ((error == ENOENT) && retry);
4087 4246
4088 4247 return (error);
4089 4248 }
4090 4249
4091 4250 int
4092 4251 dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
4093 4252 {
4094 4253 dsl_dataset_t *ds;
4095 4254 int err;
4096 4255
4097 4256 err = dsl_dataset_hold(dsname, FTAG, &ds);
4098 4257 if (err)
4099 4258 return (err);
4100 4259
4101 4260 VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
4102 4261 if (ds->ds_phys->ds_userrefs_obj != 0) {
4103 4262 zap_attribute_t *za;
4104 4263 zap_cursor_t zc;
4105 4264
4106 4265 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
4107 4266 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
4108 4267 ds->ds_phys->ds_userrefs_obj);
4109 4268 zap_cursor_retrieve(&zc, za) == 0;
4110 4269 zap_cursor_advance(&zc)) {
4111 4270 VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
4112 4271 za->za_first_integer));
4113 4272 }
4114 4273 zap_cursor_fini(&zc);
4115 4274 kmem_free(za, sizeof (zap_attribute_t));
4116 4275 }
4117 4276 dsl_dataset_rele(ds, FTAG);
4118 4277 return (0);
4119 4278 }
4120 4279
4121 4280 /*
4122 4281 * Note, this function is used as the callback for dmu_objset_find(). We
4123 4282 * always return 0 so that we will continue to find and process
4124 4283 * inconsistent datasets, even if we encounter an error trying to
4125 4284 * process one of them.
4126 4285 */
4127 4286 /* ARGSUSED */
4128 4287 int
4129 4288 dsl_destroy_inconsistent(const char *dsname, void *arg)
4130 4289 {
4131 4290 dsl_dataset_t *ds;
4132 4291
4133 4292 if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
4134 4293 if (DS_IS_INCONSISTENT(ds))
4135 4294 (void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
4136 4295 else
4137 4296 dsl_dataset_disown(ds, FTAG);
4138 4297 }
4139 4298 return (0);
4140 4299 }
4141 4300
4142 4301 /*
4143 4302 * Return (in *usedp) the amount of space written in new that is not
4144 4303 * present in oldsnap. New may be a snapshot or the head. Old must be
4145 4304 * a snapshot before new, in new's filesystem (or its origin). If not then
4146 4305 * fail and return EINVAL.
4147 4306 *
4148 4307 * The written space is calculated by considering two components: First, we
4149 4308 * ignore any freed space, and calculate the written as new's used space
4150 4309 * minus old's used space. Next, we add in the amount of space that was freed
4151 4310 * between the two snapshots, thus reducing new's used space relative to old's.
4152 4311 * Specifically, this is the space that was born before old->ds_creation_txg,
4153 4312 * and freed before new (ie. on new's deadlist or a previous deadlist).
4154 4313 *
4155 4314 * space freed [---------------------]
4156 4315 * snapshots ---O-------O--------O-------O------
4157 4316 * oldsnap new
4158 4317 */
4159 4318 int
4160 4319 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
4161 4320 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4162 4321 {
4163 4322 int err = 0;
4164 4323 uint64_t snapobj;
4165 4324 dsl_pool_t *dp = new->ds_dir->dd_pool;
4166 4325
4167 4326 *usedp = 0;
4168 4327 *usedp += new->ds_phys->ds_referenced_bytes;
4169 4328 *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
4170 4329
4171 4330 *compp = 0;
4172 4331 *compp += new->ds_phys->ds_compressed_bytes;
4173 4332 *compp -= oldsnap->ds_phys->ds_compressed_bytes;
4174 4333
4175 4334 *uncompp = 0;
4176 4335 *uncompp += new->ds_phys->ds_uncompressed_bytes;
4177 4336 *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
4178 4337
4179 4338 rw_enter(&dp->dp_config_rwlock, RW_READER);
4180 4339 snapobj = new->ds_object;
4181 4340 while (snapobj != oldsnap->ds_object) {
4182 4341 dsl_dataset_t *snap;
4183 4342 uint64_t used, comp, uncomp;
4184 4343
4185 4344 if (snapobj == new->ds_object) {
4186 4345 snap = new;
4187 4346 } else {
4188 4347 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
4189 4348 if (err != 0)
4190 4349 break;
4191 4350 }
4192 4351
4193 4352 if (snap->ds_phys->ds_prev_snap_txg ==
4194 4353 oldsnap->ds_phys->ds_creation_txg) {
4195 4354 /*
4196 4355 * The blocks in the deadlist can not be born after
4197 4356 * ds_prev_snap_txg, so get the whole deadlist space,
4198 4357 * which is more efficient (especially for old-format
4199 4358 * deadlists). Unfortunately the deadlist code
4200 4359 * doesn't have enough information to make this
4201 4360 * optimization itself.
4202 4361 */
4203 4362 dsl_deadlist_space(&snap->ds_deadlist,
4204 4363 &used, &comp, &uncomp);
4205 4364 } else {
4206 4365 dsl_deadlist_space_range(&snap->ds_deadlist,
4207 4366 0, oldsnap->ds_phys->ds_creation_txg,
4208 4367 &used, &comp, &uncomp);
4209 4368 }
4210 4369 *usedp += used;
4211 4370 *compp += comp;
4212 4371 *uncompp += uncomp;
4213 4372
4214 4373 /*
4215 4374 * If we get to the beginning of the chain of snapshots
4216 4375 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
4217 4376 * was not a snapshot of/before new.
4218 4377 */
4219 4378 snapobj = snap->ds_phys->ds_prev_snap_obj;
4220 4379 if (snap != new)
4221 4380 dsl_dataset_rele(snap, FTAG);
4222 4381 if (snapobj == 0) {
4223 4382 err = EINVAL;
4224 4383 break;
4225 4384 }
4226 4385
4227 4386 }
4228 4387 rw_exit(&dp->dp_config_rwlock);
4229 4388 return (err);
4230 4389 }
4231 4390
4232 4391 /*
4233 4392 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
4234 4393 * lastsnap, and all snapshots in between are deleted.
4235 4394 *
4236 4395 * blocks that would be freed [---------------------------]
4237 4396 * snapshots ---O-------O--------O-------O--------O
4238 4397 * firstsnap lastsnap
4239 4398 *
4240 4399 * This is the set of blocks that were born after the snap before firstsnap,
4241 4400 * (birth > firstsnap->prev_snap_txg) and died before the snap after the
4242 4401 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
4243 4402 * We calculate this by iterating over the relevant deadlists (from the snap
4244 4403 * after lastsnap, backward to the snap after firstsnap), summing up the
4245 4404 * space on the deadlist that was born after the snap before firstsnap.
4246 4405 */
4247 4406 int
4248 4407 dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
4249 4408 dsl_dataset_t *lastsnap,
4250 4409 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4251 4410 {
4252 4411 int err = 0;
4253 4412 uint64_t snapobj;
4254 4413 dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
4255 4414
4256 4415 ASSERT(dsl_dataset_is_snapshot(firstsnap));
4257 4416 ASSERT(dsl_dataset_is_snapshot(lastsnap));
4258 4417
4259 4418 /*
4260 4419 * Check that the snapshots are in the same dsl_dir, and firstsnap
4261 4420 * is before lastsnap.
4262 4421 */
4263 4422 if (firstsnap->ds_dir != lastsnap->ds_dir ||
4264 4423 firstsnap->ds_phys->ds_creation_txg >
4265 4424 lastsnap->ds_phys->ds_creation_txg)
4266 4425 return (EINVAL);
4267 4426
4268 4427 *usedp = *compp = *uncompp = 0;
4269 4428
4270 4429 rw_enter(&dp->dp_config_rwlock, RW_READER);
4271 4430 snapobj = lastsnap->ds_phys->ds_next_snap_obj;
4272 4431 while (snapobj != firstsnap->ds_object) {
4273 4432 dsl_dataset_t *ds;
4274 4433 uint64_t used, comp, uncomp;
4275 4434
4276 4435 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
4277 4436 if (err != 0)
4278 4437 break;
4279 4438
4280 4439 dsl_deadlist_space_range(&ds->ds_deadlist,
4281 4440 firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
4282 4441 &used, &comp, &uncomp);
4283 4442 *usedp += used;
4284 4443 *compp += comp;
4285 4444 *uncompp += uncomp;
4286 4445
4287 4446 snapobj = ds->ds_phys->ds_prev_snap_obj;
4288 4447 ASSERT3U(snapobj, !=, 0);
4289 4448 dsl_dataset_rele(ds, FTAG);
4290 4449 }
4291 4450 rw_exit(&dp->dp_config_rwlock);
4292 4451 return (err);
4293 4452 }
↓ open down ↓ |
1428 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX