1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright (c) 2013 Steven Hartland. All rights reserved. 25 */ 26 27 #include <sys/dsl_pool.h> 28 #include <sys/dsl_dataset.h> 29 #include <sys/dsl_prop.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/dsl_synctask.h> 32 #include <sys/dsl_scan.h> 33 #include <sys/dnode.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/arc.h> 37 #include <sys/zap.h> 38 #include <sys/zio.h> 39 #include <sys/zfs_context.h> 40 #include <sys/fs/zfs.h> 41 #include <sys/zfs_znode.h> 42 #include <sys/spa_impl.h> 43 #include <sys/dsl_deadlist.h> 44 #include <sys/bptree.h> 45 #include <sys/zfeature.h> 46 #include <sys/zil_impl.h> 47 #include <sys/dsl_userhold.h> 48 49 int zfs_no_write_throttle = 0; 50 int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ 51 int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ 52 53 uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ 54 uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ 55 uint64_t zfs_write_limit_inflated = 0; 56 uint64_t zfs_write_limit_override = 0; 57 58 kmutex_t zfs_write_limit_lock; 59 60 static pgcnt_t old_physmem = 0; 61 62 hrtime_t zfs_throttle_delay = MSEC2NSEC(10); 63 hrtime_t zfs_throttle_resolution = MSEC2NSEC(10); 64 65 int 66 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) 67 { 68 uint64_t obj; 69 int err; 70 71 err = zap_lookup(dp->dp_meta_objset, 72 dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, 73 name, sizeof (obj), 1, &obj); 74 if (err) 75 return (err); 76 77 return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); 78 } 79 80 static dsl_pool_t * 81 dsl_pool_open_impl(spa_t *spa, uint64_t txg) 82 { 83 dsl_pool_t *dp; 84 blkptr_t *bp = spa_get_rootblkptr(spa); 85 86 dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); 87 dp->dp_spa = spa; 88 dp->dp_meta_rootbp = *bp; 89 rrw_init(&dp->dp_config_rwlock, B_TRUE); 90 dp->dp_write_limit = zfs_write_limit_min; 91 txg_init(dp, txg); 92 93 txg_list_create(&dp->dp_dirty_datasets, 94 offsetof(dsl_dataset_t, ds_dirty_link)); 95 txg_list_create(&dp->dp_dirty_zilogs, 96 offsetof(zilog_t, zl_dirty_link)); 97 txg_list_create(&dp->dp_dirty_dirs, 98 offsetof(dsl_dir_t, dd_dirty_link)); 99 txg_list_create(&dp->dp_sync_tasks, 100 offsetof(dsl_sync_task_t, dst_node)); 101 102 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); 103 104 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 105 1, 4, 0); 106 107 return (dp); 108 } 109 110 int 111 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) 112 { 113 int err; 114 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 115 116 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, 117 &dp->dp_meta_objset); 118 if (err != 0) 119 dsl_pool_close(dp); 120 else 121 *dpp = dp; 122 123 return (err); 124 } 125 126 int 127 dsl_pool_open(dsl_pool_t *dp) 128 { 129 int err; 130 dsl_dir_t *dd; 131 dsl_dataset_t *ds; 132 uint64_t obj; 133 134 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 135 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 136 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, 137 &dp->dp_root_dir_obj); 138 if (err) 139 goto out; 140 141 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 142 NULL, dp, &dp->dp_root_dir); 143 if (err) 144 goto out; 145 146 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); 147 if (err) 148 goto out; 149 150 if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { 151 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); 152 if (err) 153 goto out; 154 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, 155 FTAG, &ds); 156 if (err == 0) { 157 err = dsl_dataset_hold_obj(dp, 158 ds->ds_phys->ds_prev_snap_obj, dp, 159 &dp->dp_origin_snap); 160 dsl_dataset_rele(ds, FTAG); 161 } 162 dsl_dir_rele(dd, dp); 163 if (err) 164 goto out; 165 } 166 167 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 168 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, 169 &dp->dp_free_dir); 170 if (err) 171 goto out; 172 173 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 174 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); 175 if (err) 176 goto out; 177 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 178 dp->dp_meta_objset, obj)); 179 } 180 181 if (spa_feature_is_active(dp->dp_spa, 182 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { 183 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 184 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, 185 &dp->dp_bptree_obj); 186 if (err != 0) 187 goto out; 188 } 189 190 if (spa_feature_is_active(dp->dp_spa, 191 &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) { 192 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 193 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 194 &dp->dp_empty_bpobj); 195 if (err != 0) 196 goto out; 197 } 198 199 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 200 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, 201 &dp->dp_tmp_userrefs_obj); 202 if (err == ENOENT) 203 err = 0; 204 if (err) 205 goto out; 206 207 err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); 208 209 out: 210 rrw_exit(&dp->dp_config_rwlock, FTAG); 211 return (err); 212 } 213 214 void 215 dsl_pool_close(dsl_pool_t *dp) 216 { 217 /* drop our references from dsl_pool_open() */ 218 219 /* 220 * Since we held the origin_snap from "syncing" context (which 221 * includes pool-opening context), it actually only got a "ref" 222 * and not a hold, so just drop that here. 223 */ 224 if (dp->dp_origin_snap) 225 dsl_dataset_rele(dp->dp_origin_snap, dp); 226 if (dp->dp_mos_dir) 227 dsl_dir_rele(dp->dp_mos_dir, dp); 228 if (dp->dp_free_dir) 229 dsl_dir_rele(dp->dp_free_dir, dp); 230 if (dp->dp_root_dir) 231 dsl_dir_rele(dp->dp_root_dir, dp); 232 233 bpobj_close(&dp->dp_free_bpobj); 234 235 /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ 236 if (dp->dp_meta_objset) 237 dmu_objset_evict(dp->dp_meta_objset); 238 239 txg_list_destroy(&dp->dp_dirty_datasets); 240 txg_list_destroy(&dp->dp_dirty_zilogs); 241 txg_list_destroy(&dp->dp_sync_tasks); 242 txg_list_destroy(&dp->dp_dirty_dirs); 243 244 arc_flush(dp->dp_spa); 245 txg_fini(dp); 246 dsl_scan_fini(dp); 247 rrw_destroy(&dp->dp_config_rwlock); 248 mutex_destroy(&dp->dp_lock); 249 taskq_destroy(dp->dp_vnrele_taskq); 250 if (dp->dp_blkstats) 251 kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 252 kmem_free(dp, sizeof (dsl_pool_t)); 253 } 254 255 dsl_pool_t * 256 dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) 257 { 258 int err; 259 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); 260 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 261 objset_t *os; 262 dsl_dataset_t *ds; 263 uint64_t obj; 264 265 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 266 267 /* create and open the MOS (meta-objset) */ 268 dp->dp_meta_objset = dmu_objset_create_impl(spa, 269 NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); 270 271 /* create the pool directory */ 272 err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 273 DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); 274 ASSERT0(err); 275 276 /* Initialize scan structures */ 277 VERIFY0(dsl_scan_init(dp, txg)); 278 279 /* create and open the root dir */ 280 dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); 281 VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, 282 NULL, dp, &dp->dp_root_dir)); 283 284 /* create and open the meta-objset dir */ 285 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); 286 VERIFY0(dsl_pool_open_special_dir(dp, 287 MOS_DIR_NAME, &dp->dp_mos_dir)); 288 289 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 290 /* create and open the free dir */ 291 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 292 FREE_DIR_NAME, tx); 293 VERIFY0(dsl_pool_open_special_dir(dp, 294 FREE_DIR_NAME, &dp->dp_free_dir)); 295 296 /* create and open the free_bplist */ 297 obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); 298 VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 299 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); 300 VERIFY0(bpobj_open(&dp->dp_free_bpobj, 301 dp->dp_meta_objset, obj)); 302 } 303 304 if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) 305 dsl_pool_create_origin(dp, tx); 306 307 /* create the root dataset */ 308 obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); 309 310 /* create the root objset */ 311 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); 312 os = dmu_objset_create_impl(dp->dp_spa, ds, 313 dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); 314 #ifdef _KERNEL 315 zfs_create_fs(os, kcred, zplprops, tx); 316 #endif 317 dsl_dataset_rele(ds, FTAG); 318 319 dmu_tx_commit(tx); 320 321 rrw_exit(&dp->dp_config_rwlock, FTAG); 322 323 return (dp); 324 } 325 326 /* 327 * Account for the meta-objset space in its placeholder dsl_dir. 328 */ 329 void 330 dsl_pool_mos_diduse_space(dsl_pool_t *dp, 331 int64_t used, int64_t comp, int64_t uncomp) 332 { 333 ASSERT3U(comp, ==, uncomp); /* it's all metadata */ 334 mutex_enter(&dp->dp_lock); 335 dp->dp_mos_used_delta += used; 336 dp->dp_mos_compressed_delta += comp; 337 dp->dp_mos_uncompressed_delta += uncomp; 338 mutex_exit(&dp->dp_lock); 339 } 340 341 static int 342 deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 343 { 344 dsl_deadlist_t *dl = arg; 345 dsl_deadlist_insert(dl, bp, tx); 346 return (0); 347 } 348 349 void 350 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) 351 { 352 zio_t *zio; 353 dmu_tx_t *tx; 354 dsl_dir_t *dd; 355 dsl_dataset_t *ds; 356 objset_t *mos = dp->dp_meta_objset; 357 hrtime_t start, write_time; 358 uint64_t data_written; 359 int err; 360 list_t synced_datasets; 361 362 list_create(&synced_datasets, sizeof (dsl_dataset_t), 363 offsetof(dsl_dataset_t, ds_synced_link)); 364 365 /* 366 * We need to copy dp_space_towrite() before doing 367 * dsl_sync_task_sync(), because 368 * dsl_dataset_snapshot_reserve_space() will increase 369 * dp_space_towrite but not actually write anything. 370 */ 371 data_written = dp->dp_space_towrite[txg & TXG_MASK]; 372 373 tx = dmu_tx_create_assigned(dp, txg); 374 375 dp->dp_read_overhead = 0; 376 start = gethrtime(); 377 378 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 379 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 380 /* 381 * We must not sync any non-MOS datasets twice, because 382 * we may have taken a snapshot of them. However, we 383 * may sync newly-created datasets on pass 2. 384 */ 385 ASSERT(!list_link_active(&ds->ds_synced_link)); 386 list_insert_tail(&synced_datasets, ds); 387 dsl_dataset_sync(ds, zio, tx); 388 } 389 DTRACE_PROBE(pool_sync__1setup); 390 err = zio_wait(zio); 391 392 write_time = gethrtime() - start; 393 ASSERT(err == 0); 394 DTRACE_PROBE(pool_sync__2rootzio); 395 396 /* 397 * After the data blocks have been written (ensured by the zio_wait() 398 * above), update the user/group space accounting. 399 */ 400 for (ds = list_head(&synced_datasets); ds; 401 ds = list_next(&synced_datasets, ds)) 402 dmu_objset_do_userquota_updates(ds->ds_objset, tx); 403 404 /* 405 * Sync the datasets again to push out the changes due to 406 * userspace updates. This must be done before we process the 407 * sync tasks, so that any snapshots will have the correct 408 * user accounting information (and we won't get confused 409 * about which blocks are part of the snapshot). 410 */ 411 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 412 while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { 413 ASSERT(list_link_active(&ds->ds_synced_link)); 414 dmu_buf_rele(ds->ds_dbuf, ds); 415 dsl_dataset_sync(ds, zio, tx); 416 } 417 err = zio_wait(zio); 418 419 /* 420 * Now that the datasets have been completely synced, we can 421 * clean up our in-memory structures accumulated while syncing: 422 * 423 * - move dead blocks from the pending deadlist to the on-disk deadlist 424 * - release hold from dsl_dataset_dirty() 425 */ 426 while (ds = list_remove_head(&synced_datasets)) { 427 objset_t *os = ds->ds_objset; 428 bplist_iterate(&ds->ds_pending_deadlist, 429 deadlist_enqueue_cb, &ds->ds_deadlist, tx); 430 ASSERT(!dmu_objset_is_dirty(os, txg)); 431 dmu_buf_rele(ds->ds_dbuf, ds); 432 } 433 434 start = gethrtime(); 435 while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) 436 dsl_dir_sync(dd, tx); 437 write_time += gethrtime() - start; 438 439 /* 440 * The MOS's space is accounted for in the pool/$MOS 441 * (dp_mos_dir). We can't modify the mos while we're syncing 442 * it, so we remember the deltas and apply them here. 443 */ 444 if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || 445 dp->dp_mos_uncompressed_delta != 0) { 446 dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, 447 dp->dp_mos_used_delta, 448 dp->dp_mos_compressed_delta, 449 dp->dp_mos_uncompressed_delta, tx); 450 dp->dp_mos_used_delta = 0; 451 dp->dp_mos_compressed_delta = 0; 452 dp->dp_mos_uncompressed_delta = 0; 453 } 454 455 start = gethrtime(); 456 if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || 457 list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { 458 zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 459 dmu_objset_sync(mos, zio, tx); 460 err = zio_wait(zio); 461 ASSERT(err == 0); 462 dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); 463 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 464 } 465 write_time += gethrtime() - start; 466 DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, 467 hrtime_t, dp->dp_read_overhead); 468 write_time -= dp->dp_read_overhead; 469 470 /* 471 * If we modify a dataset in the same txg that we want to destroy it, 472 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. 473 * dsl_dir_destroy_check() will fail if there are unexpected holds. 474 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf 475 * and clearing the hold on it) before we process the sync_tasks. 476 * The MOS data dirtied by the sync_tasks will be synced on the next 477 * pass. 478 */ 479 DTRACE_PROBE(pool_sync__3task); 480 if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { 481 dsl_sync_task_t *dst; 482 /* 483 * No more sync tasks should have been added while we 484 * were syncing. 485 */ 486 ASSERT(spa_sync_pass(dp->dp_spa) == 1); 487 while (dst = txg_list_remove(&dp->dp_sync_tasks, txg)) 488 dsl_sync_task_sync(dst, tx); 489 } 490 491 dmu_tx_commit(tx); 492 493 dp->dp_space_towrite[txg & TXG_MASK] = 0; 494 ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); 495 496 /* 497 * If the write limit max has not been explicitly set, set it 498 * to a fraction of available physical memory (default 1/8th). 499 * Note that we must inflate the limit because the spa 500 * inflates write sizes to account for data replication. 501 * Check this each sync phase to catch changing memory size. 502 */ 503 if (physmem != old_physmem && zfs_write_limit_shift) { 504 mutex_enter(&zfs_write_limit_lock); 505 old_physmem = physmem; 506 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; 507 zfs_write_limit_inflated = MAX(zfs_write_limit_min, 508 spa_get_asize(dp->dp_spa, zfs_write_limit_max)); 509 mutex_exit(&zfs_write_limit_lock); 510 } 511 512 /* 513 * Attempt to keep the sync time consistent by adjusting the 514 * amount of write traffic allowed into each transaction group. 515 * Weight the throughput calculation towards the current value: 516 * thru = 3/4 old_thru + 1/4 new_thru 517 * 518 * Note: write_time is in nanosecs while dp_throughput is expressed in 519 * bytes per millisecond. 520 */ 521 ASSERT(zfs_write_limit_min > 0); 522 if (data_written > zfs_write_limit_min / 8 && 523 write_time > MSEC2NSEC(1)) { 524 uint64_t throughput = data_written / NSEC2MSEC(write_time); 525 526 if (dp->dp_throughput) 527 dp->dp_throughput = throughput / 4 + 528 3 * dp->dp_throughput / 4; 529 else 530 dp->dp_throughput = throughput; 531 dp->dp_write_limit = MIN(zfs_write_limit_inflated, 532 MAX(zfs_write_limit_min, 533 dp->dp_throughput * zfs_txg_synctime_ms)); 534 } 535 } 536 537 void 538 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) 539 { 540 zilog_t *zilog; 541 dsl_dataset_t *ds; 542 543 while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) { 544 ds = dmu_objset_ds(zilog->zl_os); 545 zil_clean(zilog, txg); 546 ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); 547 dmu_buf_rele(ds->ds_dbuf, zilog); 548 } 549 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); 550 } 551 552 /* 553 * TRUE if the current thread is the tx_sync_thread or if we 554 * are being called from SPA context during pool initialization. 555 */ 556 int 557 dsl_pool_sync_context(dsl_pool_t *dp) 558 { 559 return (curthread == dp->dp_tx.tx_sync_thread || 560 spa_is_initializing(dp->dp_spa)); 561 } 562 563 uint64_t 564 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) 565 { 566 uint64_t space, resv; 567 568 /* 569 * Reserve about 1.6% (1/64), or at least 32MB, for allocation 570 * efficiency. 571 * XXX The intent log is not accounted for, so it must fit 572 * within this slop. 573 * 574 * If we're trying to assess whether it's OK to do a free, 575 * cut the reservation in half to allow forward progress 576 * (e.g. make it possible to rm(1) files from a full pool). 577 */ 578 space = spa_get_dspace(dp->dp_spa); 579 resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); 580 if (netfree) 581 resv >>= 1; 582 583 return (space - resv); 584 } 585 586 int 587 dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) 588 { 589 uint64_t reserved = 0; 590 uint64_t write_limit = (zfs_write_limit_override ? 591 zfs_write_limit_override : dp->dp_write_limit); 592 593 if (zfs_no_write_throttle) { 594 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], 595 space); 596 return (0); 597 } 598 599 /* 600 * Check to see if we have exceeded the maximum allowed IO for 601 * this transaction group. We can do this without locks since 602 * a little slop here is ok. Note that we do the reserved check 603 * with only half the requested reserve: this is because the 604 * reserve requests are worst-case, and we really don't want to 605 * throttle based off of worst-case estimates. 606 */ 607 if (write_limit > 0) { 608 reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] 609 + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; 610 611 if (reserved && reserved > write_limit) 612 return (SET_ERROR(ERESTART)); 613 } 614 615 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); 616 617 /* 618 * If this transaction group is over 7/8ths capacity, delay 619 * the caller 1 clock tick. This will slow down the "fill" 620 * rate until the sync process can catch up with us. 621 */ 622 if (reserved && reserved > (write_limit - (write_limit >> 3))) { 623 txg_delay(dp, tx->tx_txg, zfs_throttle_delay, 624 zfs_throttle_resolution); 625 } 626 627 return (0); 628 } 629 630 void 631 dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 632 { 633 ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); 634 atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); 635 } 636 637 void 638 dsl_pool_memory_pressure(dsl_pool_t *dp) 639 { 640 uint64_t space_inuse = 0; 641 int i; 642 643 if (dp->dp_write_limit == zfs_write_limit_min) 644 return; 645 646 for (i = 0; i < TXG_SIZE; i++) { 647 space_inuse += dp->dp_space_towrite[i]; 648 space_inuse += dp->dp_tempreserved[i]; 649 } 650 dp->dp_write_limit = MAX(zfs_write_limit_min, 651 MIN(dp->dp_write_limit, space_inuse / 4)); 652 } 653 654 void 655 dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) 656 { 657 if (space > 0) { 658 mutex_enter(&dp->dp_lock); 659 dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; 660 mutex_exit(&dp->dp_lock); 661 } 662 } 663 664 /* ARGSUSED */ 665 static int 666 upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 667 { 668 dmu_tx_t *tx = arg; 669 dsl_dataset_t *ds, *prev = NULL; 670 int err; 671 672 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 673 if (err) 674 return (err); 675 676 while (ds->ds_phys->ds_prev_snap_obj != 0) { 677 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 678 FTAG, &prev); 679 if (err) { 680 dsl_dataset_rele(ds, FTAG); 681 return (err); 682 } 683 684 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) 685 break; 686 dsl_dataset_rele(ds, FTAG); 687 ds = prev; 688 prev = NULL; 689 } 690 691 if (prev == NULL) { 692 prev = dp->dp_origin_snap; 693 694 /* 695 * The $ORIGIN can't have any data, or the accounting 696 * will be wrong. 697 */ 698 ASSERT0(prev->ds_phys->ds_bp.blk_birth); 699 700 /* The origin doesn't get attached to itself */ 701 if (ds->ds_object == prev->ds_object) { 702 dsl_dataset_rele(ds, FTAG); 703 return (0); 704 } 705 706 dmu_buf_will_dirty(ds->ds_dbuf, tx); 707 ds->ds_phys->ds_prev_snap_obj = prev->ds_object; 708 ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; 709 710 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 711 ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; 712 713 dmu_buf_will_dirty(prev->ds_dbuf, tx); 714 prev->ds_phys->ds_num_children++; 715 716 if (ds->ds_phys->ds_next_snap_obj == 0) { 717 ASSERT(ds->ds_prev == NULL); 718 VERIFY0(dsl_dataset_hold_obj(dp, 719 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 720 } 721 } 722 723 ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object); 724 ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object); 725 726 if (prev->ds_phys->ds_next_clones_obj == 0) { 727 dmu_buf_will_dirty(prev->ds_dbuf, tx); 728 prev->ds_phys->ds_next_clones_obj = 729 zap_create(dp->dp_meta_objset, 730 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 731 } 732 VERIFY0(zap_add_int(dp->dp_meta_objset, 733 prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); 734 735 dsl_dataset_rele(ds, FTAG); 736 if (prev != dp->dp_origin_snap) 737 dsl_dataset_rele(prev, FTAG); 738 return (0); 739 } 740 741 void 742 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) 743 { 744 ASSERT(dmu_tx_is_syncing(tx)); 745 ASSERT(dp->dp_origin_snap != NULL); 746 747 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, 748 tx, DS_FIND_CHILDREN)); 749 } 750 751 /* ARGSUSED */ 752 static int 753 upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 754 { 755 dmu_tx_t *tx = arg; 756 objset_t *mos = dp->dp_meta_objset; 757 758 if (ds->ds_dir->dd_phys->dd_origin_obj != 0) { 759 dsl_dataset_t *origin; 760 761 VERIFY0(dsl_dataset_hold_obj(dp, 762 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); 763 764 if (origin->ds_dir->dd_phys->dd_clones == 0) { 765 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 766 origin->ds_dir->dd_phys->dd_clones = zap_create(mos, 767 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 768 } 769 770 VERIFY0(zap_add_int(dp->dp_meta_objset, 771 origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx)); 772 773 dsl_dataset_rele(origin, FTAG); 774 } 775 return (0); 776 } 777 778 void 779 dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) 780 { 781 ASSERT(dmu_tx_is_syncing(tx)); 782 uint64_t obj; 783 784 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); 785 VERIFY0(dsl_pool_open_special_dir(dp, 786 FREE_DIR_NAME, &dp->dp_free_dir)); 787 788 /* 789 * We can't use bpobj_alloc(), because spa_version() still 790 * returns the old version, and we need a new-version bpobj with 791 * subobj support. So call dmu_object_alloc() directly. 792 */ 793 obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, 794 SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); 795 VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 796 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); 797 VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); 798 799 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 800 upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); 801 } 802 803 void 804 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) 805 { 806 uint64_t dsobj; 807 dsl_dataset_t *ds; 808 809 ASSERT(dmu_tx_is_syncing(tx)); 810 ASSERT(dp->dp_origin_snap == NULL); 811 ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); 812 813 /* create the origin dir, ds, & snap-ds */ 814 dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, 815 NULL, 0, kcred, tx); 816 VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 817 dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); 818 VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, 819 dp, &dp->dp_origin_snap)); 820 dsl_dataset_rele(ds, FTAG); 821 } 822 823 taskq_t * 824 dsl_pool_vnrele_taskq(dsl_pool_t *dp) 825 { 826 return (dp->dp_vnrele_taskq); 827 } 828 829 /* 830 * Walk through the pool-wide zap object of temporary snapshot user holds 831 * and release them. 832 */ 833 void 834 dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) 835 { 836 zap_attribute_t za; 837 zap_cursor_t zc; 838 objset_t *mos = dp->dp_meta_objset; 839 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 840 nvlist_t *holds; 841 842 if (zapobj == 0) 843 return; 844 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 845 846 holds = fnvlist_alloc(); 847 848 for (zap_cursor_init(&zc, mos, zapobj); 849 zap_cursor_retrieve(&zc, &za) == 0; 850 zap_cursor_advance(&zc)) { 851 char *htag; 852 uint64_t dsobj; 853 nvlist_t *tags; 854 855 htag = strchr(za.za_name, '-'); 856 *htag = '\0'; 857 ++htag; 858 if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { 859 tags = fnvlist_alloc(); 860 fnvlist_add_boolean(tags, htag); 861 fnvlist_add_nvlist(holds, za.za_name, tags); 862 fnvlist_free(tags); 863 } else { 864 fnvlist_add_boolean(tags, htag); 865 } 866 } 867 dsl_dataset_user_release_tmp(dp, holds); 868 fnvlist_free(holds); 869 zap_cursor_fini(&zc); 870 } 871 872 /* 873 * Create the pool-wide zap object for storing temporary snapshot holds. 874 */ 875 void 876 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) 877 { 878 objset_t *mos = dp->dp_meta_objset; 879 880 ASSERT(dp->dp_tmp_userrefs_obj == 0); 881 ASSERT(dmu_tx_is_syncing(tx)); 882 883 dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, 884 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); 885 } 886 887 static int 888 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, 889 const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) 890 { 891 objset_t *mos = dp->dp_meta_objset; 892 uint64_t zapobj = dp->dp_tmp_userrefs_obj; 893 char *name; 894 int error; 895 896 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 897 ASSERT(dmu_tx_is_syncing(tx)); 898 899 /* 900 * If the pool was created prior to SPA_VERSION_USERREFS, the 901 * zap object for temporary holds might not exist yet. 902 */ 903 if (zapobj == 0) { 904 if (holding) { 905 dsl_pool_user_hold_create_obj(dp, tx); 906 zapobj = dp->dp_tmp_userrefs_obj; 907 } else { 908 return (SET_ERROR(ENOENT)); 909 } 910 } 911 912 name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); 913 if (holding) 914 error = zap_add(mos, zapobj, name, 8, 1, &now, tx); 915 else 916 error = zap_remove(mos, zapobj, name, tx); 917 strfree(name); 918 919 return (error); 920 } 921 922 /* 923 * Add a temporary hold for the given dataset object and tag. 924 */ 925 int 926 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 927 uint64_t now, dmu_tx_t *tx) 928 { 929 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); 930 } 931 932 /* 933 * Release a temporary hold for the given dataset object and tag. 934 */ 935 int 936 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, 937 dmu_tx_t *tx) 938 { 939 return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, 940 tx, B_FALSE)); 941 } 942 943 /* 944 * DSL Pool Configuration Lock 945 * 946 * The dp_config_rwlock protects against changes to DSL state (e.g. dataset 947 * creation / destruction / rename / property setting). It must be held for 948 * read to hold a dataset or dsl_dir. I.e. you must call 949 * dsl_pool_config_enter() or dsl_pool_hold() before calling 950 * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock 951 * must be held continuously until all datasets and dsl_dirs are released. 952 * 953 * The only exception to this rule is that if a "long hold" is placed on 954 * a dataset, then the dp_config_rwlock may be dropped while the dataset 955 * is still held. The long hold will prevent the dataset from being 956 * destroyed -- the destroy will fail with EBUSY. A long hold can be 957 * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset 958 * (by calling dsl_{dataset,objset}_{try}own{_obj}). 959 * 960 * Legitimate long-holders (including owners) should be long-running, cancelable 961 * tasks that should cause "zfs destroy" to fail. This includes DMU 962 * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), 963 * "zfs send", and "zfs diff". There are several other long-holders whose 964 * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). 965 * 966 * The usual formula for long-holding would be: 967 * dsl_pool_hold() 968 * dsl_dataset_hold() 969 * ... perform checks ... 970 * dsl_dataset_long_hold() 971 * dsl_pool_rele() 972 * ... perform long-running task ... 973 * dsl_dataset_long_rele() 974 * dsl_dataset_rele() 975 * 976 * Note that when the long hold is released, the dataset is still held but 977 * the pool is not held. The dataset may change arbitrarily during this time 978 * (e.g. it could be destroyed). Therefore you shouldn't do anything to the 979 * dataset except release it. 980 * 981 * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only 982 * or modifying operations. 983 * 984 * Modifying operations should generally use dsl_sync_task(). The synctask 985 * infrastructure enforces proper locking strategy with respect to the 986 * dp_config_rwlock. See the comment above dsl_sync_task() for details. 987 * 988 * Read-only operations will manually hold the pool, then the dataset, obtain 989 * information from the dataset, then release the pool and dataset. 990 * dmu_objset_{hold,rele}() are convenience routines that also do the pool 991 * hold/rele. 992 */ 993 994 int 995 dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) 996 { 997 spa_t *spa; 998 int error; 999 1000 error = spa_open(name, &spa, tag); 1001 if (error == 0) { 1002 *dp = spa_get_dsl(spa); 1003 dsl_pool_config_enter(*dp, tag); 1004 } 1005 return (error); 1006 } 1007 1008 void 1009 dsl_pool_rele(dsl_pool_t *dp, void *tag) 1010 { 1011 dsl_pool_config_exit(dp, tag); 1012 spa_close(dp->dp_spa, tag); 1013 } 1014 1015 void 1016 dsl_pool_config_enter(dsl_pool_t *dp, void *tag) 1017 { 1018 /* 1019 * We use a "reentrant" reader-writer lock, but not reentrantly. 1020 * 1021 * The rrwlock can (with the track_all flag) track all reading threads, 1022 * which is very useful for debugging which code path failed to release 1023 * the lock, and for verifying that the *current* thread does hold 1024 * the lock. 1025 * 1026 * (Unlike a rwlock, which knows that N threads hold it for 1027 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE 1028 * if any thread holds it for read, even if this thread doesn't). 1029 */ 1030 ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); 1031 rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); 1032 } 1033 1034 void 1035 dsl_pool_config_exit(dsl_pool_t *dp, void *tag) 1036 { 1037 rrw_exit(&dp->dp_config_rwlock, tag); 1038 } 1039 1040 boolean_t 1041 dsl_pool_config_held(dsl_pool_t *dp) 1042 { 1043 return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); 1044 }