1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 */ 27 28 /* Portions Copyright 2010 Robert Milkowski */ 29 30 #include <sys/cred.h> 31 #include <sys/zfs_context.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/dsl_dataset.h> 35 #include <sys/dsl_prop.h> 36 #include <sys/dsl_pool.h> 37 #include <sys/dsl_synctask.h> 38 #include <sys/dsl_deleg.h> 39 #include <sys/dnode.h> 40 #include <sys/dbuf.h> 41 #include <sys/zvol.h> 42 #include <sys/dmu_tx.h> 43 #include <sys/zap.h> 44 #include <sys/zil.h> 45 #include <sys/dmu_impl.h> 46 #include <sys/zfs_ioctl.h> 47 #include <sys/sa.h> 48 #include <sys/zfs_onexit.h> 49 #include <sys/dsl_destroy.h> 50 51 /* 52 * Needed to close a window in dnode_move() that allows the objset to be freed 53 * before it can be safely accessed. 54 */ 55 krwlock_t os_lock; 56 57 void 58 dmu_objset_init(void) 59 { 60 rw_init(&os_lock, NULL, RW_DEFAULT, NULL); 61 } 62 63 void 64 dmu_objset_fini(void) 65 { 66 rw_destroy(&os_lock); 67 } 68 69 spa_t * 70 dmu_objset_spa(objset_t *os) 71 { 72 return (os->os_spa); 73 } 74 75 zilog_t * 76 dmu_objset_zil(objset_t *os) 77 { 78 return (os->os_zil); 79 } 80 81 dsl_pool_t * 82 dmu_objset_pool(objset_t *os) 83 { 84 dsl_dataset_t *ds; 85 86 if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) 87 return (ds->ds_dir->dd_pool); 88 else 89 return (spa_get_dsl(os->os_spa)); 90 } 91 92 dsl_dataset_t * 93 dmu_objset_ds(objset_t *os) 94 { 95 return (os->os_dsl_dataset); 96 } 97 98 dmu_objset_type_t 99 dmu_objset_type(objset_t *os) 100 { 101 return (os->os_phys->os_type); 102 } 103 104 void 105 dmu_objset_name(objset_t *os, char *buf) 106 { 107 dsl_dataset_name(os->os_dsl_dataset, buf); 108 } 109 110 uint64_t 111 dmu_objset_id(objset_t *os) 112 { 113 dsl_dataset_t *ds = os->os_dsl_dataset; 114 115 return (ds ? ds->ds_object : 0); 116 } 117 118 zfs_sync_type_t 119 dmu_objset_syncprop(objset_t *os) 120 { 121 return (os->os_sync); 122 } 123 124 zfs_logbias_op_t 125 dmu_objset_logbias(objset_t *os) 126 { 127 return (os->os_logbias); 128 } 129 130 static void 131 checksum_changed_cb(void *arg, uint64_t newval) 132 { 133 objset_t *os = arg; 134 135 /* 136 * Inheritance should have been done by now. 137 */ 138 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 139 140 os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); 141 } 142 143 static void 144 compression_changed_cb(void *arg, uint64_t newval) 145 { 146 objset_t *os = arg; 147 148 /* 149 * Inheritance and range checking should have been done by now. 150 */ 151 ASSERT(newval != ZIO_COMPRESS_INHERIT); 152 153 os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); 154 } 155 156 static void 157 copies_changed_cb(void *arg, uint64_t newval) 158 { 159 objset_t *os = arg; 160 161 /* 162 * Inheritance and range checking should have been done by now. 163 */ 164 ASSERT(newval > 0); 165 ASSERT(newval <= spa_max_replication(os->os_spa)); 166 167 os->os_copies = newval; 168 } 169 170 static void 171 dedup_changed_cb(void *arg, uint64_t newval) 172 { 173 objset_t *os = arg; 174 spa_t *spa = os->os_spa; 175 enum zio_checksum checksum; 176 177 /* 178 * Inheritance should have been done by now. 179 */ 180 ASSERT(newval != ZIO_CHECKSUM_INHERIT); 181 182 checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); 183 184 os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; 185 os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); 186 } 187 188 static void 189 primary_cache_changed_cb(void *arg, uint64_t newval) 190 { 191 objset_t *os = arg; 192 193 /* 194 * Inheritance and range checking should have been done by now. 195 */ 196 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 197 newval == ZFS_CACHE_METADATA); 198 199 os->os_primary_cache = newval; 200 } 201 202 static void 203 secondary_cache_changed_cb(void *arg, uint64_t newval) 204 { 205 objset_t *os = arg; 206 207 /* 208 * Inheritance and range checking should have been done by now. 209 */ 210 ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || 211 newval == ZFS_CACHE_METADATA); 212 213 os->os_secondary_cache = newval; 214 } 215 216 static void 217 sync_changed_cb(void *arg, uint64_t newval) 218 { 219 objset_t *os = arg; 220 221 /* 222 * Inheritance and range checking should have been done by now. 223 */ 224 ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || 225 newval == ZFS_SYNC_DISABLED); 226 227 os->os_sync = newval; 228 if (os->os_zil) 229 zil_set_sync(os->os_zil, newval); 230 } 231 232 static void 233 redundant_metadata_changed_cb(void *arg, uint64_t newval) 234 { 235 objset_t *os = arg; 236 237 /* 238 * Inheritance and range checking should have been done by now. 239 */ 240 ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || 241 newval == ZFS_REDUNDANT_METADATA_MOST); 242 243 os->os_redundant_metadata = newval; 244 } 245 246 static void 247 logbias_changed_cb(void *arg, uint64_t newval) 248 { 249 objset_t *os = arg; 250 251 ASSERT(newval == ZFS_LOGBIAS_LATENCY || 252 newval == ZFS_LOGBIAS_THROUGHPUT); 253 os->os_logbias = newval; 254 if (os->os_zil) 255 zil_set_logbias(os->os_zil, newval); 256 } 257 258 void 259 dmu_objset_byteswap(void *buf, size_t size) 260 { 261 objset_phys_t *osp = buf; 262 263 ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); 264 dnode_byteswap(&osp->os_meta_dnode); 265 byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); 266 osp->os_type = BSWAP_64(osp->os_type); 267 osp->os_flags = BSWAP_64(osp->os_flags); 268 if (size == sizeof (objset_phys_t)) { 269 dnode_byteswap(&osp->os_userused_dnode); 270 dnode_byteswap(&osp->os_groupused_dnode); 271 } 272 } 273 274 int 275 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 276 objset_t **osp) 277 { 278 objset_t *os; 279 int i, err; 280 281 ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); 282 283 os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); 284 os->os_dsl_dataset = ds; 285 os->os_spa = spa; 286 os->os_rootbp = bp; 287 if (!BP_IS_HOLE(os->os_rootbp)) { 288 uint32_t aflags = ARC_WAIT; 289 zbookmark_t zb; 290 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 291 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 292 293 if (DMU_OS_IS_L2CACHEABLE(os)) 294 aflags |= ARC_L2CACHE; 295 if (DMU_OS_IS_L2COMPRESSIBLE(os)) 296 aflags |= ARC_L2COMPRESS; 297 298 dprintf_bp(os->os_rootbp, "reading %s", ""); 299 err = arc_read(NULL, spa, os->os_rootbp, 300 arc_getbuf_func, &os->os_phys_buf, 301 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); 302 if (err != 0) { 303 kmem_free(os, sizeof (objset_t)); 304 /* convert checksum errors into IO errors */ 305 if (err == ECKSUM) 306 err = SET_ERROR(EIO); 307 return (err); 308 } 309 310 /* Increase the blocksize if we are permitted. */ 311 if (spa_version(spa) >= SPA_VERSION_USERSPACE && 312 arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { 313 arc_buf_t *buf = arc_buf_alloc(spa, 314 sizeof (objset_phys_t), &os->os_phys_buf, 315 ARC_BUFC_METADATA); 316 bzero(buf->b_data, sizeof (objset_phys_t)); 317 bcopy(os->os_phys_buf->b_data, buf->b_data, 318 arc_buf_size(os->os_phys_buf)); 319 (void) arc_buf_remove_ref(os->os_phys_buf, 320 &os->os_phys_buf); 321 os->os_phys_buf = buf; 322 } 323 324 os->os_phys = os->os_phys_buf->b_data; 325 os->os_flags = os->os_phys->os_flags; 326 } else { 327 int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? 328 sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; 329 os->os_phys_buf = arc_buf_alloc(spa, size, 330 &os->os_phys_buf, ARC_BUFC_METADATA); 331 os->os_phys = os->os_phys_buf->b_data; 332 bzero(os->os_phys, size); 333 } 334 335 /* 336 * Note: the changed_cb will be called once before the register 337 * func returns, thus changing the checksum/compression from the 338 * default (fletcher2/off). Snapshots don't need to know about 339 * checksum/compression/copies. 340 */ 341 if (ds != NULL) { 342 err = dsl_prop_register(ds, 343 zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), 344 primary_cache_changed_cb, os); 345 if (err == 0) { 346 err = dsl_prop_register(ds, 347 zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), 348 secondary_cache_changed_cb, os); 349 } 350 if (!dsl_dataset_is_snapshot(ds)) { 351 if (err == 0) { 352 err = dsl_prop_register(ds, 353 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 354 checksum_changed_cb, os); 355 } 356 if (err == 0) { 357 err = dsl_prop_register(ds, 358 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 359 compression_changed_cb, os); 360 } 361 if (err == 0) { 362 err = dsl_prop_register(ds, 363 zfs_prop_to_name(ZFS_PROP_COPIES), 364 copies_changed_cb, os); 365 } 366 if (err == 0) { 367 err = dsl_prop_register(ds, 368 zfs_prop_to_name(ZFS_PROP_DEDUP), 369 dedup_changed_cb, os); 370 } 371 if (err == 0) { 372 err = dsl_prop_register(ds, 373 zfs_prop_to_name(ZFS_PROP_LOGBIAS), 374 logbias_changed_cb, os); 375 } 376 if (err == 0) { 377 err = dsl_prop_register(ds, 378 zfs_prop_to_name(ZFS_PROP_SYNC), 379 sync_changed_cb, os); 380 } 381 if (err == 0) { 382 err = dsl_prop_register(ds, 383 zfs_prop_to_name( 384 ZFS_PROP_REDUNDANT_METADATA), 385 redundant_metadata_changed_cb, os); 386 } 387 } 388 if (err != 0) { 389 VERIFY(arc_buf_remove_ref(os->os_phys_buf, 390 &os->os_phys_buf)); 391 kmem_free(os, sizeof (objset_t)); 392 return (err); 393 } 394 } else { 395 /* It's the meta-objset. */ 396 os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; 397 os->os_compress = ZIO_COMPRESS_LZJB; 398 os->os_copies = spa_max_replication(spa); 399 os->os_dedup_checksum = ZIO_CHECKSUM_OFF; 400 os->os_dedup_verify = B_FALSE; 401 os->os_logbias = ZFS_LOGBIAS_LATENCY; 402 os->os_sync = ZFS_SYNC_STANDARD; 403 os->os_primary_cache = ZFS_CACHE_ALL; 404 os->os_secondary_cache = ZFS_CACHE_ALL; 405 } 406 407 if (ds == NULL || !dsl_dataset_is_snapshot(ds)) 408 os->os_zil_header = os->os_phys->os_zil_header; 409 os->os_zil = zil_alloc(os, &os->os_zil_header); 410 411 for (i = 0; i < TXG_SIZE; i++) { 412 list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), 413 offsetof(dnode_t, dn_dirty_link[i])); 414 list_create(&os->os_free_dnodes[i], sizeof (dnode_t), 415 offsetof(dnode_t, dn_dirty_link[i])); 416 } 417 list_create(&os->os_dnodes, sizeof (dnode_t), 418 offsetof(dnode_t, dn_link)); 419 list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), 420 offsetof(dmu_buf_impl_t, db_link)); 421 422 mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); 423 mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); 424 mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); 425 426 DMU_META_DNODE(os) = dnode_special_open(os, 427 &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, 428 &os->os_meta_dnode); 429 if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { 430 DMU_USERUSED_DNODE(os) = dnode_special_open(os, 431 &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, 432 &os->os_userused_dnode); 433 DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, 434 &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, 435 &os->os_groupused_dnode); 436 } 437 438 *osp = os; 439 return (0); 440 } 441 442 int 443 dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) 444 { 445 int err = 0; 446 447 mutex_enter(&ds->ds_opening_lock); 448 if (ds->ds_objset == NULL) { 449 objset_t *os; 450 err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), 451 ds, dsl_dataset_get_blkptr(ds), &os); 452 453 if (err == 0) { 454 mutex_enter(&ds->ds_lock); 455 ASSERT(ds->ds_objset == NULL); 456 ds->ds_objset = os; 457 mutex_exit(&ds->ds_lock); 458 } 459 } 460 *osp = ds->ds_objset; 461 mutex_exit(&ds->ds_opening_lock); 462 return (err); 463 } 464 465 /* 466 * Holds the pool while the objset is held. Therefore only one objset 467 * can be held at a time. 468 */ 469 int 470 dmu_objset_hold(const char *name, void *tag, objset_t **osp) 471 { 472 dsl_pool_t *dp; 473 dsl_dataset_t *ds; 474 int err; 475 476 err = dsl_pool_hold(name, tag, &dp); 477 if (err != 0) 478 return (err); 479 err = dsl_dataset_hold(dp, name, tag, &ds); 480 if (err != 0) { 481 dsl_pool_rele(dp, tag); 482 return (err); 483 } 484 485 err = dmu_objset_from_ds(ds, osp); 486 if (err != 0) { 487 dsl_dataset_rele(ds, tag); 488 dsl_pool_rele(dp, tag); 489 } 490 491 return (err); 492 } 493 494 /* 495 * dsl_pool must not be held when this is called. 496 * Upon successful return, there will be a longhold on the dataset, 497 * and the dsl_pool will not be held. 498 */ 499 int 500 dmu_objset_own(const char *name, dmu_objset_type_t type, 501 boolean_t readonly, void *tag, objset_t **osp) 502 { 503 dsl_pool_t *dp; 504 dsl_dataset_t *ds; 505 int err; 506 507 err = dsl_pool_hold(name, FTAG, &dp); 508 if (err != 0) 509 return (err); 510 err = dsl_dataset_own(dp, name, tag, &ds); 511 if (err != 0) { 512 dsl_pool_rele(dp, FTAG); 513 return (err); 514 } 515 516 err = dmu_objset_from_ds(ds, osp); 517 dsl_pool_rele(dp, FTAG); 518 if (err != 0) { 519 dsl_dataset_disown(ds, tag); 520 } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { 521 dsl_dataset_disown(ds, tag); 522 return (SET_ERROR(EINVAL)); 523 } else if (!readonly && dsl_dataset_is_snapshot(ds)) { 524 dsl_dataset_disown(ds, tag); 525 return (SET_ERROR(EROFS)); 526 } 527 return (err); 528 } 529 530 void 531 dmu_objset_rele(objset_t *os, void *tag) 532 { 533 dsl_pool_t *dp = dmu_objset_pool(os); 534 dsl_dataset_rele(os->os_dsl_dataset, tag); 535 dsl_pool_rele(dp, tag); 536 } 537 538 /* 539 * When we are called, os MUST refer to an objset associated with a dataset 540 * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner 541 * == tag. We will then release and reacquire ownership of the dataset while 542 * holding the pool config_rwlock to avoid intervening namespace or ownership 543 * changes may occur. 544 * 545 * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to 546 * release the hold on its dataset and acquire a new one on the dataset of the 547 * same name so that it can be partially torn down and reconstructed. 548 */ 549 void 550 dmu_objset_refresh_ownership(objset_t *os, void *tag) 551 { 552 dsl_pool_t *dp; 553 dsl_dataset_t *ds, *newds; 554 char name[MAXNAMELEN]; 555 556 ds = os->os_dsl_dataset; 557 VERIFY3P(ds, !=, NULL); 558 VERIFY3P(ds->ds_owner, ==, tag); 559 VERIFY(dsl_dataset_long_held(ds)); 560 561 dsl_dataset_name(ds, name); 562 dp = dmu_objset_pool(os); 563 dsl_pool_config_enter(dp, FTAG); 564 dmu_objset_disown(os, tag); 565 VERIFY0(dsl_dataset_own(dp, name, tag, &newds)); 566 VERIFY3P(newds, ==, os->os_dsl_dataset); 567 dsl_pool_config_exit(dp, FTAG); 568 } 569 570 void 571 dmu_objset_disown(objset_t *os, void *tag) 572 { 573 dsl_dataset_disown(os->os_dsl_dataset, tag); 574 } 575 576 void 577 dmu_objset_evict_dbufs(objset_t *os) 578 { 579 dnode_t *dn; 580 581 mutex_enter(&os->os_lock); 582 583 /* process the mdn last, since the other dnodes have holds on it */ 584 list_remove(&os->os_dnodes, DMU_META_DNODE(os)); 585 list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); 586 587 /* 588 * Find the first dnode with holds. We have to do this dance 589 * because dnode_add_ref() only works if you already have a 590 * hold. If there are no holds then it has no dbufs so OK to 591 * skip. 592 */ 593 for (dn = list_head(&os->os_dnodes); 594 dn && !dnode_add_ref(dn, FTAG); 595 dn = list_next(&os->os_dnodes, dn)) 596 continue; 597 598 while (dn) { 599 dnode_t *next_dn = dn; 600 601 do { 602 next_dn = list_next(&os->os_dnodes, next_dn); 603 } while (next_dn && !dnode_add_ref(next_dn, FTAG)); 604 605 mutex_exit(&os->os_lock); 606 dnode_evict_dbufs(dn); 607 dnode_rele(dn, FTAG); 608 mutex_enter(&os->os_lock); 609 dn = next_dn; 610 } 611 mutex_exit(&os->os_lock); 612 } 613 614 void 615 dmu_objset_evict(objset_t *os) 616 { 617 dsl_dataset_t *ds = os->os_dsl_dataset; 618 619 for (int t = 0; t < TXG_SIZE; t++) 620 ASSERT(!dmu_objset_is_dirty(os, t)); 621 622 if (ds) { 623 if (!dsl_dataset_is_snapshot(ds)) { 624 VERIFY0(dsl_prop_unregister(ds, 625 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 626 checksum_changed_cb, os)); 627 VERIFY0(dsl_prop_unregister(ds, 628 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 629 compression_changed_cb, os)); 630 VERIFY0(dsl_prop_unregister(ds, 631 zfs_prop_to_name(ZFS_PROP_COPIES), 632 copies_changed_cb, os)); 633 VERIFY0(dsl_prop_unregister(ds, 634 zfs_prop_to_name(ZFS_PROP_DEDUP), 635 dedup_changed_cb, os)); 636 VERIFY0(dsl_prop_unregister(ds, 637 zfs_prop_to_name(ZFS_PROP_LOGBIAS), 638 logbias_changed_cb, os)); 639 VERIFY0(dsl_prop_unregister(ds, 640 zfs_prop_to_name(ZFS_PROP_SYNC), 641 sync_changed_cb, os)); 642 VERIFY0(dsl_prop_unregister(ds, 643 zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA), 644 redundant_metadata_changed_cb, os)); 645 } 646 VERIFY0(dsl_prop_unregister(ds, 647 zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), 648 primary_cache_changed_cb, os)); 649 VERIFY0(dsl_prop_unregister(ds, 650 zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), 651 secondary_cache_changed_cb, os)); 652 } 653 654 if (os->os_sa) 655 sa_tear_down(os); 656 657 dmu_objset_evict_dbufs(os); 658 659 dnode_special_close(&os->os_meta_dnode); 660 if (DMU_USERUSED_DNODE(os)) { 661 dnode_special_close(&os->os_userused_dnode); 662 dnode_special_close(&os->os_groupused_dnode); 663 } 664 zil_free(os->os_zil); 665 666 ASSERT3P(list_head(&os->os_dnodes), ==, NULL); 667 668 VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); 669 670 /* 671 * This is a barrier to prevent the objset from going away in 672 * dnode_move() until we can safely ensure that the objset is still in 673 * use. We consider the objset valid before the barrier and invalid 674 * after the barrier. 675 */ 676 rw_enter(&os_lock, RW_READER); 677 rw_exit(&os_lock); 678 679 mutex_destroy(&os->os_lock); 680 mutex_destroy(&os->os_obj_lock); 681 mutex_destroy(&os->os_user_ptr_lock); 682 kmem_free(os, sizeof (objset_t)); 683 } 684 685 timestruc_t 686 dmu_objset_snap_cmtime(objset_t *os) 687 { 688 return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); 689 } 690 691 /* called from dsl for meta-objset */ 692 objset_t * 693 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, 694 dmu_objset_type_t type, dmu_tx_t *tx) 695 { 696 objset_t *os; 697 dnode_t *mdn; 698 699 ASSERT(dmu_tx_is_syncing(tx)); 700 701 if (ds != NULL) 702 VERIFY0(dmu_objset_from_ds(ds, &os)); 703 else 704 VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); 705 706 mdn = DMU_META_DNODE(os); 707 708 dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, 709 DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); 710 711 /* 712 * We don't want to have to increase the meta-dnode's nlevels 713 * later, because then we could do it in quescing context while 714 * we are also accessing it in open context. 715 * 716 * This precaution is not necessary for the MOS (ds == NULL), 717 * because the MOS is only updated in syncing context. 718 * This is most fortunate: the MOS is the only objset that 719 * needs to be synced multiple times as spa_sync() iterates 720 * to convergence, so minimizing its dn_nlevels matters. 721 */ 722 if (ds != NULL) { 723 int levels = 1; 724 725 /* 726 * Determine the number of levels necessary for the meta-dnode 727 * to contain DN_MAX_OBJECT dnodes. 728 */ 729 while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + 730 (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < 731 DN_MAX_OBJECT * sizeof (dnode_phys_t)) 732 levels++; 733 734 mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = 735 mdn->dn_nlevels = levels; 736 } 737 738 ASSERT(type != DMU_OST_NONE); 739 ASSERT(type != DMU_OST_ANY); 740 ASSERT(type < DMU_OST_NUMTYPES); 741 os->os_phys->os_type = type; 742 if (dmu_objset_userused_enabled(os)) { 743 os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 744 os->os_flags = os->os_phys->os_flags; 745 } 746 747 dsl_dataset_dirty(ds, tx); 748 749 return (os); 750 } 751 752 typedef struct dmu_objset_create_arg { 753 const char *doca_name; 754 cred_t *doca_cred; 755 void (*doca_userfunc)(objset_t *os, void *arg, 756 cred_t *cr, dmu_tx_t *tx); 757 void *doca_userarg; 758 dmu_objset_type_t doca_type; 759 uint64_t doca_flags; 760 } dmu_objset_create_arg_t; 761 762 /*ARGSUSED*/ 763 static int 764 dmu_objset_create_check(void *arg, dmu_tx_t *tx) 765 { 766 dmu_objset_create_arg_t *doca = arg; 767 dsl_pool_t *dp = dmu_tx_pool(tx); 768 dsl_dir_t *pdd; 769 const char *tail; 770 int error; 771 772 if (strchr(doca->doca_name, '@') != NULL) 773 return (SET_ERROR(EINVAL)); 774 775 error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); 776 if (error != 0) 777 return (error); 778 if (tail == NULL) { 779 dsl_dir_rele(pdd, FTAG); 780 return (SET_ERROR(EEXIST)); 781 } 782 error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, 783 doca->doca_cred); 784 dsl_dir_rele(pdd, FTAG); 785 786 return (error); 787 } 788 789 static void 790 dmu_objset_create_sync(void *arg, dmu_tx_t *tx) 791 { 792 dmu_objset_create_arg_t *doca = arg; 793 dsl_pool_t *dp = dmu_tx_pool(tx); 794 dsl_dir_t *pdd; 795 const char *tail; 796 dsl_dataset_t *ds; 797 uint64_t obj; 798 blkptr_t *bp; 799 objset_t *os; 800 801 VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); 802 803 obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, 804 doca->doca_cred, tx); 805 806 VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); 807 bp = dsl_dataset_get_blkptr(ds); 808 os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, 809 ds, bp, doca->doca_type, tx); 810 811 if (doca->doca_userfunc != NULL) { 812 doca->doca_userfunc(os, doca->doca_userarg, 813 doca->doca_cred, tx); 814 } 815 816 spa_history_log_internal_ds(ds, "create", tx, ""); 817 dsl_dataset_rele(ds, FTAG); 818 dsl_dir_rele(pdd, FTAG); 819 } 820 821 int 822 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, 823 void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) 824 { 825 dmu_objset_create_arg_t doca; 826 827 doca.doca_name = name; 828 doca.doca_cred = CRED(); 829 doca.doca_flags = flags; 830 doca.doca_userfunc = func; 831 doca.doca_userarg = arg; 832 doca.doca_type = type; 833 834 return (dsl_sync_task(name, 835 dmu_objset_create_check, dmu_objset_create_sync, &doca, 5)); 836 } 837 838 typedef struct dmu_objset_clone_arg { 839 const char *doca_clone; 840 const char *doca_origin; 841 cred_t *doca_cred; 842 } dmu_objset_clone_arg_t; 843 844 /*ARGSUSED*/ 845 static int 846 dmu_objset_clone_check(void *arg, dmu_tx_t *tx) 847 { 848 dmu_objset_clone_arg_t *doca = arg; 849 dsl_dir_t *pdd; 850 const char *tail; 851 int error; 852 dsl_dataset_t *origin; 853 dsl_pool_t *dp = dmu_tx_pool(tx); 854 855 if (strchr(doca->doca_clone, '@') != NULL) 856 return (SET_ERROR(EINVAL)); 857 858 error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); 859 if (error != 0) 860 return (error); 861 if (tail == NULL) { 862 dsl_dir_rele(pdd, FTAG); 863 return (SET_ERROR(EEXIST)); 864 } 865 /* You can't clone across pools. */ 866 if (pdd->dd_pool != dp) { 867 dsl_dir_rele(pdd, FTAG); 868 return (SET_ERROR(EXDEV)); 869 } 870 error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, 871 doca->doca_cred); 872 if (error != 0) { 873 dsl_dir_rele(pdd, FTAG); 874 return (SET_ERROR(EDQUOT)); 875 } 876 dsl_dir_rele(pdd, FTAG); 877 878 error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); 879 if (error != 0) 880 return (error); 881 882 /* You can't clone across pools. */ 883 if (origin->ds_dir->dd_pool != dp) { 884 dsl_dataset_rele(origin, FTAG); 885 return (SET_ERROR(EXDEV)); 886 } 887 888 /* You can only clone snapshots, not the head datasets. */ 889 if (!dsl_dataset_is_snapshot(origin)) { 890 dsl_dataset_rele(origin, FTAG); 891 return (SET_ERROR(EINVAL)); 892 } 893 dsl_dataset_rele(origin, FTAG); 894 895 return (0); 896 } 897 898 static void 899 dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) 900 { 901 dmu_objset_clone_arg_t *doca = arg; 902 dsl_pool_t *dp = dmu_tx_pool(tx); 903 dsl_dir_t *pdd; 904 const char *tail; 905 dsl_dataset_t *origin, *ds; 906 uint64_t obj; 907 char namebuf[MAXNAMELEN]; 908 909 VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); 910 VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); 911 912 obj = dsl_dataset_create_sync(pdd, tail, origin, 0, 913 doca->doca_cred, tx); 914 915 VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); 916 dsl_dataset_name(origin, namebuf); 917 spa_history_log_internal_ds(ds, "clone", tx, 918 "origin=%s (%llu)", namebuf, origin->ds_object); 919 dsl_dataset_rele(ds, FTAG); 920 dsl_dataset_rele(origin, FTAG); 921 dsl_dir_rele(pdd, FTAG); 922 } 923 924 int 925 dmu_objset_clone(const char *clone, const char *origin) 926 { 927 dmu_objset_clone_arg_t doca; 928 929 doca.doca_clone = clone; 930 doca.doca_origin = origin; 931 doca.doca_cred = CRED(); 932 933 return (dsl_sync_task(clone, 934 dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5)); 935 } 936 937 int 938 dmu_objset_snapshot_one(const char *fsname, const char *snapname) 939 { 940 int err; 941 char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); 942 nvlist_t *snaps = fnvlist_alloc(); 943 944 fnvlist_add_boolean(snaps, longsnap); 945 strfree(longsnap); 946 err = dsl_dataset_snapshot(snaps, NULL, NULL); 947 fnvlist_free(snaps); 948 return (err); 949 } 950 951 static void 952 dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) 953 { 954 dnode_t *dn; 955 956 while (dn = list_head(list)) { 957 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 958 ASSERT(dn->dn_dbuf->db_data_pending); 959 /* 960 * Initialize dn_zio outside dnode_sync() because the 961 * meta-dnode needs to set it ouside dnode_sync(). 962 */ 963 dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; 964 ASSERT(dn->dn_zio); 965 966 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); 967 list_remove(list, dn); 968 969 if (newlist) { 970 (void) dnode_add_ref(dn, newlist); 971 list_insert_tail(newlist, dn); 972 } 973 974 dnode_sync(dn, tx); 975 } 976 } 977 978 /* ARGSUSED */ 979 static void 980 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) 981 { 982 blkptr_t *bp = zio->io_bp; 983 objset_t *os = arg; 984 dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; 985 986 ASSERT(!BP_IS_EMBEDDED(bp)); 987 ASSERT3P(bp, ==, os->os_rootbp); 988 ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); 989 ASSERT0(BP_GET_LEVEL(bp)); 990 991 /* 992 * Update rootbp fill count: it should be the number of objects 993 * allocated in the object set (not counting the "special" 994 * objects that are stored in the objset_phys_t -- the meta 995 * dnode and user/group accounting objects). 996 */ 997 bp->blk_fill = 0; 998 for (int i = 0; i < dnp->dn_nblkptr; i++) 999 bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]); 1000 } 1001 1002 /* ARGSUSED */ 1003 static void 1004 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) 1005 { 1006 blkptr_t *bp = zio->io_bp; 1007 blkptr_t *bp_orig = &zio->io_bp_orig; 1008 objset_t *os = arg; 1009 1010 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 1011 ASSERT(BP_EQUAL(bp, bp_orig)); 1012 } else { 1013 dsl_dataset_t *ds = os->os_dsl_dataset; 1014 dmu_tx_t *tx = os->os_synctx; 1015 1016 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 1017 dsl_dataset_block_born(ds, bp, tx); 1018 } 1019 } 1020 1021 /* called from dsl */ 1022 void 1023 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) 1024 { 1025 int txgoff; 1026 zbookmark_t zb; 1027 zio_prop_t zp; 1028 zio_t *zio; 1029 list_t *list; 1030 list_t *newlist = NULL; 1031 dbuf_dirty_record_t *dr; 1032 1033 dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); 1034 1035 ASSERT(dmu_tx_is_syncing(tx)); 1036 /* XXX the write_done callback should really give us the tx... */ 1037 os->os_synctx = tx; 1038 1039 if (os->os_dsl_dataset == NULL) { 1040 /* 1041 * This is the MOS. If we have upgraded, 1042 * spa_max_replication() could change, so reset 1043 * os_copies here. 1044 */ 1045 os->os_copies = spa_max_replication(os->os_spa); 1046 } 1047 1048 /* 1049 * Create the root block IO 1050 */ 1051 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 1052 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 1053 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 1054 arc_release(os->os_phys_buf, &os->os_phys_buf); 1055 1056 dmu_write_policy(os, NULL, 0, 0, &zp); 1057 1058 zio = arc_write(pio, os->os_spa, tx->tx_txg, 1059 os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), 1060 DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, 1061 NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, 1062 ZIO_FLAG_MUSTSUCCEED, &zb); 1063 1064 /* 1065 * Sync special dnodes - the parent IO for the sync is the root block 1066 */ 1067 DMU_META_DNODE(os)->dn_zio = zio; 1068 dnode_sync(DMU_META_DNODE(os), tx); 1069 1070 os->os_phys->os_flags = os->os_flags; 1071 1072 if (DMU_USERUSED_DNODE(os) && 1073 DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { 1074 DMU_USERUSED_DNODE(os)->dn_zio = zio; 1075 dnode_sync(DMU_USERUSED_DNODE(os), tx); 1076 DMU_GROUPUSED_DNODE(os)->dn_zio = zio; 1077 dnode_sync(DMU_GROUPUSED_DNODE(os), tx); 1078 } 1079 1080 txgoff = tx->tx_txg & TXG_MASK; 1081 1082 if (dmu_objset_userused_enabled(os)) { 1083 newlist = &os->os_synced_dnodes; 1084 /* 1085 * We must create the list here because it uses the 1086 * dn_dirty_link[] of this txg. 1087 */ 1088 list_create(newlist, sizeof (dnode_t), 1089 offsetof(dnode_t, dn_dirty_link[txgoff])); 1090 } 1091 1092 dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); 1093 dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); 1094 1095 list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; 1096 while (dr = list_head(list)) { 1097 ASSERT0(dr->dr_dbuf->db_level); 1098 list_remove(list, dr); 1099 if (dr->dr_zio) 1100 zio_nowait(dr->dr_zio); 1101 } 1102 /* 1103 * Free intent log blocks up to this tx. 1104 */ 1105 zil_sync(os->os_zil, tx); 1106 os->os_phys->os_zil_header = os->os_zil_header; 1107 zio_nowait(zio); 1108 } 1109 1110 boolean_t 1111 dmu_objset_is_dirty(objset_t *os, uint64_t txg) 1112 { 1113 return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || 1114 !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); 1115 } 1116 1117 static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; 1118 1119 void 1120 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) 1121 { 1122 used_cbs[ost] = cb; 1123 } 1124 1125 boolean_t 1126 dmu_objset_userused_enabled(objset_t *os) 1127 { 1128 return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && 1129 used_cbs[os->os_phys->os_type] != NULL && 1130 DMU_USERUSED_DNODE(os) != NULL); 1131 } 1132 1133 static void 1134 do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, 1135 uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) 1136 { 1137 if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { 1138 int64_t delta = DNODE_SIZE + used; 1139 if (subtract) 1140 delta = -delta; 1141 VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, 1142 user, delta, tx)); 1143 VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, 1144 group, delta, tx)); 1145 } 1146 } 1147 1148 void 1149 dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) 1150 { 1151 dnode_t *dn; 1152 list_t *list = &os->os_synced_dnodes; 1153 1154 ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); 1155 1156 while (dn = list_head(list)) { 1157 int flags; 1158 ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); 1159 ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || 1160 dn->dn_phys->dn_flags & 1161 DNODE_FLAG_USERUSED_ACCOUNTED); 1162 1163 /* Allocate the user/groupused objects if necessary. */ 1164 if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { 1165 VERIFY(0 == zap_create_claim(os, 1166 DMU_USERUSED_OBJECT, 1167 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1168 VERIFY(0 == zap_create_claim(os, 1169 DMU_GROUPUSED_OBJECT, 1170 DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); 1171 } 1172 1173 /* 1174 * We intentionally modify the zap object even if the 1175 * net delta is zero. Otherwise 1176 * the block of the zap obj could be shared between 1177 * datasets but need to be different between them after 1178 * a bprewrite. 1179 */ 1180 1181 flags = dn->dn_id_flags; 1182 ASSERT(flags); 1183 if (flags & DN_ID_OLD_EXIST) { 1184 do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, 1185 dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); 1186 } 1187 if (flags & DN_ID_NEW_EXIST) { 1188 do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), 1189 dn->dn_phys->dn_flags, dn->dn_newuid, 1190 dn->dn_newgid, B_FALSE, tx); 1191 } 1192 1193 mutex_enter(&dn->dn_mtx); 1194 dn->dn_oldused = 0; 1195 dn->dn_oldflags = 0; 1196 if (dn->dn_id_flags & DN_ID_NEW_EXIST) { 1197 dn->dn_olduid = dn->dn_newuid; 1198 dn->dn_oldgid = dn->dn_newgid; 1199 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1200 if (dn->dn_bonuslen == 0) 1201 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1202 else 1203 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1204 } 1205 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); 1206 mutex_exit(&dn->dn_mtx); 1207 1208 list_remove(list, dn); 1209 dnode_rele(dn, list); 1210 } 1211 } 1212 1213 /* 1214 * Returns a pointer to data to find uid/gid from 1215 * 1216 * If a dirty record for transaction group that is syncing can't 1217 * be found then NULL is returned. In the NULL case it is assumed 1218 * the uid/gid aren't changing. 1219 */ 1220 static void * 1221 dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) 1222 { 1223 dbuf_dirty_record_t *dr, **drp; 1224 void *data; 1225 1226 if (db->db_dirtycnt == 0) 1227 return (db->db.db_data); /* Nothing is changing */ 1228 1229 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1230 if (dr->dr_txg == tx->tx_txg) 1231 break; 1232 1233 if (dr == NULL) { 1234 data = NULL; 1235 } else { 1236 dnode_t *dn; 1237 1238 DB_DNODE_ENTER(dr->dr_dbuf); 1239 dn = DB_DNODE(dr->dr_dbuf); 1240 1241 if (dn->dn_bonuslen == 0 && 1242 dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) 1243 data = dr->dt.dl.dr_data->b_data; 1244 else 1245 data = dr->dt.dl.dr_data; 1246 1247 DB_DNODE_EXIT(dr->dr_dbuf); 1248 } 1249 1250 return (data); 1251 } 1252 1253 void 1254 dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) 1255 { 1256 objset_t *os = dn->dn_objset; 1257 void *data = NULL; 1258 dmu_buf_impl_t *db = NULL; 1259 uint64_t *user = NULL; 1260 uint64_t *group = NULL; 1261 int flags = dn->dn_id_flags; 1262 int error; 1263 boolean_t have_spill = B_FALSE; 1264 1265 if (!dmu_objset_userused_enabled(dn->dn_objset)) 1266 return; 1267 1268 if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| 1269 DN_ID_CHKED_SPILL))) 1270 return; 1271 1272 if (before && dn->dn_bonuslen != 0) 1273 data = DN_BONUS(dn->dn_phys); 1274 else if (!before && dn->dn_bonuslen != 0) { 1275 if (dn->dn_bonus) { 1276 db = dn->dn_bonus; 1277 mutex_enter(&db->db_mtx); 1278 data = dmu_objset_userquota_find_data(db, tx); 1279 } else { 1280 data = DN_BONUS(dn->dn_phys); 1281 } 1282 } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { 1283 int rf = 0; 1284 1285 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) 1286 rf |= DB_RF_HAVESTRUCT; 1287 error = dmu_spill_hold_by_dnode(dn, 1288 rf | DB_RF_MUST_SUCCEED, 1289 FTAG, (dmu_buf_t **)&db); 1290 ASSERT(error == 0); 1291 mutex_enter(&db->db_mtx); 1292 data = (before) ? db->db.db_data : 1293 dmu_objset_userquota_find_data(db, tx); 1294 have_spill = B_TRUE; 1295 } else { 1296 mutex_enter(&dn->dn_mtx); 1297 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1298 mutex_exit(&dn->dn_mtx); 1299 return; 1300 } 1301 1302 if (before) { 1303 ASSERT(data); 1304 user = &dn->dn_olduid; 1305 group = &dn->dn_oldgid; 1306 } else if (data) { 1307 user = &dn->dn_newuid; 1308 group = &dn->dn_newgid; 1309 } 1310 1311 /* 1312 * Must always call the callback in case the object 1313 * type has changed and that type isn't an object type to track 1314 */ 1315 error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, 1316 user, group); 1317 1318 /* 1319 * Preserve existing uid/gid when the callback can't determine 1320 * what the new uid/gid are and the callback returned EEXIST. 1321 * The EEXIST error tells us to just use the existing uid/gid. 1322 * If we don't know what the old values are then just assign 1323 * them to 0, since that is a new file being created. 1324 */ 1325 if (!before && data == NULL && error == EEXIST) { 1326 if (flags & DN_ID_OLD_EXIST) { 1327 dn->dn_newuid = dn->dn_olduid; 1328 dn->dn_newgid = dn->dn_oldgid; 1329 } else { 1330 dn->dn_newuid = 0; 1331 dn->dn_newgid = 0; 1332 } 1333 error = 0; 1334 } 1335 1336 if (db) 1337 mutex_exit(&db->db_mtx); 1338 1339 mutex_enter(&dn->dn_mtx); 1340 if (error == 0 && before) 1341 dn->dn_id_flags |= DN_ID_OLD_EXIST; 1342 if (error == 0 && !before) 1343 dn->dn_id_flags |= DN_ID_NEW_EXIST; 1344 1345 if (have_spill) { 1346 dn->dn_id_flags |= DN_ID_CHKED_SPILL; 1347 } else { 1348 dn->dn_id_flags |= DN_ID_CHKED_BONUS; 1349 } 1350 mutex_exit(&dn->dn_mtx); 1351 if (have_spill) 1352 dmu_buf_rele((dmu_buf_t *)db, FTAG); 1353 } 1354 1355 boolean_t 1356 dmu_objset_userspace_present(objset_t *os) 1357 { 1358 return (os->os_phys->os_flags & 1359 OBJSET_FLAG_USERACCOUNTING_COMPLETE); 1360 } 1361 1362 int 1363 dmu_objset_userspace_upgrade(objset_t *os) 1364 { 1365 uint64_t obj; 1366 int err = 0; 1367 1368 if (dmu_objset_userspace_present(os)) 1369 return (0); 1370 if (!dmu_objset_userused_enabled(os)) 1371 return (SET_ERROR(ENOTSUP)); 1372 if (dmu_objset_is_snapshot(os)) 1373 return (SET_ERROR(EINVAL)); 1374 1375 /* 1376 * We simply need to mark every object dirty, so that it will be 1377 * synced out and now accounted. If this is called 1378 * concurrently, or if we already did some work before crashing, 1379 * that's fine, since we track each object's accounted state 1380 * independently. 1381 */ 1382 1383 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 1384 dmu_tx_t *tx; 1385 dmu_buf_t *db; 1386 int objerr; 1387 1388 if (issig(JUSTLOOKING) && issig(FORREAL)) 1389 return (SET_ERROR(EINTR)); 1390 1391 objerr = dmu_bonus_hold(os, obj, FTAG, &db); 1392 if (objerr != 0) 1393 continue; 1394 tx = dmu_tx_create(os); 1395 dmu_tx_hold_bonus(tx, obj); 1396 objerr = dmu_tx_assign(tx, TXG_WAIT); 1397 if (objerr != 0) { 1398 dmu_tx_abort(tx); 1399 continue; 1400 } 1401 dmu_buf_will_dirty(db, tx); 1402 dmu_buf_rele(db, FTAG); 1403 dmu_tx_commit(tx); 1404 } 1405 1406 os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; 1407 txg_wait_synced(dmu_objset_pool(os), 0); 1408 return (0); 1409 } 1410 1411 void 1412 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, 1413 uint64_t *usedobjsp, uint64_t *availobjsp) 1414 { 1415 dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, 1416 usedobjsp, availobjsp); 1417 } 1418 1419 uint64_t 1420 dmu_objset_fsid_guid(objset_t *os) 1421 { 1422 return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); 1423 } 1424 1425 void 1426 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) 1427 { 1428 stat->dds_type = os->os_phys->os_type; 1429 if (os->os_dsl_dataset) 1430 dsl_dataset_fast_stat(os->os_dsl_dataset, stat); 1431 } 1432 1433 void 1434 dmu_objset_stats(objset_t *os, nvlist_t *nv) 1435 { 1436 ASSERT(os->os_dsl_dataset || 1437 os->os_phys->os_type == DMU_OST_META); 1438 1439 if (os->os_dsl_dataset != NULL) 1440 dsl_dataset_stats(os->os_dsl_dataset, nv); 1441 1442 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, 1443 os->os_phys->os_type); 1444 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, 1445 dmu_objset_userspace_present(os)); 1446 } 1447 1448 int 1449 dmu_objset_is_snapshot(objset_t *os) 1450 { 1451 if (os->os_dsl_dataset != NULL) 1452 return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); 1453 else 1454 return (B_FALSE); 1455 } 1456 1457 int 1458 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, 1459 boolean_t *conflict) 1460 { 1461 dsl_dataset_t *ds = os->os_dsl_dataset; 1462 uint64_t ignored; 1463 1464 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1465 return (SET_ERROR(ENOENT)); 1466 1467 return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, 1468 ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, 1469 real, maxlen, conflict)); 1470 } 1471 1472 int 1473 dmu_snapshot_list_next(objset_t *os, int namelen, char *name, 1474 uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) 1475 { 1476 dsl_dataset_t *ds = os->os_dsl_dataset; 1477 zap_cursor_t cursor; 1478 zap_attribute_t attr; 1479 1480 ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); 1481 1482 if (ds->ds_phys->ds_snapnames_zapobj == 0) 1483 return (SET_ERROR(ENOENT)); 1484 1485 zap_cursor_init_serialized(&cursor, 1486 ds->ds_dir->dd_pool->dp_meta_objset, 1487 ds->ds_phys->ds_snapnames_zapobj, *offp); 1488 1489 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1490 zap_cursor_fini(&cursor); 1491 return (SET_ERROR(ENOENT)); 1492 } 1493 1494 if (strlen(attr.za_name) + 1 > namelen) { 1495 zap_cursor_fini(&cursor); 1496 return (SET_ERROR(ENAMETOOLONG)); 1497 } 1498 1499 (void) strcpy(name, attr.za_name); 1500 if (idp) 1501 *idp = attr.za_first_integer; 1502 if (case_conflict) 1503 *case_conflict = attr.za_normalization_conflict; 1504 zap_cursor_advance(&cursor); 1505 *offp = zap_cursor_serialize(&cursor); 1506 zap_cursor_fini(&cursor); 1507 1508 return (0); 1509 } 1510 1511 int 1512 dmu_dir_list_next(objset_t *os, int namelen, char *name, 1513 uint64_t *idp, uint64_t *offp) 1514 { 1515 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; 1516 zap_cursor_t cursor; 1517 zap_attribute_t attr; 1518 1519 /* there is no next dir on a snapshot! */ 1520 if (os->os_dsl_dataset->ds_object != 1521 dd->dd_phys->dd_head_dataset_obj) 1522 return (SET_ERROR(ENOENT)); 1523 1524 zap_cursor_init_serialized(&cursor, 1525 dd->dd_pool->dp_meta_objset, 1526 dd->dd_phys->dd_child_dir_zapobj, *offp); 1527 1528 if (zap_cursor_retrieve(&cursor, &attr) != 0) { 1529 zap_cursor_fini(&cursor); 1530 return (SET_ERROR(ENOENT)); 1531 } 1532 1533 if (strlen(attr.za_name) + 1 > namelen) { 1534 zap_cursor_fini(&cursor); 1535 return (SET_ERROR(ENAMETOOLONG)); 1536 } 1537 1538 (void) strcpy(name, attr.za_name); 1539 if (idp) 1540 *idp = attr.za_first_integer; 1541 zap_cursor_advance(&cursor); 1542 *offp = zap_cursor_serialize(&cursor); 1543 zap_cursor_fini(&cursor); 1544 1545 return (0); 1546 } 1547 1548 /* 1549 * Find objsets under and including ddobj, call func(ds) on each. 1550 */ 1551 int 1552 dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, 1553 int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) 1554 { 1555 dsl_dir_t *dd; 1556 dsl_dataset_t *ds; 1557 zap_cursor_t zc; 1558 zap_attribute_t *attr; 1559 uint64_t thisobj; 1560 int err; 1561 1562 ASSERT(dsl_pool_config_held(dp)); 1563 1564 err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); 1565 if (err != 0) 1566 return (err); 1567 1568 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ 1569 if (dd->dd_myname[0] == '$') { 1570 dsl_dir_rele(dd, FTAG); 1571 return (0); 1572 } 1573 1574 thisobj = dd->dd_phys->dd_head_dataset_obj; 1575 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 1576 1577 /* 1578 * Iterate over all children. 1579 */ 1580 if (flags & DS_FIND_CHILDREN) { 1581 for (zap_cursor_init(&zc, dp->dp_meta_objset, 1582 dd->dd_phys->dd_child_dir_zapobj); 1583 zap_cursor_retrieve(&zc, attr) == 0; 1584 (void) zap_cursor_advance(&zc)) { 1585 ASSERT3U(attr->za_integer_length, ==, 1586 sizeof (uint64_t)); 1587 ASSERT3U(attr->za_num_integers, ==, 1); 1588 1589 err = dmu_objset_find_dp(dp, attr->za_first_integer, 1590 func, arg, flags); 1591 if (err != 0) 1592 break; 1593 } 1594 zap_cursor_fini(&zc); 1595 1596 if (err != 0) { 1597 dsl_dir_rele(dd, FTAG); 1598 kmem_free(attr, sizeof (zap_attribute_t)); 1599 return (err); 1600 } 1601 } 1602 1603 /* 1604 * Iterate over all snapshots. 1605 */ 1606 if (flags & DS_FIND_SNAPSHOTS) { 1607 dsl_dataset_t *ds; 1608 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1609 1610 if (err == 0) { 1611 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 1612 dsl_dataset_rele(ds, FTAG); 1613 1614 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); 1615 zap_cursor_retrieve(&zc, attr) == 0; 1616 (void) zap_cursor_advance(&zc)) { 1617 ASSERT3U(attr->za_integer_length, ==, 1618 sizeof (uint64_t)); 1619 ASSERT3U(attr->za_num_integers, ==, 1); 1620 1621 err = dsl_dataset_hold_obj(dp, 1622 attr->za_first_integer, FTAG, &ds); 1623 if (err != 0) 1624 break; 1625 err = func(dp, ds, arg); 1626 dsl_dataset_rele(ds, FTAG); 1627 if (err != 0) 1628 break; 1629 } 1630 zap_cursor_fini(&zc); 1631 } 1632 } 1633 1634 dsl_dir_rele(dd, FTAG); 1635 kmem_free(attr, sizeof (zap_attribute_t)); 1636 1637 if (err != 0) 1638 return (err); 1639 1640 /* 1641 * Apply to self. 1642 */ 1643 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1644 if (err != 0) 1645 return (err); 1646 err = func(dp, ds, arg); 1647 dsl_dataset_rele(ds, FTAG); 1648 return (err); 1649 } 1650 1651 /* 1652 * Find all objsets under name, and for each, call 'func(child_name, arg)'. 1653 * The dp_config_rwlock must not be held when this is called, and it 1654 * will not be held when the callback is called. 1655 * Therefore this function should only be used when the pool is not changing 1656 * (e.g. in syncing context), or the callback can deal with the possible races. 1657 */ 1658 static int 1659 dmu_objset_find_impl(spa_t *spa, const char *name, 1660 int func(const char *, void *), void *arg, int flags) 1661 { 1662 dsl_dir_t *dd; 1663 dsl_pool_t *dp = spa_get_dsl(spa); 1664 dsl_dataset_t *ds; 1665 zap_cursor_t zc; 1666 zap_attribute_t *attr; 1667 char *child; 1668 uint64_t thisobj; 1669 int err; 1670 1671 dsl_pool_config_enter(dp, FTAG); 1672 1673 err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); 1674 if (err != 0) { 1675 dsl_pool_config_exit(dp, FTAG); 1676 return (err); 1677 } 1678 1679 /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ 1680 if (dd->dd_myname[0] == '$') { 1681 dsl_dir_rele(dd, FTAG); 1682 dsl_pool_config_exit(dp, FTAG); 1683 return (0); 1684 } 1685 1686 thisobj = dd->dd_phys->dd_head_dataset_obj; 1687 attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 1688 1689 /* 1690 * Iterate over all children. 1691 */ 1692 if (flags & DS_FIND_CHILDREN) { 1693 for (zap_cursor_init(&zc, dp->dp_meta_objset, 1694 dd->dd_phys->dd_child_dir_zapobj); 1695 zap_cursor_retrieve(&zc, attr) == 0; 1696 (void) zap_cursor_advance(&zc)) { 1697 ASSERT3U(attr->za_integer_length, ==, 1698 sizeof (uint64_t)); 1699 ASSERT3U(attr->za_num_integers, ==, 1); 1700 1701 child = kmem_asprintf("%s/%s", name, attr->za_name); 1702 dsl_pool_config_exit(dp, FTAG); 1703 err = dmu_objset_find_impl(spa, child, 1704 func, arg, flags); 1705 dsl_pool_config_enter(dp, FTAG); 1706 strfree(child); 1707 if (err != 0) 1708 break; 1709 } 1710 zap_cursor_fini(&zc); 1711 1712 if (err != 0) { 1713 dsl_dir_rele(dd, FTAG); 1714 dsl_pool_config_exit(dp, FTAG); 1715 kmem_free(attr, sizeof (zap_attribute_t)); 1716 return (err); 1717 } 1718 } 1719 1720 /* 1721 * Iterate over all snapshots. 1722 */ 1723 if (flags & DS_FIND_SNAPSHOTS) { 1724 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); 1725 1726 if (err == 0) { 1727 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 1728 dsl_dataset_rele(ds, FTAG); 1729 1730 for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); 1731 zap_cursor_retrieve(&zc, attr) == 0; 1732 (void) zap_cursor_advance(&zc)) { 1733 ASSERT3U(attr->za_integer_length, ==, 1734 sizeof (uint64_t)); 1735 ASSERT3U(attr->za_num_integers, ==, 1); 1736 1737 child = kmem_asprintf("%s@%s", 1738 name, attr->za_name); 1739 dsl_pool_config_exit(dp, FTAG); 1740 err = func(child, arg); 1741 dsl_pool_config_enter(dp, FTAG); 1742 strfree(child); 1743 if (err != 0) 1744 break; 1745 } 1746 zap_cursor_fini(&zc); 1747 } 1748 } 1749 1750 dsl_dir_rele(dd, FTAG); 1751 kmem_free(attr, sizeof (zap_attribute_t)); 1752 dsl_pool_config_exit(dp, FTAG); 1753 1754 if (err != 0) 1755 return (err); 1756 1757 /* Apply to self. */ 1758 return (func(name, arg)); 1759 } 1760 1761 /* 1762 * See comment above dmu_objset_find_impl(). 1763 */ 1764 int 1765 dmu_objset_find(char *name, int func(const char *, void *), void *arg, 1766 int flags) 1767 { 1768 spa_t *spa; 1769 int error; 1770 1771 error = spa_open(name, &spa, FTAG); 1772 if (error != 0) 1773 return (error); 1774 error = dmu_objset_find_impl(spa, name, func, arg, flags); 1775 spa_close(spa, FTAG); 1776 return (error); 1777 } 1778 1779 void 1780 dmu_objset_set_user(objset_t *os, void *user_ptr) 1781 { 1782 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1783 os->os_user_ptr = user_ptr; 1784 } 1785 1786 void * 1787 dmu_objset_get_user(objset_t *os) 1788 { 1789 ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); 1790 return (os->os_user_ptr); 1791 } 1792 1793 /* 1794 * Determine name of filesystem, given name of snapshot. 1795 * buf must be at least MAXNAMELEN bytes 1796 */ 1797 int 1798 dmu_fsname(const char *snapname, char *buf) 1799 { 1800 char *atp = strchr(snapname, '@'); 1801 if (atp == NULL) 1802 return (SET_ERROR(EINVAL)); 1803 if (atp - snapname >= MAXNAMELEN) 1804 return (SET_ERROR(ENAMETOOLONG)); 1805 (void) strlcpy(buf, snapname, atp - snapname + 1); 1806 return (0); 1807 }